characterproperties.cpp 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431
  1. // © 2018 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. // characterproperties.cpp
  4. // created: 2018sep03 Markus W. Scherer
  5. #include "unicode/utypes.h"
  6. #include "unicode/localpointer.h"
  7. #include "unicode/uchar.h"
  8. #include "unicode/ucpmap.h"
  9. #include "unicode/ucptrie.h"
  10. #include "unicode/umutablecptrie.h"
  11. #include "unicode/uniset.h"
  12. #include "unicode/uscript.h"
  13. #include "unicode/uset.h"
  14. #include "cmemory.h"
  15. #include "emojiprops.h"
  16. #include "mutex.h"
  17. #include "normalizer2impl.h"
  18. #include "uassert.h"
  19. #include "ubidi_props.h"
  20. #include "ucase.h"
  21. #include "ucln_cmn.h"
  22. #include "umutex.h"
  23. #include "uprops.h"
  24. using icu::LocalPointer;
  25. #if !UCONFIG_NO_NORMALIZATION
  26. using icu::Normalizer2Factory;
  27. using icu::Normalizer2Impl;
  28. #endif
  29. using icu::UInitOnce;
  30. using icu::UnicodeSet;
  31. namespace {
  32. UBool U_CALLCONV characterproperties_cleanup();
  33. constexpr int32_t NUM_INCLUSIONS = UPROPS_SRC_COUNT + (UCHAR_INT_LIMIT - UCHAR_INT_START);
  34. struct Inclusion {
  35. UnicodeSet *fSet = nullptr;
  36. UInitOnce fInitOnce {};
  37. };
  38. Inclusion gInclusions[NUM_INCLUSIONS]; // cached getInclusions()
  39. UnicodeSet *sets[UCHAR_BINARY_LIMIT] = {};
  40. UCPMap *maps[UCHAR_INT_LIMIT - UCHAR_INT_START] = {};
  41. icu::UMutex cpMutex;
  42. //----------------------------------------------------------------
  43. // Inclusions list
  44. //----------------------------------------------------------------
  45. // USetAdder implementation
  46. // Does not use uset.h to reduce code dependencies
  47. void U_CALLCONV
  48. _set_add(USet *set, UChar32 c) {
  49. reinterpret_cast<UnicodeSet*>(set)->add(c);
  50. }
  51. void U_CALLCONV
  52. _set_addRange(USet *set, UChar32 start, UChar32 end) {
  53. reinterpret_cast<UnicodeSet*>(set)->add(start, end);
  54. }
  55. void U_CALLCONV
  56. _set_addString(USet *set, const char16_t *str, int32_t length) {
  57. reinterpret_cast<UnicodeSet*>(set)->add(icu::UnicodeString(static_cast<UBool>(length < 0), str, length));
  58. }
  59. UBool U_CALLCONV characterproperties_cleanup() {
  60. for (Inclusion &in: gInclusions) {
  61. delete in.fSet;
  62. in.fSet = nullptr;
  63. in.fInitOnce.reset();
  64. }
  65. for (int32_t i = 0; i < UPRV_LENGTHOF(sets); ++i) {
  66. delete sets[i];
  67. sets[i] = nullptr;
  68. }
  69. for (int32_t i = 0; i < UPRV_LENGTHOF(maps); ++i) {
  70. ucptrie_close(reinterpret_cast<UCPTrie *>(maps[i]));
  71. maps[i] = nullptr;
  72. }
  73. return true;
  74. }
  75. void U_CALLCONV initInclusion(UPropertySource src, UErrorCode &errorCode) {
  76. // This function is invoked only via umtx_initOnce().
  77. U_ASSERT(0 <= src && src < UPROPS_SRC_COUNT);
  78. if (src == UPROPS_SRC_NONE) {
  79. errorCode = U_INTERNAL_PROGRAM_ERROR;
  80. return;
  81. }
  82. U_ASSERT(gInclusions[src].fSet == nullptr);
  83. LocalPointer<UnicodeSet> incl(new UnicodeSet());
  84. if (incl.isNull()) {
  85. errorCode = U_MEMORY_ALLOCATION_ERROR;
  86. return;
  87. }
  88. USetAdder sa = {
  89. reinterpret_cast<USet*>(incl.getAlias()),
  90. _set_add,
  91. _set_addRange,
  92. _set_addString,
  93. nullptr, // don't need remove()
  94. nullptr // don't need removeRange()
  95. };
  96. switch(src) {
  97. case UPROPS_SRC_CHAR:
  98. uchar_addPropertyStarts(&sa, &errorCode);
  99. break;
  100. case UPROPS_SRC_PROPSVEC:
  101. upropsvec_addPropertyStarts(&sa, &errorCode);
  102. break;
  103. case UPROPS_SRC_CHAR_AND_PROPSVEC:
  104. uchar_addPropertyStarts(&sa, &errorCode);
  105. upropsvec_addPropertyStarts(&sa, &errorCode);
  106. break;
  107. #if !UCONFIG_NO_NORMALIZATION
  108. case UPROPS_SRC_CASE_AND_NORM: {
  109. const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
  110. if(U_SUCCESS(errorCode)) {
  111. impl->addPropertyStarts(&sa, errorCode);
  112. }
  113. ucase_addPropertyStarts(&sa, &errorCode);
  114. break;
  115. }
  116. case UPROPS_SRC_NFC: {
  117. const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
  118. if(U_SUCCESS(errorCode)) {
  119. impl->addPropertyStarts(&sa, errorCode);
  120. }
  121. break;
  122. }
  123. case UPROPS_SRC_NFKC: {
  124. const Normalizer2Impl *impl=Normalizer2Factory::getNFKCImpl(errorCode);
  125. if(U_SUCCESS(errorCode)) {
  126. impl->addPropertyStarts(&sa, errorCode);
  127. }
  128. break;
  129. }
  130. case UPROPS_SRC_NFKC_CF: {
  131. const Normalizer2Impl *impl=Normalizer2Factory::getNFKC_CFImpl(errorCode);
  132. if(U_SUCCESS(errorCode)) {
  133. impl->addPropertyStarts(&sa, errorCode);
  134. }
  135. break;
  136. }
  137. case UPROPS_SRC_NFC_CANON_ITER: {
  138. const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
  139. if(U_SUCCESS(errorCode)) {
  140. impl->addCanonIterPropertyStarts(&sa, errorCode);
  141. }
  142. break;
  143. }
  144. #endif
  145. case UPROPS_SRC_CASE:
  146. ucase_addPropertyStarts(&sa, &errorCode);
  147. break;
  148. case UPROPS_SRC_BIDI:
  149. ubidi_addPropertyStarts(&sa, &errorCode);
  150. break;
  151. case UPROPS_SRC_INPC:
  152. case UPROPS_SRC_INSC:
  153. case UPROPS_SRC_VO:
  154. uprops_addPropertyStarts(src, &sa, &errorCode);
  155. break;
  156. case UPROPS_SRC_EMOJI: {
  157. const icu::EmojiProps *ep = icu::EmojiProps::getSingleton(errorCode);
  158. if (U_SUCCESS(errorCode)) {
  159. ep->addPropertyStarts(&sa, errorCode);
  160. }
  161. break;
  162. }
  163. case UPROPS_SRC_IDSU:
  164. // New in Unicode 15.1 for just two characters.
  165. sa.add(sa.set, 0x2FFE);
  166. sa.add(sa.set, 0x2FFF + 1);
  167. break;
  168. case UPROPS_SRC_ID_COMPAT_MATH:
  169. case UPROPS_SRC_MCM:
  170. uprops_addPropertyStarts(src, &sa, &errorCode);
  171. break;
  172. case UPROPS_SRC_BLOCK:
  173. ublock_addPropertyStarts(&sa, errorCode);
  174. break;
  175. default:
  176. errorCode = U_INTERNAL_PROGRAM_ERROR;
  177. break;
  178. }
  179. if (U_FAILURE(errorCode)) {
  180. return;
  181. }
  182. if (incl->isBogus()) {
  183. errorCode = U_MEMORY_ALLOCATION_ERROR;
  184. return;
  185. }
  186. // Compact for caching.
  187. incl->compact();
  188. gInclusions[src].fSet = incl.orphan();
  189. ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES, characterproperties_cleanup);
  190. }
  191. const UnicodeSet *getInclusionsForSource(UPropertySource src, UErrorCode &errorCode) {
  192. if (U_FAILURE(errorCode)) { return nullptr; }
  193. if (src < 0 || UPROPS_SRC_COUNT <= src) {
  194. errorCode = U_ILLEGAL_ARGUMENT_ERROR;
  195. return nullptr;
  196. }
  197. Inclusion &i = gInclusions[src];
  198. umtx_initOnce(i.fInitOnce, &initInclusion, src, errorCode);
  199. return i.fSet;
  200. }
  201. void U_CALLCONV initIntPropInclusion(UProperty prop, UErrorCode &errorCode) {
  202. // This function is invoked only via umtx_initOnce().
  203. U_ASSERT(UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT);
  204. int32_t inclIndex = UPROPS_SRC_COUNT + (prop - UCHAR_INT_START);
  205. U_ASSERT(gInclusions[inclIndex].fSet == nullptr);
  206. UPropertySource src = uprops_getSource(prop);
  207. const UnicodeSet *incl = getInclusionsForSource(src, errorCode);
  208. if (U_FAILURE(errorCode)) {
  209. return;
  210. }
  211. LocalPointer<UnicodeSet> intPropIncl(new UnicodeSet(0, 0));
  212. if (intPropIncl.isNull()) {
  213. errorCode = U_MEMORY_ALLOCATION_ERROR;
  214. return;
  215. }
  216. int32_t numRanges = incl->getRangeCount();
  217. int32_t prevValue = 0;
  218. for (int32_t i = 0; i < numRanges; ++i) {
  219. UChar32 rangeEnd = incl->getRangeEnd(i);
  220. for (UChar32 c = incl->getRangeStart(i); c <= rangeEnd; ++c) {
  221. // TODO: Get a UCharacterProperty.IntProperty to avoid the property dispatch.
  222. int32_t value = u_getIntPropertyValue(c, prop);
  223. if (value != prevValue) {
  224. intPropIncl->add(c);
  225. prevValue = value;
  226. }
  227. }
  228. }
  229. if (intPropIncl->isBogus()) {
  230. errorCode = U_MEMORY_ALLOCATION_ERROR;
  231. return;
  232. }
  233. // Compact for caching.
  234. intPropIncl->compact();
  235. gInclusions[inclIndex].fSet = intPropIncl.orphan();
  236. ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES, characterproperties_cleanup);
  237. }
  238. } // namespace
  239. U_NAMESPACE_BEGIN
  240. const UnicodeSet *CharacterProperties::getInclusionsForProperty(
  241. UProperty prop, UErrorCode &errorCode) {
  242. if (U_FAILURE(errorCode)) { return nullptr; }
  243. if (UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT) {
  244. int32_t inclIndex = UPROPS_SRC_COUNT + (prop - UCHAR_INT_START);
  245. Inclusion &i = gInclusions[inclIndex];
  246. umtx_initOnce(i.fInitOnce, &initIntPropInclusion, prop, errorCode);
  247. return i.fSet;
  248. } else {
  249. UPropertySource src = uprops_getSource(prop);
  250. return getInclusionsForSource(src, errorCode);
  251. }
  252. }
  253. U_NAMESPACE_END
  254. namespace {
  255. UnicodeSet *makeSet(UProperty property, UErrorCode &errorCode) {
  256. if (U_FAILURE(errorCode)) { return nullptr; }
  257. LocalPointer<UnicodeSet> set(new UnicodeSet());
  258. if (set.isNull()) {
  259. errorCode = U_MEMORY_ALLOCATION_ERROR;
  260. return nullptr;
  261. }
  262. if (UCHAR_BASIC_EMOJI <= property && property <= UCHAR_RGI_EMOJI) {
  263. // property of strings
  264. const icu::EmojiProps *ep = icu::EmojiProps::getSingleton(errorCode);
  265. if (U_FAILURE(errorCode)) { return nullptr; }
  266. USetAdder sa = {
  267. reinterpret_cast<USet*>(set.getAlias()),
  268. _set_add,
  269. _set_addRange,
  270. _set_addString,
  271. nullptr, // don't need remove()
  272. nullptr // don't need removeRange()
  273. };
  274. ep->addStrings(&sa, property, errorCode);
  275. if (property != UCHAR_BASIC_EMOJI && property != UCHAR_RGI_EMOJI) {
  276. // property of _only_ strings
  277. set->freeze();
  278. return set.orphan();
  279. }
  280. }
  281. const UnicodeSet *inclusions =
  282. icu::CharacterProperties::getInclusionsForProperty(property, errorCode);
  283. if (U_FAILURE(errorCode)) { return nullptr; }
  284. int32_t numRanges = inclusions->getRangeCount();
  285. UChar32 startHasProperty = -1;
  286. for (int32_t i = 0; i < numRanges; ++i) {
  287. UChar32 rangeEnd = inclusions->getRangeEnd(i);
  288. for (UChar32 c = inclusions->getRangeStart(i); c <= rangeEnd; ++c) {
  289. // TODO: Get a UCharacterProperty.BinaryProperty to avoid the property dispatch.
  290. if (u_hasBinaryProperty(c, property)) {
  291. if (startHasProperty < 0) {
  292. // Transition from false to true.
  293. startHasProperty = c;
  294. }
  295. } else if (startHasProperty >= 0) {
  296. // Transition from true to false.
  297. set->add(startHasProperty, c - 1);
  298. startHasProperty = -1;
  299. }
  300. }
  301. }
  302. if (startHasProperty >= 0) {
  303. set->add(startHasProperty, 0x10FFFF);
  304. }
  305. set->freeze();
  306. return set.orphan();
  307. }
  308. UCPMap *makeMap(UProperty property, UErrorCode &errorCode) {
  309. if (U_FAILURE(errorCode)) { return nullptr; }
  310. uint32_t nullValue = property == UCHAR_SCRIPT ? USCRIPT_UNKNOWN : 0;
  311. icu::LocalUMutableCPTriePointer mutableTrie(
  312. umutablecptrie_open(nullValue, nullValue, &errorCode));
  313. const UnicodeSet *inclusions =
  314. icu::CharacterProperties::getInclusionsForProperty(property, errorCode);
  315. if (U_FAILURE(errorCode)) { return nullptr; }
  316. int32_t numRanges = inclusions->getRangeCount();
  317. UChar32 start = 0;
  318. uint32_t value = nullValue;
  319. for (int32_t i = 0; i < numRanges; ++i) {
  320. UChar32 rangeEnd = inclusions->getRangeEnd(i);
  321. for (UChar32 c = inclusions->getRangeStart(i); c <= rangeEnd; ++c) {
  322. // TODO: Get a UCharacterProperty.IntProperty to avoid the property dispatch.
  323. uint32_t nextValue = u_getIntPropertyValue(c, property);
  324. if (value != nextValue) {
  325. if (value != nullValue) {
  326. umutablecptrie_setRange(mutableTrie.getAlias(), start, c - 1, value, &errorCode);
  327. }
  328. start = c;
  329. value = nextValue;
  330. }
  331. }
  332. }
  333. if (value != 0) {
  334. umutablecptrie_setRange(mutableTrie.getAlias(), start, 0x10FFFF, value, &errorCode);
  335. }
  336. UCPTrieType type;
  337. if (property == UCHAR_BIDI_CLASS || property == UCHAR_GENERAL_CATEGORY) {
  338. type = UCPTRIE_TYPE_FAST;
  339. } else {
  340. type = UCPTRIE_TYPE_SMALL;
  341. }
  342. UCPTrieValueWidth valueWidth;
  343. // TODO: UCharacterProperty.IntProperty
  344. int32_t max = u_getIntPropertyMaxValue(property);
  345. if (max <= 0xff) {
  346. valueWidth = UCPTRIE_VALUE_BITS_8;
  347. } else if (max <= 0xffff) {
  348. valueWidth = UCPTRIE_VALUE_BITS_16;
  349. } else {
  350. valueWidth = UCPTRIE_VALUE_BITS_32;
  351. }
  352. return reinterpret_cast<UCPMap *>(
  353. umutablecptrie_buildImmutable(mutableTrie.getAlias(), type, valueWidth, &errorCode));
  354. }
  355. } // namespace
  356. U_NAMESPACE_BEGIN
  357. const UnicodeSet *CharacterProperties::getBinaryPropertySet(UProperty property, UErrorCode &errorCode) {
  358. if (U_FAILURE(errorCode)) { return nullptr; }
  359. if (property < 0 || UCHAR_BINARY_LIMIT <= property) {
  360. errorCode = U_ILLEGAL_ARGUMENT_ERROR;
  361. return nullptr;
  362. }
  363. Mutex m(&cpMutex);
  364. UnicodeSet *set = sets[property];
  365. if (set == nullptr) {
  366. sets[property] = set = makeSet(property, errorCode);
  367. }
  368. return set;
  369. }
  370. U_NAMESPACE_END
  371. U_NAMESPACE_USE
  372. U_CAPI const USet * U_EXPORT2
  373. u_getBinaryPropertySet(UProperty property, UErrorCode *pErrorCode) {
  374. const UnicodeSet *set = CharacterProperties::getBinaryPropertySet(property, *pErrorCode);
  375. return U_SUCCESS(*pErrorCode) ? set->toUSet() : nullptr;
  376. }
  377. U_CAPI const UCPMap * U_EXPORT2
  378. u_getIntPropertyMap(UProperty property, UErrorCode *pErrorCode) {
  379. if (U_FAILURE(*pErrorCode)) { return nullptr; }
  380. if (property < UCHAR_INT_START || UCHAR_INT_LIMIT <= property) {
  381. *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
  382. return nullptr;
  383. }
  384. Mutex m(&cpMutex);
  385. UCPMap *map = maps[property - UCHAR_INT_START];
  386. if (map == nullptr) {
  387. maps[property - UCHAR_INT_START] = map = makeMap(property, *pErrorCode);
  388. }
  389. return map;
  390. }