static_unicode_sets.cpp 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245
  1. // © 2018 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. #include "unicode/utypes.h"
  4. #if !UCONFIG_NO_FORMATTING
  5. // Allow implicit conversion from char16_t* to UnicodeString for this file:
  6. // Helpful in toString methods and elsewhere.
  7. #define UNISTR_FROM_STRING_EXPLICIT
  8. #include "static_unicode_sets.h"
  9. #include "umutex.h"
  10. #include "ucln_cmn.h"
  11. #include "unicode/uniset.h"
  12. #include "uresimp.h"
  13. #include "cstring.h"
  14. #include "uassert.h"
  15. using namespace icu;
  16. using namespace icu::unisets;
  17. namespace {
  18. UnicodeSet* gUnicodeSets[UNISETS_KEY_COUNT] = {};
  19. // Save the empty instance in static memory to have well-defined behavior if a
  20. // regular UnicodeSet cannot be allocated.
  21. alignas(UnicodeSet)
  22. char gEmptyUnicodeSet[sizeof(UnicodeSet)];
  23. // Whether the gEmptyUnicodeSet is initialized and ready to use.
  24. UBool gEmptyUnicodeSetInitialized = false;
  25. inline UnicodeSet* getImpl(Key key) {
  26. UnicodeSet* candidate = gUnicodeSets[key];
  27. if (candidate == nullptr) {
  28. return reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet);
  29. }
  30. return candidate;
  31. }
  32. UnicodeSet* computeUnion(Key k1, Key k2) {
  33. UnicodeSet* result = new UnicodeSet();
  34. if (result == nullptr) {
  35. return nullptr;
  36. }
  37. result->addAll(*getImpl(k1));
  38. result->addAll(*getImpl(k2));
  39. result->freeze();
  40. return result;
  41. }
  42. UnicodeSet* computeUnion(Key k1, Key k2, Key k3) {
  43. UnicodeSet* result = new UnicodeSet();
  44. if (result == nullptr) {
  45. return nullptr;
  46. }
  47. result->addAll(*getImpl(k1));
  48. result->addAll(*getImpl(k2));
  49. result->addAll(*getImpl(k3));
  50. result->freeze();
  51. return result;
  52. }
  53. void saveSet(Key key, const UnicodeString& unicodeSetPattern, UErrorCode& status) {
  54. // assert unicodeSets.get(key) == null;
  55. gUnicodeSets[key] = new UnicodeSet(unicodeSetPattern, status);
  56. }
  57. class ParseDataSink : public ResourceSink {
  58. public:
  59. void put(const char* key, ResourceValue& value, UBool /*noFallback*/, UErrorCode& status) override {
  60. ResourceTable contextsTable = value.getTable(status);
  61. if (U_FAILURE(status)) { return; }
  62. for (int i = 0; contextsTable.getKeyAndValue(i, key, value); i++) {
  63. if (uprv_strcmp(key, "date") == 0) {
  64. // ignore
  65. } else {
  66. ResourceTable strictnessTable = value.getTable(status);
  67. if (U_FAILURE(status)) { return; }
  68. for (int j = 0; strictnessTable.getKeyAndValue(j, key, value); j++) {
  69. bool isLenient = (uprv_strcmp(key, "lenient") == 0);
  70. ResourceArray array = value.getArray(status);
  71. if (U_FAILURE(status)) { return; }
  72. for (int k = 0; k < array.getSize(); k++) {
  73. array.getValue(k, value);
  74. UnicodeString str = value.getUnicodeString(status);
  75. if (U_FAILURE(status)) { return; }
  76. // There is both lenient and strict data for comma/period,
  77. // but not for any of the other symbols.
  78. if (str.indexOf(u'.') != -1) {
  79. saveSet(isLenient ? PERIOD : STRICT_PERIOD, str, status);
  80. } else if (str.indexOf(u',') != -1) {
  81. saveSet(isLenient ? COMMA : STRICT_COMMA, str, status);
  82. } else if (str.indexOf(u'+') != -1) {
  83. saveSet(PLUS_SIGN, str, status);
  84. } else if (str.indexOf(u'-') != -1) {
  85. saveSet(MINUS_SIGN, str, status);
  86. } else if (str.indexOf(u'$') != -1) {
  87. saveSet(DOLLAR_SIGN, str, status);
  88. } else if (str.indexOf(u'£') != -1) {
  89. saveSet(POUND_SIGN, str, status);
  90. } else if (str.indexOf(u'₹') != -1) {
  91. saveSet(RUPEE_SIGN, str, status);
  92. } else if (str.indexOf(u'¥') != -1) {
  93. saveSet(YEN_SIGN, str, status);
  94. } else if (str.indexOf(u'₩') != -1) {
  95. saveSet(WON_SIGN, str, status);
  96. } else if (str.indexOf(u'%') != -1) {
  97. saveSet(PERCENT_SIGN, str, status);
  98. } else if (str.indexOf(u'‰') != -1) {
  99. saveSet(PERMILLE_SIGN, str, status);
  100. } else if (str.indexOf(u'’') != -1) {
  101. saveSet(APOSTROPHE_SIGN, str, status);
  102. } else {
  103. // Unknown class of parse lenients
  104. // TODO(ICU-20428): Make ICU automatically accept new classes?
  105. U_ASSERT(false);
  106. }
  107. if (U_FAILURE(status)) { return; }
  108. }
  109. }
  110. }
  111. }
  112. }
  113. };
  114. icu::UInitOnce gNumberParseUniSetsInitOnce {};
  115. UBool U_CALLCONV cleanupNumberParseUniSets() {
  116. if (gEmptyUnicodeSetInitialized) {
  117. reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet)->~UnicodeSet();
  118. gEmptyUnicodeSetInitialized = false;
  119. }
  120. for (int32_t i = 0; i < UNISETS_KEY_COUNT; i++) {
  121. delete gUnicodeSets[i];
  122. gUnicodeSets[i] = nullptr;
  123. }
  124. gNumberParseUniSetsInitOnce.reset();
  125. return true;
  126. }
  127. void U_CALLCONV initNumberParseUniSets(UErrorCode& status) {
  128. ucln_common_registerCleanup(UCLN_COMMON_NUMPARSE_UNISETS, cleanupNumberParseUniSets);
  129. // Initialize the empty instance for well-defined fallback behavior
  130. new(gEmptyUnicodeSet) UnicodeSet();
  131. reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet)->freeze();
  132. gEmptyUnicodeSetInitialized = true;
  133. // These sets were decided after discussion with icu-design@. See tickets #13084 and #13309.
  134. // Zs+TAB is "horizontal whitespace" according to UTS #18 (blank property).
  135. gUnicodeSets[DEFAULT_IGNORABLES] = new UnicodeSet(
  136. u"[[:Zs:][\\u0009][:Bidi_Control:][:Variation_Selector:]]", status);
  137. gUnicodeSets[STRICT_IGNORABLES] = new UnicodeSet(u"[[:Bidi_Control:]]", status);
  138. LocalUResourceBundlePointer rb(ures_open(nullptr, "root", &status));
  139. if (U_FAILURE(status)) { return; }
  140. ParseDataSink sink;
  141. ures_getAllItemsWithFallback(rb.getAlias(), "parse", sink, status);
  142. if (U_FAILURE(status)) { return; }
  143. // NOTE: It is OK for these assertions to fail if there was a no-data build.
  144. U_ASSERT(gUnicodeSets[COMMA] != nullptr);
  145. U_ASSERT(gUnicodeSets[STRICT_COMMA] != nullptr);
  146. U_ASSERT(gUnicodeSets[PERIOD] != nullptr);
  147. U_ASSERT(gUnicodeSets[STRICT_PERIOD] != nullptr);
  148. U_ASSERT(gUnicodeSets[APOSTROPHE_SIGN] != nullptr);
  149. LocalPointer<UnicodeSet> otherGrouping(new UnicodeSet(
  150. u"[٬‘\\u0020\\u00A0\\u2000-\\u200A\\u202F\\u205F\\u3000]",
  151. status
  152. ), status);
  153. if (U_FAILURE(status)) { return; }
  154. otherGrouping->addAll(*gUnicodeSets[APOSTROPHE_SIGN]);
  155. gUnicodeSets[OTHER_GROUPING_SEPARATORS] = otherGrouping.orphan();
  156. gUnicodeSets[ALL_SEPARATORS] = computeUnion(COMMA, PERIOD, OTHER_GROUPING_SEPARATORS);
  157. gUnicodeSets[STRICT_ALL_SEPARATORS] = computeUnion(
  158. STRICT_COMMA, STRICT_PERIOD, OTHER_GROUPING_SEPARATORS);
  159. U_ASSERT(gUnicodeSets[MINUS_SIGN] != nullptr);
  160. U_ASSERT(gUnicodeSets[PLUS_SIGN] != nullptr);
  161. U_ASSERT(gUnicodeSets[PERCENT_SIGN] != nullptr);
  162. U_ASSERT(gUnicodeSets[PERMILLE_SIGN] != nullptr);
  163. gUnicodeSets[INFINITY_SIGN] = new UnicodeSet(u"[∞]", status);
  164. if (U_FAILURE(status)) { return; }
  165. U_ASSERT(gUnicodeSets[DOLLAR_SIGN] != nullptr);
  166. U_ASSERT(gUnicodeSets[POUND_SIGN] != nullptr);
  167. U_ASSERT(gUnicodeSets[RUPEE_SIGN] != nullptr);
  168. U_ASSERT(gUnicodeSets[YEN_SIGN] != nullptr);
  169. U_ASSERT(gUnicodeSets[WON_SIGN] != nullptr);
  170. gUnicodeSets[DIGITS] = new UnicodeSet(u"[:digit:]", status);
  171. if (U_FAILURE(status)) { return; }
  172. gUnicodeSets[DIGITS_OR_ALL_SEPARATORS] = computeUnion(DIGITS, ALL_SEPARATORS);
  173. gUnicodeSets[DIGITS_OR_STRICT_ALL_SEPARATORS] = computeUnion(DIGITS, STRICT_ALL_SEPARATORS);
  174. for (auto* uniset : gUnicodeSets) {
  175. if (uniset != nullptr) {
  176. uniset->freeze();
  177. }
  178. }
  179. }
  180. }
  181. const UnicodeSet* unisets::get(Key key) {
  182. UErrorCode localStatus = U_ZERO_ERROR;
  183. umtx_initOnce(gNumberParseUniSetsInitOnce, &initNumberParseUniSets, localStatus);
  184. if (U_FAILURE(localStatus)) {
  185. return reinterpret_cast<UnicodeSet*>(gEmptyUnicodeSet);
  186. }
  187. return getImpl(key);
  188. }
  189. Key unisets::chooseFrom(UnicodeString str, Key key1) {
  190. return get(key1)->contains(str) ? key1 : NONE;
  191. }
  192. Key unisets::chooseFrom(UnicodeString str, Key key1, Key key2) {
  193. return get(key1)->contains(str) ? key1 : chooseFrom(str, key2);
  194. }
  195. //Key unisets::chooseCurrency(UnicodeString str) {
  196. // if (get(DOLLAR_SIGN)->contains(str)) {
  197. // return DOLLAR_SIGN;
  198. // } else if (get(POUND_SIGN)->contains(str)) {
  199. // return POUND_SIGN;
  200. // } else if (get(RUPEE_SIGN)->contains(str)) {
  201. // return RUPEE_SIGN;
  202. // } else if (get(YEN_SIGN)->contains(str)) {
  203. // return YEN_SIGN;
  204. // } else {
  205. // return NONE;
  206. // }
  207. //}
  208. #endif /* #if !UCONFIG_NO_FORMATTING */