regexst.cpp 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. //
  4. // regexst.h
  5. //
  6. // Copyright (C) 2004-2015, International Business Machines Corporation and others.
  7. // All Rights Reserved.
  8. //
  9. // This file contains class RegexStaticSets
  10. //
  11. // This class is internal to the regular expression implementation.
  12. // For the public Regular Expression API, see the file "unicode/regex.h"
  13. //
  14. // RegexStaticSets groups together the common UnicodeSets that are needed
  15. // for compiling or executing RegularExpressions. This grouping simplifies
  16. // the thread safe lazy creation and sharing of these sets across
  17. // all instances of regular expressions.
  18. //
  19. #include "unicode/utypes.h"
  20. #if !UCONFIG_NO_REGULAR_EXPRESSIONS
  21. #include "unicode/unistr.h"
  22. #include "unicode/uniset.h"
  23. #include "unicode/uchar.h"
  24. #include "unicode/regex.h"
  25. #include "uprops.h"
  26. #include "cmemory.h"
  27. #include "cstring.h"
  28. #include "uassert.h"
  29. #include "ucln_in.h"
  30. #include "umutex.h"
  31. #include "regexcst.h" // Contains state table for the regex pattern parser.
  32. // generated by a Perl script.
  33. #include "regexst.h"
  34. U_NAMESPACE_BEGIN
  35. // "Rule Char" Characters are those with special meaning, and therefore
  36. // need to be escaped to appear as literals in a regexp.
  37. constexpr char16_t const *gRuleSet_rule_chars = u"*?+[(){}^$|\\.";
  38. //
  39. // The backslash escape characters that ICU's unescape() function will handle.
  40. //
  41. constexpr char16_t const *gUnescapeChars = u"acefnrtuUx";
  42. //
  43. // Unicode Set pattern for Regular Expression \w
  44. //
  45. constexpr char16_t const *gIsWordPattern = u"[\\p{Alphabetic}\\p{M}\\p{Nd}\\p{Pc}\\u200c\\u200d]";
  46. //
  47. // Unicode Set Definitions for Regular Expression \s
  48. //
  49. constexpr char16_t const *gIsSpacePattern = u"[\\p{WhiteSpace}]";
  50. //
  51. // UnicodeSets used in implementation of Grapheme Cluster detection, \X
  52. //
  53. constexpr char16_t const *gGC_ControlPattern = u"[[:Zl:][:Zp:][:Cc:][:Cf:]-[:Grapheme_Extend:]]";
  54. constexpr char16_t const *gGC_ExtendPattern = u"[\\p{Grapheme_Extend}]";
  55. constexpr char16_t const *gGC_LPattern = u"[\\p{Hangul_Syllable_Type=L}]";
  56. constexpr char16_t const *gGC_VPattern = u"[\\p{Hangul_Syllable_Type=V}]";
  57. constexpr char16_t const *gGC_TPattern = u"[\\p{Hangul_Syllable_Type=T}]";
  58. constexpr char16_t const *gGC_LVPattern = u"[\\p{Hangul_Syllable_Type=LV}]";
  59. constexpr char16_t const *gGC_LVTPattern = u"[\\p{Hangul_Syllable_Type=LVT}]";
  60. RegexStaticSets *RegexStaticSets::gStaticSets = nullptr;
  61. UInitOnce gStaticSetsInitOnce {};
  62. RegexStaticSets::RegexStaticSets(UErrorCode *status) {
  63. // Initialize the shared static sets to their correct values.
  64. fUnescapeCharSet.addAll(UnicodeString(true, gUnescapeChars, -1)).freeze();
  65. fPropSets[URX_ISWORD_SET].applyPattern(UnicodeString(true, gIsWordPattern, -1), *status).freeze();
  66. fPropSets[URX_ISSPACE_SET].applyPattern(UnicodeString(true, gIsSpacePattern, -1), *status).freeze();
  67. fPropSets[URX_GC_EXTEND].applyPattern(UnicodeString(true, gGC_ExtendPattern, -1), *status).freeze();
  68. fPropSets[URX_GC_CONTROL].applyPattern(UnicodeString(true, gGC_ControlPattern, -1), *status).freeze();
  69. fPropSets[URX_GC_L].applyPattern(UnicodeString(true, gGC_LPattern, -1), *status).freeze();
  70. fPropSets[URX_GC_V].applyPattern(UnicodeString(true, gGC_VPattern, -1), *status).freeze();
  71. fPropSets[URX_GC_T].applyPattern(UnicodeString(true, gGC_TPattern, -1), *status).freeze();
  72. fPropSets[URX_GC_LV].applyPattern(UnicodeString(true, gGC_LVPattern, -1), *status).freeze();
  73. fPropSets[URX_GC_LVT].applyPattern(UnicodeString(true, gGC_LVTPattern, -1), *status).freeze();
  74. //
  75. // "Normal" is the set of characters that don't need special handling
  76. // when finding grapheme cluster boundaries.
  77. //
  78. fPropSets[URX_GC_NORMAL].complement();
  79. fPropSets[URX_GC_NORMAL].remove(0xac00, 0xd7a4);
  80. fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_CONTROL]);
  81. fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_L]);
  82. fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_V]);
  83. fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_T]);
  84. fPropSets[URX_GC_NORMAL].freeze();
  85. // Initialize the 8-bit fast bit sets from the parallel full
  86. // UnicodeSets.
  87. //
  88. // TODO: 25 Oct 2019 are these fast 8-bit sets worth keeping?
  89. // Measured 3.5% gain on (non) matching with the pattern "x(?:\\S+)+x"
  90. // This runs in exponential time, making it easy to adjust the time for
  91. // convenient measuring.
  92. //
  93. // This 8 bit optimization dates from the early days of ICU,
  94. // with a less optimized UnicodeSet. At the time, the difference
  95. // was substantial.
  96. for (int32_t i=0; i<URX_LAST_SET; i++) {
  97. fPropSets8[i].init(&fPropSets[i]);
  98. }
  99. // Sets used while parsing rules, but not referenced from the parse state table
  100. fRuleSets[kRuleSet_rule_char-128]
  101. .addAll(UnicodeString(gRuleSet_rule_chars)).complement().freeze();
  102. fRuleSets[kRuleSet_digit_char-128].add(u'0', u'9').freeze();
  103. fRuleSets[kRuleSet_ascii_letter-128].add(u'A', u'Z').add(u'a', u'z').freeze();
  104. fRuleDigitsAlias = &fRuleSets[kRuleSet_digit_char-128];
  105. // Finally, initialize an empty UText string for utility purposes
  106. fEmptyText = utext_openUChars(nullptr, nullptr, 0, status);
  107. }
  108. RegexStaticSets::~RegexStaticSets() {
  109. fRuleDigitsAlias = nullptr;
  110. utext_close(fEmptyText);
  111. }
  112. //------------------------------------------------------------------------------
  113. //
  114. // regex_cleanup Memory cleanup function, free/delete all
  115. // cached memory. Called by ICU's u_cleanup() function.
  116. //
  117. //------------------------------------------------------------------------------
  118. U_CDECL_BEGIN
  119. static UBool U_CALLCONV
  120. regex_cleanup() {
  121. delete RegexStaticSets::gStaticSets;
  122. RegexStaticSets::gStaticSets = nullptr;
  123. gStaticSetsInitOnce.reset();
  124. return true;
  125. }
  126. static void U_CALLCONV initStaticSets(UErrorCode &status) {
  127. U_ASSERT(RegexStaticSets::gStaticSets == nullptr);
  128. ucln_i18n_registerCleanup(UCLN_I18N_REGEX, regex_cleanup);
  129. RegexStaticSets::gStaticSets = new RegexStaticSets(&status);
  130. if (U_FAILURE(status)) {
  131. delete RegexStaticSets::gStaticSets;
  132. RegexStaticSets::gStaticSets = nullptr;
  133. }
  134. if (RegexStaticSets::gStaticSets == nullptr && U_SUCCESS(status)) {
  135. status = U_MEMORY_ALLOCATION_ERROR;
  136. }
  137. }
  138. U_CDECL_END
  139. void RegexStaticSets::initGlobals(UErrorCode *status) {
  140. umtx_initOnce(gStaticSetsInitOnce, &initStaticSets, *status);
  141. }
  142. U_NAMESPACE_END
  143. #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS