rbt.cpp 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. **********************************************************************
  5. * Copyright (C) 1999-2015, International Business Machines
  6. * Corporation and others. All Rights Reserved.
  7. **********************************************************************
  8. * Date Name Description
  9. * 11/17/99 aliu Creation.
  10. **********************************************************************
  11. */
  12. #include "unicode/utypes.h"
  13. #if !UCONFIG_NO_TRANSLITERATION
  14. #include "unicode/rep.h"
  15. #include "unicode/uniset.h"
  16. #include "rbt_pars.h"
  17. #include "rbt_data.h"
  18. #include "rbt_rule.h"
  19. #include "rbt.h"
  20. #include "mutex.h"
  21. #include "umutex.h"
  22. U_NAMESPACE_BEGIN
  23. UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedTransliterator)
  24. static Replaceable *gLockedText = nullptr;
  25. void RuleBasedTransliterator::_construct(const UnicodeString& rules,
  26. UTransDirection direction,
  27. UParseError& parseError,
  28. UErrorCode& status) {
  29. fData = 0;
  30. isDataOwned = true;
  31. if (U_FAILURE(status)) {
  32. return;
  33. }
  34. TransliteratorParser parser(status);
  35. parser.parse(rules, direction, parseError, status);
  36. if (U_FAILURE(status)) {
  37. return;
  38. }
  39. if (parser.idBlockVector.size() != 0 ||
  40. parser.compoundFilter != nullptr ||
  41. parser.dataVector.size() == 0) {
  42. status = U_INVALID_RBT_SYNTAX; // ::ID blocks disallowed in RBT
  43. return;
  44. }
  45. fData = (TransliterationRuleData*)parser.dataVector.orphanElementAt(0);
  46. setMaximumContextLength(fData->ruleSet.getMaximumContextLength());
  47. }
  48. /**
  49. * Constructs a new transliterator from the given rules.
  50. * @param id the id for the transliterator.
  51. * @param rules rules, separated by ';'
  52. * @param direction either FORWARD or REVERSE.
  53. * @param adoptedFilter the filter for this transliterator.
  54. * @param parseError Struct to receive information on position
  55. * of error if an error is encountered
  56. * @param status Output param set to success/failure code.
  57. * @exception IllegalArgumentException if rules are malformed
  58. * or direction is invalid.
  59. */
  60. RuleBasedTransliterator::RuleBasedTransliterator(
  61. const UnicodeString& id,
  62. const UnicodeString& rules,
  63. UTransDirection direction,
  64. UnicodeFilter* adoptedFilter,
  65. UParseError& parseError,
  66. UErrorCode& status) :
  67. Transliterator(id, adoptedFilter) {
  68. _construct(rules, direction,parseError,status);
  69. }
  70. /**
  71. * Constructs a new transliterator from the given rules.
  72. * @param id the id for the transliterator.
  73. * @param rules rules, separated by ';'
  74. * @param direction either FORWARD or REVERSE.
  75. * @param adoptedFilter the filter for this transliterator.
  76. * @param status Output param set to success/failure code.
  77. * @exception IllegalArgumentException if rules are malformed
  78. * or direction is invalid.
  79. */
  80. /*RuleBasedTransliterator::RuleBasedTransliterator(
  81. const UnicodeString& id,
  82. const UnicodeString& rules,
  83. UTransDirection direction,
  84. UnicodeFilter* adoptedFilter,
  85. UErrorCode& status) :
  86. Transliterator(id, adoptedFilter) {
  87. UParseError parseError;
  88. _construct(rules, direction,parseError, status);
  89. }*/
  90. /**
  91. * Convenience constructor with no filter.
  92. */
  93. /*RuleBasedTransliterator::RuleBasedTransliterator(
  94. const UnicodeString& id,
  95. const UnicodeString& rules,
  96. UTransDirection direction,
  97. UErrorCode& status) :
  98. Transliterator(id, 0) {
  99. UParseError parseError;
  100. _construct(rules, direction,parseError, status);
  101. }*/
  102. /**
  103. * Convenience constructor with no filter and FORWARD direction.
  104. */
  105. /*RuleBasedTransliterator::RuleBasedTransliterator(
  106. const UnicodeString& id,
  107. const UnicodeString& rules,
  108. UErrorCode& status) :
  109. Transliterator(id, 0) {
  110. UParseError parseError;
  111. _construct(rules, UTRANS_FORWARD, parseError, status);
  112. }*/
  113. /**
  114. * Convenience constructor with FORWARD direction.
  115. */
  116. /*RuleBasedTransliterator::RuleBasedTransliterator(
  117. const UnicodeString& id,
  118. const UnicodeString& rules,
  119. UnicodeFilter* adoptedFilter,
  120. UErrorCode& status) :
  121. Transliterator(id, adoptedFilter) {
  122. UParseError parseError;
  123. _construct(rules, UTRANS_FORWARD,parseError, status);
  124. }*/
  125. RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& id,
  126. const TransliterationRuleData* theData,
  127. UnicodeFilter* adoptedFilter) :
  128. Transliterator(id, adoptedFilter),
  129. fData((TransliterationRuleData*)theData), // cast away const
  130. isDataOwned(false) {
  131. setMaximumContextLength(fData->ruleSet.getMaximumContextLength());
  132. }
  133. /**
  134. * Internal constructor.
  135. */
  136. RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& id,
  137. TransliterationRuleData* theData,
  138. UBool isDataAdopted) :
  139. Transliterator(id, 0),
  140. fData(theData),
  141. isDataOwned(isDataAdopted) {
  142. setMaximumContextLength(fData->ruleSet.getMaximumContextLength());
  143. }
  144. /**
  145. * Copy constructor.
  146. */
  147. RuleBasedTransliterator::RuleBasedTransliterator(
  148. const RuleBasedTransliterator& other) :
  149. Transliterator(other), fData(other.fData),
  150. isDataOwned(other.isDataOwned) {
  151. // The data object may or may not be owned. If it is not owned we
  152. // share it; it is invariant. If it is owned, it's still
  153. // invariant, but we need to copy it to prevent double-deletion.
  154. // If this becomes a performance issue (if people do a lot of RBT
  155. // copying -- unlikely) we can reference count the data object.
  156. // Only do a deep copy if this is owned data, that is, data that
  157. // will be later deleted. System transliterators contain
  158. // non-owned data.
  159. if (isDataOwned) {
  160. fData = new TransliterationRuleData(*other.fData);
  161. }
  162. }
  163. /**
  164. * Destructor.
  165. */
  166. RuleBasedTransliterator::~RuleBasedTransliterator() {
  167. // Delete the data object only if we own it.
  168. if (isDataOwned) {
  169. delete fData;
  170. }
  171. }
  172. RuleBasedTransliterator*
  173. RuleBasedTransliterator::clone() const {
  174. return new RuleBasedTransliterator(*this);
  175. }
  176. /**
  177. * Implements {@link Transliterator#handleTransliterate}.
  178. */
  179. void
  180. RuleBasedTransliterator::handleTransliterate(Replaceable& text, UTransPosition& index,
  181. UBool isIncremental) const {
  182. /* We keep contextStart and contextLimit fixed the entire time,
  183. * relative to the text -- contextLimit may move numerically if
  184. * text is inserted or removed. The start offset moves toward
  185. * limit, with replacements happening under it.
  186. *
  187. * Example: rules 1. ab>x|y
  188. * 2. yc>z
  189. *
  190. * |eabcd begin - no match, advance start
  191. * e|abcd match rule 1 - change text & adjust start
  192. * ex|ycd match rule 2 - change text & adjust start
  193. * exz|d no match, advance start
  194. * exzd| done
  195. */
  196. /* A rule like
  197. * a>b|a
  198. * creates an infinite loop. To prevent that, we put an arbitrary
  199. * limit on the number of iterations that we take, one that is
  200. * high enough that any reasonable rules are ok, but low enough to
  201. * prevent a server from hanging. The limit is 16 times the
  202. * number of characters n, unless n is so large that 16n exceeds a
  203. * uint32_t.
  204. */
  205. uint32_t loopCount = 0;
  206. uint32_t loopLimit = index.limit - index.start;
  207. if (loopLimit >= 0x10000000) {
  208. loopLimit = 0xFFFFFFFF;
  209. } else {
  210. loopLimit <<= 4;
  211. }
  212. // Transliterator locking. Rule-based Transliterators are not thread safe; concurrent
  213. // operations must be prevented.
  214. // A Complication: compound transliterators can result in recursive entries to this
  215. // function, sometimes with different "This" objects, always with the same text.
  216. // Double-locking must be prevented in these cases.
  217. //
  218. UBool lockedMutexAtThisLevel = false;
  219. // Test whether this request is operating on the same text string as
  220. // some other transliteration that is still in progress and holding the
  221. // transliteration mutex. If so, do not lock the transliteration
  222. // mutex again.
  223. //
  224. // gLockedText variable is protected by the global ICU mutex.
  225. // Shared RBT data protected by transliteratorDataMutex.
  226. //
  227. // TODO(andy): Need a better scheme for handling this.
  228. static UMutex transliteratorDataMutex;
  229. UBool needToLock;
  230. {
  231. Mutex m;
  232. needToLock = (&text != gLockedText);
  233. }
  234. if (needToLock) {
  235. umtx_lock(&transliteratorDataMutex); // Contention, longish waits possible here.
  236. Mutex m;
  237. gLockedText = &text;
  238. lockedMutexAtThisLevel = true;
  239. }
  240. // Check to make sure we don't dereference a null pointer.
  241. if (fData != nullptr) {
  242. while (index.start < index.limit &&
  243. loopCount <= loopLimit &&
  244. fData->ruleSet.transliterate(text, index, isIncremental)) {
  245. ++loopCount;
  246. }
  247. }
  248. if (lockedMutexAtThisLevel) {
  249. {
  250. Mutex m;
  251. gLockedText = nullptr;
  252. }
  253. umtx_unlock(&transliteratorDataMutex);
  254. }
  255. }
  256. UnicodeString& RuleBasedTransliterator::toRules(UnicodeString& rulesSource,
  257. UBool escapeUnprintable) const {
  258. return fData->ruleSet.toRules(rulesSource, escapeUnprintable);
  259. }
  260. /**
  261. * Implement Transliterator framework
  262. */
  263. void RuleBasedTransliterator::handleGetSourceSet(UnicodeSet& result) const {
  264. fData->ruleSet.getSourceTargetSet(result, false);
  265. }
  266. /**
  267. * Override Transliterator framework
  268. */
  269. UnicodeSet& RuleBasedTransliterator::getTargetSet(UnicodeSet& result) const {
  270. return fData->ruleSet.getSourceTargetSet(result, true);
  271. }
  272. U_NAMESPACE_END
  273. #endif /* #if !UCONFIG_NO_TRANSLITERATION */