123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307 |
- // © 2016 and later: Unicode, Inc. and others.
- // License & terms of use: http://www.unicode.org/copyright.html
- /*
- **********************************************************************
- * Copyright (C) 1999-2015, International Business Machines
- * Corporation and others. All Rights Reserved.
- **********************************************************************
- * Date Name Description
- * 11/17/99 aliu Creation.
- **********************************************************************
- */
- #include "unicode/utypes.h"
- #if !UCONFIG_NO_TRANSLITERATION
- #include "unicode/rep.h"
- #include "unicode/uniset.h"
- #include "rbt_pars.h"
- #include "rbt_data.h"
- #include "rbt_rule.h"
- #include "rbt.h"
- #include "mutex.h"
- #include "umutex.h"
- U_NAMESPACE_BEGIN
- UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedTransliterator)
- static Replaceable *gLockedText = nullptr;
- void RuleBasedTransliterator::_construct(const UnicodeString& rules,
- UTransDirection direction,
- UParseError& parseError,
- UErrorCode& status) {
- fData = 0;
- isDataOwned = true;
- if (U_FAILURE(status)) {
- return;
- }
- TransliteratorParser parser(status);
- parser.parse(rules, direction, parseError, status);
- if (U_FAILURE(status)) {
- return;
- }
- if (parser.idBlockVector.size() != 0 ||
- parser.compoundFilter != nullptr ||
- parser.dataVector.size() == 0) {
- status = U_INVALID_RBT_SYNTAX; // ::ID blocks disallowed in RBT
- return;
- }
- fData = (TransliterationRuleData*)parser.dataVector.orphanElementAt(0);
- setMaximumContextLength(fData->ruleSet.getMaximumContextLength());
- }
- /**
- * Constructs a new transliterator from the given rules.
- * @param id the id for the transliterator.
- * @param rules rules, separated by ';'
- * @param direction either FORWARD or REVERSE.
- * @param adoptedFilter the filter for this transliterator.
- * @param parseError Struct to receive information on position
- * of error if an error is encountered
- * @param status Output param set to success/failure code.
- * @exception IllegalArgumentException if rules are malformed
- * or direction is invalid.
- */
- RuleBasedTransliterator::RuleBasedTransliterator(
- const UnicodeString& id,
- const UnicodeString& rules,
- UTransDirection direction,
- UnicodeFilter* adoptedFilter,
- UParseError& parseError,
- UErrorCode& status) :
- Transliterator(id, adoptedFilter) {
- _construct(rules, direction,parseError,status);
- }
- /**
- * Constructs a new transliterator from the given rules.
- * @param id the id for the transliterator.
- * @param rules rules, separated by ';'
- * @param direction either FORWARD or REVERSE.
- * @param adoptedFilter the filter for this transliterator.
- * @param status Output param set to success/failure code.
- * @exception IllegalArgumentException if rules are malformed
- * or direction is invalid.
- */
- /*RuleBasedTransliterator::RuleBasedTransliterator(
- const UnicodeString& id,
- const UnicodeString& rules,
- UTransDirection direction,
- UnicodeFilter* adoptedFilter,
- UErrorCode& status) :
- Transliterator(id, adoptedFilter) {
- UParseError parseError;
- _construct(rules, direction,parseError, status);
- }*/
- /**
- * Convenience constructor with no filter.
- */
- /*RuleBasedTransliterator::RuleBasedTransliterator(
- const UnicodeString& id,
- const UnicodeString& rules,
- UTransDirection direction,
- UErrorCode& status) :
- Transliterator(id, 0) {
- UParseError parseError;
- _construct(rules, direction,parseError, status);
- }*/
- /**
- * Convenience constructor with no filter and FORWARD direction.
- */
- /*RuleBasedTransliterator::RuleBasedTransliterator(
- const UnicodeString& id,
- const UnicodeString& rules,
- UErrorCode& status) :
- Transliterator(id, 0) {
- UParseError parseError;
- _construct(rules, UTRANS_FORWARD, parseError, status);
- }*/
- /**
- * Convenience constructor with FORWARD direction.
- */
- /*RuleBasedTransliterator::RuleBasedTransliterator(
- const UnicodeString& id,
- const UnicodeString& rules,
- UnicodeFilter* adoptedFilter,
- UErrorCode& status) :
- Transliterator(id, adoptedFilter) {
- UParseError parseError;
- _construct(rules, UTRANS_FORWARD,parseError, status);
- }*/
- RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& id,
- const TransliterationRuleData* theData,
- UnicodeFilter* adoptedFilter) :
- Transliterator(id, adoptedFilter),
- fData((TransliterationRuleData*)theData), // cast away const
- isDataOwned(false) {
- setMaximumContextLength(fData->ruleSet.getMaximumContextLength());
- }
- /**
- * Internal constructor.
- */
- RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& id,
- TransliterationRuleData* theData,
- UBool isDataAdopted) :
- Transliterator(id, 0),
- fData(theData),
- isDataOwned(isDataAdopted) {
- setMaximumContextLength(fData->ruleSet.getMaximumContextLength());
- }
- /**
- * Copy constructor.
- */
- RuleBasedTransliterator::RuleBasedTransliterator(
- const RuleBasedTransliterator& other) :
- Transliterator(other), fData(other.fData),
- isDataOwned(other.isDataOwned) {
- // The data object may or may not be owned. If it is not owned we
- // share it; it is invariant. If it is owned, it's still
- // invariant, but we need to copy it to prevent double-deletion.
- // If this becomes a performance issue (if people do a lot of RBT
- // copying -- unlikely) we can reference count the data object.
- // Only do a deep copy if this is owned data, that is, data that
- // will be later deleted. System transliterators contain
- // non-owned data.
- if (isDataOwned) {
- fData = new TransliterationRuleData(*other.fData);
- }
- }
- /**
- * Destructor.
- */
- RuleBasedTransliterator::~RuleBasedTransliterator() {
- // Delete the data object only if we own it.
- if (isDataOwned) {
- delete fData;
- }
- }
- RuleBasedTransliterator*
- RuleBasedTransliterator::clone() const {
- return new RuleBasedTransliterator(*this);
- }
- /**
- * Implements {@link Transliterator#handleTransliterate}.
- */
- void
- RuleBasedTransliterator::handleTransliterate(Replaceable& text, UTransPosition& index,
- UBool isIncremental) const {
- /* We keep contextStart and contextLimit fixed the entire time,
- * relative to the text -- contextLimit may move numerically if
- * text is inserted or removed. The start offset moves toward
- * limit, with replacements happening under it.
- *
- * Example: rules 1. ab>x|y
- * 2. yc>z
- *
- * |eabcd begin - no match, advance start
- * e|abcd match rule 1 - change text & adjust start
- * ex|ycd match rule 2 - change text & adjust start
- * exz|d no match, advance start
- * exzd| done
- */
- /* A rule like
- * a>b|a
- * creates an infinite loop. To prevent that, we put an arbitrary
- * limit on the number of iterations that we take, one that is
- * high enough that any reasonable rules are ok, but low enough to
- * prevent a server from hanging. The limit is 16 times the
- * number of characters n, unless n is so large that 16n exceeds a
- * uint32_t.
- */
- uint32_t loopCount = 0;
- uint32_t loopLimit = index.limit - index.start;
- if (loopLimit >= 0x10000000) {
- loopLimit = 0xFFFFFFFF;
- } else {
- loopLimit <<= 4;
- }
- // Transliterator locking. Rule-based Transliterators are not thread safe; concurrent
- // operations must be prevented.
- // A Complication: compound transliterators can result in recursive entries to this
- // function, sometimes with different "This" objects, always with the same text.
- // Double-locking must be prevented in these cases.
- //
- UBool lockedMutexAtThisLevel = false;
- // Test whether this request is operating on the same text string as
- // some other transliteration that is still in progress and holding the
- // transliteration mutex. If so, do not lock the transliteration
- // mutex again.
- //
- // gLockedText variable is protected by the global ICU mutex.
- // Shared RBT data protected by transliteratorDataMutex.
- //
- // TODO(andy): Need a better scheme for handling this.
- static UMutex transliteratorDataMutex;
- UBool needToLock;
- {
- Mutex m;
- needToLock = (&text != gLockedText);
- }
- if (needToLock) {
- umtx_lock(&transliteratorDataMutex); // Contention, longish waits possible here.
- Mutex m;
- gLockedText = &text;
- lockedMutexAtThisLevel = true;
- }
-
- // Check to make sure we don't dereference a null pointer.
- if (fData != nullptr) {
- while (index.start < index.limit &&
- loopCount <= loopLimit &&
- fData->ruleSet.transliterate(text, index, isIncremental)) {
- ++loopCount;
- }
- }
- if (lockedMutexAtThisLevel) {
- {
- Mutex m;
- gLockedText = nullptr;
- }
- umtx_unlock(&transliteratorDataMutex);
- }
- }
- UnicodeString& RuleBasedTransliterator::toRules(UnicodeString& rulesSource,
- UBool escapeUnprintable) const {
- return fData->ruleSet.toRules(rulesSource, escapeUnprintable);
- }
- /**
- * Implement Transliterator framework
- */
- void RuleBasedTransliterator::handleGetSourceSet(UnicodeSet& result) const {
- fData->ruleSet.getSourceTargetSet(result, false);
- }
- /**
- * Override Transliterator framework
- */
- UnicodeSet& RuleBasedTransliterator::getTargetSet(UnicodeSet& result) const {
- return fData->ruleSet.getSourceTargetSet(result, true);
- }
- U_NAMESPACE_END
- #endif /* #if !UCONFIG_NO_TRANSLITERATION */
|