translit.cpp 61 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. **********************************************************************
  5. * Copyright (C) 1999-2016, International Business Machines
  6. * Corporation and others. All Rights Reserved.
  7. **********************************************************************
  8. * Date Name Description
  9. * 11/17/99 aliu Creation.
  10. **********************************************************************
  11. */
  12. #include "utypeinfo.h" // for 'typeid' to work
  13. #include "unicode/utypes.h"
  14. #if !UCONFIG_NO_TRANSLITERATION
  15. #include "unicode/putil.h"
  16. #include "unicode/translit.h"
  17. #include "unicode/locid.h"
  18. #include "unicode/msgfmt.h"
  19. #include "unicode/rep.h"
  20. #include "unicode/resbund.h"
  21. #include "unicode/unifilt.h"
  22. #include "unicode/uniset.h"
  23. #include "unicode/uscript.h"
  24. #include "unicode/strenum.h"
  25. #include "unicode/utf16.h"
  26. #include "cpdtrans.h"
  27. #include "nultrans.h"
  28. #include "rbt_data.h"
  29. #include "rbt_pars.h"
  30. #include "rbt.h"
  31. #include "transreg.h"
  32. #include "name2uni.h"
  33. #include "nortrans.h"
  34. #include "remtrans.h"
  35. #include "titletrn.h"
  36. #include "tolowtrn.h"
  37. #include "toupptrn.h"
  38. #include "uni2name.h"
  39. #include "brktrans.h"
  40. #include "esctrn.h"
  41. #include "unesctrn.h"
  42. #include "tridpars.h"
  43. #include "anytrans.h"
  44. #include "util.h"
  45. #include "hash.h"
  46. #include "mutex.h"
  47. #include "ucln_in.h"
  48. #include "uassert.h"
  49. #include "cmemory.h"
  50. #include "cstring.h"
  51. #include "uinvchar.h"
  52. static const char16_t TARGET_SEP = 0x002D; /*-*/
  53. static const char16_t ID_DELIM = 0x003B; /*;*/
  54. static const char16_t VARIANT_SEP = 0x002F; // '/'
  55. /**
  56. * Prefix for resource bundle key for the display name for a
  57. * transliterator. The ID is appended to this to form the key.
  58. * The resource bundle value should be a String.
  59. */
  60. static const char RB_DISPLAY_NAME_PREFIX[] = "%Translit%%";
  61. /**
  62. * Prefix for resource bundle key for the display name for a
  63. * transliterator SCRIPT. The ID is appended to this to form the key.
  64. * The resource bundle value should be a String.
  65. */
  66. static const char RB_SCRIPT_DISPLAY_NAME_PREFIX[] = "%Translit%";
  67. /**
  68. * Resource bundle key for display name pattern.
  69. * The resource bundle value should be a String forming a
  70. * MessageFormat pattern, e.g.:
  71. * "{0,choice,0#|1#{1} Transliterator|2#{1} to {2} Transliterator}".
  72. */
  73. static const char RB_DISPLAY_NAME_PATTERN[] = "TransliteratorNamePattern";
  74. /**
  75. * Resource bundle key for the list of RuleBasedTransliterator IDs.
  76. * The resource bundle value should be a String[] with each element
  77. * being a valid ID. The ID will be appended to RB_RULE_BASED_PREFIX
  78. * to obtain the class name in which the RB_RULE key will be sought.
  79. */
  80. static const char RB_RULE_BASED_IDS[] = "RuleBasedTransliteratorIDs";
  81. /**
  82. * The mutex controlling access to registry object.
  83. */
  84. static icu::UMutex registryMutex;
  85. /**
  86. * System transliterator registry; non-null when initialized.
  87. */
  88. static icu::TransliteratorRegistry* registry = 0;
  89. // Macro to check/initialize the registry. ONLY USE WITHIN
  90. // MUTEX. Avoids function call when registry is initialized.
  91. #define HAVE_REGISTRY(status) (registry!=0 || initializeRegistry(status))
  92. U_NAMESPACE_BEGIN
  93. UOBJECT_DEFINE_ABSTRACT_RTTI_IMPLEMENTATION(Transliterator)
  94. /**
  95. * Return true if the given UTransPosition is valid for text of
  96. * the given length.
  97. */
  98. static inline UBool positionIsValid(UTransPosition& index, int32_t len) {
  99. return !(index.contextStart < 0 ||
  100. index.start < index.contextStart ||
  101. index.limit < index.start ||
  102. index.contextLimit < index.limit ||
  103. len < index.contextLimit);
  104. }
  105. /**
  106. * Default constructor.
  107. * @param theID the string identifier for this transliterator
  108. * @param theFilter the filter. Any character for which
  109. * <tt>filter.contains()</tt> returns <tt>false</tt> will not be
  110. * altered by this transliterator. If <tt>filter</tt> is
  111. * <tt>null</tt> then no filtering is applied.
  112. */
  113. Transliterator::Transliterator(const UnicodeString& theID,
  114. UnicodeFilter* adoptedFilter) :
  115. UObject(), ID(theID), filter(adoptedFilter),
  116. maximumContextLength(0)
  117. {
  118. // NUL-terminate the ID string, which is a non-aliased copy.
  119. ID.append((char16_t)0);
  120. ID.truncate(ID.length()-1);
  121. }
  122. /**
  123. * Destructor.
  124. */
  125. Transliterator::~Transliterator() {
  126. if (filter) {
  127. delete filter;
  128. }
  129. }
  130. /**
  131. * Copy constructor.
  132. */
  133. Transliterator::Transliterator(const Transliterator& other) :
  134. UObject(other), ID(other.ID), filter(0),
  135. maximumContextLength(other.maximumContextLength)
  136. {
  137. // NUL-terminate the ID string, which is a non-aliased copy.
  138. ID.append((char16_t)0);
  139. ID.truncate(ID.length()-1);
  140. if (other.filter != 0) {
  141. // We own the filter, so we must have our own copy
  142. filter = other.filter->clone();
  143. }
  144. }
  145. Transliterator* Transliterator::clone() const {
  146. return nullptr;
  147. }
  148. /**
  149. * Assignment operator.
  150. */
  151. Transliterator& Transliterator::operator=(const Transliterator& other) {
  152. if (this == &other) { return *this; } // self-assignment: no-op
  153. ID = other.ID;
  154. // NUL-terminate the ID string
  155. ID.getTerminatedBuffer();
  156. maximumContextLength = other.maximumContextLength;
  157. adoptFilter((other.filter == 0) ? 0 : other.filter->clone());
  158. return *this;
  159. }
  160. /**
  161. * Transliterates a segment of a string. <code>Transliterator</code> API.
  162. * @param text the string to be transliterated
  163. * @param start the beginning index, inclusive; <code>0 <= start
  164. * <= limit</code>.
  165. * @param limit the ending index, exclusive; <code>start <= limit
  166. * <= text.length()</code>.
  167. * @return the new limit index, or -1
  168. */
  169. int32_t Transliterator::transliterate(Replaceable& text,
  170. int32_t start, int32_t limit) const {
  171. if (start < 0 ||
  172. limit < start ||
  173. text.length() < limit) {
  174. return -1;
  175. }
  176. UTransPosition offsets;
  177. offsets.contextStart= start;
  178. offsets.contextLimit = limit;
  179. offsets.start = start;
  180. offsets.limit = limit;
  181. filteredTransliterate(text, offsets, false, true);
  182. return offsets.limit;
  183. }
  184. /**
  185. * Transliterates an entire string in place. Convenience method.
  186. * @param text the string to be transliterated
  187. */
  188. void Transliterator::transliterate(Replaceable& text) const {
  189. transliterate(text, 0, text.length());
  190. }
  191. /**
  192. * Transliterates the portion of the text buffer that can be
  193. * transliterated unambiguosly after new text has been inserted,
  194. * typically as a result of a keyboard event. The new text in
  195. * <code>insertion</code> will be inserted into <code>text</code>
  196. * at <code>index.contextLimit</code>, advancing
  197. * <code>index.contextLimit</code> by <code>insertion.length()</code>.
  198. * Then the transliterator will try to transliterate characters of
  199. * <code>text</code> between <code>index.start</code> and
  200. * <code>index.contextLimit</code>. Characters before
  201. * <code>index.start</code> will not be changed.
  202. *
  203. * <p>Upon return, values in <code>index</code> will be updated.
  204. * <code>index.contextStart</code> will be advanced to the first
  205. * character that future calls to this method will read.
  206. * <code>index.start</code> and <code>index.contextLimit</code> will
  207. * be adjusted to delimit the range of text that future calls to
  208. * this method may change.
  209. *
  210. * <p>Typical usage of this method begins with an initial call
  211. * with <code>index.contextStart</code> and <code>index.contextLimit</code>
  212. * set to indicate the portion of <code>text</code> to be
  213. * transliterated, and <code>index.start == index.contextStart</code>.
  214. * Thereafter, <code>index</code> can be used without
  215. * modification in future calls, provided that all changes to
  216. * <code>text</code> are made via this method.
  217. *
  218. * <p>This method assumes that future calls may be made that will
  219. * insert new text into the buffer. As a result, it only performs
  220. * unambiguous transliterations. After the last call to this
  221. * method, there may be untransliterated text that is waiting for
  222. * more input to resolve an ambiguity. In order to perform these
  223. * pending transliterations, clients should call {@link
  224. * #finishKeyboardTransliteration} after the last call to this
  225. * method has been made.
  226. *
  227. * @param text the buffer holding transliterated and untransliterated text
  228. * @param index an array of three integers.
  229. *
  230. * <ul><li><code>index.contextStart</code>: the beginning index,
  231. * inclusive; <code>0 <= index.contextStart <= index.contextLimit</code>.
  232. *
  233. * <li><code>index.contextLimit</code>: the ending index, exclusive;
  234. * <code>index.contextStart <= index.contextLimit <= text.length()</code>.
  235. * <code>insertion</code> is inserted at
  236. * <code>index.contextLimit</code>.
  237. *
  238. * <li><code>index.start</code>: the next character to be
  239. * considered for transliteration; <code>index.contextStart <=
  240. * index.start <= index.contextLimit</code>. Characters before
  241. * <code>index.start</code> will not be changed by future calls
  242. * to this method.</ul>
  243. *
  244. * @param insertion text to be inserted and possibly
  245. * transliterated into the translation buffer at
  246. * <code>index.contextLimit</code>. If <code>null</code> then no text
  247. * is inserted.
  248. * @see #START
  249. * @see #LIMIT
  250. * @see #CURSOR
  251. * @see #handleTransliterate
  252. * @exception IllegalArgumentException if <code>index</code>
  253. * is invalid
  254. */
  255. void Transliterator::transliterate(Replaceable& text,
  256. UTransPosition& index,
  257. const UnicodeString& insertion,
  258. UErrorCode &status) const {
  259. _transliterate(text, index, &insertion, status);
  260. }
  261. /**
  262. * Transliterates the portion of the text buffer that can be
  263. * transliterated unambiguosly after a new character has been
  264. * inserted, typically as a result of a keyboard event. This is a
  265. * convenience method; see {@link
  266. * #transliterate(Replaceable, int[], String)} for details.
  267. * @param text the buffer holding transliterated and
  268. * untransliterated text
  269. * @param index an array of three integers. See {@link
  270. * #transliterate(Replaceable, int[], String)}.
  271. * @param insertion text to be inserted and possibly
  272. * transliterated into the translation buffer at
  273. * <code>index.contextLimit</code>.
  274. * @see #transliterate(Replaceable, int[], String)
  275. */
  276. void Transliterator::transliterate(Replaceable& text,
  277. UTransPosition& index,
  278. UChar32 insertion,
  279. UErrorCode& status) const {
  280. UnicodeString str(insertion);
  281. _transliterate(text, index, &str, status);
  282. }
  283. /**
  284. * Transliterates the portion of the text buffer that can be
  285. * transliterated unambiguosly. This is a convenience method; see
  286. * {@link #transliterate(Replaceable, int[], String)} for
  287. * details.
  288. * @param text the buffer holding transliterated and
  289. * untransliterated text
  290. * @param index an array of three integers. See {@link
  291. * #transliterate(Replaceable, int[], String)}.
  292. * @see #transliterate(Replaceable, int[], String)
  293. */
  294. void Transliterator::transliterate(Replaceable& text,
  295. UTransPosition& index,
  296. UErrorCode& status) const {
  297. _transliterate(text, index, 0, status);
  298. }
  299. /**
  300. * Finishes any pending transliterations that were waiting for
  301. * more characters. Clients should call this method as the last
  302. * call after a sequence of one or more calls to
  303. * <code>transliterate()</code>.
  304. * @param text the buffer holding transliterated and
  305. * untransliterated text.
  306. * @param index the array of indices previously passed to {@link
  307. * #transliterate}
  308. */
  309. void Transliterator::finishTransliteration(Replaceable& text,
  310. UTransPosition& index) const {
  311. if (!positionIsValid(index, text.length())) {
  312. return;
  313. }
  314. filteredTransliterate(text, index, false, true);
  315. }
  316. /**
  317. * This internal method does keyboard transliteration. If the
  318. * 'insertion' is non-null then we append it to 'text' before
  319. * proceeding. This method calls through to the pure virtual
  320. * framework method handleTransliterate() to do the actual
  321. * work.
  322. */
  323. void Transliterator::_transliterate(Replaceable& text,
  324. UTransPosition& index,
  325. const UnicodeString* insertion,
  326. UErrorCode &status) const {
  327. if (U_FAILURE(status)) {
  328. return;
  329. }
  330. if (!positionIsValid(index, text.length())) {
  331. status = U_ILLEGAL_ARGUMENT_ERROR;
  332. return;
  333. }
  334. // int32_t originalStart = index.contextStart;
  335. if (insertion != 0) {
  336. text.handleReplaceBetween(index.limit, index.limit, *insertion);
  337. index.limit += insertion->length();
  338. index.contextLimit += insertion->length();
  339. }
  340. if (index.limit > 0 &&
  341. U16_IS_LEAD(text.charAt(index.limit - 1))) {
  342. // Oops, there is a dangling lead surrogate in the buffer.
  343. // This will break most transliterators, since they will
  344. // assume it is part of a pair. Don't transliterate until
  345. // more text comes in.
  346. return;
  347. }
  348. filteredTransliterate(text, index, true, true);
  349. #if 0
  350. // TODO
  351. // I CAN'T DO what I'm attempting below now that the Kleene star
  352. // operator is supported. For example, in the rule
  353. // ([:Lu:]+) { x } > $1;
  354. // what is the maximum context length? getMaximumContextLength()
  355. // will return 1, but this is just the length of the ante context
  356. // part of the pattern string -- 1 character, which is a standin
  357. // for a Quantifier, which contains a StringMatcher, which
  358. // contains a UnicodeSet.
  359. // There is a complicated way to make this work again, and that's
  360. // to add a "maximum left context" protocol into the
  361. // UnicodeMatcher hierarchy. At present I'm not convinced this is
  362. // worth it.
  363. // ---
  364. // The purpose of the code below is to keep the context small
  365. // while doing incremental transliteration. When part of the left
  366. // context (between contextStart and start) is no longer needed,
  367. // we try to advance contextStart past that portion. We use the
  368. // maximum context length to do so.
  369. int32_t newCS = index.start;
  370. int32_t n = getMaximumContextLength();
  371. while (newCS > originalStart && n-- > 0) {
  372. --newCS;
  373. newCS -= U16_LENGTH(text.char32At(newCS)) - 1;
  374. }
  375. index.contextStart = uprv_max(newCS, originalStart);
  376. #endif
  377. }
  378. /**
  379. * This method breaks up the input text into runs of unfiltered
  380. * characters. It passes each such run to
  381. * <subclass>.handleTransliterate(). Subclasses that can handle the
  382. * filter logic more efficiently themselves may override this method.
  383. *
  384. * All transliteration calls in this class go through this method.
  385. */
  386. void Transliterator::filteredTransliterate(Replaceable& text,
  387. UTransPosition& index,
  388. UBool incremental,
  389. UBool rollback) const {
  390. // Short circuit path for transliterators with no filter in
  391. // non-incremental mode.
  392. if (filter == 0 && !rollback) {
  393. handleTransliterate(text, index, incremental);
  394. return;
  395. }
  396. //----------------------------------------------------------------------
  397. // This method processes text in two groupings:
  398. //
  399. // RUNS -- A run is a contiguous group of characters which are contained
  400. // in the filter for this transliterator (filter.contains(ch) == true).
  401. // Text outside of runs may appear as context but it is not modified.
  402. // The start and limit Position values are narrowed to each run.
  403. //
  404. // PASSES (incremental only) -- To make incremental mode work correctly,
  405. // each run is broken up into n passes, where n is the length (in code
  406. // points) of the run. Each pass contains the first n characters. If a
  407. // pass is completely transliterated, it is committed, and further passes
  408. // include characters after the committed text. If a pass is blocked,
  409. // and does not transliterate completely, then this method rolls back
  410. // the changes made during the pass, extends the pass by one code point,
  411. // and tries again.
  412. //----------------------------------------------------------------------
  413. // globalLimit is the limit value for the entire operation. We
  414. // set index.limit to the end of each unfiltered run before
  415. // calling handleTransliterate(), so we need to maintain the real
  416. // value of index.limit here. After each transliteration, we
  417. // update globalLimit for insertions or deletions that have
  418. // happened.
  419. int32_t globalLimit = index.limit;
  420. // If there is a non-null filter, then break the input text up. Say the
  421. // input text has the form:
  422. // xxxabcxxdefxx
  423. // where 'x' represents a filtered character (filter.contains('x') ==
  424. // false). Then we break this up into:
  425. // xxxabc xxdef xx
  426. // Each pass through the loop consumes a run of filtered
  427. // characters (which are ignored) and a subsequent run of
  428. // unfiltered characters (which are transliterated).
  429. for (;;) {
  430. if (filter != nullptr) {
  431. // Narrow the range to be transliterated to the first segment
  432. // of unfiltered characters at or after index.start.
  433. // Advance past filtered chars
  434. UChar32 c;
  435. while (index.start < globalLimit &&
  436. !filter->contains(c=text.char32At(index.start))) {
  437. index.start += U16_LENGTH(c);
  438. }
  439. // Find the end of this run of unfiltered chars
  440. index.limit = index.start;
  441. while (index.limit < globalLimit &&
  442. filter->contains(c=text.char32At(index.limit))) {
  443. index.limit += U16_LENGTH(c);
  444. }
  445. }
  446. // Check to see if the unfiltered run is empty. This only
  447. // happens at the end of the string when all the remaining
  448. // characters are filtered.
  449. if (index.limit == index.start) {
  450. // assert(index.start == globalLimit);
  451. break;
  452. }
  453. // Is this run incremental? If there is additional
  454. // filtered text (if limit < globalLimit) then we pass in
  455. // an incremental value of false to force the subclass to
  456. // complete the transliteration for this run.
  457. UBool isIncrementalRun =
  458. (index.limit < globalLimit ? false : incremental);
  459. int32_t delta;
  460. // Implement rollback. To understand the need for rollback,
  461. // consider the following transliterator:
  462. //
  463. // "t" is "a > A;"
  464. // "u" is "A > b;"
  465. // "v" is a compound of "t; NFD; u" with a filter [:Ll:]
  466. //
  467. // Now apply "c" to the input text "a". The result is "b". But if
  468. // the transliteration is done incrementally, then the NFD holds
  469. // things up after "t" has already transformed "a" to "A". When
  470. // finishTransliterate() is called, "A" is _not_ processed because
  471. // it gets excluded by the [:Ll:] filter, and the end result is "A"
  472. // -- incorrect. The problem is that the filter is applied to a
  473. // partially-transliterated result, when we only want it to apply to
  474. // input text. Although this example hinges on a compound
  475. // transliterator containing NFD and a specific filter, it can
  476. // actually happen with any transliterator which may do a partial
  477. // transformation in incremental mode into characters outside its
  478. // filter.
  479. //
  480. // To handle this, when in incremental mode we supply characters to
  481. // handleTransliterate() in several passes. Each pass adds one more
  482. // input character to the input text. That is, for input "ABCD", we
  483. // first try "A", then "AB", then "ABC", and finally "ABCD". If at
  484. // any point we block (upon return, start < limit) then we roll
  485. // back. If at any point we complete the run (upon return start ==
  486. // limit) then we commit that run.
  487. if (rollback && isIncrementalRun) {
  488. int32_t runStart = index.start;
  489. int32_t runLimit = index.limit;
  490. int32_t runLength = runLimit - runStart;
  491. // Make a rollback copy at the end of the string
  492. int32_t rollbackOrigin = text.length();
  493. text.copy(runStart, runLimit, rollbackOrigin);
  494. // Variables reflecting the commitment of completely
  495. // transliterated text. passStart is the runStart, advanced
  496. // past committed text. rollbackStart is the rollbackOrigin,
  497. // advanced past rollback text that corresponds to committed
  498. // text.
  499. int32_t passStart = runStart;
  500. int32_t rollbackStart = rollbackOrigin;
  501. // The limit for each pass; we advance by one code point with
  502. // each iteration.
  503. int32_t passLimit = index.start;
  504. // Total length, in 16-bit code units, of uncommitted text.
  505. // This is the length to be rolled back.
  506. int32_t uncommittedLength = 0;
  507. // Total delta (change in length) for all passes
  508. int32_t totalDelta = 0;
  509. // PASS MAIN LOOP -- Start with a single character, and extend
  510. // the text by one character at a time. Roll back partial
  511. // transliterations and commit complete transliterations.
  512. for (;;) {
  513. // Length of additional code point, either one or two
  514. int32_t charLength = U16_LENGTH(text.char32At(passLimit));
  515. passLimit += charLength;
  516. if (passLimit > runLimit) {
  517. break;
  518. }
  519. uncommittedLength += charLength;
  520. index.limit = passLimit;
  521. // Delegate to subclass for actual transliteration. Upon
  522. // return, start will be updated to point after the
  523. // transliterated text, and limit and contextLimit will be
  524. // adjusted for length changes.
  525. handleTransliterate(text, index, true);
  526. delta = index.limit - passLimit; // change in length
  527. // We failed to completely transliterate this pass.
  528. // Roll back the text. Indices remain unchanged; reset
  529. // them where necessary.
  530. if (index.start != index.limit) {
  531. // Find the rollbackStart, adjusted for length changes
  532. // and the deletion of partially transliterated text.
  533. int32_t rs = rollbackStart + delta - (index.limit - passStart);
  534. // Delete the partially transliterated text
  535. text.handleReplaceBetween(passStart, index.limit, UnicodeString());
  536. // Copy the rollback text back
  537. text.copy(rs, rs + uncommittedLength, passStart);
  538. // Restore indices to their original values
  539. index.start = passStart;
  540. index.limit = passLimit;
  541. index.contextLimit -= delta;
  542. }
  543. // We did completely transliterate this pass. Update the
  544. // commit indices to record how far we got. Adjust indices
  545. // for length change.
  546. else {
  547. // Move the pass indices past the committed text.
  548. passStart = passLimit = index.start;
  549. // Adjust the rollbackStart for length changes and move
  550. // it past the committed text. All characters we've
  551. // processed to this point are committed now, so zero
  552. // out the uncommittedLength.
  553. rollbackStart += delta + uncommittedLength;
  554. uncommittedLength = 0;
  555. // Adjust indices for length changes.
  556. runLimit += delta;
  557. totalDelta += delta;
  558. }
  559. }
  560. // Adjust overall limit and rollbackOrigin for insertions and
  561. // deletions. Don't need to worry about contextLimit because
  562. // handleTransliterate() maintains that.
  563. rollbackOrigin += totalDelta;
  564. globalLimit += totalDelta;
  565. // Delete the rollback copy
  566. text.handleReplaceBetween(rollbackOrigin, rollbackOrigin + runLength, UnicodeString());
  567. // Move start past committed text
  568. index.start = passStart;
  569. }
  570. else {
  571. // Delegate to subclass for actual transliteration.
  572. int32_t limit = index.limit;
  573. handleTransliterate(text, index, isIncrementalRun);
  574. delta = index.limit - limit; // change in length
  575. // In a properly written transliterator, start == limit after
  576. // handleTransliterate() returns when incremental is false.
  577. // Catch cases where the subclass doesn't do this, and throw
  578. // an exception. (Just pinning start to limit is a bad idea,
  579. // because what's probably happening is that the subclass
  580. // isn't transliterating all the way to the end, and it should
  581. // in non-incremental mode.)
  582. if (!incremental && index.start != index.limit) {
  583. // We can't throw an exception, so just fudge things
  584. index.start = index.limit;
  585. }
  586. // Adjust overall limit for insertions/deletions. Don't need
  587. // to worry about contextLimit because handleTransliterate()
  588. // maintains that.
  589. globalLimit += delta;
  590. }
  591. if (filter == nullptr || isIncrementalRun) {
  592. break;
  593. }
  594. // If we did completely transliterate this
  595. // run, then repeat with the next unfiltered run.
  596. }
  597. // Start is valid where it is. Limit needs to be put back where
  598. // it was, modulo adjustments for deletions/insertions.
  599. index.limit = globalLimit;
  600. }
  601. void Transliterator::filteredTransliterate(Replaceable& text,
  602. UTransPosition& index,
  603. UBool incremental) const {
  604. filteredTransliterate(text, index, incremental, false);
  605. }
  606. /**
  607. * Method for subclasses to use to set the maximum context length.
  608. * @see #getMaximumContextLength
  609. */
  610. void Transliterator::setMaximumContextLength(int32_t maxContextLength) {
  611. maximumContextLength = maxContextLength;
  612. }
  613. /**
  614. * Returns a programmatic identifier for this transliterator.
  615. * If this identifier is passed to <code>getInstance()</code>, it
  616. * will return this object, if it has been registered.
  617. * @see #registerInstance
  618. * @see #getAvailableIDs
  619. */
  620. const UnicodeString& Transliterator::getID() const {
  621. return ID;
  622. }
  623. /**
  624. * Returns a name for this transliterator that is appropriate for
  625. * display to the user in the default locale. See {@link
  626. * #getDisplayName(Locale)} for details.
  627. */
  628. UnicodeString& U_EXPORT2 Transliterator::getDisplayName(const UnicodeString& ID,
  629. UnicodeString& result) {
  630. return getDisplayName(ID, Locale::getDefault(), result);
  631. }
  632. /**
  633. * Returns a name for this transliterator that is appropriate for
  634. * display to the user in the given locale. This name is taken
  635. * from the locale resource data in the standard manner of the
  636. * <code>java.text</code> package.
  637. *
  638. * <p>If no localized names exist in the system resource bundles,
  639. * a name is synthesized using a localized
  640. * <code>MessageFormat</code> pattern from the resource data. The
  641. * arguments to this pattern are an integer followed by one or two
  642. * strings. The integer is the number of strings, either 1 or 2.
  643. * The strings are formed by splitting the ID for this
  644. * transliterator at the first TARGET_SEP. If there is no TARGET_SEP, then the
  645. * entire ID forms the only string.
  646. * @param inLocale the Locale in which the display name should be
  647. * localized.
  648. * @see java.text.MessageFormat
  649. */
  650. UnicodeString& U_EXPORT2 Transliterator::getDisplayName(const UnicodeString& id,
  651. const Locale& inLocale,
  652. UnicodeString& result) {
  653. UErrorCode status = U_ZERO_ERROR;
  654. ResourceBundle bundle(U_ICUDATA_TRANSLIT, inLocale, status);
  655. // Suspend checking status until later...
  656. result.truncate(0);
  657. // Normalize the ID
  658. UnicodeString source, target, variant;
  659. UBool sawSource;
  660. TransliteratorIDParser::IDtoSTV(id, source, target, variant, sawSource);
  661. if (target.length() < 1) {
  662. // No target; malformed id
  663. return result;
  664. }
  665. if (variant.length() > 0) { // Change "Foo" to "/Foo"
  666. variant.insert(0, VARIANT_SEP);
  667. }
  668. UnicodeString ID(source);
  669. ID.append(TARGET_SEP).append(target).append(variant);
  670. // build the char* key
  671. if (uprv_isInvariantUString(ID.getBuffer(), ID.length())) {
  672. char key[200];
  673. uprv_strcpy(key, RB_DISPLAY_NAME_PREFIX);
  674. int32_t length=(int32_t)uprv_strlen(RB_DISPLAY_NAME_PREFIX);
  675. ID.extract(0, (int32_t)(sizeof(key)-length), key+length, (int32_t)(sizeof(key)-length), US_INV);
  676. // Try to retrieve a UnicodeString from the bundle.
  677. UnicodeString resString = bundle.getStringEx(key, status);
  678. if (U_SUCCESS(status) && resString.length() != 0) {
  679. return result = resString; // [sic] assign & return
  680. }
  681. #if !UCONFIG_NO_FORMATTING
  682. // We have failed to get a name from the locale data. This is
  683. // typical, since most transliterators will not have localized
  684. // name data. The next step is to retrieve the MessageFormat
  685. // pattern from the locale data and to use it to synthesize the
  686. // name from the ID.
  687. status = U_ZERO_ERROR;
  688. resString = bundle.getStringEx(RB_DISPLAY_NAME_PATTERN, status);
  689. if (U_SUCCESS(status) && resString.length() != 0) {
  690. MessageFormat msg(resString, inLocale, status);
  691. // Suspend checking status until later...
  692. // We pass either 2 or 3 Formattable objects to msg.
  693. Formattable args[3];
  694. int32_t nargs;
  695. args[0].setLong(2); // # of args to follow
  696. args[1].setString(source);
  697. args[2].setString(target);
  698. nargs = 3;
  699. // Use display names for the scripts, if they exist
  700. UnicodeString s;
  701. length=(int32_t)uprv_strlen(RB_SCRIPT_DISPLAY_NAME_PREFIX);
  702. for (int j=1; j<=2; ++j) {
  703. status = U_ZERO_ERROR;
  704. uprv_strcpy(key, RB_SCRIPT_DISPLAY_NAME_PREFIX);
  705. args[j].getString(s);
  706. if (uprv_isInvariantUString(s.getBuffer(), s.length())) {
  707. s.extract(0, sizeof(key)-length-1, key+length, (int32_t)sizeof(key)-length-1, US_INV);
  708. resString = bundle.getStringEx(key, status);
  709. if (U_SUCCESS(status)) {
  710. args[j] = resString;
  711. }
  712. }
  713. }
  714. status = U_ZERO_ERROR;
  715. FieldPosition pos; // ignored by msg
  716. msg.format(args, nargs, result, pos, status);
  717. if (U_SUCCESS(status)) {
  718. result.append(variant);
  719. return result;
  720. }
  721. }
  722. #endif
  723. }
  724. // We should not reach this point unless there is something
  725. // wrong with the build or the RB_DISPLAY_NAME_PATTERN has
  726. // been deleted from the root RB_LOCALE_ELEMENTS resource.
  727. result = ID;
  728. return result;
  729. }
  730. /**
  731. * Returns the filter used by this transliterator, or <tt>null</tt>
  732. * if this transliterator uses no filter. Caller musn't delete
  733. * the result!
  734. */
  735. const UnicodeFilter* Transliterator::getFilter() const {
  736. return filter;
  737. }
  738. /**
  739. * Returns the filter used by this transliterator, or
  740. * <tt>nullptr</tt> if this transliterator uses no filter. The
  741. * caller must eventually delete the result. After this call,
  742. * this transliterator's filter is set to <tt>nullptr</tt>.
  743. */
  744. UnicodeFilter* Transliterator::orphanFilter() {
  745. UnicodeFilter *result = filter;
  746. filter = nullptr;
  747. return result;
  748. }
  749. /**
  750. * Changes the filter used by this transliterator. If the filter
  751. * is set to <tt>null</tt> then no filtering will occur.
  752. *
  753. * <p>Callers must take care if a transliterator is in use by
  754. * multiple threads. The filter should not be changed by one
  755. * thread while another thread may be transliterating.
  756. */
  757. void Transliterator::adoptFilter(UnicodeFilter* filterToAdopt) {
  758. delete filter;
  759. filter = filterToAdopt;
  760. }
  761. /**
  762. * Returns this transliterator's inverse. See the class
  763. * documentation for details. This implementation simply inverts
  764. * the two entities in the ID and attempts to retrieve the
  765. * resulting transliterator. That is, if <code>getID()</code>
  766. * returns "A-B", then this method will return the result of
  767. * <code>getInstance("B-A")</code>, or <code>null</code> if that
  768. * call fails.
  769. *
  770. * <p>This method does not take filtering into account. The
  771. * returned transliterator will have no filter.
  772. *
  773. * <p>Subclasses with knowledge of their inverse may wish to
  774. * override this method.
  775. *
  776. * @return a transliterator that is an inverse, not necessarily
  777. * exact, of this transliterator, or <code>null</code> if no such
  778. * transliterator is registered.
  779. * @see #registerInstance
  780. */
  781. Transliterator* Transliterator::createInverse(UErrorCode& status) const {
  782. UParseError parseError;
  783. return Transliterator::createInstance(ID, UTRANS_REVERSE,parseError,status);
  784. }
  785. Transliterator* U_EXPORT2
  786. Transliterator::createInstance(const UnicodeString& ID,
  787. UTransDirection dir,
  788. UErrorCode& status)
  789. {
  790. UParseError parseError;
  791. return createInstance(ID, dir, parseError, status);
  792. }
  793. /**
  794. * Returns a <code>Transliterator</code> object given its ID.
  795. * The ID must be either a system transliterator ID or a ID registered
  796. * using <code>registerInstance()</code>.
  797. *
  798. * @param ID a valid ID, as enumerated by <code>getAvailableIDs()</code>
  799. * @return A <code>Transliterator</code> object with the given ID
  800. * @see #registerInstance
  801. * @see #getAvailableIDs
  802. * @see #getID
  803. */
  804. Transliterator* U_EXPORT2
  805. Transliterator::createInstance(const UnicodeString& ID,
  806. UTransDirection dir,
  807. UParseError& parseError,
  808. UErrorCode& status)
  809. {
  810. if (U_FAILURE(status)) {
  811. return 0;
  812. }
  813. UnicodeString canonID;
  814. UVector list(status);
  815. if (U_FAILURE(status)) {
  816. return nullptr;
  817. }
  818. UnicodeSet* globalFilter = nullptr;
  819. // TODO add code for parseError...currently unused, but
  820. // later may be used by parsing code...
  821. if (!TransliteratorIDParser::parseCompoundID(ID, dir, canonID, list, globalFilter)) {
  822. status = U_INVALID_ID;
  823. delete globalFilter;
  824. return nullptr;
  825. }
  826. LocalPointer<UnicodeSet> lpGlobalFilter(globalFilter);
  827. TransliteratorIDParser::instantiateList(list, status);
  828. if (U_FAILURE(status)) {
  829. return nullptr;
  830. }
  831. U_ASSERT(list.size() > 0);
  832. Transliterator* t = nullptr;
  833. if (list.size() > 1 || canonID.indexOf(ID_DELIM) >= 0) {
  834. // [NOTE: If it's a compoundID, we instantiate a CompoundTransliterator even if it only
  835. // has one child transliterator. This is so that toRules() will return the right thing
  836. // (without any inactive ID), but our main ID still comes out correct. That is, if we
  837. // instantiate "(Lower);Latin-Greek;", we want the rules to come out as "::Latin-Greek;"
  838. // even though the ID is "(Lower);Latin-Greek;".
  839. t = new CompoundTransliterator(list, parseError, status);
  840. }
  841. else {
  842. t = (Transliterator*)list.elementAt(0);
  843. }
  844. // Check null pointer
  845. if (t != nullptr) {
  846. t->setID(canonID);
  847. if (lpGlobalFilter.isValid()) {
  848. t->adoptFilter(lpGlobalFilter.orphan());
  849. }
  850. }
  851. else if (U_SUCCESS(status)) {
  852. status = U_MEMORY_ALLOCATION_ERROR;
  853. }
  854. return t;
  855. }
  856. /**
  857. * Create a transliterator from a basic ID. This is an ID
  858. * containing only the forward direction source, target, and
  859. * variant.
  860. * @param id a basic ID of the form S-T or S-T/V.
  861. * @return a newly created Transliterator or null if the ID is
  862. * invalid.
  863. */
  864. Transliterator* Transliterator::createBasicInstance(const UnicodeString& id,
  865. const UnicodeString* canon) {
  866. UParseError pe;
  867. UErrorCode ec = U_ZERO_ERROR;
  868. TransliteratorAlias* alias = 0;
  869. Transliterator* t = 0;
  870. umtx_lock(&registryMutex);
  871. if (HAVE_REGISTRY(ec)) {
  872. t = registry->get(id, alias, ec);
  873. }
  874. umtx_unlock(&registryMutex);
  875. if (U_FAILURE(ec)) {
  876. delete t;
  877. delete alias;
  878. return 0;
  879. }
  880. // We may have not gotten a transliterator: Because we can't
  881. // instantiate a transliterator from inside TransliteratorRegistry::
  882. // get() (that would deadlock), we sometimes pass back an alias. This
  883. // contains the data we need to finish the instantiation outside the
  884. // registry mutex. The alias may, in turn, generate another alias, so
  885. // we handle aliases in a loop. The max times through the loop is two.
  886. // [alan]
  887. while (alias != 0) {
  888. U_ASSERT(t==0);
  889. // Rule-based aliases are handled with TransliteratorAlias::
  890. // parse(), followed by TransliteratorRegistry::reget().
  891. // Other aliases are handled with TransliteratorAlias::create().
  892. if (alias->isRuleBased()) {
  893. // Step 1. parse
  894. TransliteratorParser parser(ec);
  895. alias->parse(parser, pe, ec);
  896. delete alias;
  897. alias = 0;
  898. // Step 2. reget
  899. umtx_lock(&registryMutex);
  900. if (HAVE_REGISTRY(ec)) {
  901. t = registry->reget(id, parser, alias, ec);
  902. }
  903. umtx_unlock(&registryMutex);
  904. // Step 3. Loop back around!
  905. } else {
  906. t = alias->create(pe, ec);
  907. delete alias;
  908. alias = 0;
  909. break;
  910. }
  911. if (U_FAILURE(ec)) {
  912. delete t;
  913. delete alias;
  914. t = nullptr;
  915. break;
  916. }
  917. }
  918. if (t != nullptr && canon != nullptr) {
  919. t->setID(*canon);
  920. }
  921. return t;
  922. }
  923. /**
  924. * Returns a <code>Transliterator</code> object constructed from
  925. * the given rule string. This will be a RuleBasedTransliterator,
  926. * if the rule string contains only rules, or a
  927. * CompoundTransliterator, if it contains ID blocks, or a
  928. * NullTransliterator, if it contains ID blocks which parse as
  929. * empty for the given direction.
  930. */
  931. Transliterator* U_EXPORT2
  932. Transliterator::createFromRules(const UnicodeString& ID,
  933. const UnicodeString& rules,
  934. UTransDirection dir,
  935. UParseError& parseError,
  936. UErrorCode& status)
  937. {
  938. Transliterator* t = nullptr;
  939. TransliteratorParser parser(status);
  940. parser.parse(rules, dir, parseError, status);
  941. if (U_FAILURE(status)) {
  942. return 0;
  943. }
  944. // NOTE: The logic here matches that in TransliteratorRegistry.
  945. if (parser.idBlockVector.size() == 0 && parser.dataVector.size() == 0) {
  946. t = new NullTransliterator();
  947. }
  948. else if (parser.idBlockVector.size() == 0 && parser.dataVector.size() == 1) {
  949. t = new RuleBasedTransliterator(ID, (TransliterationRuleData*)parser.dataVector.orphanElementAt(0), true);
  950. }
  951. else if (parser.idBlockVector.size() == 1 && parser.dataVector.size() == 0) {
  952. // idBlock, no data -- this is an alias. The ID has
  953. // been munged from reverse into forward mode, if
  954. // necessary, so instantiate the ID in the forward
  955. // direction.
  956. if (parser.compoundFilter != nullptr) {
  957. UnicodeString filterPattern;
  958. parser.compoundFilter->toPattern(filterPattern, false);
  959. t = createInstance(filterPattern + UnicodeString(ID_DELIM)
  960. + *((UnicodeString*)parser.idBlockVector.elementAt(0)), UTRANS_FORWARD, parseError, status);
  961. }
  962. else
  963. t = createInstance(*((UnicodeString*)parser.idBlockVector.elementAt(0)), UTRANS_FORWARD, parseError, status);
  964. if (t != nullptr) {
  965. t->setID(ID);
  966. }
  967. }
  968. else {
  969. UVector transliterators(status);
  970. // TODO ICU-21701 missing U_FAILURE check here.
  971. // Error and nullptr checking through this whole block looks suspect.
  972. int32_t passNumber = 1;
  973. int32_t limit = parser.idBlockVector.size();
  974. if (parser.dataVector.size() > limit)
  975. limit = parser.dataVector.size();
  976. for (int32_t i = 0; i < limit; i++) {
  977. if (i < parser.idBlockVector.size()) {
  978. UnicodeString* idBlock = (UnicodeString*)parser.idBlockVector.elementAt(i);
  979. if (!idBlock->isEmpty()) {
  980. Transliterator* temp = createInstance(*idBlock, UTRANS_FORWARD, parseError, status);
  981. if (U_FAILURE(status)) {
  982. delete temp;
  983. return nullptr;
  984. }
  985. if (temp != nullptr && typeid(*temp) != typeid(NullTransliterator)) {
  986. transliterators.addElement(temp, status);
  987. if (U_FAILURE(status)) {
  988. delete temp;
  989. return nullptr;
  990. }
  991. } else {
  992. delete temp;
  993. }
  994. }
  995. }
  996. if (!parser.dataVector.isEmpty()) {
  997. TransliterationRuleData* data = (TransliterationRuleData*)parser.dataVector.orphanElementAt(0);
  998. // TODO: Should passNumber be turned into a decimal-string representation (1 -> "1")?
  999. RuleBasedTransliterator* temprbt = new RuleBasedTransliterator(UnicodeString(CompoundTransliterator::PASS_STRING) + UnicodeString(passNumber++),
  1000. data, true);
  1001. // Check if nullptr before adding it to transliterators to avoid future usage of nullptr pointer.
  1002. if (temprbt == nullptr) {
  1003. if (U_SUCCESS(status)) {
  1004. status = U_MEMORY_ALLOCATION_ERROR;
  1005. }
  1006. return t;
  1007. }
  1008. transliterators.addElement(temprbt, status);
  1009. if (U_FAILURE(status)) {
  1010. delete temprbt;
  1011. return t;
  1012. }
  1013. // TODO: ICU-21701 the transliterators vector will leak its contents if anything goes wrong.
  1014. // Under normal operation, the CompoundTransliterator constructor adopts the
  1015. // the contents of the vector.
  1016. }
  1017. }
  1018. t = new CompoundTransliterator(transliterators, passNumber - 1, parseError, status);
  1019. // Null pointer check
  1020. if (t != nullptr) {
  1021. t->setID(ID);
  1022. t->adoptFilter(parser.orphanCompoundFilter());
  1023. }
  1024. }
  1025. if (U_SUCCESS(status) && t == nullptr) {
  1026. status = U_MEMORY_ALLOCATION_ERROR;
  1027. }
  1028. return t;
  1029. }
  1030. UnicodeString& Transliterator::toRules(UnicodeString& rulesSource,
  1031. UBool escapeUnprintable) const {
  1032. // The base class implementation of toRules munges the ID into
  1033. // the correct format. That is: foo => ::foo
  1034. if (escapeUnprintable) {
  1035. rulesSource.truncate(0);
  1036. UnicodeString id = getID();
  1037. for (int32_t i=0; i<id.length();) {
  1038. UChar32 c = id.char32At(i);
  1039. if (!ICU_Utility::escapeUnprintable(rulesSource, c)) {
  1040. rulesSource.append(c);
  1041. }
  1042. i += U16_LENGTH(c);
  1043. }
  1044. } else {
  1045. rulesSource = getID();
  1046. }
  1047. // KEEP in sync with rbt_pars
  1048. rulesSource.insert(0, UNICODE_STRING_SIMPLE("::"));
  1049. rulesSource.append(ID_DELIM);
  1050. return rulesSource;
  1051. }
  1052. int32_t Transliterator::countElements() const {
  1053. const CompoundTransliterator* ct = dynamic_cast<const CompoundTransliterator*>(this);
  1054. return ct != nullptr ? ct->getCount() : 0;
  1055. }
  1056. const Transliterator& Transliterator::getElement(int32_t index, UErrorCode& ec) const {
  1057. if (U_FAILURE(ec)) {
  1058. return *this;
  1059. }
  1060. const CompoundTransliterator* cpd = dynamic_cast<const CompoundTransliterator*>(this);
  1061. int32_t n = (cpd == nullptr) ? 1 : cpd->getCount();
  1062. if (index < 0 || index >= n) {
  1063. ec = U_INDEX_OUTOFBOUNDS_ERROR;
  1064. return *this;
  1065. } else {
  1066. return (n == 1) ? *this : cpd->getTransliterator(index);
  1067. }
  1068. }
  1069. UnicodeSet& Transliterator::getSourceSet(UnicodeSet& result) const {
  1070. handleGetSourceSet(result);
  1071. if (filter != nullptr) {
  1072. UnicodeSet* filterSet = dynamic_cast<UnicodeSet*>(filter);
  1073. UBool deleteFilterSet = false;
  1074. // Most, but not all filters will be UnicodeSets. Optimize for
  1075. // the high-runner case.
  1076. if (filterSet == nullptr) {
  1077. filterSet = new UnicodeSet();
  1078. // Check null pointer
  1079. if (filterSet == nullptr) {
  1080. return result;
  1081. }
  1082. deleteFilterSet = true;
  1083. filter->addMatchSetTo(*filterSet);
  1084. }
  1085. result.retainAll(*filterSet);
  1086. if (deleteFilterSet) {
  1087. delete filterSet;
  1088. }
  1089. }
  1090. return result;
  1091. }
  1092. void Transliterator::handleGetSourceSet(UnicodeSet& result) const {
  1093. result.clear();
  1094. }
  1095. UnicodeSet& Transliterator::getTargetSet(UnicodeSet& result) const {
  1096. return result.clear();
  1097. }
  1098. // For public consumption
  1099. void U_EXPORT2 Transliterator::registerFactory(const UnicodeString& id,
  1100. Transliterator::Factory factory,
  1101. Transliterator::Token context) {
  1102. Mutex lock(&registryMutex);
  1103. UErrorCode ec = U_ZERO_ERROR;
  1104. if (HAVE_REGISTRY(ec)) {
  1105. _registerFactory(id, factory, context);
  1106. }
  1107. }
  1108. // To be called only by Transliterator subclasses that are called
  1109. // to register themselves by initializeRegistry().
  1110. void Transliterator::_registerFactory(const UnicodeString& id,
  1111. Transliterator::Factory factory,
  1112. Transliterator::Token context) {
  1113. UErrorCode ec = U_ZERO_ERROR;
  1114. registry->put(id, factory, context, true, ec);
  1115. }
  1116. // To be called only by Transliterator subclasses that are called
  1117. // to register themselves by initializeRegistry().
  1118. void Transliterator::_registerSpecialInverse(const UnicodeString& target,
  1119. const UnicodeString& inverseTarget,
  1120. UBool bidirectional) {
  1121. UErrorCode status = U_ZERO_ERROR;
  1122. TransliteratorIDParser::registerSpecialInverse(target, inverseTarget, bidirectional, status);
  1123. }
  1124. /**
  1125. * Registers a instance <tt>obj</tt> of a subclass of
  1126. * <code>Transliterator</code> with the system. This object must
  1127. * implement the <tt>clone()</tt> method. When
  1128. * <tt>getInstance()</tt> is called with an ID string that is
  1129. * equal to <tt>obj.getID()</tt>, then <tt>obj.clone()</tt> is
  1130. * returned.
  1131. *
  1132. * @param obj an instance of subclass of
  1133. * <code>Transliterator</code> that defines <tt>clone()</tt>
  1134. * @see #getInstance
  1135. * @see #unregister
  1136. */
  1137. void U_EXPORT2 Transliterator::registerInstance(Transliterator* adoptedPrototype) {
  1138. Mutex lock(&registryMutex);
  1139. UErrorCode ec = U_ZERO_ERROR;
  1140. if (HAVE_REGISTRY(ec)) {
  1141. _registerInstance(adoptedPrototype);
  1142. }
  1143. }
  1144. void Transliterator::_registerInstance(Transliterator* adoptedPrototype) {
  1145. UErrorCode ec = U_ZERO_ERROR;
  1146. registry->put(adoptedPrototype, true, ec);
  1147. }
  1148. void U_EXPORT2 Transliterator::registerAlias(const UnicodeString& aliasID,
  1149. const UnicodeString& realID) {
  1150. Mutex lock(&registryMutex);
  1151. UErrorCode ec = U_ZERO_ERROR;
  1152. if (HAVE_REGISTRY(ec)) {
  1153. _registerAlias(aliasID, realID);
  1154. }
  1155. }
  1156. void Transliterator::_registerAlias(const UnicodeString& aliasID,
  1157. const UnicodeString& realID) {
  1158. UErrorCode ec = U_ZERO_ERROR;
  1159. registry->put(aliasID, realID, false, true, ec);
  1160. }
  1161. /**
  1162. * Unregisters a transliterator or class. This may be either
  1163. * a system transliterator or a user transliterator or class.
  1164. *
  1165. * @param ID the ID of the transliterator or class
  1166. * @see #registerInstance
  1167. */
  1168. void U_EXPORT2 Transliterator::unregister(const UnicodeString& ID) {
  1169. Mutex lock(&registryMutex);
  1170. UErrorCode ec = U_ZERO_ERROR;
  1171. if (HAVE_REGISTRY(ec)) {
  1172. registry->remove(ID);
  1173. }
  1174. }
  1175. /**
  1176. * == OBSOLETE - remove in ICU 3.4 ==
  1177. * Return the number of IDs currently registered with the system.
  1178. * To retrieve the actual IDs, call getAvailableID(i) with
  1179. * i from 0 to countAvailableIDs() - 1.
  1180. */
  1181. int32_t U_EXPORT2 Transliterator::countAvailableIDs() {
  1182. int32_t retVal = 0;
  1183. Mutex lock(&registryMutex);
  1184. UErrorCode ec = U_ZERO_ERROR;
  1185. if (HAVE_REGISTRY(ec)) {
  1186. retVal = registry->countAvailableIDs();
  1187. }
  1188. return retVal;
  1189. }
  1190. /**
  1191. * == OBSOLETE - remove in ICU 3.4 ==
  1192. * Return the index-th available ID. index must be between 0
  1193. * and countAvailableIDs() - 1, inclusive. If index is out of
  1194. * range, the result of getAvailableID(0) is returned.
  1195. */
  1196. const UnicodeString& U_EXPORT2 Transliterator::getAvailableID(int32_t index) {
  1197. const UnicodeString* result = nullptr;
  1198. umtx_lock(&registryMutex);
  1199. UErrorCode ec = U_ZERO_ERROR;
  1200. if (HAVE_REGISTRY(ec)) {
  1201. result = &registry->getAvailableID(index);
  1202. }
  1203. umtx_unlock(&registryMutex);
  1204. U_ASSERT(result != nullptr); // fail if no registry
  1205. return *result;
  1206. }
  1207. StringEnumeration* U_EXPORT2 Transliterator::getAvailableIDs(UErrorCode& ec) {
  1208. if (U_FAILURE(ec)) return nullptr;
  1209. StringEnumeration* result = nullptr;
  1210. umtx_lock(&registryMutex);
  1211. if (HAVE_REGISTRY(ec)) {
  1212. result = registry->getAvailableIDs();
  1213. }
  1214. umtx_unlock(&registryMutex);
  1215. if (result == nullptr) {
  1216. ec = U_INTERNAL_TRANSLITERATOR_ERROR;
  1217. }
  1218. return result;
  1219. }
  1220. int32_t U_EXPORT2 Transliterator::countAvailableSources() {
  1221. Mutex lock(&registryMutex);
  1222. UErrorCode ec = U_ZERO_ERROR;
  1223. return HAVE_REGISTRY(ec) ? _countAvailableSources() : 0;
  1224. }
  1225. UnicodeString& U_EXPORT2 Transliterator::getAvailableSource(int32_t index,
  1226. UnicodeString& result) {
  1227. Mutex lock(&registryMutex);
  1228. UErrorCode ec = U_ZERO_ERROR;
  1229. if (HAVE_REGISTRY(ec)) {
  1230. _getAvailableSource(index, result);
  1231. }
  1232. return result;
  1233. }
  1234. int32_t U_EXPORT2 Transliterator::countAvailableTargets(const UnicodeString& source) {
  1235. Mutex lock(&registryMutex);
  1236. UErrorCode ec = U_ZERO_ERROR;
  1237. return HAVE_REGISTRY(ec) ? _countAvailableTargets(source) : 0;
  1238. }
  1239. UnicodeString& U_EXPORT2 Transliterator::getAvailableTarget(int32_t index,
  1240. const UnicodeString& source,
  1241. UnicodeString& result) {
  1242. Mutex lock(&registryMutex);
  1243. UErrorCode ec = U_ZERO_ERROR;
  1244. if (HAVE_REGISTRY(ec)) {
  1245. _getAvailableTarget(index, source, result);
  1246. }
  1247. return result;
  1248. }
  1249. int32_t U_EXPORT2 Transliterator::countAvailableVariants(const UnicodeString& source,
  1250. const UnicodeString& target) {
  1251. Mutex lock(&registryMutex);
  1252. UErrorCode ec = U_ZERO_ERROR;
  1253. return HAVE_REGISTRY(ec) ? _countAvailableVariants(source, target) : 0;
  1254. }
  1255. UnicodeString& U_EXPORT2 Transliterator::getAvailableVariant(int32_t index,
  1256. const UnicodeString& source,
  1257. const UnicodeString& target,
  1258. UnicodeString& result) {
  1259. Mutex lock(&registryMutex);
  1260. UErrorCode ec = U_ZERO_ERROR;
  1261. if (HAVE_REGISTRY(ec)) {
  1262. _getAvailableVariant(index, source, target, result);
  1263. }
  1264. return result;
  1265. }
  1266. int32_t Transliterator::_countAvailableSources() {
  1267. return registry->countAvailableSources();
  1268. }
  1269. UnicodeString& Transliterator::_getAvailableSource(int32_t index,
  1270. UnicodeString& result) {
  1271. return registry->getAvailableSource(index, result);
  1272. }
  1273. int32_t Transliterator::_countAvailableTargets(const UnicodeString& source) {
  1274. return registry->countAvailableTargets(source);
  1275. }
  1276. UnicodeString& Transliterator::_getAvailableTarget(int32_t index,
  1277. const UnicodeString& source,
  1278. UnicodeString& result) {
  1279. return registry->getAvailableTarget(index, source, result);
  1280. }
  1281. int32_t Transliterator::_countAvailableVariants(const UnicodeString& source,
  1282. const UnicodeString& target) {
  1283. return registry->countAvailableVariants(source, target);
  1284. }
  1285. UnicodeString& Transliterator::_getAvailableVariant(int32_t index,
  1286. const UnicodeString& source,
  1287. const UnicodeString& target,
  1288. UnicodeString& result) {
  1289. return registry->getAvailableVariant(index, source, target, result);
  1290. }
  1291. #ifdef U_USE_DEPRECATED_TRANSLITERATOR_API
  1292. /**
  1293. * Method for subclasses to use to obtain a character in the given
  1294. * string, with filtering.
  1295. * @deprecated the new architecture provides filtering at the top
  1296. * level. This method will be removed Dec 31 2001.
  1297. */
  1298. char16_t Transliterator::filteredCharAt(const Replaceable& text, int32_t i) const {
  1299. char16_t c;
  1300. const UnicodeFilter* localFilter = getFilter();
  1301. return (localFilter == 0) ? text.charAt(i) :
  1302. (localFilter->contains(c = text.charAt(i)) ? c : (char16_t)0xFFFE);
  1303. }
  1304. #endif
  1305. /**
  1306. * If the registry is initialized, return true. If not, initialize it
  1307. * and return true. If the registry cannot be initialized, return
  1308. * false (rare).
  1309. *
  1310. * IMPORTANT: Upon entry, registryMutex must be LOCKED. The entire
  1311. * initialization is done with the lock held. There is NO REASON to
  1312. * unlock, since no other thread that is waiting on the registryMutex
  1313. * cannot itself proceed until the registry is initialized.
  1314. */
  1315. UBool Transliterator::initializeRegistry(UErrorCode &status) {
  1316. if (registry != 0) {
  1317. return true;
  1318. }
  1319. registry = new TransliteratorRegistry(status);
  1320. if (registry == 0 || U_FAILURE(status)) {
  1321. delete registry;
  1322. registry = 0;
  1323. return false; // can't create registry, no recovery
  1324. }
  1325. /* The following code parses the index table located in
  1326. * icu/data/translit/root.txt. The index is an n x 4 table
  1327. * that follows this format:
  1328. * <id>{
  1329. * file{
  1330. * resource{"<resource>"}
  1331. * direction{"<direction>"}
  1332. * }
  1333. * }
  1334. * <id>{
  1335. * internal{
  1336. * resource{"<resource>"}
  1337. * direction{"<direction"}
  1338. * }
  1339. * }
  1340. * <id>{
  1341. * alias{"<getInstanceArg"}
  1342. * }
  1343. * <id> is the ID of the system transliterator being defined. These
  1344. * are public IDs enumerated by Transliterator.getAvailableIDs(),
  1345. * unless the second field is "internal".
  1346. *
  1347. * <resource> is a ResourceReader resource name. Currently these refer
  1348. * to file names under com/ibm/text/resources. This string is passed
  1349. * directly to ResourceReader, together with <encoding>.
  1350. *
  1351. * <direction> is either "FORWARD" or "REVERSE".
  1352. *
  1353. * <getInstanceArg> is a string to be passed directly to
  1354. * Transliterator.getInstance(). The returned Transliterator object
  1355. * then has its ID changed to <id> and is returned.
  1356. *
  1357. * The extra blank field on "alias" lines is to make the array square.
  1358. */
  1359. //static const char translit_index[] = "translit_index";
  1360. UErrorCode lstatus = U_ZERO_ERROR;
  1361. UResourceBundle *bundle, *transIDs, *colBund;
  1362. bundle = ures_open(U_ICUDATA_TRANSLIT, nullptr/*open default locale*/, &lstatus);
  1363. transIDs = ures_getByKey(bundle, RB_RULE_BASED_IDS, 0, &lstatus);
  1364. const UnicodeString T_PART = UNICODE_STRING_SIMPLE("-t-");
  1365. int32_t row, maxRows;
  1366. if (lstatus == U_MEMORY_ALLOCATION_ERROR) {
  1367. delete registry;
  1368. registry = nullptr;
  1369. status = U_MEMORY_ALLOCATION_ERROR;
  1370. return false;
  1371. }
  1372. if (U_SUCCESS(lstatus)) {
  1373. maxRows = ures_getSize(transIDs);
  1374. for (row = 0; row < maxRows; row++) {
  1375. colBund = ures_getByIndex(transIDs, row, 0, &lstatus);
  1376. if (U_SUCCESS(lstatus)) {
  1377. UnicodeString id(ures_getKey(colBund), -1, US_INV);
  1378. if(id.indexOf(T_PART) != -1) {
  1379. ures_close(colBund);
  1380. continue;
  1381. }
  1382. UResourceBundle* res = ures_getNextResource(colBund, nullptr, &lstatus);
  1383. const char* typeStr = ures_getKey(res);
  1384. char16_t type;
  1385. u_charsToUChars(typeStr, &type, 1);
  1386. if (U_SUCCESS(lstatus)) {
  1387. int32_t len = 0;
  1388. const char16_t *resString;
  1389. switch (type) {
  1390. case 0x66: // 'f'
  1391. case 0x69: // 'i'
  1392. // 'file' or 'internal';
  1393. // row[2]=resource, row[3]=direction
  1394. {
  1395. resString = ures_getStringByKey(res, "resource", &len, &lstatus);
  1396. UBool visible = (type == 0x0066 /*f*/);
  1397. UTransDirection dir =
  1398. (ures_getUnicodeStringByKey(res, "direction", &lstatus).charAt(0) ==
  1399. 0x0046 /*F*/) ?
  1400. UTRANS_FORWARD : UTRANS_REVERSE;
  1401. registry->put(id, UnicodeString(true, resString, len), dir, true, visible, lstatus);
  1402. }
  1403. break;
  1404. case 0x61: // 'a'
  1405. // 'alias'; row[2]=createInstance argument
  1406. resString = ures_getString(res, &len, &lstatus);
  1407. registry->put(id, UnicodeString(true, resString, len), true, true, lstatus);
  1408. break;
  1409. }
  1410. }
  1411. ures_close(res);
  1412. }
  1413. ures_close(colBund);
  1414. }
  1415. }
  1416. ures_close(transIDs);
  1417. ures_close(bundle);
  1418. // Manually add prototypes that the system knows about to the
  1419. // cache. This is how new non-rule-based transliterators are
  1420. // added to the system.
  1421. // This is to allow for null pointer check
  1422. NullTransliterator* tempNullTranslit = new NullTransliterator();
  1423. LowercaseTransliterator* tempLowercaseTranslit = new LowercaseTransliterator();
  1424. UppercaseTransliterator* tempUppercaseTranslit = new UppercaseTransliterator();
  1425. TitlecaseTransliterator* tempTitlecaseTranslit = new TitlecaseTransliterator();
  1426. UnicodeNameTransliterator* tempUnicodeTranslit = new UnicodeNameTransliterator();
  1427. NameUnicodeTransliterator* tempNameUnicodeTranslit = new NameUnicodeTransliterator();
  1428. #if !UCONFIG_NO_BREAK_ITERATION
  1429. // TODO: could or should these transliterators be referenced polymorphically once constructed?
  1430. BreakTransliterator* tempBreakTranslit = new BreakTransliterator();
  1431. #endif
  1432. // Check for null pointers
  1433. if (tempNullTranslit == nullptr || tempLowercaseTranslit == nullptr || tempUppercaseTranslit == nullptr ||
  1434. tempTitlecaseTranslit == nullptr || tempUnicodeTranslit == nullptr ||
  1435. #if !UCONFIG_NO_BREAK_ITERATION
  1436. tempBreakTranslit == nullptr ||
  1437. #endif
  1438. tempNameUnicodeTranslit == nullptr )
  1439. {
  1440. delete tempNullTranslit;
  1441. delete tempLowercaseTranslit;
  1442. delete tempUppercaseTranslit;
  1443. delete tempTitlecaseTranslit;
  1444. delete tempUnicodeTranslit;
  1445. delete tempNameUnicodeTranslit;
  1446. #if !UCONFIG_NO_BREAK_ITERATION
  1447. delete tempBreakTranslit;
  1448. #endif
  1449. // Since there was an error, remove registry
  1450. delete registry;
  1451. registry = nullptr;
  1452. status = U_MEMORY_ALLOCATION_ERROR;
  1453. return 0;
  1454. }
  1455. registry->put(tempNullTranslit, true, status);
  1456. registry->put(tempLowercaseTranslit, true, status);
  1457. registry->put(tempUppercaseTranslit, true, status);
  1458. registry->put(tempTitlecaseTranslit, true, status);
  1459. registry->put(tempUnicodeTranslit, true, status);
  1460. registry->put(tempNameUnicodeTranslit, true, status);
  1461. #if !UCONFIG_NO_BREAK_ITERATION
  1462. registry->put(tempBreakTranslit, false, status); // false means invisible.
  1463. #endif
  1464. RemoveTransliterator::registerIDs(); // Must be within mutex
  1465. EscapeTransliterator::registerIDs();
  1466. UnescapeTransliterator::registerIDs();
  1467. NormalizationTransliterator::registerIDs();
  1468. AnyTransliterator::registerIDs();
  1469. _registerSpecialInverse(UNICODE_STRING_SIMPLE("Null"),
  1470. UNICODE_STRING_SIMPLE("Null"), false);
  1471. _registerSpecialInverse(UNICODE_STRING_SIMPLE("Upper"),
  1472. UNICODE_STRING_SIMPLE("Lower"), true);
  1473. _registerSpecialInverse(UNICODE_STRING_SIMPLE("Title"),
  1474. UNICODE_STRING_SIMPLE("Lower"), false);
  1475. ucln_i18n_registerCleanup(UCLN_I18N_TRANSLITERATOR, utrans_transliterator_cleanup);
  1476. return true;
  1477. }
  1478. U_NAMESPACE_END
  1479. // Defined in transreg.h:
  1480. /**
  1481. * Release all static memory held by transliterator. This will
  1482. * necessarily invalidate any rule-based transliterators held by the
  1483. * user, because RBTs hold pointers to common data objects.
  1484. */
  1485. U_CFUNC UBool utrans_transliterator_cleanup() {
  1486. U_NAMESPACE_USE
  1487. TransliteratorIDParser::cleanup();
  1488. if (registry) {
  1489. delete registry;
  1490. registry = nullptr;
  1491. }
  1492. return true;
  1493. }
  1494. #endif /* #if !UCONFIG_NO_TRANSLITERATION */
  1495. //eof