translit.h 66 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. **********************************************************************
  5. * Copyright (C) 1999-2014, International Business Machines
  6. * Corporation and others. All Rights Reserved.
  7. **********************************************************************
  8. * Date Name Description
  9. * 11/17/99 aliu Creation.
  10. **********************************************************************
  11. */
  12. #ifndef TRANSLIT_H
  13. #define TRANSLIT_H
  14. #include "unicode/utypes.h"
  15. #if U_SHOW_CPLUSPLUS_API
  16. /**
  17. * \file
  18. * \brief C++ API: Transforms text from one format to another.
  19. */
  20. #if !UCONFIG_NO_TRANSLITERATION
  21. #include "unicode/uobject.h"
  22. #include "unicode/unistr.h"
  23. #include "unicode/parseerr.h"
  24. #include "unicode/utrans.h" // UTransPosition, UTransDirection
  25. #include "unicode/strenum.h"
  26. U_NAMESPACE_BEGIN
  27. class UnicodeFilter;
  28. class UnicodeSet;
  29. class TransliteratorParser;
  30. class NormalizationTransliterator;
  31. class TransliteratorIDParser;
  32. /**
  33. *
  34. * <code>Transliterator</code> is an abstract class that
  35. * transliterates text from one format to another. The most common
  36. * kind of transliterator is a script, or alphabet, transliterator.
  37. * For example, a Russian to Latin transliterator changes Russian text
  38. * written in Cyrillic characters to phonetically equivalent Latin
  39. * characters. It does not <em>translate</em> Russian to English!
  40. * Transliteration, unlike translation, operates on characters, without
  41. * reference to the meanings of words and sentences.
  42. *
  43. * <p>Although script conversion is its most common use, a
  44. * transliterator can actually perform a more general class of tasks.
  45. * In fact, <code>Transliterator</code> defines a very general API
  46. * which specifies only that a segment of the input text is replaced
  47. * by new text. The particulars of this conversion are determined
  48. * entirely by subclasses of <code>Transliterator</code>.
  49. *
  50. * <p><b>Transliterators are stateless</b>
  51. *
  52. * <p><code>Transliterator</code> objects are <em>stateless</em>; they
  53. * retain no information between calls to
  54. * <code>transliterate()</code>. (However, this does <em>not</em>
  55. * mean that threads may share transliterators without synchronizing
  56. * them. Transliterators are not immutable, so they must be
  57. * synchronized when shared between threads.) This might seem to
  58. * limit the complexity of the transliteration operation. In
  59. * practice, subclasses perform complex transliterations by delaying
  60. * the replacement of text until it is known that no other
  61. * replacements are possible. In other words, although the
  62. * <code>Transliterator</code> objects are stateless, the source text
  63. * itself embodies all the needed information, and delayed operation
  64. * allows arbitrary complexity.
  65. *
  66. * <p><b>Batch transliteration</b>
  67. *
  68. * <p>The simplest way to perform transliteration is all at once, on a
  69. * string of existing text. This is referred to as <em>batch</em>
  70. * transliteration. For example, given a string <code>input</code>
  71. * and a transliterator <code>t</code>, the call
  72. *
  73. * String result = t.transliterate(input);
  74. *
  75. * will transliterate it and return the result. Other methods allow
  76. * the client to specify a substring to be transliterated and to use
  77. * {@link Replaceable } objects instead of strings, in order to
  78. * preserve out-of-band information (such as text styles).
  79. *
  80. * <p><b>Keyboard transliteration</b>
  81. *
  82. * <p>Somewhat more involved is <em>keyboard</em>, or incremental
  83. * transliteration. This is the transliteration of text that is
  84. * arriving from some source (typically the user's keyboard) one
  85. * character at a time, or in some other piecemeal fashion.
  86. *
  87. * <p>In keyboard transliteration, a <code>Replaceable</code> buffer
  88. * stores the text. As text is inserted, as much as possible is
  89. * transliterated on the fly. This means a GUI that displays the
  90. * contents of the buffer may show text being modified as each new
  91. * character arrives.
  92. *
  93. * <p>Consider the simple rule-based Transliterator:
  94. * <pre>
  95. * th>{theta}
  96. * t>{tau}
  97. * </pre>
  98. *
  99. * When the user types 't', nothing will happen, since the
  100. * transliterator is waiting to see if the next character is 'h'. To
  101. * remedy this, we introduce the notion of a cursor, marked by a '|'
  102. * in the output string:
  103. * <pre>
  104. * t>|{tau}
  105. * {tau}h>{theta}
  106. * </pre>
  107. *
  108. * Now when the user types 't', tau appears, and if the next character
  109. * is 'h', the tau changes to a theta. This is accomplished by
  110. * maintaining a cursor position (independent of the insertion point,
  111. * and invisible in the GUI) across calls to
  112. * <code>transliterate()</code>. Typically, the cursor will
  113. * be coincident with the insertion point, but in a case like the one
  114. * above, it will precede the insertion point.
  115. *
  116. * <p>Keyboard transliteration methods maintain a set of three indices
  117. * that are updated with each call to
  118. * <code>transliterate()</code>, including the cursor, start,
  119. * and limit. Since these indices are changed by the method, they are
  120. * passed in an <code>int[]</code> array. The <code>START</code> index
  121. * marks the beginning of the substring that the transliterator will
  122. * look at. It is advanced as text becomes committed (but it is not
  123. * the committed index; that's the <code>CURSOR</code>). The
  124. * <code>CURSOR</code> index, described above, marks the point at
  125. * which the transliterator last stopped, either because it reached
  126. * the end, or because it required more characters to disambiguate
  127. * between possible inputs. The <code>CURSOR</code> can also be
  128. * explicitly set by rules in a rule-based Transliterator.
  129. * Any characters before the <code>CURSOR</code> index are frozen;
  130. * future keyboard transliteration calls within this input sequence
  131. * will not change them. New text is inserted at the
  132. * <code>LIMIT</code> index, which marks the end of the substring that
  133. * the transliterator looks at.
  134. *
  135. * <p>Because keyboard transliteration assumes that more characters
  136. * are to arrive, it is conservative in its operation. It only
  137. * transliterates when it can do so unambiguously. Otherwise it waits
  138. * for more characters to arrive. When the client code knows that no
  139. * more characters are forthcoming, perhaps because the user has
  140. * performed some input termination operation, then it should call
  141. * <code>finishTransliteration()</code> to complete any
  142. * pending transliterations.
  143. *
  144. * <p><b>Inverses</b>
  145. *
  146. * <p>Pairs of transliterators may be inverses of one another. For
  147. * example, if transliterator <b>A</b> transliterates characters by
  148. * incrementing their Unicode value (so "abc" -> "def"), and
  149. * transliterator <b>B</b> decrements character values, then <b>A</b>
  150. * is an inverse of <b>B</b> and vice versa. If we compose <b>A</b>
  151. * with <b>B</b> in a compound transliterator, the result is the
  152. * identity transliterator, that is, a transliterator that does not
  153. * change its input text.
  154. *
  155. * The <code>Transliterator</code> method <code>getInverse()</code>
  156. * returns a transliterator's inverse, if one exists, or
  157. * <code>null</code> otherwise. However, the result of
  158. * <code>getInverse()</code> usually will <em>not</em> be a true
  159. * mathematical inverse. This is because true inverse transliterators
  160. * are difficult to formulate. For example, consider two
  161. * transliterators: <b>AB</b>, which transliterates the character 'A'
  162. * to 'B', and <b>BA</b>, which transliterates 'B' to 'A'. It might
  163. * seem that these are exact inverses, since
  164. *
  165. * \htmlonly<blockquote>\endhtmlonly"A" x <b>AB</b> -> "B"<br>
  166. * "B" x <b>BA</b> -> "A"\htmlonly</blockquote>\endhtmlonly
  167. *
  168. * where 'x' represents transliteration. However,
  169. *
  170. * \htmlonly<blockquote>\endhtmlonly"ABCD" x <b>AB</b> -> "BBCD"<br>
  171. * "BBCD" x <b>BA</b> -> "AACD"\htmlonly</blockquote>\endhtmlonly
  172. *
  173. * so <b>AB</b> composed with <b>BA</b> is not the
  174. * identity. Nonetheless, <b>BA</b> may be usefully considered to be
  175. * <b>AB</b>'s inverse, and it is on this basis that
  176. * <b>AB</b><code>.getInverse()</code> could legitimately return
  177. * <b>BA</b>.
  178. *
  179. * <p><b>IDs and display names</b>
  180. *
  181. * <p>A transliterator is designated by a short identifier string or
  182. * <em>ID</em>. IDs follow the format <em>source-destination</em>,
  183. * where <em>source</em> describes the entity being replaced, and
  184. * <em>destination</em> describes the entity replacing
  185. * <em>source</em>. The entities may be the names of scripts,
  186. * particular sequences of characters, or whatever else it is that the
  187. * transliterator converts to or from. For example, a transliterator
  188. * from Russian to Latin might be named "Russian-Latin". A
  189. * transliterator from keyboard escape sequences to Latin-1 characters
  190. * might be named "KeyboardEscape-Latin1". By convention, system
  191. * entity names are in English, with the initial letters of words
  192. * capitalized; user entity names may follow any format so long as
  193. * they do not contain dashes.
  194. *
  195. * <p>In addition to programmatic IDs, transliterator objects have
  196. * display names for presentation in user interfaces, returned by
  197. * {@link #getDisplayName }.
  198. *
  199. * <p><b>Factory methods and registration</b>
  200. *
  201. * <p>In general, client code should use the factory method
  202. * {@link #createInstance } to obtain an instance of a
  203. * transliterator given its ID. Valid IDs may be enumerated using
  204. * <code>getAvailableIDs()</code>. Since transliterators are mutable,
  205. * multiple calls to {@link #createInstance } with the same ID will
  206. * return distinct objects.
  207. *
  208. * <p>In addition to the system transliterators registered at startup,
  209. * user transliterators may be registered by calling
  210. * <code>registerInstance()</code> at run time. A registered instance
  211. * acts a template; future calls to {@link #createInstance } with the ID
  212. * of the registered object return clones of that object. Thus any
  213. * object passed to <tt>registerInstance()</tt> must implement
  214. * <tt>clone()</tt> properly. To register a transliterator subclass
  215. * without instantiating it (until it is needed), users may call
  216. * {@link #registerFactory }. In this case, the objects are
  217. * instantiated by invoking the zero-argument public constructor of
  218. * the class.
  219. *
  220. * <p><b>Subclassing</b>
  221. *
  222. * Subclasses must implement the abstract method
  223. * <code>handleTransliterate()</code>. <p>Subclasses should override
  224. * the <code>transliterate()</code> method taking a
  225. * <code>Replaceable</code> and the <code>transliterate()</code>
  226. * method taking a <code>String</code> and <code>StringBuffer</code>
  227. * if the performance of these methods can be improved over the
  228. * performance obtained by the default implementations in this class.
  229. *
  230. * <p><b>Rule syntax</b>
  231. *
  232. * <p>A set of rules determines how to perform translations.
  233. * Rules within a rule set are separated by semicolons (';').
  234. * To include a literal semicolon, prefix it with a backslash ('\').
  235. * Unicode Pattern_White_Space is ignored.
  236. * If the first non-blank character on a line is '#',
  237. * the entire line is ignored as a comment.
  238. *
  239. * <p>Each set of rules consists of two groups, one forward, and one
  240. * reverse. This is a convention that is not enforced; rules for one
  241. * direction may be omitted, with the result that translations in
  242. * that direction will not modify the source text. In addition,
  243. * bidirectional forward-reverse rules may be specified for
  244. * symmetrical transformations.
  245. *
  246. * <p>Note: Another description of the Transliterator rule syntax is available in
  247. * <a href="https://www.unicode.org/reports/tr35/tr35-general.html#Transform_Rules_Syntax">section
  248. * Transform Rules Syntax of UTS #35: Unicode LDML</a>.
  249. * The rules are shown there using arrow symbols ← and → and ↔.
  250. * ICU supports both those and the equivalent ASCII symbols &lt; and &gt; and &lt;&gt;.
  251. *
  252. * <p>Rule statements take one of the following forms:
  253. *
  254. * <dl>
  255. * <dt><code>$alefmadda=\\u0622;</code></dt>
  256. * <dd><strong>Variable definition.</strong> The name on the
  257. * left is assigned the text on the right. In this example,
  258. * after this statement, instances of the left hand name,
  259. * &quot;<code>$alefmadda</code>&quot;, will be replaced by
  260. * the Unicode character U+0622. Variable names must begin
  261. * with a letter and consist only of letters, digits, and
  262. * underscores. Case is significant. Duplicate names cause
  263. * an exception to be thrown, that is, variables cannot be
  264. * redefined. The right hand side may contain well-formed
  265. * text of any length, including no text at all (&quot;<code>$empty=;</code>&quot;).
  266. * The right hand side may contain embedded <code>UnicodeSet</code>
  267. * patterns, for example, &quot;<code>$softvowel=[eiyEIY]</code>&quot;.</dd>
  268. * <dt><code>ai&gt;$alefmadda;</code></dt>
  269. * <dd><strong>Forward translation rule.</strong> This rule
  270. * states that the string on the left will be changed to the
  271. * string on the right when performing forward
  272. * transliteration.</dd>
  273. * <dt><code>ai&lt;$alefmadda;</code></dt>
  274. * <dd><strong>Reverse translation rule.</strong> This rule
  275. * states that the string on the right will be changed to
  276. * the string on the left when performing reverse
  277. * transliteration.</dd>
  278. * </dl>
  279. *
  280. * <dl>
  281. * <dt><code>ai&lt;&gt;$alefmadda;</code></dt>
  282. * <dd><strong>Bidirectional translation rule.</strong> This
  283. * rule states that the string on the right will be changed
  284. * to the string on the left when performing forward
  285. * transliteration, and vice versa when performing reverse
  286. * transliteration.</dd>
  287. * </dl>
  288. *
  289. * <p>Translation rules consist of a <em>match pattern</em> and an <em>output
  290. * string</em>. The match pattern consists of literal characters,
  291. * optionally preceded by context, and optionally followed by
  292. * context. Context characters, like literal pattern characters,
  293. * must be matched in the text being transliterated. However, unlike
  294. * literal pattern characters, they are not replaced by the output
  295. * text. For example, the pattern &quot;<code>abc{def}</code>&quot;
  296. * indicates the characters &quot;<code>def</code>&quot; must be
  297. * preceded by &quot;<code>abc</code>&quot; for a successful match.
  298. * If there is a successful match, &quot;<code>def</code>&quot; will
  299. * be replaced, but not &quot;<code>abc</code>&quot;. The final '<code>}</code>'
  300. * is optional, so &quot;<code>abc{def</code>&quot; is equivalent to
  301. * &quot;<code>abc{def}</code>&quot;. Another example is &quot;<code>{123}456</code>&quot;
  302. * (or &quot;<code>123}456</code>&quot;) in which the literal
  303. * pattern &quot;<code>123</code>&quot; must be followed by &quot;<code>456</code>&quot;.
  304. *
  305. * <p>The output string of a forward or reverse rule consists of
  306. * characters to replace the literal pattern characters. If the
  307. * output string contains the character '<code>|</code>', this is
  308. * taken to indicate the location of the <em>cursor</em> after
  309. * replacement. The cursor is the point in the text at which the
  310. * next replacement, if any, will be applied. The cursor is usually
  311. * placed within the replacement text; however, it can actually be
  312. * placed into the preceding or following context by using the
  313. * special character '@'. Examples:
  314. *
  315. * <pre>
  316. * a {foo} z &gt; | @ bar; # foo -&gt; bar, move cursor before a
  317. * {foo} xyz &gt; bar @@|; #&nbsp;foo -&gt; bar, cursor between y and z
  318. * </pre>
  319. *
  320. * <p><b>UnicodeSet</b>
  321. *
  322. * <p><code>UnicodeSet</code> patterns may appear anywhere that
  323. * makes sense. They may appear in variable definitions.
  324. * Contrariwise, <code>UnicodeSet</code> patterns may themselves
  325. * contain variable references, such as &quot;<code>$a=[a-z];$not_a=[^$a]</code>&quot;,
  326. * or &quot;<code>$range=a-z;$ll=[$range]</code>&quot;.
  327. *
  328. * <p><code>UnicodeSet</code> patterns may also be embedded directly
  329. * into rule strings. Thus, the following two rules are equivalent:
  330. *
  331. * <pre>
  332. * $vowel=[aeiou]; $vowel&gt;'*'; # One way to do this
  333. * [aeiou]&gt;'*'; # Another way
  334. * </pre>
  335. *
  336. * <p>See {@link UnicodeSet} for more documentation and examples.
  337. *
  338. * <p><b>Segments</b>
  339. *
  340. * <p>Segments of the input string can be matched and copied to the
  341. * output string. This makes certain sets of rules simpler and more
  342. * general, and makes reordering possible. For example:
  343. *
  344. * <pre>
  345. * ([a-z]) &gt; $1 $1; # double lowercase letters
  346. * ([:Lu:]) ([:Ll:]) &gt; $2 $1; # reverse order of Lu-Ll pairs
  347. * </pre>
  348. *
  349. * <p>The segment of the input string to be copied is delimited by
  350. * &quot;<code>(</code>&quot; and &quot;<code>)</code>&quot;. Up to
  351. * nine segments may be defined. Segments may not overlap. In the
  352. * output string, &quot;<code>$1</code>&quot; through &quot;<code>$9</code>&quot;
  353. * represent the input string segments, in left-to-right order of
  354. * definition.
  355. *
  356. * <p><b>Anchors</b>
  357. *
  358. * <p>Patterns can be anchored to the beginning or the end of the text. This is done with the
  359. * special characters '<code>^</code>' and '<code>$</code>'. For example:
  360. *
  361. * <pre>
  362. * ^ a&nbsp;&nbsp; &gt; 'BEG_A'; &nbsp;&nbsp;# match 'a' at start of text
  363. * &nbsp; a&nbsp;&nbsp; &gt; 'A'; # match other instances of 'a'
  364. * &nbsp; z $ &gt; 'END_Z'; &nbsp;&nbsp;# match 'z' at end of text
  365. * &nbsp; z&nbsp;&nbsp; &gt; 'Z';&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; # match other instances of 'z'
  366. * </pre>
  367. *
  368. * <p>It is also possible to match the beginning or the end of the text using a <code>UnicodeSet</code>.
  369. * This is done by including a virtual anchor character '<code>$</code>' at the end of the
  370. * set pattern. Although this is usually the match character for the end anchor, the set will
  371. * match either the beginning or the end of the text, depending on its placement. For
  372. * example:
  373. *
  374. * <pre>
  375. * $x = [a-z$]; &nbsp;&nbsp;# match 'a' through 'z' OR anchor
  376. * $x 1&nbsp;&nbsp;&nbsp; &gt; 2;&nbsp;&nbsp; # match '1' after a-z or at the start
  377. * &nbsp;&nbsp; 3 $x &gt; 4; &nbsp;&nbsp;# match '3' before a-z or at the end
  378. * </pre>
  379. *
  380. * <p><b>Example</b>
  381. *
  382. * <p>The following example rules illustrate many of the features of
  383. * the rule language.
  384. *
  385. * <table border="0" cellpadding="4">
  386. * <tr>
  387. * <td style="vertical-align: top;">Rule 1.</td>
  388. * <td style="vertical-align: top; write-space: nowrap;"><code>abc{def}&gt;x|y</code></td>
  389. * </tr>
  390. * <tr>
  391. * <td style="vertical-align: top;">Rule 2.</td>
  392. * <td style="vertical-align: top; write-space: nowrap;"><code>xyz&gt;r</code></td>
  393. * </tr>
  394. * <tr>
  395. * <td style="vertical-align: top;">Rule 3.</td>
  396. * <td style="vertical-align: top; write-space: nowrap;"><code>yz&gt;q</code></td>
  397. * </tr>
  398. * </table>
  399. *
  400. * <p>Applying these rules to the string &quot;<code>adefabcdefz</code>&quot;
  401. * yields the following results:
  402. *
  403. * <table border="0" cellpadding="4">
  404. * <tr>
  405. * <td style="vertical-align: top; write-space: nowrap;"><code>|adefabcdefz</code></td>
  406. * <td style="vertical-align: top;">Initial state, no rules match. Advance
  407. * cursor.</td>
  408. * </tr>
  409. * <tr>
  410. * <td style="vertical-align: top; write-space: nowrap;"><code>a|defabcdefz</code></td>
  411. * <td style="vertical-align: top;">Still no match. Rule 1 does not match
  412. * because the preceding context is not present.</td>
  413. * </tr>
  414. * <tr>
  415. * <td style="vertical-align: top; write-space: nowrap;"><code>ad|efabcdefz</code></td>
  416. * <td style="vertical-align: top;">Still no match. Keep advancing until
  417. * there is a match...</td>
  418. * </tr>
  419. * <tr>
  420. * <td style="vertical-align: top; write-space: nowrap;"><code>ade|fabcdefz</code></td>
  421. * <td style="vertical-align: top;">...</td>
  422. * </tr>
  423. * <tr>
  424. * <td style="vertical-align: top; write-space: nowrap;"><code>adef|abcdefz</code></td>
  425. * <td style="vertical-align: top;">...</td>
  426. * </tr>
  427. * <tr>
  428. * <td style="vertical-align: top; write-space: nowrap;"><code>adefa|bcdefz</code></td>
  429. * <td style="vertical-align: top;">...</td>
  430. * </tr>
  431. * <tr>
  432. * <td style="vertical-align: top; write-space: nowrap;"><code>adefab|cdefz</code></td>
  433. * <td style="vertical-align: top;">...</td>
  434. * </tr>
  435. * <tr>
  436. * <td style="vertical-align: top; write-space: nowrap;"><code>adefabc|defz</code></td>
  437. * <td style="vertical-align: top;">Rule 1 matches; replace &quot;<code>def</code>&quot;
  438. * with &quot;<code>xy</code>&quot; and back up the cursor
  439. * to before the '<code>y</code>'.</td>
  440. * </tr>
  441. * <tr>
  442. * <td style="vertical-align: top; write-space: nowrap;"><code>adefabcx|yz</code></td>
  443. * <td style="vertical-align: top;">Although &quot;<code>xyz</code>&quot; is
  444. * present, rule 2 does not match because the cursor is
  445. * before the '<code>y</code>', not before the '<code>x</code>'.
  446. * Rule 3 does match. Replace &quot;<code>yz</code>&quot;
  447. * with &quot;<code>q</code>&quot;.</td>
  448. * </tr>
  449. * <tr>
  450. * <td style="vertical-align: top; write-space: nowrap;"><code>adefabcxq|</code></td>
  451. * <td style="vertical-align: top;">The cursor is at the end;
  452. * transliteration is complete.</td>
  453. * </tr>
  454. * </table>
  455. *
  456. * <p>The order of rules is significant. If multiple rules may match
  457. * at some point, the first matching rule is applied.
  458. *
  459. * <p>Forward and reverse rules may have an empty output string.
  460. * Otherwise, an empty left or right hand side of any statement is a
  461. * syntax error.
  462. *
  463. * <p>Single quotes are used to quote any character other than a
  464. * digit or letter. To specify a single quote itself, inside or
  465. * outside of quotes, use two single quotes in a row. For example,
  466. * the rule &quot;<code>'&gt;'&gt;o''clock</code>&quot; changes the
  467. * string &quot;<code>&gt;</code>&quot; to the string &quot;<code>o'clock</code>&quot;.
  468. *
  469. * <p><b>Notes</b>
  470. *
  471. * <p>While a Transliterator is being built from rules, it checks that
  472. * the rules are added in proper order. For example, if the rule
  473. * &quot;a&gt;x&quot; is followed by the rule &quot;ab&gt;y&quot;,
  474. * then the second rule will throw an exception. The reason is that
  475. * the second rule can never be triggered, since the first rule
  476. * always matches anything it matches. In other words, the first
  477. * rule <em>masks</em> the second rule.
  478. *
  479. * @author Alan Liu
  480. * @stable ICU 2.0
  481. */
  482. class U_I18N_API Transliterator : public UObject {
  483. private:
  484. /**
  485. * Programmatic name, e.g., "Latin-Arabic".
  486. */
  487. UnicodeString ID;
  488. /**
  489. * This transliterator's filter. Any character for which
  490. * <tt>filter.contains()</tt> returns <tt>false</tt> will not be
  491. * altered by this transliterator. If <tt>filter</tt> is
  492. * <tt>null</tt> then no filtering is applied.
  493. */
  494. UnicodeFilter* filter;
  495. int32_t maximumContextLength;
  496. public:
  497. /**
  498. * A context integer or pointer for a factory function, passed by
  499. * value.
  500. * @stable ICU 2.4
  501. */
  502. union Token {
  503. /**
  504. * This token, interpreted as a 32-bit integer.
  505. * @stable ICU 2.4
  506. */
  507. int32_t integer;
  508. /**
  509. * This token, interpreted as a native pointer.
  510. * @stable ICU 2.4
  511. */
  512. void* pointer;
  513. };
  514. #ifndef U_HIDE_INTERNAL_API
  515. /**
  516. * Return a token containing an integer.
  517. * @return a token containing an integer.
  518. * @internal
  519. */
  520. inline static Token integerToken(int32_t);
  521. /**
  522. * Return a token containing a pointer.
  523. * @return a token containing a pointer.
  524. * @internal
  525. */
  526. inline static Token pointerToken(void*);
  527. #endif /* U_HIDE_INTERNAL_API */
  528. /**
  529. * A function that creates and returns a Transliterator. When
  530. * invoked, it will be passed the ID string that is being
  531. * instantiated, together with the context pointer that was passed
  532. * in when the factory function was first registered. Many
  533. * factory functions will ignore both parameters, however,
  534. * functions that are registered to more than one ID may use the
  535. * ID or the context parameter to parameterize the transliterator
  536. * they create.
  537. * @param ID the string identifier for this transliterator
  538. * @param context a context pointer that will be stored and
  539. * later passed to the factory function when an ID matching
  540. * the registration ID is being instantiated with this factory.
  541. * @stable ICU 2.4
  542. */
  543. typedef Transliterator* (U_EXPORT2 *Factory)(const UnicodeString& ID, Token context);
  544. protected:
  545. /**
  546. * Default constructor.
  547. * @param ID the string identifier for this transliterator
  548. * @param adoptedFilter the filter. Any character for which
  549. * <tt>filter.contains()</tt> returns <tt>false</tt> will not be
  550. * altered by this transliterator. If <tt>filter</tt> is
  551. * <tt>null</tt> then no filtering is applied.
  552. * @stable ICU 2.4
  553. */
  554. Transliterator(const UnicodeString& ID, UnicodeFilter* adoptedFilter);
  555. /**
  556. * Copy constructor.
  557. * @stable ICU 2.4
  558. */
  559. Transliterator(const Transliterator&);
  560. /**
  561. * Assignment operator.
  562. * @stable ICU 2.4
  563. */
  564. Transliterator& operator=(const Transliterator&);
  565. /**
  566. * Create a transliterator from a basic ID. This is an ID
  567. * containing only the forward direction source, target, and
  568. * variant.
  569. * @param id a basic ID of the form S-T or S-T/V.
  570. * @param canon canonical ID to assign to the object, or
  571. * nullptr to leave the ID unchanged
  572. * @return a newly created Transliterator or null if the ID is
  573. * invalid.
  574. * @stable ICU 2.4
  575. */
  576. static Transliterator* createBasicInstance(const UnicodeString& id,
  577. const UnicodeString* canon);
  578. friend class TransliteratorParser; // for parseID()
  579. friend class TransliteratorIDParser; // for createBasicInstance()
  580. friend class TransliteratorAlias; // for setID()
  581. public:
  582. /**
  583. * Destructor.
  584. * @stable ICU 2.0
  585. */
  586. virtual ~Transliterator();
  587. /**
  588. * Implements Cloneable.
  589. * All subclasses are encouraged to implement this method if it is
  590. * possible and reasonable to do so. Subclasses that are to be
  591. * registered with the system using <tt>registerInstance()</tt>
  592. * are required to implement this method. If a subclass does not
  593. * implement clone() properly and is registered with the system
  594. * using registerInstance(), then the default clone() implementation
  595. * will return null, and calls to createInstance() will fail.
  596. *
  597. * @return a copy of the object.
  598. * @see #registerInstance
  599. * @stable ICU 2.0
  600. */
  601. virtual Transliterator* clone() const;
  602. /**
  603. * Transliterates a segment of a string, with optional filtering.
  604. *
  605. * @param text the string to be transliterated
  606. * @param start the beginning index, inclusive; <code>0 <= start
  607. * <= limit</code>.
  608. * @param limit the ending index, exclusive; <code>start <= limit
  609. * <= text.length()</code>.
  610. * @return The new limit index. The text previously occupying <code>[start,
  611. * limit)</code> has been transliterated, possibly to a string of a different
  612. * length, at <code>[start, </code><em>new-limit</em><code>)</code>, where
  613. * <em>new-limit</em> is the return value. If the input offsets are out of bounds,
  614. * the returned value is -1 and the input string remains unchanged.
  615. * @stable ICU 2.0
  616. */
  617. virtual int32_t transliterate(Replaceable& text,
  618. int32_t start, int32_t limit) const;
  619. /**
  620. * Transliterates an entire string in place. Convenience method.
  621. * @param text the string to be transliterated
  622. * @stable ICU 2.0
  623. */
  624. virtual void transliterate(Replaceable& text) const;
  625. /**
  626. * Transliterates the portion of the text buffer that can be
  627. * transliterated unambiguosly after new text has been inserted,
  628. * typically as a result of a keyboard event. The new text in
  629. * <code>insertion</code> will be inserted into <code>text</code>
  630. * at <code>index.limit</code>, advancing
  631. * <code>index.limit</code> by <code>insertion.length()</code>.
  632. * Then the transliterator will try to transliterate characters of
  633. * <code>text</code> between <code>index.cursor</code> and
  634. * <code>index.limit</code>. Characters before
  635. * <code>index.cursor</code> will not be changed.
  636. *
  637. * <p>Upon return, values in <code>index</code> will be updated.
  638. * <code>index.start</code> will be advanced to the first
  639. * character that future calls to this method will read.
  640. * <code>index.cursor</code> and <code>index.limit</code> will
  641. * be adjusted to delimit the range of text that future calls to
  642. * this method may change.
  643. *
  644. * <p>Typical usage of this method begins with an initial call
  645. * with <code>index.start</code> and <code>index.limit</code>
  646. * set to indicate the portion of <code>text</code> to be
  647. * transliterated, and <code>index.cursor == index.start</code>.
  648. * Thereafter, <code>index</code> can be used without
  649. * modification in future calls, provided that all changes to
  650. * <code>text</code> are made via this method.
  651. *
  652. * <p>This method assumes that future calls may be made that will
  653. * insert new text into the buffer. As a result, it only performs
  654. * unambiguous transliterations. After the last call to this
  655. * method, there may be untransliterated text that is waiting for
  656. * more input to resolve an ambiguity. In order to perform these
  657. * pending transliterations, clients should call
  658. * {@link #finishTransliteration } after the last call to this
  659. * method has been made.
  660. *
  661. * @param text the buffer holding transliterated and untransliterated text
  662. * @param index an array of three integers.
  663. *
  664. * <ul><li><code>index.start</code>: the beginning index,
  665. * inclusive; <code>0 <= index.start <= index.limit</code>.
  666. *
  667. * <li><code>index.limit</code>: the ending index, exclusive;
  668. * <code>index.start <= index.limit <= text.length()</code>.
  669. * <code>insertion</code> is inserted at
  670. * <code>index.limit</code>.
  671. *
  672. * <li><code>index.cursor</code>: the next character to be
  673. * considered for transliteration; <code>index.start <=
  674. * index.cursor <= index.limit</code>. Characters before
  675. * <code>index.cursor</code> will not be changed by future calls
  676. * to this method.</ul>
  677. *
  678. * @param insertion text to be inserted and possibly
  679. * transliterated into the translation buffer at
  680. * <code>index.limit</code>. If <code>null</code> then no text
  681. * is inserted.
  682. * @param status Output param to filled in with a success or an error.
  683. * @see #handleTransliterate
  684. * @exception IllegalArgumentException if <code>index</code>
  685. * is invalid
  686. * @see UTransPosition
  687. * @stable ICU 2.0
  688. */
  689. virtual void transliterate(Replaceable& text, UTransPosition& index,
  690. const UnicodeString& insertion,
  691. UErrorCode& status) const;
  692. /**
  693. * Transliterates the portion of the text buffer that can be
  694. * transliterated unambiguosly after a new character has been
  695. * inserted, typically as a result of a keyboard event. This is a
  696. * convenience method.
  697. * @param text the buffer holding transliterated and
  698. * untransliterated text
  699. * @param index an array of three integers.
  700. * @param insertion text to be inserted and possibly
  701. * transliterated into the translation buffer at
  702. * <code>index.limit</code>.
  703. * @param status Output param to filled in with a success or an error.
  704. * @see #transliterate(Replaceable&, UTransPosition&, const UnicodeString&, UErrorCode&) const
  705. * @stable ICU 2.0
  706. */
  707. virtual void transliterate(Replaceable& text, UTransPosition& index,
  708. UChar32 insertion,
  709. UErrorCode& status) const;
  710. /**
  711. * Transliterates the portion of the text buffer that can be
  712. * transliterated unambiguosly. This is a convenience method; see
  713. * {@link #transliterate(Replaceable&, UTransPosition&, const UnicodeString&, UErrorCode&) const }
  714. * for details.
  715. * @param text the buffer holding transliterated and
  716. * untransliterated text
  717. * @param index an array of three integers.
  718. * @param status Output param to filled in with a success or an error.
  719. * @see #transliterate(Replaceable&, UTransPosition&, const UnicodeString&, UErrorCode &) const
  720. * @stable ICU 2.0
  721. */
  722. virtual void transliterate(Replaceable& text, UTransPosition& index,
  723. UErrorCode& status) const;
  724. /**
  725. * Finishes any pending transliterations that were waiting for
  726. * more characters. Clients should call this method as the last
  727. * call after a sequence of one or more calls to
  728. * <code>transliterate()</code>.
  729. * @param text the buffer holding transliterated and
  730. * untransliterated text.
  731. * @param index the array of indices previously passed to {@link #transliterate }
  732. * @stable ICU 2.0
  733. */
  734. virtual void finishTransliteration(Replaceable& text,
  735. UTransPosition& index) const;
  736. private:
  737. /**
  738. * This internal method does incremental transliteration. If the
  739. * 'insertion' is non-null then we append it to 'text' before
  740. * proceeding. This method calls through to the pure virtual
  741. * framework method handleTransliterate() to do the actual
  742. * work.
  743. * @param text the buffer holding transliterated and
  744. * untransliterated text
  745. * @param index an array of three integers. See {@link
  746. * #transliterate(Replaceable, int[], String)}.
  747. * @param insertion text to be inserted and possibly
  748. * transliterated into the translation buffer at
  749. * <code>index.limit</code>.
  750. * @param status Output param to filled in with a success or an error.
  751. */
  752. void _transliterate(Replaceable& text,
  753. UTransPosition& index,
  754. const UnicodeString* insertion,
  755. UErrorCode &status) const;
  756. protected:
  757. /**
  758. * Abstract method that concrete subclasses define to implement
  759. * their transliteration algorithm. This method handles both
  760. * incremental and non-incremental transliteration. Let
  761. * <code>originalStart</code> refer to the value of
  762. * <code>pos.start</code> upon entry.
  763. *
  764. * <ul>
  765. * <li>If <code>incremental</code> is false, then this method
  766. * should transliterate all characters between
  767. * <code>pos.start</code> and <code>pos.limit</code>. Upon return
  768. * <code>pos.start</code> must == <code> pos.limit</code>.</li>
  769. *
  770. * <li>If <code>incremental</code> is true, then this method
  771. * should transliterate all characters between
  772. * <code>pos.start</code> and <code>pos.limit</code> that can be
  773. * unambiguously transliterated, regardless of future insertions
  774. * of text at <code>pos.limit</code>. Upon return,
  775. * <code>pos.start</code> should be in the range
  776. * [<code>originalStart</code>, <code>pos.limit</code>).
  777. * <code>pos.start</code> should be positioned such that
  778. * characters [<code>originalStart</code>, <code>
  779. * pos.start</code>) will not be changed in the future by this
  780. * transliterator and characters [<code>pos.start</code>,
  781. * <code>pos.limit</code>) are unchanged.</li>
  782. * </ul>
  783. *
  784. * <p>Implementations of this method should also obey the
  785. * following invariants:</p>
  786. *
  787. * <ul>
  788. * <li> <code>pos.limit</code> and <code>pos.contextLimit</code>
  789. * should be updated to reflect changes in length of the text
  790. * between <code>pos.start</code> and <code>pos.limit</code>. The
  791. * difference <code> pos.contextLimit - pos.limit</code> should
  792. * not change.</li>
  793. *
  794. * <li><code>pos.contextStart</code> should not change.</li>
  795. *
  796. * <li>Upon return, neither <code>pos.start</code> nor
  797. * <code>pos.limit</code> should be less than
  798. * <code>originalStart</code>.</li>
  799. *
  800. * <li>Text before <code>originalStart</code> and text after
  801. * <code>pos.limit</code> should not change.</li>
  802. *
  803. * <li>Text before <code>pos.contextStart</code> and text after
  804. * <code> pos.contextLimit</code> should be ignored.</li>
  805. * </ul>
  806. *
  807. * <p>Subclasses may safely assume that all characters in
  808. * [<code>pos.start</code>, <code>pos.limit</code>) are filtered.
  809. * In other words, the filter has already been applied by the time
  810. * this method is called. See
  811. * <code>filteredTransliterate()</code>.
  812. *
  813. * <p>This method is <b>not</b> for public consumption. Calling
  814. * this method directly will transliterate
  815. * [<code>pos.start</code>, <code>pos.limit</code>) without
  816. * applying the filter. End user code should call <code>
  817. * transliterate()</code> instead of this method. Subclass code
  818. * and wrapping transliterators should call
  819. * <code>filteredTransliterate()</code> instead of this method.<p>
  820. *
  821. * @param text the buffer holding transliterated and
  822. * untransliterated text
  823. *
  824. * @param pos the indices indicating the start, limit, context
  825. * start, and context limit of the text.
  826. *
  827. * @param incremental if true, assume more text may be inserted at
  828. * <code>pos.limit</code> and act accordingly. Otherwise,
  829. * transliterate all text between <code>pos.start</code> and
  830. * <code>pos.limit</code> and move <code>pos.start</code> up to
  831. * <code>pos.limit</code>.
  832. *
  833. * @see #transliterate
  834. * @stable ICU 2.4
  835. */
  836. virtual void handleTransliterate(Replaceable& text,
  837. UTransPosition& pos,
  838. UBool incremental) const = 0;
  839. public:
  840. /**
  841. * Transliterate a substring of text, as specified by index, taking filters
  842. * into account. This method is for subclasses that need to delegate to
  843. * another transliterator.
  844. * @param text the text to be transliterated
  845. * @param index the position indices
  846. * @param incremental if true, then assume more characters may be inserted
  847. * at index.limit, and postpone processing to accommodate future incoming
  848. * characters
  849. * @stable ICU 2.4
  850. */
  851. virtual void filteredTransliterate(Replaceable& text,
  852. UTransPosition& index,
  853. UBool incremental) const;
  854. private:
  855. /**
  856. * Top-level transliteration method, handling filtering, incremental and
  857. * non-incremental transliteration, and rollback. All transliteration
  858. * public API methods eventually call this method with a rollback argument
  859. * of true. Other entities may call this method but rollback should be
  860. * false.
  861. *
  862. * <p>If this transliterator has a filter, break up the input text into runs
  863. * of unfiltered characters. Pass each run to
  864. * subclass.handleTransliterate().
  865. *
  866. * <p>In incremental mode, if rollback is true, perform a special
  867. * incremental procedure in which several passes are made over the input
  868. * text, adding one character at a time, and committing successful
  869. * transliterations as they occur. Unsuccessful transliterations are rolled
  870. * back and retried with additional characters to give correct results.
  871. *
  872. * @param text the text to be transliterated
  873. * @param index the position indices
  874. * @param incremental if true, then assume more characters may be inserted
  875. * at index.limit, and postpone processing to accommodate future incoming
  876. * characters
  877. * @param rollback if true and if incremental is true, then perform special
  878. * incremental processing, as described above, and undo partial
  879. * transliterations where necessary. If incremental is false then this
  880. * parameter is ignored.
  881. */
  882. virtual void filteredTransliterate(Replaceable& text,
  883. UTransPosition& index,
  884. UBool incremental,
  885. UBool rollback) const;
  886. public:
  887. /**
  888. * Returns the length of the longest context required by this transliterator.
  889. * This is <em>preceding</em> context. The default implementation supplied
  890. * by <code>Transliterator</code> returns zero; subclasses
  891. * that use preceding context should override this method to return the
  892. * correct value. For example, if a transliterator translates "ddd" (where
  893. * d is any digit) to "555" when preceded by "(ddd)", then the preceding
  894. * context length is 5, the length of "(ddd)".
  895. *
  896. * @return The maximum number of preceding context characters this
  897. * transliterator needs to examine
  898. * @stable ICU 2.0
  899. */
  900. int32_t getMaximumContextLength() const;
  901. protected:
  902. /**
  903. * Method for subclasses to use to set the maximum context length.
  904. * @param maxContextLength the new value to be set.
  905. * @see #getMaximumContextLength
  906. * @stable ICU 2.4
  907. */
  908. void setMaximumContextLength(int32_t maxContextLength);
  909. public:
  910. /**
  911. * Returns a programmatic identifier for this transliterator.
  912. * If this identifier is passed to <code>createInstance()</code>, it
  913. * will return this object, if it has been registered.
  914. * @return a programmatic identifier for this transliterator.
  915. * @see #registerInstance
  916. * @see #registerFactory
  917. * @see #getAvailableIDs
  918. * @stable ICU 2.0
  919. */
  920. virtual const UnicodeString& getID() const;
  921. /**
  922. * Returns a name for this transliterator that is appropriate for
  923. * display to the user in the default locale. See {@link #getDisplayName }
  924. * for details.
  925. * @param ID the string identifier for this transliterator
  926. * @param result Output param to receive the display name
  927. * @return A reference to 'result'.
  928. * @stable ICU 2.0
  929. */
  930. static UnicodeString& U_EXPORT2 getDisplayName(const UnicodeString& ID,
  931. UnicodeString& result);
  932. /**
  933. * Returns a name for this transliterator that is appropriate for
  934. * display to the user in the given locale. This name is taken
  935. * from the locale resource data in the standard manner of the
  936. * <code>java.text</code> package.
  937. *
  938. * <p>If no localized names exist in the system resource bundles,
  939. * a name is synthesized using a localized
  940. * <code>MessageFormat</code> pattern from the resource data. The
  941. * arguments to this pattern are an integer followed by one or two
  942. * strings. The integer is the number of strings, either 1 or 2.
  943. * The strings are formed by splitting the ID for this
  944. * transliterator at the first '-'. If there is no '-', then the
  945. * entire ID forms the only string.
  946. * @param ID the string identifier for this transliterator
  947. * @param inLocale the Locale in which the display name should be
  948. * localized.
  949. * @param result Output param to receive the display name
  950. * @return A reference to 'result'.
  951. * @stable ICU 2.0
  952. */
  953. static UnicodeString& U_EXPORT2 getDisplayName(const UnicodeString& ID,
  954. const Locale& inLocale,
  955. UnicodeString& result);
  956. /**
  957. * Returns the filter used by this transliterator, or <tt>nullptr</tt>
  958. * if this transliterator uses no filter.
  959. * @return the filter used by this transliterator, or <tt>nullptr</tt>
  960. * if this transliterator uses no filter.
  961. * @stable ICU 2.0
  962. */
  963. const UnicodeFilter* getFilter() const;
  964. /**
  965. * Returns the filter used by this transliterator, or <tt>nullptr</tt> if this
  966. * transliterator uses no filter. The caller must eventually delete the
  967. * result. After this call, this transliterator's filter is set to
  968. * <tt>nullptr</tt>.
  969. * @return the filter used by this transliterator, or <tt>nullptr</tt> if this
  970. * transliterator uses no filter.
  971. * @stable ICU 2.4
  972. */
  973. UnicodeFilter* orphanFilter();
  974. /**
  975. * Changes the filter used by this transliterator. If the filter
  976. * is set to <tt>null</tt> then no filtering will occur.
  977. *
  978. * <p>Callers must take care if a transliterator is in use by
  979. * multiple threads. The filter should not be changed by one
  980. * thread while another thread may be transliterating.
  981. * @param adoptedFilter the new filter to be adopted.
  982. * @stable ICU 2.0
  983. */
  984. void adoptFilter(UnicodeFilter* adoptedFilter);
  985. /**
  986. * Returns this transliterator's inverse. See the class
  987. * documentation for details. This implementation simply inverts
  988. * the two entities in the ID and attempts to retrieve the
  989. * resulting transliterator. That is, if <code>getID()</code>
  990. * returns "A-B", then this method will return the result of
  991. * <code>createInstance("B-A")</code>, or <code>null</code> if that
  992. * call fails.
  993. *
  994. * <p>Subclasses with knowledge of their inverse may wish to
  995. * override this method.
  996. *
  997. * @param status Output param to filled in with a success or an error.
  998. * @return a transliterator that is an inverse, not necessarily
  999. * exact, of this transliterator, or <code>null</code> if no such
  1000. * transliterator is registered.
  1001. * @see #registerInstance
  1002. * @stable ICU 2.0
  1003. */
  1004. Transliterator* createInverse(UErrorCode& status) const;
  1005. /**
  1006. * Returns a <code>Transliterator</code> object given its ID.
  1007. * The ID must be either a system transliterator ID or a ID registered
  1008. * using <code>registerInstance()</code>.
  1009. *
  1010. * @param ID a valid ID, as enumerated by <code>getAvailableIDs()</code>
  1011. * @param dir either FORWARD or REVERSE.
  1012. * @param parseError Struct to receive information on position
  1013. * of error if an error is encountered
  1014. * @param status Output param to filled in with a success or an error.
  1015. * @return A <code>Transliterator</code> object with the given ID
  1016. * @see #registerInstance
  1017. * @see #getAvailableIDs
  1018. * @see #getID
  1019. * @stable ICU 2.0
  1020. */
  1021. static Transliterator* U_EXPORT2 createInstance(const UnicodeString& ID,
  1022. UTransDirection dir,
  1023. UParseError& parseError,
  1024. UErrorCode& status);
  1025. /**
  1026. * Returns a <code>Transliterator</code> object given its ID.
  1027. * The ID must be either a system transliterator ID or a ID registered
  1028. * using <code>registerInstance()</code>.
  1029. * @param ID a valid ID, as enumerated by <code>getAvailableIDs()</code>
  1030. * @param dir either FORWARD or REVERSE.
  1031. * @param status Output param to filled in with a success or an error.
  1032. * @return A <code>Transliterator</code> object with the given ID
  1033. * @stable ICU 2.0
  1034. */
  1035. static Transliterator* U_EXPORT2 createInstance(const UnicodeString& ID,
  1036. UTransDirection dir,
  1037. UErrorCode& status);
  1038. /**
  1039. * Returns a <code>Transliterator</code> object constructed from
  1040. * the given rule string. This will be a rule-based Transliterator,
  1041. * if the rule string contains only rules, or a
  1042. * compound Transliterator, if it contains ID blocks, or a
  1043. * null Transliterator, if it contains ID blocks which parse as
  1044. * empty for the given direction.
  1045. *
  1046. * @param ID the id for the transliterator.
  1047. * @param rules rules, separated by ';'
  1048. * @param dir either FORWARD or REVERSE.
  1049. * @param parseError Struct to receive information on position
  1050. * of error if an error is encountered
  1051. * @param status Output param set to success/failure code.
  1052. * @return a newly created Transliterator
  1053. * @stable ICU 2.0
  1054. */
  1055. static Transliterator* U_EXPORT2 createFromRules(const UnicodeString& ID,
  1056. const UnicodeString& rules,
  1057. UTransDirection dir,
  1058. UParseError& parseError,
  1059. UErrorCode& status);
  1060. /**
  1061. * Create a rule string that can be passed to createFromRules()
  1062. * to recreate this transliterator.
  1063. * @param result the string to receive the rules. Previous
  1064. * contents will be deleted.
  1065. * @param escapeUnprintable if true then convert unprintable
  1066. * character to their hex escape representations, \\uxxxx or
  1067. * \\Uxxxxxxxx. Unprintable characters are those other than
  1068. * U+000A, U+0020..U+007E.
  1069. * @stable ICU 2.0
  1070. */
  1071. virtual UnicodeString& toRules(UnicodeString& result,
  1072. UBool escapeUnprintable) const;
  1073. /**
  1074. * Return the number of elements that make up this transliterator.
  1075. * For example, if the transliterator "NFD;Jamo-Latin;Latin-Greek"
  1076. * were created, the return value of this method would be 3.
  1077. *
  1078. * <p>If this transliterator is not composed of other
  1079. * transliterators, then this method returns 1.
  1080. * @return the number of transliterators that compose this
  1081. * transliterator, or 1 if this transliterator is not composed of
  1082. * multiple transliterators
  1083. * @stable ICU 3.0
  1084. */
  1085. int32_t countElements() const;
  1086. /**
  1087. * Return an element that makes up this transliterator. For
  1088. * example, if the transliterator "NFD;Jamo-Latin;Latin-Greek"
  1089. * were created, the return value of this method would be one
  1090. * of the three transliterator objects that make up that
  1091. * transliterator: [NFD, Jamo-Latin, Latin-Greek].
  1092. *
  1093. * <p>If this transliterator is not composed of other
  1094. * transliterators, then this method will return a reference to
  1095. * this transliterator when given the index 0.
  1096. * @param index a value from 0..countElements()-1 indicating the
  1097. * transliterator to return
  1098. * @param ec input-output error code
  1099. * @return one of the transliterators that makes up this
  1100. * transliterator, if this transliterator is made up of multiple
  1101. * transliterators, otherwise a reference to this object if given
  1102. * an index of 0
  1103. * @stable ICU 3.0
  1104. */
  1105. const Transliterator& getElement(int32_t index, UErrorCode& ec) const;
  1106. /**
  1107. * Returns the set of all characters that may be modified in the
  1108. * input text by this Transliterator. This incorporates this
  1109. * object's current filter; if the filter is changed, the return
  1110. * value of this function will change. The default implementation
  1111. * returns an empty set. Some subclasses may override
  1112. * {@link #handleGetSourceSet } to return a more precise result. The
  1113. * return result is approximate in any case and is intended for
  1114. * use by tests, tools, or utilities.
  1115. * @param result receives result set; previous contents lost
  1116. * @return a reference to result
  1117. * @see #getTargetSet
  1118. * @see #handleGetSourceSet
  1119. * @stable ICU 2.4
  1120. */
  1121. UnicodeSet& getSourceSet(UnicodeSet& result) const;
  1122. /**
  1123. * Framework method that returns the set of all characters that
  1124. * may be modified in the input text by this Transliterator,
  1125. * ignoring the effect of this object's filter. The base class
  1126. * implementation returns the empty set. Subclasses that wish to
  1127. * implement this should override this method.
  1128. * @return the set of characters that this transliterator may
  1129. * modify. The set may be modified, so subclasses should return a
  1130. * newly-created object.
  1131. * @param result receives result set; previous contents lost
  1132. * @see #getSourceSet
  1133. * @see #getTargetSet
  1134. * @stable ICU 2.4
  1135. */
  1136. virtual void handleGetSourceSet(UnicodeSet& result) const;
  1137. /**
  1138. * Returns the set of all characters that may be generated as
  1139. * replacement text by this transliterator. The default
  1140. * implementation returns the empty set. Some subclasses may
  1141. * override this method to return a more precise result. The
  1142. * return result is approximate in any case and is intended for
  1143. * use by tests, tools, or utilities requiring such
  1144. * meta-information.
  1145. * @param result receives result set; previous contents lost
  1146. * @return a reference to result
  1147. * @see #getTargetSet
  1148. * @stable ICU 2.4
  1149. */
  1150. virtual UnicodeSet& getTargetSet(UnicodeSet& result) const;
  1151. public:
  1152. /**
  1153. * Registers a factory function that creates transliterators of
  1154. * a given ID.
  1155. *
  1156. * Because ICU may choose to cache Transliterators internally, this must
  1157. * be called at application startup, prior to any calls to
  1158. * Transliterator::createXXX to avoid undefined behavior.
  1159. *
  1160. * @param id the ID being registered
  1161. * @param factory a function pointer that will be copied and
  1162. * called later when the given ID is passed to createInstance()
  1163. * @param context a context pointer that will be stored and
  1164. * later passed to the factory function when an ID matching
  1165. * the registration ID is being instantiated with this factory.
  1166. * @stable ICU 2.0
  1167. */
  1168. static void U_EXPORT2 registerFactory(const UnicodeString& id,
  1169. Factory factory,
  1170. Token context);
  1171. /**
  1172. * Registers an instance <tt>obj</tt> of a subclass of
  1173. * <code>Transliterator</code> with the system. When
  1174. * <tt>createInstance()</tt> is called with an ID string that is
  1175. * equal to <tt>obj->getID()</tt>, then <tt>obj->clone()</tt> is
  1176. * returned.
  1177. *
  1178. * After this call the Transliterator class owns the adoptedObj
  1179. * and will delete it.
  1180. *
  1181. * Because ICU may choose to cache Transliterators internally, this must
  1182. * be called at application startup, prior to any calls to
  1183. * Transliterator::createXXX to avoid undefined behavior.
  1184. *
  1185. * @param adoptedObj an instance of subclass of
  1186. * <code>Transliterator</code> that defines <tt>clone()</tt>
  1187. * @see #createInstance
  1188. * @see #registerFactory
  1189. * @see #unregister
  1190. * @stable ICU 2.0
  1191. */
  1192. static void U_EXPORT2 registerInstance(Transliterator* adoptedObj);
  1193. /**
  1194. * Registers an ID string as an alias of another ID string.
  1195. * That is, after calling this function, <tt>createInstance(aliasID)</tt>
  1196. * will return the same thing as <tt>createInstance(realID)</tt>.
  1197. * This is generally used to create shorter, more mnemonic aliases
  1198. * for long compound IDs.
  1199. *
  1200. * @param aliasID The new ID being registered.
  1201. * @param realID The ID that the new ID is to be an alias for.
  1202. * This can be a compound ID and can include filters and should
  1203. * refer to transliterators that have already been registered with
  1204. * the framework, although this isn't checked.
  1205. * @stable ICU 3.6
  1206. */
  1207. static void U_EXPORT2 registerAlias(const UnicodeString& aliasID,
  1208. const UnicodeString& realID);
  1209. protected:
  1210. #ifndef U_HIDE_INTERNAL_API
  1211. /**
  1212. * @param id the ID being registered
  1213. * @param factory a function pointer that will be copied and
  1214. * called later when the given ID is passed to createInstance()
  1215. * @param context a context pointer that will be stored and
  1216. * later passed to the factory function when an ID matching
  1217. * the registration ID is being instantiated with this factory.
  1218. * @internal
  1219. */
  1220. static void _registerFactory(const UnicodeString& id,
  1221. Factory factory,
  1222. Token context);
  1223. /**
  1224. * @internal
  1225. */
  1226. static void _registerInstance(Transliterator* adoptedObj);
  1227. /**
  1228. * @internal
  1229. */
  1230. static void _registerAlias(const UnicodeString& aliasID, const UnicodeString& realID);
  1231. /**
  1232. * Register two targets as being inverses of one another. For
  1233. * example, calling registerSpecialInverse("NFC", "NFD", true) causes
  1234. * Transliterator to form the following inverse relationships:
  1235. *
  1236. * <pre>NFC => NFD
  1237. * Any-NFC => Any-NFD
  1238. * NFD => NFC
  1239. * Any-NFD => Any-NFC</pre>
  1240. *
  1241. * (Without the special inverse registration, the inverse of NFC
  1242. * would be NFC-Any.) Note that NFD is shorthand for Any-NFD, but
  1243. * that the presence or absence of "Any-" is preserved.
  1244. *
  1245. * <p>The relationship is symmetrical; registering (a, b) is
  1246. * equivalent to registering (b, a).
  1247. *
  1248. * <p>The relevant IDs must still be registered separately as
  1249. * factories or classes.
  1250. *
  1251. * <p>Only the targets are specified. Special inverses always
  1252. * have the form Any-Target1 <=> Any-Target2. The target should
  1253. * have canonical casing (the casing desired to be produced when
  1254. * an inverse is formed) and should contain no whitespace or other
  1255. * extraneous characters.
  1256. *
  1257. * @param target the target against which to register the inverse
  1258. * @param inverseTarget the inverse of target, that is
  1259. * Any-target.getInverse() => Any-inverseTarget
  1260. * @param bidirectional if true, register the reverse relation
  1261. * as well, that is, Any-inverseTarget.getInverse() => Any-target
  1262. * @internal
  1263. */
  1264. static void _registerSpecialInverse(const UnicodeString& target,
  1265. const UnicodeString& inverseTarget,
  1266. UBool bidirectional);
  1267. #endif /* U_HIDE_INTERNAL_API */
  1268. public:
  1269. /**
  1270. * Unregisters a transliterator or class. This may be either
  1271. * a system transliterator or a user transliterator or class.
  1272. * Any attempt to construct an unregistered transliterator based
  1273. * on its ID will fail.
  1274. *
  1275. * Because ICU may choose to cache Transliterators internally, this should
  1276. * be called during application shutdown, after all calls to
  1277. * Transliterator::createXXX to avoid undefined behavior.
  1278. *
  1279. * @param ID the ID of the transliterator or class
  1280. * @return the <code>Object</code> that was registered with
  1281. * <code>ID</code>, or <code>null</code> if none was
  1282. * @see #registerInstance
  1283. * @see #registerFactory
  1284. * @stable ICU 2.0
  1285. */
  1286. static void U_EXPORT2 unregister(const UnicodeString& ID);
  1287. public:
  1288. /**
  1289. * Return a StringEnumeration over the IDs available at the time of the
  1290. * call, including user-registered IDs.
  1291. * @param ec input-output error code
  1292. * @return a newly-created StringEnumeration over the transliterators
  1293. * available at the time of the call. The caller should delete this object
  1294. * when done using it.
  1295. * @stable ICU 3.0
  1296. */
  1297. static StringEnumeration* U_EXPORT2 getAvailableIDs(UErrorCode& ec);
  1298. /**
  1299. * Return the number of registered source specifiers.
  1300. * @return the number of registered source specifiers.
  1301. * @stable ICU 2.0
  1302. */
  1303. static int32_t U_EXPORT2 countAvailableSources();
  1304. /**
  1305. * Return a registered source specifier.
  1306. * @param index which specifier to return, from 0 to n-1, where
  1307. * n = countAvailableSources()
  1308. * @param result fill-in parameter to receive the source specifier.
  1309. * If index is out of range, result will be empty.
  1310. * @return reference to result
  1311. * @stable ICU 2.0
  1312. */
  1313. static UnicodeString& U_EXPORT2 getAvailableSource(int32_t index,
  1314. UnicodeString& result);
  1315. /**
  1316. * Return the number of registered target specifiers for a given
  1317. * source specifier.
  1318. * @param source the given source specifier.
  1319. * @return the number of registered target specifiers for a given
  1320. * source specifier.
  1321. * @stable ICU 2.0
  1322. */
  1323. static int32_t U_EXPORT2 countAvailableTargets(const UnicodeString& source);
  1324. /**
  1325. * Return a registered target specifier for a given source.
  1326. * @param index which specifier to return, from 0 to n-1, where
  1327. * n = countAvailableTargets(source)
  1328. * @param source the source specifier
  1329. * @param result fill-in parameter to receive the target specifier.
  1330. * If source is invalid or if index is out of range, result will
  1331. * be empty.
  1332. * @return reference to result
  1333. * @stable ICU 2.0
  1334. */
  1335. static UnicodeString& U_EXPORT2 getAvailableTarget(int32_t index,
  1336. const UnicodeString& source,
  1337. UnicodeString& result);
  1338. /**
  1339. * Return the number of registered variant specifiers for a given
  1340. * source-target pair.
  1341. * @param source the source specifiers.
  1342. * @param target the target specifiers.
  1343. * @stable ICU 2.0
  1344. */
  1345. static int32_t U_EXPORT2 countAvailableVariants(const UnicodeString& source,
  1346. const UnicodeString& target);
  1347. /**
  1348. * Return a registered variant specifier for a given source-target
  1349. * pair.
  1350. * @param index which specifier to return, from 0 to n-1, where
  1351. * n = countAvailableVariants(source, target)
  1352. * @param source the source specifier
  1353. * @param target the target specifier
  1354. * @param result fill-in parameter to receive the variant
  1355. * specifier. If source is invalid or if target is invalid or if
  1356. * index is out of range, result will be empty.
  1357. * @return reference to result
  1358. * @stable ICU 2.0
  1359. */
  1360. static UnicodeString& U_EXPORT2 getAvailableVariant(int32_t index,
  1361. const UnicodeString& source,
  1362. const UnicodeString& target,
  1363. UnicodeString& result);
  1364. protected:
  1365. #ifndef U_HIDE_INTERNAL_API
  1366. /**
  1367. * Non-mutexed internal method
  1368. * @internal
  1369. */
  1370. static int32_t _countAvailableSources();
  1371. /**
  1372. * Non-mutexed internal method
  1373. * @internal
  1374. */
  1375. static UnicodeString& _getAvailableSource(int32_t index,
  1376. UnicodeString& result);
  1377. /**
  1378. * Non-mutexed internal method
  1379. * @internal
  1380. */
  1381. static int32_t _countAvailableTargets(const UnicodeString& source);
  1382. /**
  1383. * Non-mutexed internal method
  1384. * @internal
  1385. */
  1386. static UnicodeString& _getAvailableTarget(int32_t index,
  1387. const UnicodeString& source,
  1388. UnicodeString& result);
  1389. /**
  1390. * Non-mutexed internal method
  1391. * @internal
  1392. */
  1393. static int32_t _countAvailableVariants(const UnicodeString& source,
  1394. const UnicodeString& target);
  1395. /**
  1396. * Non-mutexed internal method
  1397. * @internal
  1398. */
  1399. static UnicodeString& _getAvailableVariant(int32_t index,
  1400. const UnicodeString& source,
  1401. const UnicodeString& target,
  1402. UnicodeString& result);
  1403. #endif /* U_HIDE_INTERNAL_API */
  1404. protected:
  1405. /**
  1406. * Set the ID of this transliterators. Subclasses shouldn't do
  1407. * this, unless the underlying script behavior has changed.
  1408. * @param id the new id t to be set.
  1409. * @stable ICU 2.4
  1410. */
  1411. void setID(const UnicodeString& id);
  1412. public:
  1413. /**
  1414. * Return the class ID for this class. This is useful only for
  1415. * comparing to a return value from getDynamicClassID().
  1416. * Note that Transliterator is an abstract base class, and therefor
  1417. * no fully constructed object will have a dynamic
  1418. * UCLassID that equals the UClassID returned from
  1419. * TRansliterator::getStaticClassID().
  1420. * @return The class ID for class Transliterator.
  1421. * @stable ICU 2.0
  1422. */
  1423. static UClassID U_EXPORT2 getStaticClassID();
  1424. /**
  1425. * Returns a unique class ID <b>polymorphically</b>. This method
  1426. * is to implement a simple version of RTTI, since not all C++
  1427. * compilers support genuine RTTI. Polymorphic operator==() and
  1428. * clone() methods call this method.
  1429. *
  1430. * <p>Concrete subclasses of Transliterator must use the
  1431. * UOBJECT_DEFINE_RTTI_IMPLEMENTATION macro from
  1432. * uobject.h to provide the RTTI functions.
  1433. *
  1434. * @return The class ID for this object. All objects of a given
  1435. * class have the same class ID. Objects of other classes have
  1436. * different class IDs.
  1437. * @stable ICU 2.0
  1438. */
  1439. virtual UClassID getDynamicClassID() const override = 0;
  1440. private:
  1441. static UBool initializeRegistry(UErrorCode &status);
  1442. public:
  1443. #ifndef U_HIDE_OBSOLETE_API
  1444. /**
  1445. * Return the number of IDs currently registered with the system.
  1446. * To retrieve the actual IDs, call getAvailableID(i) with
  1447. * i from 0 to countAvailableIDs() - 1.
  1448. * @return the number of IDs currently registered with the system.
  1449. * @obsolete ICU 3.4 use getAvailableIDs() instead
  1450. */
  1451. static int32_t U_EXPORT2 countAvailableIDs();
  1452. /**
  1453. * Return the index-th available ID. index must be between 0
  1454. * and countAvailableIDs() - 1, inclusive. If index is out of
  1455. * range, the result of getAvailableID(0) is returned.
  1456. * @param index the given ID index.
  1457. * @return the index-th available ID. index must be between 0
  1458. * and countAvailableIDs() - 1, inclusive. If index is out of
  1459. * range, the result of getAvailableID(0) is returned.
  1460. * @obsolete ICU 3.4 use getAvailableIDs() instead; this function
  1461. * is not thread safe, since it returns a reference to storage that
  1462. * may become invalid if another thread calls unregister
  1463. */
  1464. static const UnicodeString& U_EXPORT2 getAvailableID(int32_t index);
  1465. #endif /* U_HIDE_OBSOLETE_API */
  1466. };
  1467. inline int32_t Transliterator::getMaximumContextLength() const {
  1468. return maximumContextLength;
  1469. }
  1470. inline void Transliterator::setID(const UnicodeString& id) {
  1471. ID = id;
  1472. // NUL-terminate the ID string, which is a non-aliased copy.
  1473. ID.append(static_cast<char16_t>(0));
  1474. ID.truncate(ID.length()-1);
  1475. }
  1476. #ifndef U_HIDE_INTERNAL_API
  1477. inline Transliterator::Token Transliterator::integerToken(int32_t i) {
  1478. Token t;
  1479. t.integer = i;
  1480. return t;
  1481. }
  1482. inline Transliterator::Token Transliterator::pointerToken(void* p) {
  1483. Token t;
  1484. t.pointer = p;
  1485. return t;
  1486. }
  1487. #endif /* U_HIDE_INTERNAL_API */
  1488. U_NAMESPACE_END
  1489. #endif /* #if !UCONFIG_NO_TRANSLITERATION */
  1490. #endif /* U_SHOW_CPLUSPLUS_API */
  1491. #endif