normalizer2.h 35 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. *******************************************************************************
  5. *
  6. * Copyright (C) 2009-2013, International Business Machines
  7. * Corporation and others. All Rights Reserved.
  8. *
  9. *******************************************************************************
  10. * file name: normalizer2.h
  11. * encoding: UTF-8
  12. * tab size: 8 (not used)
  13. * indentation:4
  14. *
  15. * created on: 2009nov22
  16. * created by: Markus W. Scherer
  17. */
  18. #ifndef __NORMALIZER2_H__
  19. #define __NORMALIZER2_H__
  20. /**
  21. * \file
  22. * \brief C++ API: New API for Unicode Normalization.
  23. */
  24. #include "unicode/utypes.h"
  25. #if U_SHOW_CPLUSPLUS_API
  26. #if !UCONFIG_NO_NORMALIZATION
  27. #include "unicode/stringpiece.h"
  28. #include "unicode/uniset.h"
  29. #include "unicode/unistr.h"
  30. #include "unicode/unorm2.h"
  31. U_NAMESPACE_BEGIN
  32. class ByteSink;
  33. /**
  34. * Unicode normalization functionality for standard Unicode normalization or
  35. * for using custom mapping tables.
  36. * All instances of this class are unmodifiable/immutable.
  37. * Instances returned by getInstance() are singletons that must not be deleted by the caller.
  38. * The Normalizer2 class is not intended for public subclassing.
  39. *
  40. * The primary functions are to produce a normalized string and to detect whether
  41. * a string is already normalized.
  42. * The most commonly used normalization forms are those defined in
  43. * http://www.unicode.org/unicode/reports/tr15/
  44. * However, this API supports additional normalization forms for specialized purposes.
  45. * For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE)
  46. * and can be used in implementations of UTS #46.
  47. *
  48. * Not only are the standard compose and decompose modes supplied,
  49. * but additional modes are provided as documented in the Mode enum.
  50. *
  51. * Some of the functions in this class identify normalization boundaries.
  52. * At a normalization boundary, the portions of the string
  53. * before it and starting from it do not interact and can be handled independently.
  54. *
  55. * The spanQuickCheckYes() stops at a normalization boundary.
  56. * When the goal is a normalized string, then the text before the boundary
  57. * can be copied, and the remainder can be processed with normalizeSecondAndAppend().
  58. *
  59. * The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether
  60. * a character is guaranteed to be at a normalization boundary,
  61. * regardless of context.
  62. * This is used for moving from one normalization boundary to the next
  63. * or preceding boundary, and for performing iterative normalization.
  64. *
  65. * Iterative normalization is useful when only a small portion of a
  66. * longer string needs to be processed.
  67. * For example, in ICU, iterative normalization is used by the NormalizationTransliterator
  68. * (to avoid replacing already-normalized text) and ucol_nextSortKeyPart()
  69. * (to process only the substring for which sort key bytes are computed).
  70. *
  71. * The set of normalization boundaries returned by these functions may not be
  72. * complete: There may be more boundaries that could be returned.
  73. * Different functions may return different boundaries.
  74. * @stable ICU 4.4
  75. */
  76. class U_COMMON_API Normalizer2 : public UObject {
  77. public:
  78. /**
  79. * Destructor.
  80. * @stable ICU 4.4
  81. */
  82. ~Normalizer2();
  83. /**
  84. * Returns a Normalizer2 instance for Unicode NFC normalization.
  85. * Same as getInstance(nullptr, "nfc", UNORM2_COMPOSE, errorCode).
  86. * Returns an unmodifiable singleton instance. Do not delete it.
  87. * @param errorCode Standard ICU error code. Its input value must
  88. * pass the U_SUCCESS() test, or else the function returns
  89. * immediately. Check for U_FAILURE() on output or use with
  90. * function chaining. (See User Guide for details.)
  91. * @return the requested Normalizer2, if successful
  92. * @stable ICU 49
  93. */
  94. static const Normalizer2 *
  95. getNFCInstance(UErrorCode &errorCode);
  96. /**
  97. * Returns a Normalizer2 instance for Unicode NFD normalization.
  98. * Same as getInstance(nullptr, "nfc", UNORM2_DECOMPOSE, errorCode).
  99. * Returns an unmodifiable singleton instance. Do not delete it.
  100. * @param errorCode Standard ICU error code. Its input value must
  101. * pass the U_SUCCESS() test, or else the function returns
  102. * immediately. Check for U_FAILURE() on output or use with
  103. * function chaining. (See User Guide for details.)
  104. * @return the requested Normalizer2, if successful
  105. * @stable ICU 49
  106. */
  107. static const Normalizer2 *
  108. getNFDInstance(UErrorCode &errorCode);
  109. /**
  110. * Returns a Normalizer2 instance for Unicode NFKC normalization.
  111. * Same as getInstance(nullptr, "nfkc", UNORM2_COMPOSE, errorCode).
  112. * Returns an unmodifiable singleton instance. Do not delete it.
  113. * @param errorCode Standard ICU error code. Its input value must
  114. * pass the U_SUCCESS() test, or else the function returns
  115. * immediately. Check for U_FAILURE() on output or use with
  116. * function chaining. (See User Guide for details.)
  117. * @return the requested Normalizer2, if successful
  118. * @stable ICU 49
  119. */
  120. static const Normalizer2 *
  121. getNFKCInstance(UErrorCode &errorCode);
  122. /**
  123. * Returns a Normalizer2 instance for Unicode NFKD normalization.
  124. * Same as getInstance(nullptr, "nfkc", UNORM2_DECOMPOSE, errorCode).
  125. * Returns an unmodifiable singleton instance. Do not delete it.
  126. * @param errorCode Standard ICU error code. Its input value must
  127. * pass the U_SUCCESS() test, or else the function returns
  128. * immediately. Check for U_FAILURE() on output or use with
  129. * function chaining. (See User Guide for details.)
  130. * @return the requested Normalizer2, if successful
  131. * @stable ICU 49
  132. */
  133. static const Normalizer2 *
  134. getNFKDInstance(UErrorCode &errorCode);
  135. /**
  136. * Returns a Normalizer2 instance for Unicode toNFKC_Casefold() normalization
  137. * which is equivalent to applying the NFKC_Casefold mappings and then NFC.
  138. * See https://www.unicode.org/reports/tr44/#NFKC_Casefold
  139. *
  140. * Same as getInstance(nullptr, "nfkc_cf", UNORM2_COMPOSE, errorCode).
  141. * Returns an unmodifiable singleton instance. Do not delete it.
  142. * @param errorCode Standard ICU error code. Its input value must
  143. * pass the U_SUCCESS() test, or else the function returns
  144. * immediately. Check for U_FAILURE() on output or use with
  145. * function chaining. (See User Guide for details.)
  146. * @return the requested Normalizer2, if successful
  147. * @stable ICU 49
  148. */
  149. static const Normalizer2 *
  150. getNFKCCasefoldInstance(UErrorCode &errorCode);
  151. /**
  152. * Returns a Normalizer2 instance for a variant of Unicode toNFKC_Casefold() normalization
  153. * which is equivalent to applying the NFKC_Simple_Casefold mappings and then NFC.
  154. * See https://www.unicode.org/reports/tr44/#NFKC_Simple_Casefold
  155. *
  156. * Same as getInstance(nullptr, "nfkc_scf", UNORM2_COMPOSE, errorCode).
  157. * Returns an unmodifiable singleton instance. Do not delete it.
  158. * @param errorCode Standard ICU error code. Its input value must
  159. * pass the U_SUCCESS() test, or else the function returns
  160. * immediately. Check for U_FAILURE() on output or use with
  161. * function chaining. (See User Guide for details.)
  162. * @return the requested Normalizer2, if successful
  163. * @stable ICU 74
  164. */
  165. static const Normalizer2 *
  166. getNFKCSimpleCasefoldInstance(UErrorCode &errorCode);
  167. /**
  168. * Returns a Normalizer2 instance which uses the specified data file
  169. * (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle)
  170. * and which composes or decomposes text according to the specified mode.
  171. * Returns an unmodifiable singleton instance. Do not delete it.
  172. *
  173. * Use packageName=nullptr for data files that are part of ICU's own data.
  174. * Use name="nfc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFC/NFD.
  175. * Use name="nfkc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFKC/NFKD.
  176. * Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold.
  177. *
  178. * @param packageName nullptr for ICU built-in data, otherwise application data package name
  179. * @param name "nfc" or "nfkc" or "nfkc_cf" or "nfkc_scf" or name of custom data file
  180. * @param mode normalization mode (compose or decompose etc.)
  181. * @param errorCode Standard ICU error code. Its input value must
  182. * pass the U_SUCCESS() test, or else the function returns
  183. * immediately. Check for U_FAILURE() on output or use with
  184. * function chaining. (See User Guide for details.)
  185. * @return the requested Normalizer2, if successful
  186. * @stable ICU 4.4
  187. */
  188. static const Normalizer2 *
  189. getInstance(const char *packageName,
  190. const char *name,
  191. UNormalization2Mode mode,
  192. UErrorCode &errorCode);
  193. /**
  194. * Returns the normalized form of the source string.
  195. * @param src source string
  196. * @param errorCode Standard ICU error code. Its input value must
  197. * pass the U_SUCCESS() test, or else the function returns
  198. * immediately. Check for U_FAILURE() on output or use with
  199. * function chaining. (See User Guide for details.)
  200. * @return normalized src
  201. * @stable ICU 4.4
  202. */
  203. UnicodeString
  204. normalize(const UnicodeString &src, UErrorCode &errorCode) const {
  205. UnicodeString result;
  206. normalize(src, result, errorCode);
  207. return result;
  208. }
  209. /**
  210. * Writes the normalized form of the source string to the destination string
  211. * (replacing its contents) and returns the destination string.
  212. * The source and destination strings must be different objects.
  213. * @param src source string
  214. * @param dest destination string; its contents is replaced with normalized src
  215. * @param errorCode Standard ICU error code. Its input value must
  216. * pass the U_SUCCESS() test, or else the function returns
  217. * immediately. Check for U_FAILURE() on output or use with
  218. * function chaining. (See User Guide for details.)
  219. * @return dest
  220. * @stable ICU 4.4
  221. */
  222. virtual UnicodeString &
  223. normalize(const UnicodeString &src,
  224. UnicodeString &dest,
  225. UErrorCode &errorCode) const = 0;
  226. /**
  227. * Normalizes a UTF-8 string and optionally records how source substrings
  228. * relate to changed and unchanged result substrings.
  229. *
  230. * Implemented completely for all built-in modes except for FCD.
  231. * The base class implementation converts to & from UTF-16 and does not support edits.
  232. *
  233. * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
  234. * @param src Source UTF-8 string.
  235. * @param sink A ByteSink to which the normalized UTF-8 result string is written.
  236. * sink.Flush() is called at the end.
  237. * @param edits Records edits for index mapping, working with styled text,
  238. * and getting only changes (if any).
  239. * The Edits contents is undefined if any error occurs.
  240. * This function calls edits->reset() first unless
  241. * options includes U_EDITS_NO_RESET. edits can be nullptr.
  242. * @param errorCode Standard ICU error code. Its input value must
  243. * pass the U_SUCCESS() test, or else the function returns
  244. * immediately. Check for U_FAILURE() on output or use with
  245. * function chaining. (See User Guide for details.)
  246. * @stable ICU 60
  247. */
  248. virtual void
  249. normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
  250. Edits *edits, UErrorCode &errorCode) const;
  251. /**
  252. * Appends the normalized form of the second string to the first string
  253. * (merging them at the boundary) and returns the first string.
  254. * The result is normalized if the first string was normalized.
  255. * The first and second strings must be different objects.
  256. * @param first string, should be normalized
  257. * @param second string, will be normalized
  258. * @param errorCode Standard ICU error code. Its input value must
  259. * pass the U_SUCCESS() test, or else the function returns
  260. * immediately. Check for U_FAILURE() on output or use with
  261. * function chaining. (See User Guide for details.)
  262. * @return first
  263. * @stable ICU 4.4
  264. */
  265. virtual UnicodeString &
  266. normalizeSecondAndAppend(UnicodeString &first,
  267. const UnicodeString &second,
  268. UErrorCode &errorCode) const = 0;
  269. /**
  270. * Appends the second string to the first string
  271. * (merging them at the boundary) and returns the first string.
  272. * The result is normalized if both the strings were normalized.
  273. * The first and second strings must be different objects.
  274. * @param first string, should be normalized
  275. * @param second string, should be normalized
  276. * @param errorCode Standard ICU error code. Its input value must
  277. * pass the U_SUCCESS() test, or else the function returns
  278. * immediately. Check for U_FAILURE() on output or use with
  279. * function chaining. (See User Guide for details.)
  280. * @return first
  281. * @stable ICU 4.4
  282. */
  283. virtual UnicodeString &
  284. append(UnicodeString &first,
  285. const UnicodeString &second,
  286. UErrorCode &errorCode) const = 0;
  287. /**
  288. * Gets the decomposition mapping of c.
  289. * Roughly equivalent to normalizing the String form of c
  290. * on a UNORM2_DECOMPOSE Normalizer2 instance, but much faster, and except that this function
  291. * returns false and does not write a string
  292. * if c does not have a decomposition mapping in this instance's data.
  293. * This function is independent of the mode of the Normalizer2.
  294. * @param c code point
  295. * @param decomposition String object which will be set to c's
  296. * decomposition mapping, if there is one.
  297. * @return true if c has a decomposition, otherwise false
  298. * @stable ICU 4.6
  299. */
  300. virtual UBool
  301. getDecomposition(UChar32 c, UnicodeString &decomposition) const = 0;
  302. /**
  303. * Gets the raw decomposition mapping of c.
  304. *
  305. * This is similar to the getDecomposition() method but returns the
  306. * raw decomposition mapping as specified in UnicodeData.txt or
  307. * (for custom data) in the mapping files processed by the gennorm2 tool.
  308. * By contrast, getDecomposition() returns the processed,
  309. * recursively-decomposed version of this mapping.
  310. *
  311. * When used on a standard NFKC Normalizer2 instance,
  312. * getRawDecomposition() returns the Unicode Decomposition_Mapping (dm) property.
  313. *
  314. * When used on a standard NFC Normalizer2 instance,
  315. * it returns the Decomposition_Mapping only if the Decomposition_Type (dt) is Canonical (Can);
  316. * in this case, the result contains either one or two code points (=1..4 char16_ts).
  317. *
  318. * This function is independent of the mode of the Normalizer2.
  319. * The default implementation returns false.
  320. * @param c code point
  321. * @param decomposition String object which will be set to c's
  322. * raw decomposition mapping, if there is one.
  323. * @return true if c has a decomposition, otherwise false
  324. * @stable ICU 49
  325. */
  326. virtual UBool
  327. getRawDecomposition(UChar32 c, UnicodeString &decomposition) const;
  328. /**
  329. * Performs pairwise composition of a & b and returns the composite if there is one.
  330. *
  331. * Returns a composite code point c only if c has a two-way mapping to a+b.
  332. * In standard Unicode normalization, this means that
  333. * c has a canonical decomposition to a+b
  334. * and c does not have the Full_Composition_Exclusion property.
  335. *
  336. * This function is independent of the mode of the Normalizer2.
  337. * The default implementation returns a negative value.
  338. * @param a A (normalization starter) code point.
  339. * @param b Another code point.
  340. * @return The non-negative composite code point if there is one; otherwise a negative value.
  341. * @stable ICU 49
  342. */
  343. virtual UChar32
  344. composePair(UChar32 a, UChar32 b) const;
  345. /**
  346. * Gets the combining class of c.
  347. * The default implementation returns 0
  348. * but all standard implementations return the Unicode Canonical_Combining_Class value.
  349. * @param c code point
  350. * @return c's combining class
  351. * @stable ICU 49
  352. */
  353. virtual uint8_t
  354. getCombiningClass(UChar32 c) const;
  355. /**
  356. * Tests if the string is normalized.
  357. * Internally, in cases where the quickCheck() method would return "maybe"
  358. * (which is only possible for the two COMPOSE modes) this method
  359. * resolves to "yes" or "no" to provide a definitive result,
  360. * at the cost of doing more work in those cases.
  361. * @param s input string
  362. * @param errorCode Standard ICU error code. Its input value must
  363. * pass the U_SUCCESS() test, or else the function returns
  364. * immediately. Check for U_FAILURE() on output or use with
  365. * function chaining. (See User Guide for details.)
  366. * @return true if s is normalized
  367. * @stable ICU 4.4
  368. */
  369. virtual UBool
  370. isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0;
  371. /**
  372. * Tests if the UTF-8 string is normalized.
  373. * Internally, in cases where the quickCheck() method would return "maybe"
  374. * (which is only possible for the two COMPOSE modes) this method
  375. * resolves to "yes" or "no" to provide a definitive result,
  376. * at the cost of doing more work in those cases.
  377. *
  378. * This works for all normalization modes.
  379. * It is optimized for UTF-8 for all built-in modes except for FCD.
  380. * The base class implementation converts to UTF-16 and calls isNormalized().
  381. *
  382. * @param s UTF-8 input string
  383. * @param errorCode Standard ICU error code. Its input value must
  384. * pass the U_SUCCESS() test, or else the function returns
  385. * immediately. Check for U_FAILURE() on output or use with
  386. * function chaining. (See User Guide for details.)
  387. * @return true if s is normalized
  388. * @stable ICU 60
  389. */
  390. virtual UBool
  391. isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const;
  392. /**
  393. * Tests if the string is normalized.
  394. * For the two COMPOSE modes, the result could be "maybe" in cases that
  395. * would take a little more work to resolve definitively.
  396. * Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster
  397. * combination of quick check + normalization, to avoid
  398. * re-checking the "yes" prefix.
  399. * @param s input string
  400. * @param errorCode Standard ICU error code. Its input value must
  401. * pass the U_SUCCESS() test, or else the function returns
  402. * immediately. Check for U_FAILURE() on output or use with
  403. * function chaining. (See User Guide for details.)
  404. * @return UNormalizationCheckResult
  405. * @stable ICU 4.4
  406. */
  407. virtual UNormalizationCheckResult
  408. quickCheck(const UnicodeString &s, UErrorCode &errorCode) const = 0;
  409. /**
  410. * Returns the end of the normalized substring of the input string.
  411. * In other words, with <code>end=spanQuickCheckYes(s, ec);</code>
  412. * the substring <code>UnicodeString(s, 0, end)</code>
  413. * will pass the quick check with a "yes" result.
  414. *
  415. * The returned end index is usually one or more characters before the
  416. * "no" or "maybe" character: The end index is at a normalization boundary.
  417. * (See the class documentation for more about normalization boundaries.)
  418. *
  419. * When the goal is a normalized string and most input strings are expected
  420. * to be normalized already, then call this method,
  421. * and if it returns a prefix shorter than the input string,
  422. * copy that prefix and use normalizeSecondAndAppend() for the remainder.
  423. * @param s input string
  424. * @param errorCode Standard ICU error code. Its input value must
  425. * pass the U_SUCCESS() test, or else the function returns
  426. * immediately. Check for U_FAILURE() on output or use with
  427. * function chaining. (See User Guide for details.)
  428. * @return "yes" span end index
  429. * @stable ICU 4.4
  430. */
  431. virtual int32_t
  432. spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const = 0;
  433. /**
  434. * Tests if the character always has a normalization boundary before it,
  435. * regardless of context.
  436. * If true, then the character does not normalization-interact with
  437. * preceding characters.
  438. * In other words, a string containing this character can be normalized
  439. * by processing portions before this character and starting from this
  440. * character independently.
  441. * This is used for iterative normalization. See the class documentation for details.
  442. * @param c character to test
  443. * @return true if c has a normalization boundary before it
  444. * @stable ICU 4.4
  445. */
  446. virtual UBool hasBoundaryBefore(UChar32 c) const = 0;
  447. /**
  448. * Tests if the character always has a normalization boundary after it,
  449. * regardless of context.
  450. * If true, then the character does not normalization-interact with
  451. * following characters.
  452. * In other words, a string containing this character can be normalized
  453. * by processing portions up to this character and after this
  454. * character independently.
  455. * This is used for iterative normalization. See the class documentation for details.
  456. * Note that this operation may be significantly slower than hasBoundaryBefore().
  457. * @param c character to test
  458. * @return true if c has a normalization boundary after it
  459. * @stable ICU 4.4
  460. */
  461. virtual UBool hasBoundaryAfter(UChar32 c) const = 0;
  462. /**
  463. * Tests if the character is normalization-inert.
  464. * If true, then the character does not change, nor normalization-interact with
  465. * preceding or following characters.
  466. * In other words, a string containing this character can be normalized
  467. * by processing portions before this character and after this
  468. * character independently.
  469. * This is used for iterative normalization. See the class documentation for details.
  470. * Note that this operation may be significantly slower than hasBoundaryBefore().
  471. * @param c character to test
  472. * @return true if c is normalization-inert
  473. * @stable ICU 4.4
  474. */
  475. virtual UBool isInert(UChar32 c) const = 0;
  476. };
  477. /**
  478. * Normalization filtered by a UnicodeSet.
  479. * Normalizes portions of the text contained in the filter set and leaves
  480. * portions not contained in the filter set unchanged.
  481. * Filtering is done via UnicodeSet::span(..., USET_SPAN_SIMPLE).
  482. * Not-in-the-filter text is treated as "is normalized" and "quick check yes".
  483. * This class implements all of (and only) the Normalizer2 API.
  484. * An instance of this class is unmodifiable/immutable but is constructed and
  485. * must be destructed by the owner.
  486. * @stable ICU 4.4
  487. */
  488. class U_COMMON_API FilteredNormalizer2 : public Normalizer2 {
  489. public:
  490. /**
  491. * Constructs a filtered normalizer wrapping any Normalizer2 instance
  492. * and a filter set.
  493. * Both are aliased and must not be modified or deleted while this object
  494. * is used.
  495. * The filter set should be frozen; otherwise the performance will suffer greatly.
  496. * @param n2 wrapped Normalizer2 instance
  497. * @param filterSet UnicodeSet which determines the characters to be normalized
  498. * @stable ICU 4.4
  499. */
  500. FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet) :
  501. norm2(n2), set(filterSet) {}
  502. /**
  503. * Destructor.
  504. * @stable ICU 4.4
  505. */
  506. ~FilteredNormalizer2();
  507. /**
  508. * Writes the normalized form of the source string to the destination string
  509. * (replacing its contents) and returns the destination string.
  510. * The source and destination strings must be different objects.
  511. * @param src source string
  512. * @param dest destination string; its contents is replaced with normalized src
  513. * @param errorCode Standard ICU error code. Its input value must
  514. * pass the U_SUCCESS() test, or else the function returns
  515. * immediately. Check for U_FAILURE() on output or use with
  516. * function chaining. (See User Guide for details.)
  517. * @return dest
  518. * @stable ICU 4.4
  519. */
  520. virtual UnicodeString &
  521. normalize(const UnicodeString &src,
  522. UnicodeString &dest,
  523. UErrorCode &errorCode) const override;
  524. /**
  525. * Normalizes a UTF-8 string and optionally records how source substrings
  526. * relate to changed and unchanged result substrings.
  527. *
  528. * Implemented completely for most built-in modes except for FCD.
  529. * The base class implementation converts to & from UTF-16 and does not support edits.
  530. *
  531. * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
  532. * @param src Source UTF-8 string.
  533. * @param sink A ByteSink to which the normalized UTF-8 result string is written.
  534. * sink.Flush() is called at the end.
  535. * @param edits Records edits for index mapping, working with styled text,
  536. * and getting only changes (if any).
  537. * The Edits contents is undefined if any error occurs.
  538. * This function calls edits->reset() first unless
  539. * options includes U_EDITS_NO_RESET. edits can be nullptr.
  540. * @param errorCode Standard ICU error code. Its input value must
  541. * pass the U_SUCCESS() test, or else the function returns
  542. * immediately. Check for U_FAILURE() on output or use with
  543. * function chaining. (See User Guide for details.)
  544. * @stable ICU 60
  545. */
  546. virtual void
  547. normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
  548. Edits *edits, UErrorCode &errorCode) const override;
  549. /**
  550. * Appends the normalized form of the second string to the first string
  551. * (merging them at the boundary) and returns the first string.
  552. * The result is normalized if the first string was normalized.
  553. * The first and second strings must be different objects.
  554. * @param first string, should be normalized
  555. * @param second string, will be normalized
  556. * @param errorCode Standard ICU error code. Its input value must
  557. * pass the U_SUCCESS() test, or else the function returns
  558. * immediately. Check for U_FAILURE() on output or use with
  559. * function chaining. (See User Guide for details.)
  560. * @return first
  561. * @stable ICU 4.4
  562. */
  563. virtual UnicodeString &
  564. normalizeSecondAndAppend(UnicodeString &first,
  565. const UnicodeString &second,
  566. UErrorCode &errorCode) const override;
  567. /**
  568. * Appends the second string to the first string
  569. * (merging them at the boundary) and returns the first string.
  570. * The result is normalized if both the strings were normalized.
  571. * The first and second strings must be different objects.
  572. * @param first string, should be normalized
  573. * @param second string, should be normalized
  574. * @param errorCode Standard ICU error code. Its input value must
  575. * pass the U_SUCCESS() test, or else the function returns
  576. * immediately. Check for U_FAILURE() on output or use with
  577. * function chaining. (See User Guide for details.)
  578. * @return first
  579. * @stable ICU 4.4
  580. */
  581. virtual UnicodeString &
  582. append(UnicodeString &first,
  583. const UnicodeString &second,
  584. UErrorCode &errorCode) const override;
  585. /**
  586. * Gets the decomposition mapping of c.
  587. * For details see the base class documentation.
  588. *
  589. * This function is independent of the mode of the Normalizer2.
  590. * @param c code point
  591. * @param decomposition String object which will be set to c's
  592. * decomposition mapping, if there is one.
  593. * @return true if c has a decomposition, otherwise false
  594. * @stable ICU 4.6
  595. */
  596. virtual UBool
  597. getDecomposition(UChar32 c, UnicodeString &decomposition) const override;
  598. /**
  599. * Gets the raw decomposition mapping of c.
  600. * For details see the base class documentation.
  601. *
  602. * This function is independent of the mode of the Normalizer2.
  603. * @param c code point
  604. * @param decomposition String object which will be set to c's
  605. * raw decomposition mapping, if there is one.
  606. * @return true if c has a decomposition, otherwise false
  607. * @stable ICU 49
  608. */
  609. virtual UBool
  610. getRawDecomposition(UChar32 c, UnicodeString &decomposition) const override;
  611. /**
  612. * Performs pairwise composition of a & b and returns the composite if there is one.
  613. * For details see the base class documentation.
  614. *
  615. * This function is independent of the mode of the Normalizer2.
  616. * @param a A (normalization starter) code point.
  617. * @param b Another code point.
  618. * @return The non-negative composite code point if there is one; otherwise a negative value.
  619. * @stable ICU 49
  620. */
  621. virtual UChar32
  622. composePair(UChar32 a, UChar32 b) const override;
  623. /**
  624. * Gets the combining class of c.
  625. * The default implementation returns 0
  626. * but all standard implementations return the Unicode Canonical_Combining_Class value.
  627. * @param c code point
  628. * @return c's combining class
  629. * @stable ICU 49
  630. */
  631. virtual uint8_t
  632. getCombiningClass(UChar32 c) const override;
  633. /**
  634. * Tests if the string is normalized.
  635. * For details see the Normalizer2 base class documentation.
  636. * @param s input string
  637. * @param errorCode Standard ICU error code. Its input value must
  638. * pass the U_SUCCESS() test, or else the function returns
  639. * immediately. Check for U_FAILURE() on output or use with
  640. * function chaining. (See User Guide for details.)
  641. * @return true if s is normalized
  642. * @stable ICU 4.4
  643. */
  644. virtual UBool
  645. isNormalized(const UnicodeString &s, UErrorCode &errorCode) const override;
  646. /**
  647. * Tests if the UTF-8 string is normalized.
  648. * Internally, in cases where the quickCheck() method would return "maybe"
  649. * (which is only possible for the two COMPOSE modes) this method
  650. * resolves to "yes" or "no" to provide a definitive result,
  651. * at the cost of doing more work in those cases.
  652. *
  653. * This works for all normalization modes.
  654. * It is optimized for UTF-8 for all built-in modes except for FCD.
  655. * The base class implementation converts to UTF-16 and calls isNormalized().
  656. *
  657. * @param s UTF-8 input string
  658. * @param errorCode Standard ICU error code. Its input value must
  659. * pass the U_SUCCESS() test, or else the function returns
  660. * immediately. Check for U_FAILURE() on output or use with
  661. * function chaining. (See User Guide for details.)
  662. * @return true if s is normalized
  663. * @stable ICU 60
  664. */
  665. virtual UBool
  666. isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const override;
  667. /**
  668. * Tests if the string is normalized.
  669. * For details see the Normalizer2 base class documentation.
  670. * @param s input string
  671. * @param errorCode Standard ICU error code. Its input value must
  672. * pass the U_SUCCESS() test, or else the function returns
  673. * immediately. Check for U_FAILURE() on output or use with
  674. * function chaining. (See User Guide for details.)
  675. * @return UNormalizationCheckResult
  676. * @stable ICU 4.4
  677. */
  678. virtual UNormalizationCheckResult
  679. quickCheck(const UnicodeString &s, UErrorCode &errorCode) const override;
  680. /**
  681. * Returns the end of the normalized substring of the input string.
  682. * For details see the Normalizer2 base class documentation.
  683. * @param s input string
  684. * @param errorCode Standard ICU error code. Its input value must
  685. * pass the U_SUCCESS() test, or else the function returns
  686. * immediately. Check for U_FAILURE() on output or use with
  687. * function chaining. (See User Guide for details.)
  688. * @return "yes" span end index
  689. * @stable ICU 4.4
  690. */
  691. virtual int32_t
  692. spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const override;
  693. /**
  694. * Tests if the character always has a normalization boundary before it,
  695. * regardless of context.
  696. * For details see the Normalizer2 base class documentation.
  697. * @param c character to test
  698. * @return true if c has a normalization boundary before it
  699. * @stable ICU 4.4
  700. */
  701. virtual UBool hasBoundaryBefore(UChar32 c) const override;
  702. /**
  703. * Tests if the character always has a normalization boundary after it,
  704. * regardless of context.
  705. * For details see the Normalizer2 base class documentation.
  706. * @param c character to test
  707. * @return true if c has a normalization boundary after it
  708. * @stable ICU 4.4
  709. */
  710. virtual UBool hasBoundaryAfter(UChar32 c) const override;
  711. /**
  712. * Tests if the character is normalization-inert.
  713. * For details see the Normalizer2 base class documentation.
  714. * @param c character to test
  715. * @return true if c is normalization-inert
  716. * @stable ICU 4.4
  717. */
  718. virtual UBool isInert(UChar32 c) const override;
  719. private:
  720. UnicodeString &
  721. normalize(const UnicodeString &src,
  722. UnicodeString &dest,
  723. USetSpanCondition spanCondition,
  724. UErrorCode &errorCode) const;
  725. void
  726. normalizeUTF8(uint32_t options, const char *src, int32_t length,
  727. ByteSink &sink, Edits *edits,
  728. USetSpanCondition spanCondition,
  729. UErrorCode &errorCode) const;
  730. UnicodeString &
  731. normalizeSecondAndAppend(UnicodeString &first,
  732. const UnicodeString &second,
  733. UBool doNormalize,
  734. UErrorCode &errorCode) const;
  735. const Normalizer2 &norm2;
  736. const UnicodeSet &set;
  737. };
  738. U_NAMESPACE_END
  739. #endif // !UCONFIG_NO_NORMALIZATION
  740. #endif /* U_SHOW_CPLUSPLUS_API */
  741. #endif // __NORMALIZER2_H__