normalizer2.h 35 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. *******************************************************************************
  5. *
  6. * Copyright (C) 2009-2013, International Business Machines
  7. * Corporation and others. All Rights Reserved.
  8. *
  9. *******************************************************************************
  10. * file name: normalizer2.h
  11. * encoding: UTF-8
  12. * tab size: 8 (not used)
  13. * indentation:4
  14. *
  15. * created on: 2009nov22
  16. * created by: Markus W. Scherer
  17. */
  18. #ifndef __NORMALIZER2_H__
  19. #define __NORMALIZER2_H__
  20. /**
  21. * \file
  22. * \brief C++ API: New API for Unicode Normalization.
  23. */
  24. #include "unicode/utypes.h"
  25. #if U_SHOW_CPLUSPLUS_API
  26. #if !UCONFIG_NO_NORMALIZATION
  27. #include "unicode/stringpiece.h"
  28. #include "unicode/uniset.h"
  29. #include "unicode/unistr.h"
  30. #include "unicode/unorm2.h"
  31. U_NAMESPACE_BEGIN
  32. class ByteSink;
  33. /**
  34. * Unicode normalization functionality for standard Unicode normalization or
  35. * for using custom mapping tables.
  36. * All instances of this class are unmodifiable/immutable.
  37. * Instances returned by getInstance() are singletons that must not be deleted by the caller.
  38. * The Normalizer2 class is not intended for public subclassing.
  39. *
  40. * The primary functions are to produce a normalized string and to detect whether
  41. * a string is already normalized.
  42. * The most commonly used normalization forms are those defined in
  43. * http://www.unicode.org/unicode/reports/tr15/
  44. * However, this API supports additional normalization forms for specialized purposes.
  45. * For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE)
  46. * and can be used in implementations of UTS #46.
  47. *
  48. * Not only are the standard compose and decompose modes supplied,
  49. * but additional modes are provided as documented in the Mode enum.
  50. *
  51. * Some of the functions in this class identify normalization boundaries.
  52. * At a normalization boundary, the portions of the string
  53. * before it and starting from it do not interact and can be handled independently.
  54. *
  55. * The spanQuickCheckYes() stops at a normalization boundary.
  56. * When the goal is a normalized string, then the text before the boundary
  57. * can be copied, and the remainder can be processed with normalizeSecondAndAppend().
  58. *
  59. * The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether
  60. * a character is guaranteed to be at a normalization boundary,
  61. * regardless of context.
  62. * This is used for moving from one normalization boundary to the next
  63. * or preceding boundary, and for performing iterative normalization.
  64. *
  65. * Iterative normalization is useful when only a small portion of a
  66. * longer string needs to be processed.
  67. * For example, in ICU, iterative normalization is used by the NormalizationTransliterator
  68. * (to avoid replacing already-normalized text) and ucol_nextSortKeyPart()
  69. * (to process only the substring for which sort key bytes are computed).
  70. *
  71. * The set of normalization boundaries returned by these functions may not be
  72. * complete: There may be more boundaries that could be returned.
  73. * Different functions may return different boundaries.
  74. * @stable ICU 4.4
  75. */
  76. class U_COMMON_API Normalizer2 : public UObject {
  77. public:
  78. /**
  79. * Destructor.
  80. * @stable ICU 4.4
  81. */
  82. ~Normalizer2();
  83. /**
  84. * Returns a Normalizer2 instance for Unicode NFC normalization.
  85. * Same as getInstance(nullptr, "nfc", UNORM2_COMPOSE, errorCode).
  86. * Returns an unmodifiable singleton instance. Do not delete it.
  87. * @param errorCode Standard ICU error code. Its input value must
  88. * pass the U_SUCCESS() test, or else the function returns
  89. * immediately. Check for U_FAILURE() on output or use with
  90. * function chaining. (See User Guide for details.)
  91. * @return the requested Normalizer2, if successful
  92. * @stable ICU 49
  93. */
  94. static const Normalizer2 *
  95. getNFCInstance(UErrorCode &errorCode);
  96. /**
  97. * Returns a Normalizer2 instance for Unicode NFD normalization.
  98. * Same as getInstance(nullptr, "nfc", UNORM2_DECOMPOSE, errorCode).
  99. * Returns an unmodifiable singleton instance. Do not delete it.
  100. * @param errorCode Standard ICU error code. Its input value must
  101. * pass the U_SUCCESS() test, or else the function returns
  102. * immediately. Check for U_FAILURE() on output or use with
  103. * function chaining. (See User Guide for details.)
  104. * @return the requested Normalizer2, if successful
  105. * @stable ICU 49
  106. */
  107. static const Normalizer2 *
  108. getNFDInstance(UErrorCode &errorCode);
  109. /**
  110. * Returns a Normalizer2 instance for Unicode NFKC normalization.
  111. * Same as getInstance(nullptr, "nfkc", UNORM2_COMPOSE, errorCode).
  112. * Returns an unmodifiable singleton instance. Do not delete it.
  113. * @param errorCode Standard ICU error code. Its input value must
  114. * pass the U_SUCCESS() test, or else the function returns
  115. * immediately. Check for U_FAILURE() on output or use with
  116. * function chaining. (See User Guide for details.)
  117. * @return the requested Normalizer2, if successful
  118. * @stable ICU 49
  119. */
  120. static const Normalizer2 *
  121. getNFKCInstance(UErrorCode &errorCode);
  122. /**
  123. * Returns a Normalizer2 instance for Unicode NFKD normalization.
  124. * Same as getInstance(nullptr, "nfkc", UNORM2_DECOMPOSE, errorCode).
  125. * Returns an unmodifiable singleton instance. Do not delete it.
  126. * @param errorCode Standard ICU error code. Its input value must
  127. * pass the U_SUCCESS() test, or else the function returns
  128. * immediately. Check for U_FAILURE() on output or use with
  129. * function chaining. (See User Guide for details.)
  130. * @return the requested Normalizer2, if successful
  131. * @stable ICU 49
  132. */
  133. static const Normalizer2 *
  134. getNFKDInstance(UErrorCode &errorCode);
  135. /**
  136. * Returns a Normalizer2 instance for Unicode toNFKC_Casefold() normalization
  137. * which is equivalent to applying the NFKC_Casefold mappings and then NFC.
  138. * See https://www.unicode.org/reports/tr44/#NFKC_Casefold
  139. *
  140. * Same as getInstance(nullptr, "nfkc_cf", UNORM2_COMPOSE, errorCode).
  141. * Returns an unmodifiable singleton instance. Do not delete it.
  142. * @param errorCode Standard ICU error code. Its input value must
  143. * pass the U_SUCCESS() test, or else the function returns
  144. * immediately. Check for U_FAILURE() on output or use with
  145. * function chaining. (See User Guide for details.)
  146. * @return the requested Normalizer2, if successful
  147. * @stable ICU 49
  148. */
  149. static const Normalizer2 *
  150. getNFKCCasefoldInstance(UErrorCode &errorCode);
  151. #ifndef U_HIDE_DRAFT_API
  152. /**
  153. * Returns a Normalizer2 instance for a variant of Unicode toNFKC_Casefold() normalization
  154. * which is equivalent to applying the NFKC_Simple_Casefold mappings and then NFC.
  155. * See https://www.unicode.org/reports/tr44/#NFKC_Simple_Casefold
  156. *
  157. * Same as getInstance(nullptr, "nfkc_scf", UNORM2_COMPOSE, errorCode).
  158. * Returns an unmodifiable singleton instance. Do not delete it.
  159. * @param errorCode Standard ICU error code. Its input value must
  160. * pass the U_SUCCESS() test, or else the function returns
  161. * immediately. Check for U_FAILURE() on output or use with
  162. * function chaining. (See User Guide for details.)
  163. * @return the requested Normalizer2, if successful
  164. * @draft ICU 74
  165. */
  166. static const Normalizer2 *
  167. getNFKCSimpleCasefoldInstance(UErrorCode &errorCode);
  168. #endif // U_HIDE_DRAFT_API
  169. /**
  170. * Returns a Normalizer2 instance which uses the specified data file
  171. * (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle)
  172. * and which composes or decomposes text according to the specified mode.
  173. * Returns an unmodifiable singleton instance. Do not delete it.
  174. *
  175. * Use packageName=nullptr for data files that are part of ICU's own data.
  176. * Use name="nfc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFC/NFD.
  177. * Use name="nfkc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFKC/NFKD.
  178. * Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold.
  179. *
  180. * @param packageName nullptr for ICU built-in data, otherwise application data package name
  181. * @param name "nfc" or "nfkc" or "nfkc_cf" or "nfkc_scf" or name of custom data file
  182. * @param mode normalization mode (compose or decompose etc.)
  183. * @param errorCode Standard ICU error code. Its input value must
  184. * pass the U_SUCCESS() test, or else the function returns
  185. * immediately. Check for U_FAILURE() on output or use with
  186. * function chaining. (See User Guide for details.)
  187. * @return the requested Normalizer2, if successful
  188. * @stable ICU 4.4
  189. */
  190. static const Normalizer2 *
  191. getInstance(const char *packageName,
  192. const char *name,
  193. UNormalization2Mode mode,
  194. UErrorCode &errorCode);
  195. /**
  196. * Returns the normalized form of the source string.
  197. * @param src source string
  198. * @param errorCode Standard ICU error code. Its input value must
  199. * pass the U_SUCCESS() test, or else the function returns
  200. * immediately. Check for U_FAILURE() on output or use with
  201. * function chaining. (See User Guide for details.)
  202. * @return normalized src
  203. * @stable ICU 4.4
  204. */
  205. UnicodeString
  206. normalize(const UnicodeString &src, UErrorCode &errorCode) const {
  207. UnicodeString result;
  208. normalize(src, result, errorCode);
  209. return result;
  210. }
  211. /**
  212. * Writes the normalized form of the source string to the destination string
  213. * (replacing its contents) and returns the destination string.
  214. * The source and destination strings must be different objects.
  215. * @param src source string
  216. * @param dest destination string; its contents is replaced with normalized src
  217. * @param errorCode Standard ICU error code. Its input value must
  218. * pass the U_SUCCESS() test, or else the function returns
  219. * immediately. Check for U_FAILURE() on output or use with
  220. * function chaining. (See User Guide for details.)
  221. * @return dest
  222. * @stable ICU 4.4
  223. */
  224. virtual UnicodeString &
  225. normalize(const UnicodeString &src,
  226. UnicodeString &dest,
  227. UErrorCode &errorCode) const = 0;
  228. /**
  229. * Normalizes a UTF-8 string and optionally records how source substrings
  230. * relate to changed and unchanged result substrings.
  231. *
  232. * Implemented completely for all built-in modes except for FCD.
  233. * The base class implementation converts to & from UTF-16 and does not support edits.
  234. *
  235. * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
  236. * @param src Source UTF-8 string.
  237. * @param sink A ByteSink to which the normalized UTF-8 result string is written.
  238. * sink.Flush() is called at the end.
  239. * @param edits Records edits for index mapping, working with styled text,
  240. * and getting only changes (if any).
  241. * The Edits contents is undefined if any error occurs.
  242. * This function calls edits->reset() first unless
  243. * options includes U_EDITS_NO_RESET. edits can be nullptr.
  244. * @param errorCode Standard ICU error code. Its input value must
  245. * pass the U_SUCCESS() test, or else the function returns
  246. * immediately. Check for U_FAILURE() on output or use with
  247. * function chaining. (See User Guide for details.)
  248. * @stable ICU 60
  249. */
  250. virtual void
  251. normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
  252. Edits *edits, UErrorCode &errorCode) const;
  253. /**
  254. * Appends the normalized form of the second string to the first string
  255. * (merging them at the boundary) and returns the first string.
  256. * The result is normalized if the first string was normalized.
  257. * The first and second strings must be different objects.
  258. * @param first string, should be normalized
  259. * @param second string, will be normalized
  260. * @param errorCode Standard ICU error code. Its input value must
  261. * pass the U_SUCCESS() test, or else the function returns
  262. * immediately. Check for U_FAILURE() on output or use with
  263. * function chaining. (See User Guide for details.)
  264. * @return first
  265. * @stable ICU 4.4
  266. */
  267. virtual UnicodeString &
  268. normalizeSecondAndAppend(UnicodeString &first,
  269. const UnicodeString &second,
  270. UErrorCode &errorCode) const = 0;
  271. /**
  272. * Appends the second string to the first string
  273. * (merging them at the boundary) and returns the first string.
  274. * The result is normalized if both the strings were normalized.
  275. * The first and second strings must be different objects.
  276. * @param first string, should be normalized
  277. * @param second string, should be normalized
  278. * @param errorCode Standard ICU error code. Its input value must
  279. * pass the U_SUCCESS() test, or else the function returns
  280. * immediately. Check for U_FAILURE() on output or use with
  281. * function chaining. (See User Guide for details.)
  282. * @return first
  283. * @stable ICU 4.4
  284. */
  285. virtual UnicodeString &
  286. append(UnicodeString &first,
  287. const UnicodeString &second,
  288. UErrorCode &errorCode) const = 0;
  289. /**
  290. * Gets the decomposition mapping of c.
  291. * Roughly equivalent to normalizing the String form of c
  292. * on a UNORM2_DECOMPOSE Normalizer2 instance, but much faster, and except that this function
  293. * returns false and does not write a string
  294. * if c does not have a decomposition mapping in this instance's data.
  295. * This function is independent of the mode of the Normalizer2.
  296. * @param c code point
  297. * @param decomposition String object which will be set to c's
  298. * decomposition mapping, if there is one.
  299. * @return true if c has a decomposition, otherwise false
  300. * @stable ICU 4.6
  301. */
  302. virtual UBool
  303. getDecomposition(UChar32 c, UnicodeString &decomposition) const = 0;
  304. /**
  305. * Gets the raw decomposition mapping of c.
  306. *
  307. * This is similar to the getDecomposition() method but returns the
  308. * raw decomposition mapping as specified in UnicodeData.txt or
  309. * (for custom data) in the mapping files processed by the gennorm2 tool.
  310. * By contrast, getDecomposition() returns the processed,
  311. * recursively-decomposed version of this mapping.
  312. *
  313. * When used on a standard NFKC Normalizer2 instance,
  314. * getRawDecomposition() returns the Unicode Decomposition_Mapping (dm) property.
  315. *
  316. * When used on a standard NFC Normalizer2 instance,
  317. * it returns the Decomposition_Mapping only if the Decomposition_Type (dt) is Canonical (Can);
  318. * in this case, the result contains either one or two code points (=1..4 char16_ts).
  319. *
  320. * This function is independent of the mode of the Normalizer2.
  321. * The default implementation returns false.
  322. * @param c code point
  323. * @param decomposition String object which will be set to c's
  324. * raw decomposition mapping, if there is one.
  325. * @return true if c has a decomposition, otherwise false
  326. * @stable ICU 49
  327. */
  328. virtual UBool
  329. getRawDecomposition(UChar32 c, UnicodeString &decomposition) const;
  330. /**
  331. * Performs pairwise composition of a & b and returns the composite if there is one.
  332. *
  333. * Returns a composite code point c only if c has a two-way mapping to a+b.
  334. * In standard Unicode normalization, this means that
  335. * c has a canonical decomposition to a+b
  336. * and c does not have the Full_Composition_Exclusion property.
  337. *
  338. * This function is independent of the mode of the Normalizer2.
  339. * The default implementation returns a negative value.
  340. * @param a A (normalization starter) code point.
  341. * @param b Another code point.
  342. * @return The non-negative composite code point if there is one; otherwise a negative value.
  343. * @stable ICU 49
  344. */
  345. virtual UChar32
  346. composePair(UChar32 a, UChar32 b) const;
  347. /**
  348. * Gets the combining class of c.
  349. * The default implementation returns 0
  350. * but all standard implementations return the Unicode Canonical_Combining_Class value.
  351. * @param c code point
  352. * @return c's combining class
  353. * @stable ICU 49
  354. */
  355. virtual uint8_t
  356. getCombiningClass(UChar32 c) const;
  357. /**
  358. * Tests if the string is normalized.
  359. * Internally, in cases where the quickCheck() method would return "maybe"
  360. * (which is only possible for the two COMPOSE modes) this method
  361. * resolves to "yes" or "no" to provide a definitive result,
  362. * at the cost of doing more work in those cases.
  363. * @param s input string
  364. * @param errorCode Standard ICU error code. Its input value must
  365. * pass the U_SUCCESS() test, or else the function returns
  366. * immediately. Check for U_FAILURE() on output or use with
  367. * function chaining. (See User Guide for details.)
  368. * @return true if s is normalized
  369. * @stable ICU 4.4
  370. */
  371. virtual UBool
  372. isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0;
  373. /**
  374. * Tests if the UTF-8 string is normalized.
  375. * Internally, in cases where the quickCheck() method would return "maybe"
  376. * (which is only possible for the two COMPOSE modes) this method
  377. * resolves to "yes" or "no" to provide a definitive result,
  378. * at the cost of doing more work in those cases.
  379. *
  380. * This works for all normalization modes.
  381. * It is optimized for UTF-8 for all built-in modes except for FCD.
  382. * The base class implementation converts to UTF-16 and calls isNormalized().
  383. *
  384. * @param s UTF-8 input string
  385. * @param errorCode Standard ICU error code. Its input value must
  386. * pass the U_SUCCESS() test, or else the function returns
  387. * immediately. Check for U_FAILURE() on output or use with
  388. * function chaining. (See User Guide for details.)
  389. * @return true if s is normalized
  390. * @stable ICU 60
  391. */
  392. virtual UBool
  393. isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const;
  394. /**
  395. * Tests if the string is normalized.
  396. * For the two COMPOSE modes, the result could be "maybe" in cases that
  397. * would take a little more work to resolve definitively.
  398. * Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster
  399. * combination of quick check + normalization, to avoid
  400. * re-checking the "yes" prefix.
  401. * @param s input string
  402. * @param errorCode Standard ICU error code. Its input value must
  403. * pass the U_SUCCESS() test, or else the function returns
  404. * immediately. Check for U_FAILURE() on output or use with
  405. * function chaining. (See User Guide for details.)
  406. * @return UNormalizationCheckResult
  407. * @stable ICU 4.4
  408. */
  409. virtual UNormalizationCheckResult
  410. quickCheck(const UnicodeString &s, UErrorCode &errorCode) const = 0;
  411. /**
  412. * Returns the end of the normalized substring of the input string.
  413. * In other words, with <code>end=spanQuickCheckYes(s, ec);</code>
  414. * the substring <code>UnicodeString(s, 0, end)</code>
  415. * will pass the quick check with a "yes" result.
  416. *
  417. * The returned end index is usually one or more characters before the
  418. * "no" or "maybe" character: The end index is at a normalization boundary.
  419. * (See the class documentation for more about normalization boundaries.)
  420. *
  421. * When the goal is a normalized string and most input strings are expected
  422. * to be normalized already, then call this method,
  423. * and if it returns a prefix shorter than the input string,
  424. * copy that prefix and use normalizeSecondAndAppend() for the remainder.
  425. * @param s input string
  426. * @param errorCode Standard ICU error code. Its input value must
  427. * pass the U_SUCCESS() test, or else the function returns
  428. * immediately. Check for U_FAILURE() on output or use with
  429. * function chaining. (See User Guide for details.)
  430. * @return "yes" span end index
  431. * @stable ICU 4.4
  432. */
  433. virtual int32_t
  434. spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const = 0;
  435. /**
  436. * Tests if the character always has a normalization boundary before it,
  437. * regardless of context.
  438. * If true, then the character does not normalization-interact with
  439. * preceding characters.
  440. * In other words, a string containing this character can be normalized
  441. * by processing portions before this character and starting from this
  442. * character independently.
  443. * This is used for iterative normalization. See the class documentation for details.
  444. * @param c character to test
  445. * @return true if c has a normalization boundary before it
  446. * @stable ICU 4.4
  447. */
  448. virtual UBool hasBoundaryBefore(UChar32 c) const = 0;
  449. /**
  450. * Tests if the character always has a normalization boundary after it,
  451. * regardless of context.
  452. * If true, then the character does not normalization-interact with
  453. * following characters.
  454. * In other words, a string containing this character can be normalized
  455. * by processing portions up to this character and after this
  456. * character independently.
  457. * This is used for iterative normalization. See the class documentation for details.
  458. * Note that this operation may be significantly slower than hasBoundaryBefore().
  459. * @param c character to test
  460. * @return true if c has a normalization boundary after it
  461. * @stable ICU 4.4
  462. */
  463. virtual UBool hasBoundaryAfter(UChar32 c) const = 0;
  464. /**
  465. * Tests if the character is normalization-inert.
  466. * If true, then the character does not change, nor normalization-interact with
  467. * preceding or following characters.
  468. * In other words, a string containing this character can be normalized
  469. * by processing portions before this character and after this
  470. * character independently.
  471. * This is used for iterative normalization. See the class documentation for details.
  472. * Note that this operation may be significantly slower than hasBoundaryBefore().
  473. * @param c character to test
  474. * @return true if c is normalization-inert
  475. * @stable ICU 4.4
  476. */
  477. virtual UBool isInert(UChar32 c) const = 0;
  478. };
  479. /**
  480. * Normalization filtered by a UnicodeSet.
  481. * Normalizes portions of the text contained in the filter set and leaves
  482. * portions not contained in the filter set unchanged.
  483. * Filtering is done via UnicodeSet::span(..., USET_SPAN_SIMPLE).
  484. * Not-in-the-filter text is treated as "is normalized" and "quick check yes".
  485. * This class implements all of (and only) the Normalizer2 API.
  486. * An instance of this class is unmodifiable/immutable but is constructed and
  487. * must be destructed by the owner.
  488. * @stable ICU 4.4
  489. */
  490. class U_COMMON_API FilteredNormalizer2 : public Normalizer2 {
  491. public:
  492. /**
  493. * Constructs a filtered normalizer wrapping any Normalizer2 instance
  494. * and a filter set.
  495. * Both are aliased and must not be modified or deleted while this object
  496. * is used.
  497. * The filter set should be frozen; otherwise the performance will suffer greatly.
  498. * @param n2 wrapped Normalizer2 instance
  499. * @param filterSet UnicodeSet which determines the characters to be normalized
  500. * @stable ICU 4.4
  501. */
  502. FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet) :
  503. norm2(n2), set(filterSet) {}
  504. /**
  505. * Destructor.
  506. * @stable ICU 4.4
  507. */
  508. ~FilteredNormalizer2();
  509. /**
  510. * Writes the normalized form of the source string to the destination string
  511. * (replacing its contents) and returns the destination string.
  512. * The source and destination strings must be different objects.
  513. * @param src source string
  514. * @param dest destination string; its contents is replaced with normalized src
  515. * @param errorCode Standard ICU error code. Its input value must
  516. * pass the U_SUCCESS() test, or else the function returns
  517. * immediately. Check for U_FAILURE() on output or use with
  518. * function chaining. (See User Guide for details.)
  519. * @return dest
  520. * @stable ICU 4.4
  521. */
  522. virtual UnicodeString &
  523. normalize(const UnicodeString &src,
  524. UnicodeString &dest,
  525. UErrorCode &errorCode) const override;
  526. /**
  527. * Normalizes a UTF-8 string and optionally records how source substrings
  528. * relate to changed and unchanged result substrings.
  529. *
  530. * Implemented completely for most built-in modes except for FCD.
  531. * The base class implementation converts to & from UTF-16 and does not support edits.
  532. *
  533. * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
  534. * @param src Source UTF-8 string.
  535. * @param sink A ByteSink to which the normalized UTF-8 result string is written.
  536. * sink.Flush() is called at the end.
  537. * @param edits Records edits for index mapping, working with styled text,
  538. * and getting only changes (if any).
  539. * The Edits contents is undefined if any error occurs.
  540. * This function calls edits->reset() first unless
  541. * options includes U_EDITS_NO_RESET. edits can be nullptr.
  542. * @param errorCode Standard ICU error code. Its input value must
  543. * pass the U_SUCCESS() test, or else the function returns
  544. * immediately. Check for U_FAILURE() on output or use with
  545. * function chaining. (See User Guide for details.)
  546. * @stable ICU 60
  547. */
  548. virtual void
  549. normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
  550. Edits *edits, UErrorCode &errorCode) const override;
  551. /**
  552. * Appends the normalized form of the second string to the first string
  553. * (merging them at the boundary) and returns the first string.
  554. * The result is normalized if the first string was normalized.
  555. * The first and second strings must be different objects.
  556. * @param first string, should be normalized
  557. * @param second string, will be normalized
  558. * @param errorCode Standard ICU error code. Its input value must
  559. * pass the U_SUCCESS() test, or else the function returns
  560. * immediately. Check for U_FAILURE() on output or use with
  561. * function chaining. (See User Guide for details.)
  562. * @return first
  563. * @stable ICU 4.4
  564. */
  565. virtual UnicodeString &
  566. normalizeSecondAndAppend(UnicodeString &first,
  567. const UnicodeString &second,
  568. UErrorCode &errorCode) const override;
  569. /**
  570. * Appends the second string to the first string
  571. * (merging them at the boundary) and returns the first string.
  572. * The result is normalized if both the strings were normalized.
  573. * The first and second strings must be different objects.
  574. * @param first string, should be normalized
  575. * @param second string, should be normalized
  576. * @param errorCode Standard ICU error code. Its input value must
  577. * pass the U_SUCCESS() test, or else the function returns
  578. * immediately. Check for U_FAILURE() on output or use with
  579. * function chaining. (See User Guide for details.)
  580. * @return first
  581. * @stable ICU 4.4
  582. */
  583. virtual UnicodeString &
  584. append(UnicodeString &first,
  585. const UnicodeString &second,
  586. UErrorCode &errorCode) const override;
  587. /**
  588. * Gets the decomposition mapping of c.
  589. * For details see the base class documentation.
  590. *
  591. * This function is independent of the mode of the Normalizer2.
  592. * @param c code point
  593. * @param decomposition String object which will be set to c's
  594. * decomposition mapping, if there is one.
  595. * @return true if c has a decomposition, otherwise false
  596. * @stable ICU 4.6
  597. */
  598. virtual UBool
  599. getDecomposition(UChar32 c, UnicodeString &decomposition) const override;
  600. /**
  601. * Gets the raw decomposition mapping of c.
  602. * For details see the base class documentation.
  603. *
  604. * This function is independent of the mode of the Normalizer2.
  605. * @param c code point
  606. * @param decomposition String object which will be set to c's
  607. * raw decomposition mapping, if there is one.
  608. * @return true if c has a decomposition, otherwise false
  609. * @stable ICU 49
  610. */
  611. virtual UBool
  612. getRawDecomposition(UChar32 c, UnicodeString &decomposition) const override;
  613. /**
  614. * Performs pairwise composition of a & b and returns the composite if there is one.
  615. * For details see the base class documentation.
  616. *
  617. * This function is independent of the mode of the Normalizer2.
  618. * @param a A (normalization starter) code point.
  619. * @param b Another code point.
  620. * @return The non-negative composite code point if there is one; otherwise a negative value.
  621. * @stable ICU 49
  622. */
  623. virtual UChar32
  624. composePair(UChar32 a, UChar32 b) const override;
  625. /**
  626. * Gets the combining class of c.
  627. * The default implementation returns 0
  628. * but all standard implementations return the Unicode Canonical_Combining_Class value.
  629. * @param c code point
  630. * @return c's combining class
  631. * @stable ICU 49
  632. */
  633. virtual uint8_t
  634. getCombiningClass(UChar32 c) const override;
  635. /**
  636. * Tests if the string is normalized.
  637. * For details see the Normalizer2 base class documentation.
  638. * @param s input string
  639. * @param errorCode Standard ICU error code. Its input value must
  640. * pass the U_SUCCESS() test, or else the function returns
  641. * immediately. Check for U_FAILURE() on output or use with
  642. * function chaining. (See User Guide for details.)
  643. * @return true if s is normalized
  644. * @stable ICU 4.4
  645. */
  646. virtual UBool
  647. isNormalized(const UnicodeString &s, UErrorCode &errorCode) const override;
  648. /**
  649. * Tests if the UTF-8 string is normalized.
  650. * Internally, in cases where the quickCheck() method would return "maybe"
  651. * (which is only possible for the two COMPOSE modes) this method
  652. * resolves to "yes" or "no" to provide a definitive result,
  653. * at the cost of doing more work in those cases.
  654. *
  655. * This works for all normalization modes.
  656. * It is optimized for UTF-8 for all built-in modes except for FCD.
  657. * The base class implementation converts to UTF-16 and calls isNormalized().
  658. *
  659. * @param s UTF-8 input string
  660. * @param errorCode Standard ICU error code. Its input value must
  661. * pass the U_SUCCESS() test, or else the function returns
  662. * immediately. Check for U_FAILURE() on output or use with
  663. * function chaining. (See User Guide for details.)
  664. * @return true if s is normalized
  665. * @stable ICU 60
  666. */
  667. virtual UBool
  668. isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const override;
  669. /**
  670. * Tests if the string is normalized.
  671. * For details see the Normalizer2 base class documentation.
  672. * @param s input string
  673. * @param errorCode Standard ICU error code. Its input value must
  674. * pass the U_SUCCESS() test, or else the function returns
  675. * immediately. Check for U_FAILURE() on output or use with
  676. * function chaining. (See User Guide for details.)
  677. * @return UNormalizationCheckResult
  678. * @stable ICU 4.4
  679. */
  680. virtual UNormalizationCheckResult
  681. quickCheck(const UnicodeString &s, UErrorCode &errorCode) const override;
  682. /**
  683. * Returns the end of the normalized substring of the input string.
  684. * For details see the Normalizer2 base class documentation.
  685. * @param s input string
  686. * @param errorCode Standard ICU error code. Its input value must
  687. * pass the U_SUCCESS() test, or else the function returns
  688. * immediately. Check for U_FAILURE() on output or use with
  689. * function chaining. (See User Guide for details.)
  690. * @return "yes" span end index
  691. * @stable ICU 4.4
  692. */
  693. virtual int32_t
  694. spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const override;
  695. /**
  696. * Tests if the character always has a normalization boundary before it,
  697. * regardless of context.
  698. * For details see the Normalizer2 base class documentation.
  699. * @param c character to test
  700. * @return true if c has a normalization boundary before it
  701. * @stable ICU 4.4
  702. */
  703. virtual UBool hasBoundaryBefore(UChar32 c) const override;
  704. /**
  705. * Tests if the character always has a normalization boundary after it,
  706. * regardless of context.
  707. * For details see the Normalizer2 base class documentation.
  708. * @param c character to test
  709. * @return true if c has a normalization boundary after it
  710. * @stable ICU 4.4
  711. */
  712. virtual UBool hasBoundaryAfter(UChar32 c) const override;
  713. /**
  714. * Tests if the character is normalization-inert.
  715. * For details see the Normalizer2 base class documentation.
  716. * @param c character to test
  717. * @return true if c is normalization-inert
  718. * @stable ICU 4.4
  719. */
  720. virtual UBool isInert(UChar32 c) const override;
  721. private:
  722. UnicodeString &
  723. normalize(const UnicodeString &src,
  724. UnicodeString &dest,
  725. USetSpanCondition spanCondition,
  726. UErrorCode &errorCode) const;
  727. void
  728. normalizeUTF8(uint32_t options, const char *src, int32_t length,
  729. ByteSink &sink, Edits *edits,
  730. USetSpanCondition spanCondition,
  731. UErrorCode &errorCode) const;
  732. UnicodeString &
  733. normalizeSecondAndAppend(UnicodeString &first,
  734. const UnicodeString &second,
  735. UBool doNormalize,
  736. UErrorCode &errorCode) const;
  737. const Normalizer2 &norm2;
  738. const UnicodeSet &set;
  739. };
  740. U_NAMESPACE_END
  741. #endif // !UCONFIG_NO_NORMALIZATION
  742. #endif /* U_SHOW_CPLUSPLUS_API */
  743. #endif // __NORMALIZER2_H__