tblcoll.h 37 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. ******************************************************************************
  5. * Copyright (C) 1996-2016, International Business Machines Corporation and
  6. * others. All Rights Reserved.
  7. ******************************************************************************
  8. */
  9. /**
  10. * \file
  11. * \brief C++ API: The RuleBasedCollator class implements the Collator abstract base class.
  12. */
  13. /**
  14. * File tblcoll.h
  15. *
  16. * Created by: Helena Shih
  17. *
  18. * Modification History:
  19. *
  20. * Date Name Description
  21. * 2/5/97 aliu Added streamIn and streamOut methods. Added
  22. * constructor which reads RuleBasedCollator object from
  23. * a binary file. Added writeToFile method which streams
  24. * RuleBasedCollator out to a binary file. The streamIn
  25. * and streamOut methods use istream and ostream objects
  26. * in binary mode.
  27. * 2/12/97 aliu Modified to use TableCollationData sub-object to
  28. * hold invariant data.
  29. * 2/13/97 aliu Moved several methods into this class from Collation.
  30. * Added a private RuleBasedCollator(Locale&) constructor,
  31. * to be used by Collator::createDefault(). General
  32. * clean up.
  33. * 2/20/97 helena Added clone, operator==, operator!=, operator=, and copy
  34. * constructor and getDynamicClassID.
  35. * 3/5/97 aliu Modified constructFromFile() to add parameter
  36. * specifying whether or not binary loading is to be
  37. * attempted. This is required for dynamic rule loading.
  38. * 05/07/97 helena Added memory allocation error detection.
  39. * 6/17/97 helena Added IDENTICAL strength for compare, changed getRules to
  40. * use MergeCollation::getPattern.
  41. * 6/20/97 helena Java class name change.
  42. * 8/18/97 helena Added internal API documentation.
  43. * 09/03/97 helena Added createCollationKeyValues().
  44. * 02/10/98 damiba Added compare with "length" parameter
  45. * 08/05/98 erm Synched with 1.2 version of RuleBasedCollator.java
  46. * 04/23/99 stephen Removed EDecompositionMode, merged with
  47. * Normalizer::EMode
  48. * 06/14/99 stephen Removed kResourceBundleSuffix
  49. * 11/02/99 helena Collator performance enhancements. Eliminates the
  50. * UnicodeString construction and special case for NO_OP.
  51. * 11/23/99 srl More performance enhancements. Updates to NormalizerIterator
  52. * internal state management.
  53. * 12/15/99 aliu Update to support Thai collation. Move NormalizerIterator
  54. * to implementation file.
  55. * 01/29/01 synwee Modified into a C++ wrapper which calls C API
  56. * (ucol.h)
  57. * 2012-2014 markus Rewritten in C++ again.
  58. */
  59. #ifndef TBLCOLL_H
  60. #define TBLCOLL_H
  61. #include "unicode/utypes.h"
  62. #if U_SHOW_CPLUSPLUS_API
  63. #if !UCONFIG_NO_COLLATION
  64. #include "unicode/coll.h"
  65. #include "unicode/locid.h"
  66. #include "unicode/uiter.h"
  67. #include "unicode/ucol.h"
  68. U_NAMESPACE_BEGIN
  69. struct CollationCacheEntry;
  70. struct CollationData;
  71. struct CollationSettings;
  72. struct CollationTailoring;
  73. /**
  74. * @stable ICU 2.0
  75. */
  76. class StringSearch;
  77. /**
  78. * @stable ICU 2.0
  79. */
  80. class CollationElementIterator;
  81. class CollationKey;
  82. class SortKeyByteSink;
  83. class UnicodeSet;
  84. class UnicodeString;
  85. class UVector64;
  86. /**
  87. * The RuleBasedCollator class provides the implementation of
  88. * Collator, using data-driven tables. The user can create a customized
  89. * table-based collation.
  90. * <p>
  91. * For more information about the collation service see
  92. * <a href="https://unicode-org.github.io/icu/userguide/collation">the User Guide</a>.
  93. * <p>
  94. * Collation service provides correct sorting orders for most locales supported in ICU.
  95. * If specific data for a locale is not available, the orders eventually falls back
  96. * to the <a href="http://www.unicode.org/reports/tr35/tr35-collation.html#Root_Collation">CLDR root sort order</a>.
  97. * <p>
  98. * Sort ordering may be customized by providing your own set of rules. For more on
  99. * this subject see the <a href="https://unicode-org.github.io/icu/userguide/collation/customization">
  100. * Collation Customization</a> section of the User Guide.
  101. * <p>
  102. * Note, RuleBasedCollator is not to be subclassed.
  103. * @see Collator
  104. */
  105. class U_I18N_API RuleBasedCollator final : public Collator {
  106. public:
  107. /**
  108. * RuleBasedCollator constructor. This takes the table rules and builds a
  109. * collation table out of them. Please see RuleBasedCollator class
  110. * description for more details on the collation rule syntax.
  111. * @param rules the collation rules to build the collation table from.
  112. * @param status reporting a success or an error.
  113. * @stable ICU 2.0
  114. */
  115. RuleBasedCollator(const UnicodeString& rules, UErrorCode& status);
  116. /**
  117. * RuleBasedCollator constructor. This takes the table rules and builds a
  118. * collation table out of them. Please see RuleBasedCollator class
  119. * description for more details on the collation rule syntax.
  120. * @param rules the collation rules to build the collation table from.
  121. * @param collationStrength strength for comparison
  122. * @param status reporting a success or an error.
  123. * @stable ICU 2.0
  124. */
  125. RuleBasedCollator(const UnicodeString& rules,
  126. ECollationStrength collationStrength,
  127. UErrorCode& status);
  128. /**
  129. * RuleBasedCollator constructor. This takes the table rules and builds a
  130. * collation table out of them. Please see RuleBasedCollator class
  131. * description for more details on the collation rule syntax.
  132. * @param rules the collation rules to build the collation table from.
  133. * @param decompositionMode the normalisation mode
  134. * @param status reporting a success or an error.
  135. * @stable ICU 2.0
  136. */
  137. RuleBasedCollator(const UnicodeString& rules,
  138. UColAttributeValue decompositionMode,
  139. UErrorCode& status);
  140. /**
  141. * RuleBasedCollator constructor. This takes the table rules and builds a
  142. * collation table out of them. Please see RuleBasedCollator class
  143. * description for more details on the collation rule syntax.
  144. * @param rules the collation rules to build the collation table from.
  145. * @param collationStrength strength for comparison
  146. * @param decompositionMode the normalisation mode
  147. * @param status reporting a success or an error.
  148. * @stable ICU 2.0
  149. */
  150. RuleBasedCollator(const UnicodeString& rules,
  151. ECollationStrength collationStrength,
  152. UColAttributeValue decompositionMode,
  153. UErrorCode& status);
  154. #ifndef U_HIDE_INTERNAL_API
  155. /**
  156. * TODO: document & propose as public API
  157. * @internal
  158. */
  159. RuleBasedCollator(const UnicodeString &rules,
  160. UParseError &parseError, UnicodeString &reason,
  161. UErrorCode &errorCode);
  162. #endif /* U_HIDE_INTERNAL_API */
  163. /**
  164. * Copy constructor.
  165. * @param other the RuleBasedCollator object to be copied
  166. * @stable ICU 2.0
  167. */
  168. RuleBasedCollator(const RuleBasedCollator& other);
  169. /** Opens a collator from a collator binary image created using
  170. * cloneBinary. Binary image used in instantiation of the
  171. * collator remains owned by the user and should stay around for
  172. * the lifetime of the collator. The API also takes a base collator
  173. * which must be the root collator.
  174. * @param bin binary image owned by the user and required through the
  175. * lifetime of the collator
  176. * @param length size of the image. If negative, the API will try to
  177. * figure out the length of the image
  178. * @param base Base collator, for lookup of untailored characters.
  179. * Must be the root collator, must not be nullptr.
  180. * The base is required to be present through the lifetime of the collator.
  181. * @param status for catching errors
  182. * @return newly created collator
  183. * @see cloneBinary
  184. * @stable ICU 3.4
  185. */
  186. RuleBasedCollator(const uint8_t *bin, int32_t length,
  187. const RuleBasedCollator *base,
  188. UErrorCode &status);
  189. /**
  190. * Destructor.
  191. * @stable ICU 2.0
  192. */
  193. virtual ~RuleBasedCollator();
  194. /**
  195. * Assignment operator.
  196. * @param other other RuleBasedCollator object to copy from.
  197. * @stable ICU 2.0
  198. */
  199. RuleBasedCollator& operator=(const RuleBasedCollator& other);
  200. /**
  201. * Returns true if argument is the same as this object.
  202. * @param other Collator object to be compared.
  203. * @return true if arguments is the same as this object.
  204. * @stable ICU 2.0
  205. */
  206. virtual bool operator==(const Collator& other) const override;
  207. /**
  208. * Makes a copy of this object.
  209. * @return a copy of this object, owned by the caller
  210. * @stable ICU 2.0
  211. */
  212. virtual RuleBasedCollator* clone() const override;
  213. /**
  214. * Creates a collation element iterator for the source string. The caller of
  215. * this method is responsible for the memory management of the return
  216. * pointer.
  217. * @param source the string over which the CollationElementIterator will
  218. * iterate.
  219. * @return the collation element iterator of the source string using this as
  220. * the based Collator.
  221. * @stable ICU 2.2
  222. */
  223. virtual CollationElementIterator* createCollationElementIterator(
  224. const UnicodeString& source) const;
  225. /**
  226. * Creates a collation element iterator for the source. The caller of this
  227. * method is responsible for the memory management of the returned pointer.
  228. * @param source the CharacterIterator which produces the characters over
  229. * which the CollationElementItgerator will iterate.
  230. * @return the collation element iterator of the source using this as the
  231. * based Collator.
  232. * @stable ICU 2.2
  233. */
  234. virtual CollationElementIterator* createCollationElementIterator(
  235. const CharacterIterator& source) const;
  236. // Make deprecated versions of Collator::compare() visible.
  237. using Collator::compare;
  238. /**
  239. * The comparison function compares the character data stored in two
  240. * different strings. Returns information about whether a string is less
  241. * than, greater than or equal to another string.
  242. * @param source the source string to be compared with.
  243. * @param target the string that is to be compared with the source string.
  244. * @param status possible error code
  245. * @return Returns an enum value. UCOL_GREATER if source is greater
  246. * than target; UCOL_EQUAL if source is equal to target; UCOL_LESS if source is less
  247. * than target
  248. * @stable ICU 2.6
  249. **/
  250. virtual UCollationResult compare(const UnicodeString& source,
  251. const UnicodeString& target,
  252. UErrorCode &status) const override;
  253. /**
  254. * Does the same thing as compare but limits the comparison to a specified
  255. * length
  256. * @param source the source string to be compared with.
  257. * @param target the string that is to be compared with the source string.
  258. * @param length the length the comparison is limited to
  259. * @param status possible error code
  260. * @return Returns an enum value. UCOL_GREATER if source (up to the specified
  261. * length) is greater than target; UCOL_EQUAL if source (up to specified
  262. * length) is equal to target; UCOL_LESS if source (up to the specified
  263. * length) is less than target.
  264. * @stable ICU 2.6
  265. */
  266. virtual UCollationResult compare(const UnicodeString& source,
  267. const UnicodeString& target,
  268. int32_t length,
  269. UErrorCode &status) const override;
  270. /**
  271. * The comparison function compares the character data stored in two
  272. * different string arrays. Returns information about whether a string array
  273. * is less than, greater than or equal to another string array.
  274. * @param source the source string array to be compared with.
  275. * @param sourceLength the length of the source string array. If this value
  276. * is equal to -1, the string array is null-terminated.
  277. * @param target the string that is to be compared with the source string.
  278. * @param targetLength the length of the target string array. If this value
  279. * is equal to -1, the string array is null-terminated.
  280. * @param status possible error code
  281. * @return Returns an enum value. UCOL_GREATER if source is greater
  282. * than target; UCOL_EQUAL if source is equal to target; UCOL_LESS if source is less
  283. * than target
  284. * @stable ICU 2.6
  285. */
  286. virtual UCollationResult compare(const char16_t* source, int32_t sourceLength,
  287. const char16_t* target, int32_t targetLength,
  288. UErrorCode &status) const override;
  289. /**
  290. * Compares two strings using the Collator.
  291. * Returns whether the first one compares less than/equal to/greater than
  292. * the second one.
  293. * This version takes UCharIterator input.
  294. * @param sIter the first ("source") string iterator
  295. * @param tIter the second ("target") string iterator
  296. * @param status ICU status
  297. * @return UCOL_LESS, UCOL_EQUAL or UCOL_GREATER
  298. * @stable ICU 4.2
  299. */
  300. virtual UCollationResult compare(UCharIterator &sIter,
  301. UCharIterator &tIter,
  302. UErrorCode &status) const override;
  303. /**
  304. * Compares two UTF-8 strings using the Collator.
  305. * Returns whether the first one compares less than/equal to/greater than
  306. * the second one.
  307. * This version takes UTF-8 input.
  308. * Note that a StringPiece can be implicitly constructed
  309. * from a std::string or a NUL-terminated const char * string.
  310. * @param source the first UTF-8 string
  311. * @param target the second UTF-8 string
  312. * @param status ICU status
  313. * @return UCOL_LESS, UCOL_EQUAL or UCOL_GREATER
  314. * @stable ICU 51
  315. */
  316. virtual UCollationResult compareUTF8(const StringPiece &source,
  317. const StringPiece &target,
  318. UErrorCode &status) const override;
  319. /**
  320. * Transforms the string into a series of characters
  321. * that can be compared with CollationKey.compare().
  322. *
  323. * Note that sort keys are often less efficient than simply doing comparison.
  324. * For more details, see the ICU User Guide.
  325. *
  326. * @param source the source string.
  327. * @param key the transformed key of the source string.
  328. * @param status the error code status.
  329. * @return the transformed key.
  330. * @see CollationKey
  331. * @stable ICU 2.0
  332. */
  333. virtual CollationKey& getCollationKey(const UnicodeString& source,
  334. CollationKey& key,
  335. UErrorCode& status) const override;
  336. /**
  337. * Transforms a specified region of the string into a series of characters
  338. * that can be compared with CollationKey.compare.
  339. *
  340. * Note that sort keys are often less efficient than simply doing comparison.
  341. * For more details, see the ICU User Guide.
  342. *
  343. * @param source the source string.
  344. * @param sourceLength the length of the source string.
  345. * @param key the transformed key of the source string.
  346. * @param status the error code status.
  347. * @return the transformed key.
  348. * @see CollationKey
  349. * @stable ICU 2.0
  350. */
  351. virtual CollationKey& getCollationKey(const char16_t *source,
  352. int32_t sourceLength,
  353. CollationKey& key,
  354. UErrorCode& status) const override;
  355. /**
  356. * Generates the hash code for the rule-based collation object.
  357. * @return the hash code.
  358. * @stable ICU 2.0
  359. */
  360. virtual int32_t hashCode() const override;
  361. #ifndef U_FORCE_HIDE_DEPRECATED_API
  362. /**
  363. * Gets the locale of the Collator
  364. * @param type can be either requested, valid or actual locale. For more
  365. * information see the definition of ULocDataLocaleType in
  366. * uloc.h
  367. * @param status the error code status.
  368. * @return locale where the collation data lives. If the collator
  369. * was instantiated from rules, locale is empty.
  370. * @deprecated ICU 2.8 likely to change in ICU 3.0, based on feedback
  371. */
  372. virtual Locale getLocale(ULocDataLocaleType type, UErrorCode& status) const override;
  373. #endif // U_FORCE_HIDE_DEPRECATED_API
  374. /**
  375. * Gets the tailoring rules for this collator.
  376. * @return the collation tailoring from which this collator was created
  377. * @stable ICU 2.0
  378. */
  379. const UnicodeString& getRules() const;
  380. /**
  381. * Gets the version information for a Collator.
  382. * @param info the version # information, the result will be filled in
  383. * @stable ICU 2.0
  384. */
  385. virtual void getVersion(UVersionInfo info) const override;
  386. #ifndef U_HIDE_DEPRECATED_API
  387. /**
  388. * Returns the maximum length of any expansion sequences that end with the
  389. * specified comparison order.
  390. *
  391. * This is specific to the kind of collation element values and sequences
  392. * returned by the CollationElementIterator.
  393. * Call CollationElementIterator::getMaxExpansion() instead.
  394. *
  395. * @param order a collation order returned by CollationElementIterator::previous
  396. * or CollationElementIterator::next.
  397. * @return maximum size of the expansion sequences ending with the collation
  398. * element, or 1 if the collation element does not occur at the end of
  399. * any expansion sequence
  400. * @see CollationElementIterator#getMaxExpansion
  401. * @deprecated ICU 51 Use CollationElementIterator::getMaxExpansion() instead.
  402. */
  403. int32_t getMaxExpansion(int32_t order) const;
  404. #endif /* U_HIDE_DEPRECATED_API */
  405. /**
  406. * Returns a unique class ID POLYMORPHICALLY. Pure virtual override. This
  407. * method is to implement a simple version of RTTI, since not all C++
  408. * compilers support genuine RTTI. Polymorphic operator==() and clone()
  409. * methods call this method.
  410. * @return The class ID for this object. All objects of a given class have
  411. * the same class ID. Objects of other classes have different class
  412. * IDs.
  413. * @stable ICU 2.0
  414. */
  415. virtual UClassID getDynamicClassID() const override;
  416. /**
  417. * Returns the class ID for this class. This is useful only for comparing to
  418. * a return value from getDynamicClassID(). For example:
  419. * <pre>
  420. * Base* polymorphic_pointer = createPolymorphicObject();
  421. * if (polymorphic_pointer->getDynamicClassID() ==
  422. * Derived::getStaticClassID()) ...
  423. * </pre>
  424. * @return The class ID for all objects of this class.
  425. * @stable ICU 2.0
  426. */
  427. static UClassID U_EXPORT2 getStaticClassID();
  428. #ifndef U_HIDE_DEPRECATED_API
  429. /**
  430. * Do not use this method: The caller and the ICU library might use different heaps.
  431. * Use cloneBinary() instead which writes to caller-provided memory.
  432. *
  433. * Returns a binary format of this collator.
  434. * @param length Returns the length of the data, in bytes
  435. * @param status the error code status.
  436. * @return memory, owned by the caller, of size 'length' bytes.
  437. * @deprecated ICU 52. Use cloneBinary() instead.
  438. */
  439. uint8_t *cloneRuleData(int32_t &length, UErrorCode &status) const;
  440. #endif /* U_HIDE_DEPRECATED_API */
  441. /** Creates a binary image of a collator. This binary image can be stored and
  442. * later used to instantiate a collator using ucol_openBinary.
  443. * This API supports preflighting.
  444. * @param buffer a fill-in buffer to receive the binary image
  445. * @param capacity capacity of the destination buffer
  446. * @param status for catching errors
  447. * @return size of the image
  448. * @see ucol_openBinary
  449. * @stable ICU 3.4
  450. */
  451. int32_t cloneBinary(uint8_t *buffer, int32_t capacity, UErrorCode &status) const;
  452. /**
  453. * Returns current rules. Delta defines whether full rules are returned or
  454. * just the tailoring.
  455. *
  456. * getRules(void) should normally be used instead.
  457. * See https://unicode-org.github.io/icu/userguide/collation/customization#building-on-existing-locales
  458. * @param delta one of UCOL_TAILORING_ONLY, UCOL_FULL_RULES.
  459. * @param buffer UnicodeString to store the result rules
  460. * @stable ICU 2.2
  461. * @see UCOL_FULL_RULES
  462. */
  463. void getRules(UColRuleOption delta, UnicodeString &buffer) const;
  464. /**
  465. * Universal attribute setter
  466. * @param attr attribute type
  467. * @param value attribute value
  468. * @param status to indicate whether the operation went on smoothly or there were errors
  469. * @stable ICU 2.2
  470. */
  471. virtual void setAttribute(UColAttribute attr, UColAttributeValue value,
  472. UErrorCode &status) override;
  473. /**
  474. * Universal attribute getter.
  475. * @param attr attribute type
  476. * @param status to indicate whether the operation went on smoothly or there were errors
  477. * @return attribute value
  478. * @stable ICU 2.2
  479. */
  480. virtual UColAttributeValue getAttribute(UColAttribute attr,
  481. UErrorCode &status) const override;
  482. /**
  483. * Sets the variable top to the top of the specified reordering group.
  484. * The variable top determines the highest-sorting character
  485. * which is affected by UCOL_ALTERNATE_HANDLING.
  486. * If that attribute is set to UCOL_NON_IGNORABLE, then the variable top has no effect.
  487. * @param group one of UCOL_REORDER_CODE_SPACE, UCOL_REORDER_CODE_PUNCTUATION,
  488. * UCOL_REORDER_CODE_SYMBOL, UCOL_REORDER_CODE_CURRENCY;
  489. * or UCOL_REORDER_CODE_DEFAULT to restore the default max variable group
  490. * @param errorCode Standard ICU error code. Its input value must
  491. * pass the U_SUCCESS() test, or else the function returns
  492. * immediately. Check for U_FAILURE() on output or use with
  493. * function chaining. (See User Guide for details.)
  494. * @return *this
  495. * @see getMaxVariable
  496. * @stable ICU 53
  497. */
  498. virtual Collator &setMaxVariable(UColReorderCode group, UErrorCode &errorCode) override;
  499. /**
  500. * Returns the maximum reordering group whose characters are affected by UCOL_ALTERNATE_HANDLING.
  501. * @return the maximum variable reordering group.
  502. * @see setMaxVariable
  503. * @stable ICU 53
  504. */
  505. virtual UColReorderCode getMaxVariable() const override;
  506. #ifndef U_FORCE_HIDE_DEPRECATED_API
  507. /**
  508. * Sets the variable top to the primary weight of the specified string.
  509. *
  510. * Beginning with ICU 53, the variable top is pinned to
  511. * the top of one of the supported reordering groups,
  512. * and it must not be beyond the last of those groups.
  513. * See setMaxVariable().
  514. * @param varTop one or more (if contraction) char16_ts to which the variable top should be set
  515. * @param len length of variable top string. If -1 it is considered to be zero terminated.
  516. * @param status error code. If error code is set, the return value is undefined. Errors set by this function are: <br>
  517. * U_CE_NOT_FOUND_ERROR if more than one character was passed and there is no such contraction<br>
  518. * U_ILLEGAL_ARGUMENT_ERROR if the variable top is beyond
  519. * the last reordering group supported by setMaxVariable()
  520. * @return variable top primary weight
  521. * @deprecated ICU 53 Call setMaxVariable() instead.
  522. */
  523. virtual uint32_t setVariableTop(const char16_t *varTop, int32_t len, UErrorCode &status) override;
  524. /**
  525. * Sets the variable top to the primary weight of the specified string.
  526. *
  527. * Beginning with ICU 53, the variable top is pinned to
  528. * the top of one of the supported reordering groups,
  529. * and it must not be beyond the last of those groups.
  530. * See setMaxVariable().
  531. * @param varTop a UnicodeString size 1 or more (if contraction) of char16_ts to which the variable top should be set
  532. * @param status error code. If error code is set, the return value is undefined. Errors set by this function are: <br>
  533. * U_CE_NOT_FOUND_ERROR if more than one character was passed and there is no such contraction<br>
  534. * U_ILLEGAL_ARGUMENT_ERROR if the variable top is beyond
  535. * the last reordering group supported by setMaxVariable()
  536. * @return variable top primary weight
  537. * @deprecated ICU 53 Call setMaxVariable() instead.
  538. */
  539. virtual uint32_t setVariableTop(const UnicodeString &varTop, UErrorCode &status) override;
  540. /**
  541. * Sets the variable top to the specified primary weight.
  542. *
  543. * Beginning with ICU 53, the variable top is pinned to
  544. * the top of one of the supported reordering groups,
  545. * and it must not be beyond the last of those groups.
  546. * See setMaxVariable().
  547. * @param varTop primary weight, as returned by setVariableTop or ucol_getVariableTop
  548. * @param status error code
  549. * @deprecated ICU 53 Call setMaxVariable() instead.
  550. */
  551. virtual void setVariableTop(uint32_t varTop, UErrorCode &status) override;
  552. #endif // U_FORCE_HIDE_DEPRECATED_API
  553. /**
  554. * Gets the variable top value of a Collator.
  555. * @param status error code (not changed by function). If error code is set, the return value is undefined.
  556. * @return the variable top primary weight
  557. * @see getMaxVariable
  558. * @stable ICU 2.0
  559. */
  560. virtual uint32_t getVariableTop(UErrorCode &status) const override;
  561. /**
  562. * Get a UnicodeSet that contains all the characters and sequences tailored in
  563. * this collator.
  564. * @param status error code of the operation
  565. * @return a pointer to a UnicodeSet object containing all the
  566. * code points and sequences that may sort differently than
  567. * in the root collator. The object must be disposed of by using delete
  568. * @stable ICU 2.4
  569. */
  570. virtual UnicodeSet *getTailoredSet(UErrorCode &status) const override;
  571. /**
  572. * Get the sort key as an array of bytes from a UnicodeString.
  573. *
  574. * Note that sort keys are often less efficient than simply doing comparison.
  575. * For more details, see the ICU User Guide.
  576. *
  577. * @param source string to be processed.
  578. * @param result buffer to store result in. If nullptr, number of bytes needed
  579. * will be returned.
  580. * @param resultLength length of the result buffer. If if not enough the
  581. * buffer will be filled to capacity.
  582. * @return Number of bytes needed for storing the sort key
  583. * @stable ICU 2.0
  584. */
  585. virtual int32_t getSortKey(const UnicodeString& source, uint8_t *result,
  586. int32_t resultLength) const override;
  587. /**
  588. * Get the sort key as an array of bytes from a char16_t buffer.
  589. *
  590. * Note that sort keys are often less efficient than simply doing comparison.
  591. * For more details, see the ICU User Guide.
  592. *
  593. * @param source string to be processed.
  594. * @param sourceLength length of string to be processed. If -1, the string
  595. * is 0 terminated and length will be decided by the function.
  596. * @param result buffer to store result in. If nullptr, number of bytes needed
  597. * will be returned.
  598. * @param resultLength length of the result buffer. If if not enough the
  599. * buffer will be filled to capacity.
  600. * @return Number of bytes needed for storing the sort key
  601. * @stable ICU 2.2
  602. */
  603. virtual int32_t getSortKey(const char16_t *source, int32_t sourceLength,
  604. uint8_t *result, int32_t resultLength) const override;
  605. /**
  606. * Retrieves the reordering codes for this collator.
  607. * @param dest The array to fill with the script ordering.
  608. * @param destCapacity The length of dest. If it is 0, then dest may be nullptr and the function
  609. * will only return the length of the result without writing any codes (pre-flighting).
  610. * @param status A reference to an error code value, which must not indicate
  611. * a failure before the function call.
  612. * @return The length of the script ordering array.
  613. * @see ucol_setReorderCodes
  614. * @see Collator#getEquivalentReorderCodes
  615. * @see Collator#setReorderCodes
  616. * @stable ICU 4.8
  617. */
  618. virtual int32_t getReorderCodes(int32_t *dest,
  619. int32_t destCapacity,
  620. UErrorCode& status) const override;
  621. /**
  622. * Sets the ordering of scripts for this collator.
  623. * @param reorderCodes An array of script codes in the new order. This can be nullptr if the
  624. * length is also set to 0. An empty array will clear any reordering codes on the collator.
  625. * @param reorderCodesLength The length of reorderCodes.
  626. * @param status error code
  627. * @see ucol_setReorderCodes
  628. * @see Collator#getReorderCodes
  629. * @see Collator#getEquivalentReorderCodes
  630. * @stable ICU 4.8
  631. */
  632. virtual void setReorderCodes(const int32_t* reorderCodes,
  633. int32_t reorderCodesLength,
  634. UErrorCode& status) override;
  635. /**
  636. * Implements ucol_strcollUTF8().
  637. * @internal
  638. */
  639. virtual UCollationResult internalCompareUTF8(
  640. const char *left, int32_t leftLength,
  641. const char *right, int32_t rightLength,
  642. UErrorCode &errorCode) const override;
  643. /** Get the short definition string for a collator. This internal API harvests the collator's
  644. * locale and the attribute set and produces a string that can be used for opening
  645. * a collator with the same attributes using the ucol_openFromShortString API.
  646. * This string will be normalized.
  647. * The structure and the syntax of the string is defined in the "Naming collators"
  648. * section of the users guide:
  649. * https://unicode-org.github.io/icu/userguide/collation/concepts#collator-naming-scheme
  650. * This function supports preflighting.
  651. *
  652. * This is internal, and intended to be used with delegate converters.
  653. *
  654. * @param locale a locale that will appear as a collators locale in the resulting
  655. * short string definition. If nullptr, the locale will be harvested
  656. * from the collator.
  657. * @param buffer space to hold the resulting string
  658. * @param capacity capacity of the buffer
  659. * @param status for returning errors. All the preflighting errors are featured
  660. * @return length of the resulting string
  661. * @see ucol_openFromShortString
  662. * @see ucol_normalizeShortDefinitionString
  663. * @see ucol_getShortDefinitionString
  664. * @internal
  665. */
  666. virtual int32_t internalGetShortDefinitionString(const char *locale,
  667. char *buffer,
  668. int32_t capacity,
  669. UErrorCode &status) const override;
  670. /**
  671. * Implements ucol_nextSortKeyPart().
  672. * @internal
  673. */
  674. virtual int32_t internalNextSortKeyPart(
  675. UCharIterator *iter, uint32_t state[2],
  676. uint8_t *dest, int32_t count, UErrorCode &errorCode) const override;
  677. // Do not enclose the default constructor with #ifndef U_HIDE_INTERNAL_API
  678. /**
  679. * Only for use in ucol_openRules().
  680. * @internal
  681. */
  682. RuleBasedCollator();
  683. #ifndef U_HIDE_INTERNAL_API
  684. /**
  685. * Implements ucol_getLocaleByType().
  686. * Needed because the lifetime of the locale ID string must match that of the collator.
  687. * getLocale() returns a copy of a Locale, with minimal lifetime in a C wrapper.
  688. * @internal
  689. */
  690. const char *internalGetLocaleID(ULocDataLocaleType type, UErrorCode &errorCode) const;
  691. /**
  692. * Implements ucol_getContractionsAndExpansions().
  693. * Gets this collator's sets of contraction strings and/or
  694. * characters and strings that map to multiple collation elements (expansions).
  695. * If addPrefixes is true, then contractions that are expressed as
  696. * prefix/pre-context rules are included.
  697. * @param contractions if not nullptr, the set to hold the contractions
  698. * @param expansions if not nullptr, the set to hold the expansions
  699. * @param addPrefixes include prefix contextual mappings
  700. * @param errorCode in/out ICU error code
  701. * @internal
  702. */
  703. void internalGetContractionsAndExpansions(
  704. UnicodeSet *contractions, UnicodeSet *expansions,
  705. UBool addPrefixes, UErrorCode &errorCode) const;
  706. /**
  707. * Adds the contractions that start with character c to the set.
  708. * Ignores prefixes. Used by AlphabeticIndex.
  709. * @internal
  710. */
  711. void internalAddContractions(UChar32 c, UnicodeSet &set, UErrorCode &errorCode) const;
  712. /**
  713. * Implements from-rule constructors, and ucol_openRules().
  714. * @internal
  715. */
  716. void internalBuildTailoring(
  717. const UnicodeString &rules,
  718. int32_t strength,
  719. UColAttributeValue decompositionMode,
  720. UParseError *outParseError, UnicodeString *outReason,
  721. UErrorCode &errorCode);
  722. /** @internal */
  723. static inline RuleBasedCollator *rbcFromUCollator(UCollator *uc) {
  724. return dynamic_cast<RuleBasedCollator *>(fromUCollator(uc));
  725. }
  726. /** @internal */
  727. static inline const RuleBasedCollator *rbcFromUCollator(const UCollator *uc) {
  728. return dynamic_cast<const RuleBasedCollator *>(fromUCollator(uc));
  729. }
  730. /**
  731. * Appends the CEs for the string to the vector.
  732. * @internal for tests & tools
  733. */
  734. void internalGetCEs(const UnicodeString &str, UVector64 &ces, UErrorCode &errorCode) const;
  735. #endif // U_HIDE_INTERNAL_API
  736. protected:
  737. /**
  738. * Used internally by registration to define the requested and valid locales.
  739. * @param requestedLocale the requested locale
  740. * @param validLocale the valid locale
  741. * @param actualLocale the actual locale
  742. * @internal
  743. */
  744. virtual void setLocales(const Locale& requestedLocale, const Locale& validLocale, const Locale& actualLocale) override;
  745. private:
  746. friend class CollationElementIterator;
  747. friend class Collator;
  748. RuleBasedCollator(const CollationCacheEntry *entry);
  749. /**
  750. * Enumeration of attributes that are relevant for short definition strings
  751. * (e.g., ucol_getShortDefinitionString()).
  752. * Effectively extends UColAttribute.
  753. */
  754. enum Attributes {
  755. ATTR_VARIABLE_TOP = UCOL_ATTRIBUTE_COUNT,
  756. ATTR_LIMIT
  757. };
  758. void adoptTailoring(CollationTailoring *t, UErrorCode &errorCode);
  759. // Both lengths must be <0 or else both must be >=0.
  760. UCollationResult doCompare(const char16_t *left, int32_t leftLength,
  761. const char16_t *right, int32_t rightLength,
  762. UErrorCode &errorCode) const;
  763. UCollationResult doCompare(const uint8_t *left, int32_t leftLength,
  764. const uint8_t *right, int32_t rightLength,
  765. UErrorCode &errorCode) const;
  766. void writeSortKey(const char16_t *s, int32_t length,
  767. SortKeyByteSink &sink, UErrorCode &errorCode) const;
  768. void writeIdenticalLevel(const char16_t *s, const char16_t *limit,
  769. SortKeyByteSink &sink, UErrorCode &errorCode) const;
  770. const CollationSettings &getDefaultSettings() const;
  771. void setAttributeDefault(int32_t attribute) {
  772. explicitlySetAttributes &= ~((uint32_t)1 << attribute);
  773. }
  774. void setAttributeExplicitly(int32_t attribute) {
  775. explicitlySetAttributes |= (uint32_t)1 << attribute;
  776. }
  777. UBool attributeHasBeenSetExplicitly(int32_t attribute) const {
  778. // assert(0 <= attribute < ATTR_LIMIT);
  779. return (UBool)((explicitlySetAttributes & ((uint32_t)1 << attribute)) != 0);
  780. }
  781. /**
  782. * Tests whether a character is "unsafe" for use as a collation starting point.
  783. *
  784. * @param c code point or code unit
  785. * @return true if c is unsafe
  786. * @see CollationElementIterator#setOffset(int)
  787. */
  788. UBool isUnsafe(UChar32 c) const;
  789. static void U_CALLCONV computeMaxExpansions(const CollationTailoring *t, UErrorCode &errorCode);
  790. UBool initMaxExpansions(UErrorCode &errorCode) const;
  791. void setFastLatinOptions(CollationSettings &ownedSettings) const;
  792. const CollationData *data;
  793. const CollationSettings *settings; // reference-counted
  794. const CollationTailoring *tailoring; // alias of cacheEntry->tailoring
  795. const CollationCacheEntry *cacheEntry; // reference-counted
  796. Locale validLocale;
  797. uint32_t explicitlySetAttributes;
  798. UBool actualLocaleIsSameAsValid;
  799. };
  800. U_NAMESPACE_END
  801. #endif // !UCONFIG_NO_COLLATION
  802. #endif /* U_SHOW_CPLUSPLUS_API */
  803. #endif // TBLCOLL_H