coleitr.h 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. ******************************************************************************
  5. * Copyright (C) 1997-2014, International Business Machines
  6. * Corporation and others. All Rights Reserved.
  7. ******************************************************************************
  8. */
  9. /**
  10. * \file
  11. * \brief C++ API: Collation Element Iterator.
  12. */
  13. /**
  14. * File coleitr.h
  15. *
  16. * Created by: Helena Shih
  17. *
  18. * Modification History:
  19. *
  20. * Date Name Description
  21. *
  22. * 8/18/97 helena Added internal API documentation.
  23. * 08/03/98 erm Synched with 1.2 version CollationElementIterator.java
  24. * 12/10/99 aliu Ported Thai collation support from Java.
  25. * 01/25/01 swquek Modified into a C++ wrapper calling C APIs (ucoliter.h)
  26. * 02/19/01 swquek Removed CollationElementsIterator() since it is
  27. * private constructor and no calls are made to it
  28. * 2012-2014 markus Rewritten in C++ again.
  29. */
  30. #ifndef COLEITR_H
  31. #define COLEITR_H
  32. #include "unicode/utypes.h"
  33. #if U_SHOW_CPLUSPLUS_API
  34. #if !UCONFIG_NO_COLLATION
  35. #include "unicode/unistr.h"
  36. #include "unicode/uobject.h"
  37. struct UCollationElements;
  38. struct UHashtable;
  39. U_NAMESPACE_BEGIN
  40. struct CollationData;
  41. class CharacterIterator;
  42. class CollationIterator;
  43. class RuleBasedCollator;
  44. class UCollationPCE;
  45. class UVector32;
  46. /**
  47. * The CollationElementIterator class is used as an iterator to walk through
  48. * each character of an international string. Use the iterator to return the
  49. * ordering priority of the positioned character. The ordering priority of a
  50. * character, which we refer to as a key, defines how a character is collated in
  51. * the given collation object.
  52. * For example, consider the following in Slovak and in traditional Spanish collation:
  53. * <pre>
  54. * "ca" -> the first key is key('c') and second key is key('a').
  55. * "cha" -> the first key is key('ch') and second key is key('a').</pre>
  56. * And in German phonebook collation,
  57. * <pre> \htmlonly "&#x00E6;b"-> the first key is key('a'), the second key is key('e'), and
  58. * the third key is key('b'). \endhtmlonly </pre>
  59. * The key of a character, is an integer composed of primary order(short),
  60. * secondary order(char), and tertiary order(char). Java strictly defines the
  61. * size and signedness of its primitive data types. Therefore, the static
  62. * functions primaryOrder(), secondaryOrder(), and tertiaryOrder() return
  63. * int32_t to ensure the correctness of the key value.
  64. * <p>Example of the iterator usage: (without error checking)
  65. * <pre>
  66. * \code
  67. * void CollationElementIterator_Example()
  68. * {
  69. * UnicodeString str = "This is a test";
  70. * UErrorCode success = U_ZERO_ERROR;
  71. * RuleBasedCollator* rbc =
  72. * (RuleBasedCollator*) RuleBasedCollator::createInstance(success);
  73. * CollationElementIterator* c =
  74. * rbc->createCollationElementIterator( str );
  75. * int32_t order = c->next(success);
  76. * c->reset();
  77. * order = c->previous(success);
  78. * delete c;
  79. * delete rbc;
  80. * }
  81. * \endcode
  82. * </pre>
  83. * <p>
  84. * The method next() returns the collation order of the next character based on
  85. * the comparison level of the collator. The method previous() returns the
  86. * collation order of the previous character based on the comparison level of
  87. * the collator. The Collation Element Iterator moves only in one direction
  88. * between calls to reset(), setOffset(), or setText(). That is, next()
  89. * and previous() can not be inter-used. Whenever previous() is to be called after
  90. * next() or vice versa, reset(), setOffset() or setText() has to be called first
  91. * to reset the status, shifting pointers to either the end or the start of
  92. * the string (reset() or setText()), or the specified position (setOffset()).
  93. * Hence at the next call of next() or previous(), the first or last collation order,
  94. * or collation order at the specified position will be returned. If a change of
  95. * direction is done without one of these calls, the result is undefined.
  96. * <p>
  97. * The result of a forward iterate (next()) and reversed result of the backward
  98. * iterate (previous()) on the same string are equivalent, if collation orders
  99. * with the value 0 are ignored.
  100. * Character based on the comparison level of the collator. A collation order
  101. * consists of primary order, secondary order and tertiary order. The data
  102. * type of the collation order is <strong>int32_t</strong>.
  103. *
  104. * Note, CollationElementIterator should not be subclassed.
  105. * @see Collator
  106. * @see RuleBasedCollator
  107. * @version 1.8 Jan 16 2001
  108. */
  109. class U_I18N_API CollationElementIterator final : public UObject {
  110. public:
  111. // CollationElementIterator public data member ------------------------------
  112. enum {
  113. /**
  114. * NULLORDER indicates that an error has occurred while processing
  115. * @stable ICU 2.0
  116. */
  117. NULLORDER = (int32_t)0xffffffff
  118. };
  119. // CollationElementIterator public constructor/destructor -------------------
  120. /**
  121. * Copy constructor.
  122. *
  123. * @param other the object to be copied from
  124. * @stable ICU 2.0
  125. */
  126. CollationElementIterator(const CollationElementIterator& other);
  127. /**
  128. * Destructor
  129. * @stable ICU 2.0
  130. */
  131. virtual ~CollationElementIterator();
  132. // CollationElementIterator public methods ----------------------------------
  133. /**
  134. * Returns true if "other" is the same as "this"
  135. *
  136. * @param other the object to be compared
  137. * @return true if "other" is the same as "this"
  138. * @stable ICU 2.0
  139. */
  140. bool operator==(const CollationElementIterator& other) const;
  141. /**
  142. * Returns true if "other" is not the same as "this".
  143. *
  144. * @param other the object to be compared
  145. * @return true if "other" is not the same as "this"
  146. * @stable ICU 2.0
  147. */
  148. bool operator!=(const CollationElementIterator& other) const;
  149. /**
  150. * Resets the cursor to the beginning of the string.
  151. * @stable ICU 2.0
  152. */
  153. void reset();
  154. /**
  155. * Gets the ordering priority of the next character in the string.
  156. * @param status the error code status.
  157. * @return the next character's ordering. otherwise returns NULLORDER if an
  158. * error has occurred or if the end of string has been reached
  159. * @stable ICU 2.0
  160. */
  161. int32_t next(UErrorCode& status);
  162. /**
  163. * Get the ordering priority of the previous collation element in the string.
  164. * @param status the error code status.
  165. * @return the previous element's ordering. otherwise returns NULLORDER if an
  166. * error has occurred or if the start of string has been reached
  167. * @stable ICU 2.0
  168. */
  169. int32_t previous(UErrorCode& status);
  170. /**
  171. * Gets the primary order of a collation order.
  172. * @param order the collation order
  173. * @return the primary order of a collation order.
  174. * @stable ICU 2.0
  175. */
  176. static inline int32_t primaryOrder(int32_t order);
  177. /**
  178. * Gets the secondary order of a collation order.
  179. * @param order the collation order
  180. * @return the secondary order of a collation order.
  181. * @stable ICU 2.0
  182. */
  183. static inline int32_t secondaryOrder(int32_t order);
  184. /**
  185. * Gets the tertiary order of a collation order.
  186. * @param order the collation order
  187. * @return the tertiary order of a collation order.
  188. * @stable ICU 2.0
  189. */
  190. static inline int32_t tertiaryOrder(int32_t order);
  191. /**
  192. * Return the maximum length of any expansion sequences that end with the
  193. * specified comparison order.
  194. * @param order a collation order returned by previous or next.
  195. * @return maximum size of the expansion sequences ending with the collation
  196. * element or 1 if collation element does not occur at the end of any
  197. * expansion sequence
  198. * @stable ICU 2.0
  199. */
  200. int32_t getMaxExpansion(int32_t order) const;
  201. /**
  202. * Gets the comparison order in the desired strength. Ignore the other
  203. * differences.
  204. * @param order The order value
  205. * @stable ICU 2.0
  206. */
  207. int32_t strengthOrder(int32_t order) const;
  208. /**
  209. * Sets the source string.
  210. * @param str the source string.
  211. * @param status the error code status.
  212. * @stable ICU 2.0
  213. */
  214. void setText(const UnicodeString& str, UErrorCode& status);
  215. /**
  216. * Sets the source string.
  217. * @param str the source character iterator.
  218. * @param status the error code status.
  219. * @stable ICU 2.0
  220. */
  221. void setText(CharacterIterator& str, UErrorCode& status);
  222. /**
  223. * Checks if a comparison order is ignorable.
  224. * @param order the collation order.
  225. * @return true if a character is ignorable, false otherwise.
  226. * @stable ICU 2.0
  227. */
  228. static inline UBool isIgnorable(int32_t order);
  229. /**
  230. * Gets the offset of the currently processed character in the source string.
  231. * @return the offset of the character.
  232. * @stable ICU 2.0
  233. */
  234. int32_t getOffset() const;
  235. /**
  236. * Sets the offset of the currently processed character in the source string.
  237. * @param newOffset the new offset.
  238. * @param status the error code status.
  239. * @return the offset of the character.
  240. * @stable ICU 2.0
  241. */
  242. void setOffset(int32_t newOffset, UErrorCode& status);
  243. /**
  244. * ICU "poor man's RTTI", returns a UClassID for the actual class.
  245. *
  246. * @stable ICU 2.2
  247. */
  248. virtual UClassID getDynamicClassID() const override;
  249. /**
  250. * ICU "poor man's RTTI", returns a UClassID for this class.
  251. *
  252. * @stable ICU 2.2
  253. */
  254. static UClassID U_EXPORT2 getStaticClassID();
  255. #ifndef U_HIDE_INTERNAL_API
  256. /** @internal */
  257. static inline CollationElementIterator *fromUCollationElements(UCollationElements *uc) {
  258. return reinterpret_cast<CollationElementIterator *>(uc);
  259. }
  260. /** @internal */
  261. static inline const CollationElementIterator *fromUCollationElements(const UCollationElements *uc) {
  262. return reinterpret_cast<const CollationElementIterator *>(uc);
  263. }
  264. /** @internal */
  265. inline UCollationElements *toUCollationElements() {
  266. return reinterpret_cast<UCollationElements *>(this);
  267. }
  268. /** @internal */
  269. inline const UCollationElements *toUCollationElements() const {
  270. return reinterpret_cast<const UCollationElements *>(this);
  271. }
  272. #endif // U_HIDE_INTERNAL_API
  273. private:
  274. friend class RuleBasedCollator;
  275. friend class UCollationPCE;
  276. /**
  277. * CollationElementIterator constructor. This takes the source string and the
  278. * collation object. The cursor will walk thru the source string based on the
  279. * predefined collation rules. If the source string is empty, NULLORDER will
  280. * be returned on the calls to next().
  281. * @param sourceText the source string.
  282. * @param order the collation object.
  283. * @param status the error code status.
  284. */
  285. CollationElementIterator(const UnicodeString& sourceText,
  286. const RuleBasedCollator* order, UErrorCode& status);
  287. // Note: The constructors should take settings & tailoring, not a collator,
  288. // to avoid circular dependencies.
  289. // However, for operator==() we would need to be able to compare tailoring data for equality
  290. // without making CollationData or CollationTailoring depend on TailoredSet.
  291. // (See the implementation of RuleBasedCollator::operator==().)
  292. // That might require creating an intermediate class that would be used
  293. // by both CollationElementIterator and RuleBasedCollator
  294. // but only contain the part of RBC== related to data and rules.
  295. /**
  296. * CollationElementIterator constructor. This takes the source string and the
  297. * collation object. The cursor will walk thru the source string based on the
  298. * predefined collation rules. If the source string is empty, NULLORDER will
  299. * be returned on the calls to next().
  300. * @param sourceText the source string.
  301. * @param order the collation object.
  302. * @param status the error code status.
  303. */
  304. CollationElementIterator(const CharacterIterator& sourceText,
  305. const RuleBasedCollator* order, UErrorCode& status);
  306. /**
  307. * Assignment operator
  308. *
  309. * @param other the object to be copied
  310. */
  311. const CollationElementIterator&
  312. operator=(const CollationElementIterator& other);
  313. CollationElementIterator() = delete; // default constructor not implemented
  314. /** Normalizes dir_=1 (just after setOffset()) to dir_=0 (just after reset()). */
  315. inline int8_t normalizeDir() const { return dir_ == 1 ? 0 : dir_; }
  316. static UHashtable *computeMaxExpansions(const CollationData *data, UErrorCode &errorCode);
  317. static int32_t getMaxExpansion(const UHashtable *maxExpansions, int32_t order);
  318. // CollationElementIterator private data members ----------------------------
  319. CollationIterator *iter_; // owned
  320. const RuleBasedCollator *rbc_; // aliased
  321. uint32_t otherHalf_;
  322. /**
  323. * <0: backwards; 0: just after reset() (previous() begins from end);
  324. * 1: just after setOffset(); >1: forward
  325. */
  326. int8_t dir_;
  327. /**
  328. * Stores offsets from expansions and from unsafe-backwards iteration,
  329. * so that getOffset() returns intermediate offsets for the CEs
  330. * that are consistent with forward iteration.
  331. */
  332. UVector32 *offsets_;
  333. UnicodeString string_;
  334. };
  335. // CollationElementIterator inline method definitions --------------------------
  336. inline int32_t CollationElementIterator::primaryOrder(int32_t order)
  337. {
  338. return (order >> 16) & 0xffff;
  339. }
  340. inline int32_t CollationElementIterator::secondaryOrder(int32_t order)
  341. {
  342. return (order >> 8) & 0xff;
  343. }
  344. inline int32_t CollationElementIterator::tertiaryOrder(int32_t order)
  345. {
  346. return order & 0xff;
  347. }
  348. inline UBool CollationElementIterator::isIgnorable(int32_t order)
  349. {
  350. return (order & 0xffff0000) == 0;
  351. }
  352. U_NAMESPACE_END
  353. #endif /* #if !UCONFIG_NO_COLLATION */
  354. #endif /* U_SHOW_CPLUSPLUS_API */
  355. #endif