coleitr.cpp 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. *******************************************************************************
  5. * Copyright (C) 1996-2014, International Business Machines Corporation and
  6. * others. All Rights Reserved.
  7. *******************************************************************************
  8. */
  9. /*
  10. * File coleitr.cpp
  11. *
  12. * Created by: Helena Shih
  13. *
  14. * Modification History:
  15. *
  16. * Date Name Description
  17. *
  18. * 6/23/97 helena Adding comments to make code more readable.
  19. * 08/03/98 erm Synched with 1.2 version of CollationElementIterator.java
  20. * 12/10/99 aliu Ported Thai collation support from Java.
  21. * 01/25/01 swquek Modified to a C++ wrapper calling C APIs (ucoliter.h)
  22. * 02/19/01 swquek Removed CollationElementIterator() since it is
  23. * private constructor and no calls are made to it
  24. * 2012-2014 markus Rewritten in C++ again.
  25. */
  26. #include "unicode/utypes.h"
  27. #if !UCONFIG_NO_COLLATION
  28. #include "unicode/chariter.h"
  29. #include "unicode/coleitr.h"
  30. #include "unicode/tblcoll.h"
  31. #include "unicode/ustring.h"
  32. #include "cmemory.h"
  33. #include "collation.h"
  34. #include "collationdata.h"
  35. #include "collationiterator.h"
  36. #include "collationsets.h"
  37. #include "collationtailoring.h"
  38. #include "uassert.h"
  39. #include "uhash.h"
  40. #include "utf16collationiterator.h"
  41. #include "uvectr32.h"
  42. /* Constants --------------------------------------------------------------- */
  43. U_NAMESPACE_BEGIN
  44. UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollationElementIterator)
  45. /* CollationElementIterator public constructor/destructor ------------------ */
  46. CollationElementIterator::CollationElementIterator(
  47. const CollationElementIterator& other)
  48. : UObject(other), iter_(nullptr), rbc_(nullptr), otherHalf_(0), dir_(0), offsets_(nullptr) {
  49. *this = other;
  50. }
  51. CollationElementIterator::~CollationElementIterator()
  52. {
  53. delete iter_;
  54. delete offsets_;
  55. }
  56. /* CollationElementIterator public methods --------------------------------- */
  57. namespace {
  58. uint32_t getFirstHalf(uint32_t p, uint32_t lower32) {
  59. return (p & 0xffff0000) | ((lower32 >> 16) & 0xff00) | ((lower32 >> 8) & 0xff);
  60. }
  61. uint32_t getSecondHalf(uint32_t p, uint32_t lower32) {
  62. return (p << 16) | ((lower32 >> 8) & 0xff00) | (lower32 & 0x3f);
  63. }
  64. UBool ceNeedsTwoParts(int64_t ce) {
  65. return (ce & INT64_C(0xffff00ff003f)) != 0;
  66. }
  67. } // namespace
  68. int32_t CollationElementIterator::getOffset() const
  69. {
  70. if (dir_ < 0 && offsets_ != nullptr && !offsets_->isEmpty()) {
  71. // CollationIterator::previousCE() decrements the CEs length
  72. // while it pops CEs from its internal buffer.
  73. int32_t i = iter_->getCEsLength();
  74. if (otherHalf_ != 0) {
  75. // Return the trailing CE offset while we are in the middle of a 64-bit CE.
  76. ++i;
  77. }
  78. U_ASSERT(i < offsets_->size());
  79. return offsets_->elementAti(i);
  80. }
  81. return iter_->getOffset();
  82. }
  83. /**
  84. * Get the ordering priority of the next character in the string.
  85. * @return the next character's ordering. Returns NULLORDER if an error has
  86. * occurred or if the end of string has been reached
  87. */
  88. int32_t CollationElementIterator::next(UErrorCode& status)
  89. {
  90. if (U_FAILURE(status)) { return NULLORDER; }
  91. if (dir_ > 1) {
  92. // Continue forward iteration. Test this first.
  93. if (otherHalf_ != 0) {
  94. uint32_t oh = otherHalf_;
  95. otherHalf_ = 0;
  96. return oh;
  97. }
  98. } else if (dir_ == 1) {
  99. // next() after setOffset()
  100. dir_ = 2;
  101. } else if (dir_ == 0) {
  102. // The iter_ is already reset to the start of the text.
  103. dir_ = 2;
  104. } else /* dir_ < 0 */ {
  105. // illegal change of direction
  106. status = U_INVALID_STATE_ERROR;
  107. return NULLORDER;
  108. }
  109. // No need to keep all CEs in the buffer when we iterate.
  110. iter_->clearCEsIfNoneRemaining();
  111. int64_t ce = iter_->nextCE(status);
  112. if (ce == Collation::NO_CE) { return NULLORDER; }
  113. // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits.
  114. uint32_t p = (uint32_t)(ce >> 32);
  115. uint32_t lower32 = (uint32_t)ce;
  116. uint32_t firstHalf = getFirstHalf(p, lower32);
  117. uint32_t secondHalf = getSecondHalf(p, lower32);
  118. if (secondHalf != 0) {
  119. otherHalf_ = secondHalf | 0xc0; // continuation CE
  120. }
  121. return firstHalf;
  122. }
  123. bool CollationElementIterator::operator!=(
  124. const CollationElementIterator& other) const
  125. {
  126. return !(*this == other);
  127. }
  128. bool CollationElementIterator::operator==(
  129. const CollationElementIterator& that) const
  130. {
  131. if (this == &that) {
  132. return true;
  133. }
  134. return
  135. (rbc_ == that.rbc_ || *rbc_ == *that.rbc_) &&
  136. otherHalf_ == that.otherHalf_ &&
  137. normalizeDir() == that.normalizeDir() &&
  138. string_ == that.string_ &&
  139. *iter_ == *that.iter_;
  140. }
  141. /**
  142. * Get the ordering priority of the previous collation element in the string.
  143. * @param status the error code status.
  144. * @return the previous element's ordering. Returns NULLORDER if an error has
  145. * occurred or if the start of string has been reached.
  146. */
  147. int32_t CollationElementIterator::previous(UErrorCode& status)
  148. {
  149. if (U_FAILURE(status)) { return NULLORDER; }
  150. if (dir_ < 0) {
  151. // Continue backwards iteration. Test this first.
  152. if (otherHalf_ != 0) {
  153. uint32_t oh = otherHalf_;
  154. otherHalf_ = 0;
  155. return oh;
  156. }
  157. } else if (dir_ == 0) {
  158. iter_->resetToOffset(string_.length());
  159. dir_ = -1;
  160. } else if (dir_ == 1) {
  161. // previous() after setOffset()
  162. dir_ = -1;
  163. } else /* dir_ > 1 */ {
  164. // illegal change of direction
  165. status = U_INVALID_STATE_ERROR;
  166. return NULLORDER;
  167. }
  168. if (offsets_ == nullptr) {
  169. offsets_ = new UVector32(status);
  170. if (offsets_ == nullptr) {
  171. status = U_MEMORY_ALLOCATION_ERROR;
  172. return NULLORDER;
  173. }
  174. }
  175. // If we already have expansion CEs, then we also have offsets.
  176. // Otherwise remember the trailing offset in case we need to
  177. // write offsets for an artificial expansion.
  178. int32_t limitOffset = iter_->getCEsLength() == 0 ? iter_->getOffset() : 0;
  179. int64_t ce = iter_->previousCE(*offsets_, status);
  180. if (ce == Collation::NO_CE) { return NULLORDER; }
  181. // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits.
  182. uint32_t p = (uint32_t)(ce >> 32);
  183. uint32_t lower32 = (uint32_t)ce;
  184. uint32_t firstHalf = getFirstHalf(p, lower32);
  185. uint32_t secondHalf = getSecondHalf(p, lower32);
  186. if (secondHalf != 0) {
  187. if (offsets_->isEmpty()) {
  188. // When we convert a single 64-bit CE into two 32-bit CEs,
  189. // we need to make this artificial expansion behave like a normal expansion.
  190. // See CollationIterator::previousCE().
  191. offsets_->addElement(iter_->getOffset(), status);
  192. offsets_->addElement(limitOffset, status);
  193. }
  194. otherHalf_ = firstHalf;
  195. return secondHalf | 0xc0; // continuation CE
  196. }
  197. return firstHalf;
  198. }
  199. /**
  200. * Resets the cursor to the beginning of the string.
  201. */
  202. void CollationElementIterator::reset()
  203. {
  204. iter_ ->resetToOffset(0);
  205. otherHalf_ = 0;
  206. dir_ = 0;
  207. }
  208. void CollationElementIterator::setOffset(int32_t newOffset,
  209. UErrorCode& status)
  210. {
  211. if (U_FAILURE(status)) { return; }
  212. if (0 < newOffset && newOffset < string_.length()) {
  213. int32_t offset = newOffset;
  214. do {
  215. char16_t c = string_.charAt(offset);
  216. if (!rbc_->isUnsafe(c) ||
  217. (U16_IS_LEAD(c) && !rbc_->isUnsafe(string_.char32At(offset)))) {
  218. break;
  219. }
  220. // Back up to before this unsafe character.
  221. --offset;
  222. } while (offset > 0);
  223. if (offset < newOffset) {
  224. // We might have backed up more than necessary.
  225. // For example, contractions "ch" and "cu" make both 'h' and 'u' unsafe,
  226. // but for text "chu" setOffset(2) should remain at 2
  227. // although we initially back up to offset 0.
  228. // Find the last safe offset no greater than newOffset by iterating forward.
  229. int32_t lastSafeOffset = offset;
  230. do {
  231. iter_->resetToOffset(lastSafeOffset);
  232. do {
  233. iter_->nextCE(status);
  234. if (U_FAILURE(status)) { return; }
  235. } while ((offset = iter_->getOffset()) == lastSafeOffset);
  236. if (offset <= newOffset) {
  237. lastSafeOffset = offset;
  238. }
  239. } while (offset < newOffset);
  240. newOffset = lastSafeOffset;
  241. }
  242. }
  243. iter_->resetToOffset(newOffset);
  244. otherHalf_ = 0;
  245. dir_ = 1;
  246. }
  247. /**
  248. * Sets the source to the new source string.
  249. */
  250. void CollationElementIterator::setText(const UnicodeString& source,
  251. UErrorCode& status)
  252. {
  253. if (U_FAILURE(status)) {
  254. return;
  255. }
  256. string_ = source;
  257. const char16_t *s = string_.getBuffer();
  258. CollationIterator *newIter;
  259. UBool numeric = rbc_->settings->isNumeric();
  260. if (rbc_->settings->dontCheckFCD()) {
  261. newIter = new UTF16CollationIterator(rbc_->data, numeric, s, s, s + string_.length());
  262. } else {
  263. newIter = new FCDUTF16CollationIterator(rbc_->data, numeric, s, s, s + string_.length());
  264. }
  265. if (newIter == nullptr) {
  266. status = U_MEMORY_ALLOCATION_ERROR;
  267. return;
  268. }
  269. delete iter_;
  270. iter_ = newIter;
  271. otherHalf_ = 0;
  272. dir_ = 0;
  273. }
  274. // Sets the source to the new character iterator.
  275. void CollationElementIterator::setText(CharacterIterator& source,
  276. UErrorCode& status)
  277. {
  278. if (U_FAILURE(status))
  279. return;
  280. source.getText(string_);
  281. setText(string_, status);
  282. }
  283. int32_t CollationElementIterator::strengthOrder(int32_t order) const
  284. {
  285. UColAttributeValue s = (UColAttributeValue)rbc_->settings->getStrength();
  286. // Mask off the unwanted differences.
  287. if (s == UCOL_PRIMARY) {
  288. order &= 0xffff0000;
  289. }
  290. else if (s == UCOL_SECONDARY) {
  291. order &= 0xffffff00;
  292. }
  293. return order;
  294. }
  295. /* CollationElementIterator private constructors/destructors --------------- */
  296. /**
  297. * This is the "real" constructor for this class; it constructs an iterator
  298. * over the source text using the specified collator
  299. */
  300. CollationElementIterator::CollationElementIterator(
  301. const UnicodeString &source,
  302. const RuleBasedCollator *coll,
  303. UErrorCode &status)
  304. : iter_(nullptr), rbc_(coll), otherHalf_(0), dir_(0), offsets_(nullptr) {
  305. setText(source, status);
  306. }
  307. /**
  308. * This is the "real" constructor for this class; it constructs an iterator over
  309. * the source text using the specified collator
  310. */
  311. CollationElementIterator::CollationElementIterator(
  312. const CharacterIterator &source,
  313. const RuleBasedCollator *coll,
  314. UErrorCode &status)
  315. : iter_(nullptr), rbc_(coll), otherHalf_(0), dir_(0), offsets_(nullptr) {
  316. // We only call source.getText() which should be const anyway.
  317. setText(const_cast<CharacterIterator &>(source), status);
  318. }
  319. /* CollationElementIterator private methods -------------------------------- */
  320. const CollationElementIterator& CollationElementIterator::operator=(
  321. const CollationElementIterator& other)
  322. {
  323. if (this == &other) {
  324. return *this;
  325. }
  326. CollationIterator *newIter;
  327. const FCDUTF16CollationIterator *otherFCDIter =
  328. dynamic_cast<const FCDUTF16CollationIterator *>(other.iter_);
  329. if(otherFCDIter != nullptr) {
  330. newIter = new FCDUTF16CollationIterator(*otherFCDIter, string_.getBuffer());
  331. } else {
  332. const UTF16CollationIterator *otherIter =
  333. dynamic_cast<const UTF16CollationIterator *>(other.iter_);
  334. if(otherIter != nullptr) {
  335. newIter = new UTF16CollationIterator(*otherIter, string_.getBuffer());
  336. } else {
  337. newIter = nullptr;
  338. }
  339. }
  340. if(newIter != nullptr) {
  341. delete iter_;
  342. iter_ = newIter;
  343. rbc_ = other.rbc_;
  344. otherHalf_ = other.otherHalf_;
  345. dir_ = other.dir_;
  346. string_ = other.string_;
  347. }
  348. if(other.dir_ < 0 && other.offsets_ != nullptr && !other.offsets_->isEmpty()) {
  349. UErrorCode errorCode = U_ZERO_ERROR;
  350. if(offsets_ == nullptr) {
  351. offsets_ = new UVector32(other.offsets_->size(), errorCode);
  352. }
  353. if(offsets_ != nullptr) {
  354. offsets_->assign(*other.offsets_, errorCode);
  355. }
  356. }
  357. return *this;
  358. }
  359. namespace {
  360. class MaxExpSink : public ContractionsAndExpansions::CESink {
  361. public:
  362. MaxExpSink(UHashtable *h, UErrorCode &ec) : maxExpansions(h), errorCode(ec) {}
  363. virtual ~MaxExpSink();
  364. virtual void handleCE(int64_t /*ce*/) override {}
  365. virtual void handleExpansion(const int64_t ces[], int32_t length) override {
  366. if (length <= 1) {
  367. // We do not need to add single CEs into the map.
  368. return;
  369. }
  370. int32_t count = 0; // number of CE "halves"
  371. for (int32_t i = 0; i < length; ++i) {
  372. count += ceNeedsTwoParts(ces[i]) ? 2 : 1;
  373. }
  374. // last "half" of the last CE
  375. int64_t ce = ces[length - 1];
  376. uint32_t p = (uint32_t)(ce >> 32);
  377. uint32_t lower32 = (uint32_t)ce;
  378. uint32_t lastHalf = getSecondHalf(p, lower32);
  379. if (lastHalf == 0) {
  380. lastHalf = getFirstHalf(p, lower32);
  381. U_ASSERT(lastHalf != 0);
  382. } else {
  383. lastHalf |= 0xc0; // old-style continuation CE
  384. }
  385. if (count > uhash_igeti(maxExpansions, (int32_t)lastHalf)) {
  386. uhash_iputi(maxExpansions, (int32_t)lastHalf, count, &errorCode);
  387. }
  388. }
  389. private:
  390. UHashtable *maxExpansions;
  391. UErrorCode &errorCode;
  392. };
  393. MaxExpSink::~MaxExpSink() {}
  394. } // namespace
  395. UHashtable *
  396. CollationElementIterator::computeMaxExpansions(const CollationData *data, UErrorCode &errorCode) {
  397. if (U_FAILURE(errorCode)) { return nullptr; }
  398. UHashtable *maxExpansions = uhash_open(uhash_hashLong, uhash_compareLong,
  399. uhash_compareLong, &errorCode);
  400. if (U_FAILURE(errorCode)) { return nullptr; }
  401. MaxExpSink sink(maxExpansions, errorCode);
  402. ContractionsAndExpansions(nullptr, nullptr, &sink, true).forData(data, errorCode);
  403. if (U_FAILURE(errorCode)) {
  404. uhash_close(maxExpansions);
  405. return nullptr;
  406. }
  407. return maxExpansions;
  408. }
  409. int32_t
  410. CollationElementIterator::getMaxExpansion(int32_t order) const {
  411. return getMaxExpansion(rbc_->tailoring->maxExpansions, order);
  412. }
  413. int32_t
  414. CollationElementIterator::getMaxExpansion(const UHashtable *maxExpansions, int32_t order) {
  415. if (order == 0) { return 1; }
  416. int32_t max;
  417. if(maxExpansions != nullptr && (max = uhash_igeti(maxExpansions, order)) != 0) {
  418. return max;
  419. }
  420. if ((order & 0xc0) == 0xc0) {
  421. // old-style continuation CE
  422. return 2;
  423. } else {
  424. return 1;
  425. }
  426. }
  427. U_NAMESPACE_END
  428. #endif /* #if !UCONFIG_NO_COLLATION */