123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473 |
- // © 2016 and later: Unicode, Inc. and others.
- // License & terms of use: http://www.unicode.org/copyright.html
- /*
- *******************************************************************************
- * Copyright (C) 1996-2014, International Business Machines Corporation and
- * others. All Rights Reserved.
- *******************************************************************************
- */
- /*
- * File coleitr.cpp
- *
- * Created by: Helena Shih
- *
- * Modification History:
- *
- * Date Name Description
- *
- * 6/23/97 helena Adding comments to make code more readable.
- * 08/03/98 erm Synched with 1.2 version of CollationElementIterator.java
- * 12/10/99 aliu Ported Thai collation support from Java.
- * 01/25/01 swquek Modified to a C++ wrapper calling C APIs (ucoliter.h)
- * 02/19/01 swquek Removed CollationElementIterator() since it is
- * private constructor and no calls are made to it
- * 2012-2014 markus Rewritten in C++ again.
- */
- #include "unicode/utypes.h"
- #if !UCONFIG_NO_COLLATION
- #include "unicode/chariter.h"
- #include "unicode/coleitr.h"
- #include "unicode/tblcoll.h"
- #include "unicode/ustring.h"
- #include "cmemory.h"
- #include "collation.h"
- #include "collationdata.h"
- #include "collationiterator.h"
- #include "collationsets.h"
- #include "collationtailoring.h"
- #include "uassert.h"
- #include "uhash.h"
- #include "utf16collationiterator.h"
- #include "uvectr32.h"
- /* Constants --------------------------------------------------------------- */
- U_NAMESPACE_BEGIN
- UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollationElementIterator)
- /* CollationElementIterator public constructor/destructor ------------------ */
- CollationElementIterator::CollationElementIterator(
- const CollationElementIterator& other)
- : UObject(other), iter_(nullptr), rbc_(nullptr), otherHalf_(0), dir_(0), offsets_(nullptr) {
- *this = other;
- }
- CollationElementIterator::~CollationElementIterator()
- {
- delete iter_;
- delete offsets_;
- }
- /* CollationElementIterator public methods --------------------------------- */
- namespace {
- uint32_t getFirstHalf(uint32_t p, uint32_t lower32) {
- return (p & 0xffff0000) | ((lower32 >> 16) & 0xff00) | ((lower32 >> 8) & 0xff);
- }
- uint32_t getSecondHalf(uint32_t p, uint32_t lower32) {
- return (p << 16) | ((lower32 >> 8) & 0xff00) | (lower32 & 0x3f);
- }
- UBool ceNeedsTwoParts(int64_t ce) {
- return (ce & INT64_C(0xffff00ff003f)) != 0;
- }
- } // namespace
- int32_t CollationElementIterator::getOffset() const
- {
- if (dir_ < 0 && offsets_ != nullptr && !offsets_->isEmpty()) {
- // CollationIterator::previousCE() decrements the CEs length
- // while it pops CEs from its internal buffer.
- int32_t i = iter_->getCEsLength();
- if (otherHalf_ != 0) {
- // Return the trailing CE offset while we are in the middle of a 64-bit CE.
- ++i;
- }
- U_ASSERT(i < offsets_->size());
- return offsets_->elementAti(i);
- }
- return iter_->getOffset();
- }
- /**
- * Get the ordering priority of the next character in the string.
- * @return the next character's ordering. Returns NULLORDER if an error has
- * occurred or if the end of string has been reached
- */
- int32_t CollationElementIterator::next(UErrorCode& status)
- {
- if (U_FAILURE(status)) { return NULLORDER; }
- if (dir_ > 1) {
- // Continue forward iteration. Test this first.
- if (otherHalf_ != 0) {
- uint32_t oh = otherHalf_;
- otherHalf_ = 0;
- return oh;
- }
- } else if (dir_ == 1) {
- // next() after setOffset()
- dir_ = 2;
- } else if (dir_ == 0) {
- // The iter_ is already reset to the start of the text.
- dir_ = 2;
- } else /* dir_ < 0 */ {
- // illegal change of direction
- status = U_INVALID_STATE_ERROR;
- return NULLORDER;
- }
- // No need to keep all CEs in the buffer when we iterate.
- iter_->clearCEsIfNoneRemaining();
- int64_t ce = iter_->nextCE(status);
- if (ce == Collation::NO_CE) { return NULLORDER; }
- // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits.
- uint32_t p = (uint32_t)(ce >> 32);
- uint32_t lower32 = (uint32_t)ce;
- uint32_t firstHalf = getFirstHalf(p, lower32);
- uint32_t secondHalf = getSecondHalf(p, lower32);
- if (secondHalf != 0) {
- otherHalf_ = secondHalf | 0xc0; // continuation CE
- }
- return firstHalf;
- }
- bool CollationElementIterator::operator!=(
- const CollationElementIterator& other) const
- {
- return !(*this == other);
- }
- bool CollationElementIterator::operator==(
- const CollationElementIterator& that) const
- {
- if (this == &that) {
- return true;
- }
- return
- (rbc_ == that.rbc_ || *rbc_ == *that.rbc_) &&
- otherHalf_ == that.otherHalf_ &&
- normalizeDir() == that.normalizeDir() &&
- string_ == that.string_ &&
- *iter_ == *that.iter_;
- }
- /**
- * Get the ordering priority of the previous collation element in the string.
- * @param status the error code status.
- * @return the previous element's ordering. Returns NULLORDER if an error has
- * occurred or if the start of string has been reached.
- */
- int32_t CollationElementIterator::previous(UErrorCode& status)
- {
- if (U_FAILURE(status)) { return NULLORDER; }
- if (dir_ < 0) {
- // Continue backwards iteration. Test this first.
- if (otherHalf_ != 0) {
- uint32_t oh = otherHalf_;
- otherHalf_ = 0;
- return oh;
- }
- } else if (dir_ == 0) {
- iter_->resetToOffset(string_.length());
- dir_ = -1;
- } else if (dir_ == 1) {
- // previous() after setOffset()
- dir_ = -1;
- } else /* dir_ > 1 */ {
- // illegal change of direction
- status = U_INVALID_STATE_ERROR;
- return NULLORDER;
- }
- if (offsets_ == nullptr) {
- offsets_ = new UVector32(status);
- if (offsets_ == nullptr) {
- status = U_MEMORY_ALLOCATION_ERROR;
- return NULLORDER;
- }
- }
- // If we already have expansion CEs, then we also have offsets.
- // Otherwise remember the trailing offset in case we need to
- // write offsets for an artificial expansion.
- int32_t limitOffset = iter_->getCEsLength() == 0 ? iter_->getOffset() : 0;
- int64_t ce = iter_->previousCE(*offsets_, status);
- if (ce == Collation::NO_CE) { return NULLORDER; }
- // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits.
- uint32_t p = (uint32_t)(ce >> 32);
- uint32_t lower32 = (uint32_t)ce;
- uint32_t firstHalf = getFirstHalf(p, lower32);
- uint32_t secondHalf = getSecondHalf(p, lower32);
- if (secondHalf != 0) {
- if (offsets_->isEmpty()) {
- // When we convert a single 64-bit CE into two 32-bit CEs,
- // we need to make this artificial expansion behave like a normal expansion.
- // See CollationIterator::previousCE().
- offsets_->addElement(iter_->getOffset(), status);
- offsets_->addElement(limitOffset, status);
- }
- otherHalf_ = firstHalf;
- return secondHalf | 0xc0; // continuation CE
- }
- return firstHalf;
- }
- /**
- * Resets the cursor to the beginning of the string.
- */
- void CollationElementIterator::reset()
- {
- iter_ ->resetToOffset(0);
- otherHalf_ = 0;
- dir_ = 0;
- }
- void CollationElementIterator::setOffset(int32_t newOffset,
- UErrorCode& status)
- {
- if (U_FAILURE(status)) { return; }
- if (0 < newOffset && newOffset < string_.length()) {
- int32_t offset = newOffset;
- do {
- char16_t c = string_.charAt(offset);
- if (!rbc_->isUnsafe(c) ||
- (U16_IS_LEAD(c) && !rbc_->isUnsafe(string_.char32At(offset)))) {
- break;
- }
- // Back up to before this unsafe character.
- --offset;
- } while (offset > 0);
- if (offset < newOffset) {
- // We might have backed up more than necessary.
- // For example, contractions "ch" and "cu" make both 'h' and 'u' unsafe,
- // but for text "chu" setOffset(2) should remain at 2
- // although we initially back up to offset 0.
- // Find the last safe offset no greater than newOffset by iterating forward.
- int32_t lastSafeOffset = offset;
- do {
- iter_->resetToOffset(lastSafeOffset);
- do {
- iter_->nextCE(status);
- if (U_FAILURE(status)) { return; }
- } while ((offset = iter_->getOffset()) == lastSafeOffset);
- if (offset <= newOffset) {
- lastSafeOffset = offset;
- }
- } while (offset < newOffset);
- newOffset = lastSafeOffset;
- }
- }
- iter_->resetToOffset(newOffset);
- otherHalf_ = 0;
- dir_ = 1;
- }
- /**
- * Sets the source to the new source string.
- */
- void CollationElementIterator::setText(const UnicodeString& source,
- UErrorCode& status)
- {
- if (U_FAILURE(status)) {
- return;
- }
- string_ = source;
- const char16_t *s = string_.getBuffer();
- CollationIterator *newIter;
- UBool numeric = rbc_->settings->isNumeric();
- if (rbc_->settings->dontCheckFCD()) {
- newIter = new UTF16CollationIterator(rbc_->data, numeric, s, s, s + string_.length());
- } else {
- newIter = new FCDUTF16CollationIterator(rbc_->data, numeric, s, s, s + string_.length());
- }
- if (newIter == nullptr) {
- status = U_MEMORY_ALLOCATION_ERROR;
- return;
- }
- delete iter_;
- iter_ = newIter;
- otherHalf_ = 0;
- dir_ = 0;
- }
- // Sets the source to the new character iterator.
- void CollationElementIterator::setText(CharacterIterator& source,
- UErrorCode& status)
- {
- if (U_FAILURE(status))
- return;
- source.getText(string_);
- setText(string_, status);
- }
- int32_t CollationElementIterator::strengthOrder(int32_t order) const
- {
- UColAttributeValue s = (UColAttributeValue)rbc_->settings->getStrength();
- // Mask off the unwanted differences.
- if (s == UCOL_PRIMARY) {
- order &= 0xffff0000;
- }
- else if (s == UCOL_SECONDARY) {
- order &= 0xffffff00;
- }
- return order;
- }
- /* CollationElementIterator private constructors/destructors --------------- */
- /**
- * This is the "real" constructor for this class; it constructs an iterator
- * over the source text using the specified collator
- */
- CollationElementIterator::CollationElementIterator(
- const UnicodeString &source,
- const RuleBasedCollator *coll,
- UErrorCode &status)
- : iter_(nullptr), rbc_(coll), otherHalf_(0), dir_(0), offsets_(nullptr) {
- setText(source, status);
- }
- /**
- * This is the "real" constructor for this class; it constructs an iterator over
- * the source text using the specified collator
- */
- CollationElementIterator::CollationElementIterator(
- const CharacterIterator &source,
- const RuleBasedCollator *coll,
- UErrorCode &status)
- : iter_(nullptr), rbc_(coll), otherHalf_(0), dir_(0), offsets_(nullptr) {
- // We only call source.getText() which should be const anyway.
- setText(const_cast<CharacterIterator &>(source), status);
- }
- /* CollationElementIterator private methods -------------------------------- */
- const CollationElementIterator& CollationElementIterator::operator=(
- const CollationElementIterator& other)
- {
- if (this == &other) {
- return *this;
- }
- CollationIterator *newIter;
- const FCDUTF16CollationIterator *otherFCDIter =
- dynamic_cast<const FCDUTF16CollationIterator *>(other.iter_);
- if(otherFCDIter != nullptr) {
- newIter = new FCDUTF16CollationIterator(*otherFCDIter, string_.getBuffer());
- } else {
- const UTF16CollationIterator *otherIter =
- dynamic_cast<const UTF16CollationIterator *>(other.iter_);
- if(otherIter != nullptr) {
- newIter = new UTF16CollationIterator(*otherIter, string_.getBuffer());
- } else {
- newIter = nullptr;
- }
- }
- if(newIter != nullptr) {
- delete iter_;
- iter_ = newIter;
- rbc_ = other.rbc_;
- otherHalf_ = other.otherHalf_;
- dir_ = other.dir_;
- string_ = other.string_;
- }
- if(other.dir_ < 0 && other.offsets_ != nullptr && !other.offsets_->isEmpty()) {
- UErrorCode errorCode = U_ZERO_ERROR;
- if(offsets_ == nullptr) {
- offsets_ = new UVector32(other.offsets_->size(), errorCode);
- }
- if(offsets_ != nullptr) {
- offsets_->assign(*other.offsets_, errorCode);
- }
- }
- return *this;
- }
- namespace {
- class MaxExpSink : public ContractionsAndExpansions::CESink {
- public:
- MaxExpSink(UHashtable *h, UErrorCode &ec) : maxExpansions(h), errorCode(ec) {}
- virtual ~MaxExpSink();
- virtual void handleCE(int64_t /*ce*/) override {}
- virtual void handleExpansion(const int64_t ces[], int32_t length) override {
- if (length <= 1) {
- // We do not need to add single CEs into the map.
- return;
- }
- int32_t count = 0; // number of CE "halves"
- for (int32_t i = 0; i < length; ++i) {
- count += ceNeedsTwoParts(ces[i]) ? 2 : 1;
- }
- // last "half" of the last CE
- int64_t ce = ces[length - 1];
- uint32_t p = (uint32_t)(ce >> 32);
- uint32_t lower32 = (uint32_t)ce;
- uint32_t lastHalf = getSecondHalf(p, lower32);
- if (lastHalf == 0) {
- lastHalf = getFirstHalf(p, lower32);
- U_ASSERT(lastHalf != 0);
- } else {
- lastHalf |= 0xc0; // old-style continuation CE
- }
- if (count > uhash_igeti(maxExpansions, (int32_t)lastHalf)) {
- uhash_iputi(maxExpansions, (int32_t)lastHalf, count, &errorCode);
- }
- }
- private:
- UHashtable *maxExpansions;
- UErrorCode &errorCode;
- };
- MaxExpSink::~MaxExpSink() {}
- } // namespace
- UHashtable *
- CollationElementIterator::computeMaxExpansions(const CollationData *data, UErrorCode &errorCode) {
- if (U_FAILURE(errorCode)) { return nullptr; }
- UHashtable *maxExpansions = uhash_open(uhash_hashLong, uhash_compareLong,
- uhash_compareLong, &errorCode);
- if (U_FAILURE(errorCode)) { return nullptr; }
- MaxExpSink sink(maxExpansions, errorCode);
- ContractionsAndExpansions(nullptr, nullptr, &sink, true).forData(data, errorCode);
- if (U_FAILURE(errorCode)) {
- uhash_close(maxExpansions);
- return nullptr;
- }
- return maxExpansions;
- }
- int32_t
- CollationElementIterator::getMaxExpansion(int32_t order) const {
- return getMaxExpansion(rbc_->tailoring->maxExpansions, order);
- }
- int32_t
- CollationElementIterator::getMaxExpansion(const UHashtable *maxExpansions, int32_t order) {
- if (order == 0) { return 1; }
- int32_t max;
- if(maxExpansions != nullptr && (max = uhash_igeti(maxExpansions, order)) != 0) {
- return max;
- }
- if ((order & 0xc0) == 0xc0) {
- // old-style continuation CE
- return 2;
- } else {
- return 1;
- }
- }
- U_NAMESPACE_END
- #endif /* #if !UCONFIG_NO_COLLATION */
|