123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656 |
- // © 2016 and later: Unicode, Inc. and others.
- // License & terms of use: http://www.unicode.org/copyright.html
- /*
- *******************************************************************************
- * Copyright (C) 1996-2015, International Business Machines
- * Corporation and others. All Rights Reserved.
- *******************************************************************************
- * rulebasedcollator.cpp
- *
- * (replaced the former tblcoll.cpp)
- *
- * created on: 2012feb14 with new and old collation code
- * created by: Markus W. Scherer
- */
- #include "unicode/utypes.h"
- #if !UCONFIG_NO_COLLATION
- #include "unicode/coll.h"
- #include "unicode/coleitr.h"
- #include "unicode/localpointer.h"
- #include "unicode/locid.h"
- #include "unicode/sortkey.h"
- #include "unicode/tblcoll.h"
- #include "unicode/ucol.h"
- #include "unicode/uiter.h"
- #include "unicode/uloc.h"
- #include "unicode/uniset.h"
- #include "unicode/unistr.h"
- #include "unicode/usetiter.h"
- #include "unicode/utf8.h"
- #include "unicode/uversion.h"
- #include "bocsu.h"
- #include "charstr.h"
- #include "cmemory.h"
- #include "collation.h"
- #include "collationcompare.h"
- #include "collationdata.h"
- #include "collationdatareader.h"
- #include "collationfastlatin.h"
- #include "collationiterator.h"
- #include "collationkeys.h"
- #include "collationroot.h"
- #include "collationsets.h"
- #include "collationsettings.h"
- #include "collationtailoring.h"
- #include "cstring.h"
- #include "uassert.h"
- #include "ucol_imp.h"
- #include "uhash.h"
- #include "uitercollationiterator.h"
- #include "ustr_imp.h"
- #include "utf16collationiterator.h"
- #include "utf8collationiterator.h"
- #include "uvectr64.h"
- U_NAMESPACE_BEGIN
- namespace {
- class FixedSortKeyByteSink : public SortKeyByteSink {
- public:
- FixedSortKeyByteSink(char *dest, int32_t destCapacity)
- : SortKeyByteSink(dest, destCapacity) {}
- virtual ~FixedSortKeyByteSink();
- private:
- virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length) override;
- virtual UBool Resize(int32_t appendCapacity, int32_t length) override;
- };
- FixedSortKeyByteSink::~FixedSortKeyByteSink() {}
- void
- FixedSortKeyByteSink::AppendBeyondCapacity(const char *bytes, int32_t /*n*/, int32_t length) {
- // buffer_ != nullptr && bytes != nullptr && n > 0 && appended_ > capacity_
- // Fill the buffer completely.
- int32_t available = capacity_ - length;
- if (available > 0) {
- uprv_memcpy(buffer_ + length, bytes, available);
- }
- }
- UBool
- FixedSortKeyByteSink::Resize(int32_t /*appendCapacity*/, int32_t /*length*/) {
- return false;
- }
- } // namespace
- // Not in an anonymous namespace, so that it can be a friend of CollationKey.
- class CollationKeyByteSink : public SortKeyByteSink {
- public:
- CollationKeyByteSink(CollationKey &key)
- : SortKeyByteSink(reinterpret_cast<char *>(key.getBytes()), key.getCapacity()),
- key_(key) {}
- virtual ~CollationKeyByteSink();
- private:
- virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length) override;
- virtual UBool Resize(int32_t appendCapacity, int32_t length) override;
- CollationKey &key_;
- };
- CollationKeyByteSink::~CollationKeyByteSink() {}
- void
- CollationKeyByteSink::AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length) {
- // buffer_ != nullptr && bytes != nullptr && n > 0 && appended_ > capacity_
- if (Resize(n, length)) {
- uprv_memcpy(buffer_ + length, bytes, n);
- }
- }
- UBool
- CollationKeyByteSink::Resize(int32_t appendCapacity, int32_t length) {
- if (buffer_ == nullptr) {
- return false; // allocation failed before already
- }
- int32_t newCapacity = 2 * capacity_;
- int32_t altCapacity = length + 2 * appendCapacity;
- if (newCapacity < altCapacity) {
- newCapacity = altCapacity;
- }
- if (newCapacity < 200) {
- newCapacity = 200;
- }
- uint8_t *newBuffer = key_.reallocate(newCapacity, length);
- if (newBuffer == nullptr) {
- SetNotOk();
- return false;
- }
- buffer_ = reinterpret_cast<char *>(newBuffer);
- capacity_ = newCapacity;
- return true;
- }
- RuleBasedCollator::RuleBasedCollator(const RuleBasedCollator &other)
- : Collator(other),
- data(other.data),
- settings(other.settings),
- tailoring(other.tailoring),
- cacheEntry(other.cacheEntry),
- validLocale(other.validLocale),
- explicitlySetAttributes(other.explicitlySetAttributes),
- actualLocaleIsSameAsValid(other.actualLocaleIsSameAsValid) {
- settings->addRef();
- cacheEntry->addRef();
- }
- RuleBasedCollator::RuleBasedCollator(const uint8_t *bin, int32_t length,
- const RuleBasedCollator *base, UErrorCode &errorCode)
- : data(nullptr),
- settings(nullptr),
- tailoring(nullptr),
- cacheEntry(nullptr),
- validLocale(""),
- explicitlySetAttributes(0),
- actualLocaleIsSameAsValid(false) {
- if(U_FAILURE(errorCode)) { return; }
- if(bin == nullptr || length == 0 || base == nullptr) {
- errorCode = U_ILLEGAL_ARGUMENT_ERROR;
- return;
- }
- const CollationTailoring *root = CollationRoot::getRoot(errorCode);
- if(U_FAILURE(errorCode)) { return; }
- if(base->tailoring != root) {
- errorCode = U_UNSUPPORTED_ERROR;
- return;
- }
- LocalPointer<CollationTailoring> t(new CollationTailoring(base->tailoring->settings));
- if(t.isNull() || t->isBogus()) {
- errorCode = U_MEMORY_ALLOCATION_ERROR;
- return;
- }
- CollationDataReader::read(base->tailoring, bin, length, *t, errorCode);
- if(U_FAILURE(errorCode)) { return; }
- t->actualLocale.setToBogus();
- adoptTailoring(t.orphan(), errorCode);
- }
- RuleBasedCollator::RuleBasedCollator(const CollationCacheEntry *entry)
- : data(entry->tailoring->data),
- settings(entry->tailoring->settings),
- tailoring(entry->tailoring),
- cacheEntry(entry),
- validLocale(entry->validLocale),
- explicitlySetAttributes(0),
- actualLocaleIsSameAsValid(false) {
- settings->addRef();
- cacheEntry->addRef();
- }
- RuleBasedCollator::~RuleBasedCollator() {
- SharedObject::clearPtr(settings);
- SharedObject::clearPtr(cacheEntry);
- }
- void
- RuleBasedCollator::adoptTailoring(CollationTailoring *t, UErrorCode &errorCode) {
- if(U_FAILURE(errorCode)) {
- t->deleteIfZeroRefCount();
- return;
- }
- U_ASSERT(settings == nullptr && data == nullptr && tailoring == nullptr && cacheEntry == nullptr);
- cacheEntry = new CollationCacheEntry(t->actualLocale, t);
- if(cacheEntry == nullptr) {
- errorCode = U_MEMORY_ALLOCATION_ERROR;
- t->deleteIfZeroRefCount();
- return;
- }
- data = t->data;
- settings = t->settings;
- settings->addRef();
- tailoring = t;
- cacheEntry->addRef();
- validLocale = t->actualLocale;
- actualLocaleIsSameAsValid = false;
- }
- RuleBasedCollator *
- RuleBasedCollator::clone() const {
- return new RuleBasedCollator(*this);
- }
- RuleBasedCollator &RuleBasedCollator::operator=(const RuleBasedCollator &other) {
- if(this == &other) { return *this; }
- SharedObject::copyPtr(other.settings, settings);
- tailoring = other.tailoring;
- SharedObject::copyPtr(other.cacheEntry, cacheEntry);
- data = tailoring->data;
- validLocale = other.validLocale;
- explicitlySetAttributes = other.explicitlySetAttributes;
- actualLocaleIsSameAsValid = other.actualLocaleIsSameAsValid;
- return *this;
- }
- UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedCollator)
- bool
- RuleBasedCollator::operator==(const Collator& other) const {
- if(this == &other) { return true; }
- if(!Collator::operator==(other)) { return false; }
- const RuleBasedCollator &o = static_cast<const RuleBasedCollator &>(other);
- if(*settings != *o.settings) { return false; }
- if(data == o.data) { return true; }
- UBool thisIsRoot = data->base == nullptr;
- UBool otherIsRoot = o.data->base == nullptr;
- U_ASSERT(!thisIsRoot || !otherIsRoot); // otherwise their data pointers should be ==
- if(thisIsRoot != otherIsRoot) { return false; }
- if((thisIsRoot || !tailoring->rules.isEmpty()) &&
- (otherIsRoot || !o.tailoring->rules.isEmpty())) {
- // Shortcut: If both collators have valid rule strings, then compare those.
- if(tailoring->rules == o.tailoring->rules) { return true; }
- }
- // Different rule strings can result in the same or equivalent tailoring.
- // The rule strings are optional in ICU resource bundles, although included by default.
- // cloneBinary() drops the rule string.
- UErrorCode errorCode = U_ZERO_ERROR;
- LocalPointer<UnicodeSet> thisTailored(getTailoredSet(errorCode));
- LocalPointer<UnicodeSet> otherTailored(o.getTailoredSet(errorCode));
- if(U_FAILURE(errorCode)) { return false; }
- if(*thisTailored != *otherTailored) { return false; }
- // For completeness, we should compare all of the mappings;
- // or we should create a list of strings, sort it with one collator,
- // and check if both collators compare adjacent strings the same
- // (order & strength, down to quaternary); or similar.
- // Testing equality of collators seems unusual.
- return true;
- }
- int32_t
- RuleBasedCollator::hashCode() const {
- int32_t h = settings->hashCode();
- if(data->base == nullptr) { return h; } // root collator
- // Do not rely on the rule string, see comments in operator==().
- UErrorCode errorCode = U_ZERO_ERROR;
- LocalPointer<UnicodeSet> set(getTailoredSet(errorCode));
- if(U_FAILURE(errorCode)) { return 0; }
- UnicodeSetIterator iter(*set);
- while(iter.next() && !iter.isString()) {
- h ^= data->getCE32(iter.getCodepoint());
- }
- return h;
- }
- void
- RuleBasedCollator::setLocales(const Locale &requested, const Locale &valid,
- const Locale &actual) {
- if(actual == tailoring->actualLocale) {
- actualLocaleIsSameAsValid = false;
- } else {
- U_ASSERT(actual == valid);
- actualLocaleIsSameAsValid = true;
- }
- // Do not modify tailoring.actualLocale:
- // We cannot be sure that that would be thread-safe.
- validLocale = valid;
- (void)requested; // Ignore, see also ticket #10477.
- }
- Locale
- RuleBasedCollator::getLocale(ULocDataLocaleType type, UErrorCode& errorCode) const {
- if(U_FAILURE(errorCode)) {
- return Locale::getRoot();
- }
- switch(type) {
- case ULOC_ACTUAL_LOCALE:
- return actualLocaleIsSameAsValid ? validLocale : tailoring->actualLocale;
- case ULOC_VALID_LOCALE:
- return validLocale;
- case ULOC_REQUESTED_LOCALE:
- default:
- errorCode = U_ILLEGAL_ARGUMENT_ERROR;
- return Locale::getRoot();
- }
- }
- const char *
- RuleBasedCollator::internalGetLocaleID(ULocDataLocaleType type, UErrorCode &errorCode) const {
- if(U_FAILURE(errorCode)) {
- return nullptr;
- }
- const Locale *result;
- switch(type) {
- case ULOC_ACTUAL_LOCALE:
- result = actualLocaleIsSameAsValid ? &validLocale : &tailoring->actualLocale;
- break;
- case ULOC_VALID_LOCALE:
- result = &validLocale;
- break;
- case ULOC_REQUESTED_LOCALE:
- default:
- errorCode = U_ILLEGAL_ARGUMENT_ERROR;
- return nullptr;
- }
- if(result->isBogus()) { return nullptr; }
- const char *id = result->getName();
- return id[0] == 0 ? "root" : id;
- }
- const UnicodeString&
- RuleBasedCollator::getRules() const {
- return tailoring->rules;
- }
- void
- RuleBasedCollator::getRules(UColRuleOption delta, UnicodeString &buffer) const {
- if(delta == UCOL_TAILORING_ONLY) {
- buffer = tailoring->rules;
- return;
- }
- // UCOL_FULL_RULES
- buffer.remove();
- CollationLoader::appendRootRules(buffer);
- buffer.append(tailoring->rules).getTerminatedBuffer();
- }
- void
- RuleBasedCollator::getVersion(UVersionInfo version) const {
- uprv_memcpy(version, tailoring->version, U_MAX_VERSION_LENGTH);
- version[0] += (UCOL_RUNTIME_VERSION << 4) + (UCOL_RUNTIME_VERSION >> 4);
- }
- UnicodeSet *
- RuleBasedCollator::getTailoredSet(UErrorCode &errorCode) const {
- if(U_FAILURE(errorCode)) { return nullptr; }
- UnicodeSet *tailored = new UnicodeSet();
- if(tailored == nullptr) {
- errorCode = U_MEMORY_ALLOCATION_ERROR;
- return nullptr;
- }
- if(data->base != nullptr) {
- TailoredSet(tailored).forData(data, errorCode);
- if(U_FAILURE(errorCode)) {
- delete tailored;
- return nullptr;
- }
- }
- return tailored;
- }
- void
- RuleBasedCollator::internalGetContractionsAndExpansions(
- UnicodeSet *contractions, UnicodeSet *expansions,
- UBool addPrefixes, UErrorCode &errorCode) const {
- if(U_FAILURE(errorCode)) { return; }
- if(contractions != nullptr) {
- contractions->clear();
- }
- if(expansions != nullptr) {
- expansions->clear();
- }
- ContractionsAndExpansions(contractions, expansions, nullptr, addPrefixes).forData(data, errorCode);
- }
- void
- RuleBasedCollator::internalAddContractions(UChar32 c, UnicodeSet &set, UErrorCode &errorCode) const {
- if(U_FAILURE(errorCode)) { return; }
- ContractionsAndExpansions(&set, nullptr, nullptr, false).forCodePoint(data, c, errorCode);
- }
- const CollationSettings &
- RuleBasedCollator::getDefaultSettings() const {
- return *tailoring->settings;
- }
- UColAttributeValue
- RuleBasedCollator::getAttribute(UColAttribute attr, UErrorCode &errorCode) const {
- if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; }
- int32_t option;
- switch(attr) {
- case UCOL_FRENCH_COLLATION:
- option = CollationSettings::BACKWARD_SECONDARY;
- break;
- case UCOL_ALTERNATE_HANDLING:
- return settings->getAlternateHandling();
- case UCOL_CASE_FIRST:
- return settings->getCaseFirst();
- case UCOL_CASE_LEVEL:
- option = CollationSettings::CASE_LEVEL;
- break;
- case UCOL_NORMALIZATION_MODE:
- option = CollationSettings::CHECK_FCD;
- break;
- case UCOL_STRENGTH:
- return (UColAttributeValue)settings->getStrength();
- case UCOL_HIRAGANA_QUATERNARY_MODE:
- // Deprecated attribute, unsettable.
- return UCOL_OFF;
- case UCOL_NUMERIC_COLLATION:
- option = CollationSettings::NUMERIC;
- break;
- default:
- errorCode = U_ILLEGAL_ARGUMENT_ERROR;
- return UCOL_DEFAULT;
- }
- return ((settings->options & option) == 0) ? UCOL_OFF : UCOL_ON;
- }
- void
- RuleBasedCollator::setAttribute(UColAttribute attr, UColAttributeValue value,
- UErrorCode &errorCode) {
- UColAttributeValue oldValue = getAttribute(attr, errorCode);
- if(U_FAILURE(errorCode)) { return; }
- if(value == oldValue) {
- setAttributeExplicitly(attr);
- return;
- }
- const CollationSettings &defaultSettings = getDefaultSettings();
- if(settings == &defaultSettings) {
- if(value == UCOL_DEFAULT) {
- setAttributeDefault(attr);
- return;
- }
- }
- CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings);
- if(ownedSettings == nullptr) {
- errorCode = U_MEMORY_ALLOCATION_ERROR;
- return;
- }
- switch(attr) {
- case UCOL_FRENCH_COLLATION:
- ownedSettings->setFlag(CollationSettings::BACKWARD_SECONDARY, value,
- defaultSettings.options, errorCode);
- break;
- case UCOL_ALTERNATE_HANDLING:
- ownedSettings->setAlternateHandling(value, defaultSettings.options, errorCode);
- break;
- case UCOL_CASE_FIRST:
- ownedSettings->setCaseFirst(value, defaultSettings.options, errorCode);
- break;
- case UCOL_CASE_LEVEL:
- ownedSettings->setFlag(CollationSettings::CASE_LEVEL, value,
- defaultSettings.options, errorCode);
- break;
- case UCOL_NORMALIZATION_MODE:
- ownedSettings->setFlag(CollationSettings::CHECK_FCD, value,
- defaultSettings.options, errorCode);
- break;
- case UCOL_STRENGTH:
- ownedSettings->setStrength(value, defaultSettings.options, errorCode);
- break;
- case UCOL_HIRAGANA_QUATERNARY_MODE:
- // Deprecated attribute. Check for valid values but do not change anything.
- if(value != UCOL_OFF && value != UCOL_ON && value != UCOL_DEFAULT) {
- errorCode = U_ILLEGAL_ARGUMENT_ERROR;
- }
- break;
- case UCOL_NUMERIC_COLLATION:
- ownedSettings->setFlag(CollationSettings::NUMERIC, value, defaultSettings.options, errorCode);
- break;
- default:
- errorCode = U_ILLEGAL_ARGUMENT_ERROR;
- break;
- }
- if(U_FAILURE(errorCode)) { return; }
- setFastLatinOptions(*ownedSettings);
- if(value == UCOL_DEFAULT) {
- setAttributeDefault(attr);
- } else {
- setAttributeExplicitly(attr);
- }
- }
- Collator &
- RuleBasedCollator::setMaxVariable(UColReorderCode group, UErrorCode &errorCode) {
- if(U_FAILURE(errorCode)) { return *this; }
- // Convert the reorder code into a MaxVariable number, or UCOL_DEFAULT=-1.
- int32_t value;
- if(group == UCOL_REORDER_CODE_DEFAULT) {
- value = UCOL_DEFAULT;
- } else if(UCOL_REORDER_CODE_FIRST <= group && group <= UCOL_REORDER_CODE_CURRENCY) {
- value = group - UCOL_REORDER_CODE_FIRST;
- } else {
- errorCode = U_ILLEGAL_ARGUMENT_ERROR;
- return *this;
- }
- CollationSettings::MaxVariable oldValue = settings->getMaxVariable();
- if(value == oldValue) {
- setAttributeExplicitly(ATTR_VARIABLE_TOP);
- return *this;
- }
- const CollationSettings &defaultSettings = getDefaultSettings();
- if(settings == &defaultSettings) {
- if(value == UCOL_DEFAULT) {
- setAttributeDefault(ATTR_VARIABLE_TOP);
- return *this;
- }
- }
- CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings);
- if(ownedSettings == nullptr) {
- errorCode = U_MEMORY_ALLOCATION_ERROR;
- return *this;
- }
- if(group == UCOL_REORDER_CODE_DEFAULT) {
- group = (UColReorderCode)(
- UCOL_REORDER_CODE_FIRST + int32_t{defaultSettings.getMaxVariable()});
- }
- uint32_t varTop = data->getLastPrimaryForGroup(group);
- U_ASSERT(varTop != 0);
- ownedSettings->setMaxVariable(value, defaultSettings.options, errorCode);
- if(U_FAILURE(errorCode)) { return *this; }
- ownedSettings->variableTop = varTop;
- setFastLatinOptions(*ownedSettings);
- if(value == UCOL_DEFAULT) {
- setAttributeDefault(ATTR_VARIABLE_TOP);
- } else {
- setAttributeExplicitly(ATTR_VARIABLE_TOP);
- }
- return *this;
- }
- UColReorderCode
- RuleBasedCollator::getMaxVariable() const {
- return (UColReorderCode)(UCOL_REORDER_CODE_FIRST + int32_t{settings->getMaxVariable()});
- }
- uint32_t
- RuleBasedCollator::getVariableTop(UErrorCode & /*errorCode*/) const {
- return settings->variableTop;
- }
- uint32_t
- RuleBasedCollator::setVariableTop(const char16_t *varTop, int32_t len, UErrorCode &errorCode) {
- if(U_FAILURE(errorCode)) { return 0; }
- if(varTop == nullptr && len !=0) {
- errorCode = U_ILLEGAL_ARGUMENT_ERROR;
- return 0;
- }
- if(len < 0) { len = u_strlen(varTop); }
- if(len == 0) {
- errorCode = U_ILLEGAL_ARGUMENT_ERROR;
- return 0;
- }
- UBool numeric = settings->isNumeric();
- int64_t ce1, ce2;
- if(settings->dontCheckFCD()) {
- UTF16CollationIterator ci(data, numeric, varTop, varTop, varTop + len);
- ce1 = ci.nextCE(errorCode);
- ce2 = ci.nextCE(errorCode);
- } else {
- FCDUTF16CollationIterator ci(data, numeric, varTop, varTop, varTop + len);
- ce1 = ci.nextCE(errorCode);
- ce2 = ci.nextCE(errorCode);
- }
- if(ce1 == Collation::NO_CE || ce2 != Collation::NO_CE) {
- errorCode = U_CE_NOT_FOUND_ERROR;
- return 0;
- }
- setVariableTop((uint32_t)(ce1 >> 32), errorCode);
- return settings->variableTop;
- }
- uint32_t
- RuleBasedCollator::setVariableTop(const UnicodeString &varTop, UErrorCode &errorCode) {
- return setVariableTop(varTop.getBuffer(), varTop.length(), errorCode);
- }
- void
- RuleBasedCollator::setVariableTop(uint32_t varTop, UErrorCode &errorCode) {
- if(U_FAILURE(errorCode)) { return; }
- if(varTop != settings->variableTop) {
- // Pin the variable top to the end of the reordering group which contains it.
- // Only a few special groups are supported.
- int32_t group = data->getGroupForPrimary(varTop);
- if(group < UCOL_REORDER_CODE_FIRST || UCOL_REORDER_CODE_CURRENCY < group) {
- errorCode = U_ILLEGAL_ARGUMENT_ERROR;
- return;
- }
- uint32_t v = data->getLastPrimaryForGroup(group);
- U_ASSERT(v != 0 && v >= varTop);
- varTop = v;
- if(varTop != settings->variableTop) {
- CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings);
- if(ownedSettings == nullptr) {
- errorCode = U_MEMORY_ALLOCATION_ERROR;
- return;
- }
- ownedSettings->setMaxVariable(group - UCOL_REORDER_CODE_FIRST,
- getDefaultSettings().options, errorCode);
- if(U_FAILURE(errorCode)) { return; }
- ownedSettings->variableTop = varTop;
- setFastLatinOptions(*ownedSettings);
- }
- }
- if(varTop == getDefaultSettings().variableTop) {
- setAttributeDefault(ATTR_VARIABLE_TOP);
- } else {
- setAttributeExplicitly(ATTR_VARIABLE_TOP);
- }
- }
- int32_t
- RuleBasedCollator::getReorderCodes(int32_t *dest, int32_t capacity,
- UErrorCode &errorCode) const {
- if(U_FAILURE(errorCode)) { return 0; }
- if(capacity < 0 || (dest == nullptr && capacity > 0)) {
- errorCode = U_ILLEGAL_ARGUMENT_ERROR;
- return 0;
- }
- int32_t length = settings->reorderCodesLength;
- if(length == 0) { return 0; }
- if(length > capacity) {
- errorCode = U_BUFFER_OVERFLOW_ERROR;
- return length;
- }
- uprv_memcpy(dest, settings->reorderCodes, length * 4);
- return length;
- }
- void
- RuleBasedCollator::setReorderCodes(const int32_t *reorderCodes, int32_t length,
- UErrorCode &errorCode) {
- if(U_FAILURE(errorCode)) { return; }
- if(length < 0 || (reorderCodes == nullptr && length > 0)) {
- errorCode = U_ILLEGAL_ARGUMENT_ERROR;
- return;
- }
- if(length == 1 && reorderCodes[0] == UCOL_REORDER_CODE_NONE) {
- length = 0;
- }
- if(length == settings->reorderCodesLength &&
- uprv_memcmp(reorderCodes, settings->reorderCodes, length * 4) == 0) {
- return;
- }
- const CollationSettings &defaultSettings = getDefaultSettings();
- if(length == 1 && reorderCodes[0] == UCOL_REORDER_CODE_DEFAULT) {
- if(settings != &defaultSettings) {
- CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings);
- if(ownedSettings == nullptr) {
- errorCode = U_MEMORY_ALLOCATION_ERROR;
- return;
- }
- ownedSettings->copyReorderingFrom(defaultSettings, errorCode);
- setFastLatinOptions(*ownedSettings);
- }
- return;
- }
- CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings);
- if(ownedSettings == nullptr) {
- errorCode = U_MEMORY_ALLOCATION_ERROR;
- return;
- }
- ownedSettings->setReordering(*data, reorderCodes, length, errorCode);
- setFastLatinOptions(*ownedSettings);
- }
- void
- RuleBasedCollator::setFastLatinOptions(CollationSettings &ownedSettings) const {
- ownedSettings.fastLatinOptions = CollationFastLatin::getOptions(
- data, ownedSettings,
- ownedSettings.fastLatinPrimaries, UPRV_LENGTHOF(ownedSettings.fastLatinPrimaries));
- }
- UCollationResult
- RuleBasedCollator::compare(const UnicodeString &left, const UnicodeString &right,
- UErrorCode &errorCode) const {
- if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
- return doCompare(left.getBuffer(), left.length(),
- right.getBuffer(), right.length(), errorCode);
- }
- UCollationResult
- RuleBasedCollator::compare(const UnicodeString &left, const UnicodeString &right,
- int32_t length, UErrorCode &errorCode) const {
- if(U_FAILURE(errorCode) || length == 0) { return UCOL_EQUAL; }
- if(length < 0) {
- errorCode = U_ILLEGAL_ARGUMENT_ERROR;
- return UCOL_EQUAL;
- }
- int32_t leftLength = left.length();
- int32_t rightLength = right.length();
- if(leftLength > length) { leftLength = length; }
- if(rightLength > length) { rightLength = length; }
- return doCompare(left.getBuffer(), leftLength,
- right.getBuffer(), rightLength, errorCode);
- }
- UCollationResult
- RuleBasedCollator::compare(const char16_t *left, int32_t leftLength,
- const char16_t *right, int32_t rightLength,
- UErrorCode &errorCode) const {
- if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
- if((left == nullptr && leftLength != 0) || (right == nullptr && rightLength != 0)) {
- errorCode = U_ILLEGAL_ARGUMENT_ERROR;
- return UCOL_EQUAL;
- }
- // Make sure both or neither strings have a known length.
- // We do not optimize for mixed length/termination.
- if(leftLength >= 0) {
- if(rightLength < 0) { rightLength = u_strlen(right); }
- } else {
- if(rightLength >= 0) { leftLength = u_strlen(left); }
- }
- return doCompare(left, leftLength, right, rightLength, errorCode);
- }
- UCollationResult
- RuleBasedCollator::compareUTF8(const StringPiece &left, const StringPiece &right,
- UErrorCode &errorCode) const {
- if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
- const uint8_t *leftBytes = reinterpret_cast<const uint8_t *>(left.data());
- const uint8_t *rightBytes = reinterpret_cast<const uint8_t *>(right.data());
- if((leftBytes == nullptr && !left.empty()) || (rightBytes == nullptr && !right.empty())) {
- errorCode = U_ILLEGAL_ARGUMENT_ERROR;
- return UCOL_EQUAL;
- }
- return doCompare(leftBytes, left.length(), rightBytes, right.length(), errorCode);
- }
- UCollationResult
- RuleBasedCollator::internalCompareUTF8(const char *left, int32_t leftLength,
- const char *right, int32_t rightLength,
- UErrorCode &errorCode) const {
- if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
- if((left == nullptr && leftLength != 0) || (right == nullptr && rightLength != 0)) {
- errorCode = U_ILLEGAL_ARGUMENT_ERROR;
- return UCOL_EQUAL;
- }
- // Make sure both or neither strings have a known length.
- // We do not optimize for mixed length/termination.
- if(leftLength >= 0) {
- if(rightLength < 0) { rightLength = static_cast<int32_t>(uprv_strlen(right)); }
- } else {
- if(rightLength >= 0) { leftLength = static_cast<int32_t>(uprv_strlen(left)); }
- }
- return doCompare(reinterpret_cast<const uint8_t *>(left), leftLength,
- reinterpret_cast<const uint8_t *>(right), rightLength, errorCode);
- }
- namespace {
- /**
- * Abstract iterator for identical-level string comparisons.
- * Returns FCD code points and handles temporary switching to NFD.
- */
- class NFDIterator : public UObject {
- public:
- NFDIterator() : index(-1), length(0) {}
- virtual ~NFDIterator() {}
- /**
- * Returns the next code point from the internal normalization buffer,
- * or else the next text code point.
- * Returns -1 at the end of the text.
- */
- UChar32 nextCodePoint() {
- if(index >= 0) {
- if(index == length) {
- index = -1;
- } else {
- UChar32 c;
- U16_NEXT_UNSAFE(decomp, index, c);
- return c;
- }
- }
- return nextRawCodePoint();
- }
- /**
- * @param nfcImpl
- * @param c the last code point returned by nextCodePoint() or nextDecomposedCodePoint()
- * @return the first code point in c's decomposition,
- * or c itself if it was decomposed already or if it does not decompose
- */
- UChar32 nextDecomposedCodePoint(const Normalizer2Impl &nfcImpl, UChar32 c) {
- if(index >= 0) { return c; }
- decomp = nfcImpl.getDecomposition(c, buffer, length);
- if(decomp == nullptr) { return c; }
- index = 0;
- U16_NEXT_UNSAFE(decomp, index, c);
- return c;
- }
- protected:
- /**
- * Returns the next text code point in FCD order.
- * Returns -1 at the end of the text.
- */
- virtual UChar32 nextRawCodePoint() = 0;
- private:
- const char16_t *decomp;
- char16_t buffer[4];
- int32_t index;
- int32_t length;
- };
- class UTF16NFDIterator : public NFDIterator {
- public:
- UTF16NFDIterator(const char16_t *text, const char16_t *textLimit) : s(text), limit(textLimit) {}
- protected:
- virtual UChar32 nextRawCodePoint() override {
- if(s == limit) { return U_SENTINEL; }
- UChar32 c = *s++;
- if(limit == nullptr && c == 0) {
- s = nullptr;
- return U_SENTINEL;
- }
- char16_t trail;
- if(U16_IS_LEAD(c) && s != limit && U16_IS_TRAIL(trail = *s)) {
- ++s;
- c = U16_GET_SUPPLEMENTARY(c, trail);
- }
- return c;
- }
- const char16_t *s;
- const char16_t *limit;
- };
- class FCDUTF16NFDIterator : public UTF16NFDIterator {
- public:
- FCDUTF16NFDIterator(const Normalizer2Impl &nfcImpl, const char16_t *text, const char16_t *textLimit)
- : UTF16NFDIterator(nullptr, nullptr) {
- UErrorCode errorCode = U_ZERO_ERROR;
- const char16_t *spanLimit = nfcImpl.makeFCD(text, textLimit, nullptr, errorCode);
- if(U_FAILURE(errorCode)) { return; }
- if(spanLimit == textLimit || (textLimit == nullptr && *spanLimit == 0)) {
- s = text;
- limit = spanLimit;
- } else {
- str.setTo(text, (int32_t)(spanLimit - text));
- {
- ReorderingBuffer r_buffer(nfcImpl, str);
- if(r_buffer.init(str.length(), errorCode)) {
- nfcImpl.makeFCD(spanLimit, textLimit, &r_buffer, errorCode);
- }
- }
- if(U_SUCCESS(errorCode)) {
- s = str.getBuffer();
- limit = s + str.length();
- }
- }
- }
- private:
- UnicodeString str;
- };
- class UTF8NFDIterator : public NFDIterator {
- public:
- UTF8NFDIterator(const uint8_t *text, int32_t textLength)
- : s(text), pos(0), length(textLength) {}
- protected:
- virtual UChar32 nextRawCodePoint() override {
- if(pos == length || (s[pos] == 0 && length < 0)) { return U_SENTINEL; }
- UChar32 c;
- U8_NEXT_OR_FFFD(s, pos, length, c);
- return c;
- }
- const uint8_t *s;
- int32_t pos;
- int32_t length;
- };
- class FCDUTF8NFDIterator : public NFDIterator {
- public:
- FCDUTF8NFDIterator(const CollationData *data, const uint8_t *text, int32_t textLength)
- : u8ci(data, false, text, 0, textLength) {}
- protected:
- virtual UChar32 nextRawCodePoint() override {
- UErrorCode errorCode = U_ZERO_ERROR;
- return u8ci.nextCodePoint(errorCode);
- }
- private:
- FCDUTF8CollationIterator u8ci;
- };
- class UIterNFDIterator : public NFDIterator {
- public:
- UIterNFDIterator(UCharIterator &it) : iter(it) {}
- protected:
- virtual UChar32 nextRawCodePoint() override {
- return uiter_next32(&iter);
- }
- private:
- UCharIterator &iter;
- };
- class FCDUIterNFDIterator : public NFDIterator {
- public:
- FCDUIterNFDIterator(const CollationData *data, UCharIterator &it, int32_t startIndex)
- : uici(data, false, it, startIndex) {}
- protected:
- virtual UChar32 nextRawCodePoint() override {
- UErrorCode errorCode = U_ZERO_ERROR;
- return uici.nextCodePoint(errorCode);
- }
- private:
- FCDUIterCollationIterator uici;
- };
- UCollationResult compareNFDIter(const Normalizer2Impl &nfcImpl,
- NFDIterator &left, NFDIterator &right) {
- for(;;) {
- // Fetch the next FCD code point from each string.
- UChar32 leftCp = left.nextCodePoint();
- UChar32 rightCp = right.nextCodePoint();
- if(leftCp == rightCp) {
- if(leftCp < 0) { break; }
- continue;
- }
- // If they are different, then decompose each and compare again.
- if(leftCp < 0) {
- leftCp = -2; // end of string
- } else if(leftCp == 0xfffe) {
- leftCp = -1; // U+FFFE: merge separator
- } else {
- leftCp = left.nextDecomposedCodePoint(nfcImpl, leftCp);
- }
- if(rightCp < 0) {
- rightCp = -2; // end of string
- } else if(rightCp == 0xfffe) {
- rightCp = -1; // U+FFFE: merge separator
- } else {
- rightCp = right.nextDecomposedCodePoint(nfcImpl, rightCp);
- }
- if(leftCp < rightCp) { return UCOL_LESS; }
- if(leftCp > rightCp) { return UCOL_GREATER; }
- }
- return UCOL_EQUAL;
- }
- } // namespace
- UCollationResult
- RuleBasedCollator::doCompare(const char16_t *left, int32_t leftLength,
- const char16_t *right, int32_t rightLength,
- UErrorCode &errorCode) const {
- // U_FAILURE(errorCode) checked by caller.
- if(left == right && leftLength == rightLength) {
- return UCOL_EQUAL;
- }
- // Identical-prefix test.
- const char16_t *leftLimit;
- const char16_t *rightLimit;
- int32_t equalPrefixLength = 0;
- if(leftLength < 0) {
- leftLimit = nullptr;
- rightLimit = nullptr;
- char16_t c;
- while((c = left[equalPrefixLength]) == right[equalPrefixLength]) {
- if(c == 0) { return UCOL_EQUAL; }
- ++equalPrefixLength;
- }
- } else {
- leftLimit = left + leftLength;
- rightLimit = right + rightLength;
- for(;;) {
- if(equalPrefixLength == leftLength) {
- if(equalPrefixLength == rightLength) { return UCOL_EQUAL; }
- break;
- } else if(equalPrefixLength == rightLength ||
- left[equalPrefixLength] != right[equalPrefixLength]) {
- break;
- }
- ++equalPrefixLength;
- }
- }
- UBool numeric = settings->isNumeric();
- if(equalPrefixLength > 0) {
- if((equalPrefixLength != leftLength &&
- data->isUnsafeBackward(left[equalPrefixLength], numeric)) ||
- (equalPrefixLength != rightLength &&
- data->isUnsafeBackward(right[equalPrefixLength], numeric))) {
- // Identical prefix: Back up to the start of a contraction or reordering sequence.
- while(--equalPrefixLength > 0 &&
- data->isUnsafeBackward(left[equalPrefixLength], numeric)) {}
- }
- // Notes:
- // - A longer string can compare equal to a prefix of it if only ignorables follow.
- // - With a backward level, a longer string can compare less-than a prefix of it.
- // Pass the actual start of each string into the CollationIterators,
- // plus the equalPrefixLength position,
- // so that prefix matches back into the equal prefix work.
- }
- int32_t result;
- int32_t fastLatinOptions = settings->fastLatinOptions;
- if(fastLatinOptions >= 0 &&
- (equalPrefixLength == leftLength ||
- left[equalPrefixLength] <= CollationFastLatin::LATIN_MAX) &&
- (equalPrefixLength == rightLength ||
- right[equalPrefixLength] <= CollationFastLatin::LATIN_MAX)) {
- if(leftLength >= 0) {
- result = CollationFastLatin::compareUTF16(data->fastLatinTable,
- settings->fastLatinPrimaries,
- fastLatinOptions,
- left + equalPrefixLength,
- leftLength - equalPrefixLength,
- right + equalPrefixLength,
- rightLength - equalPrefixLength);
- } else {
- result = CollationFastLatin::compareUTF16(data->fastLatinTable,
- settings->fastLatinPrimaries,
- fastLatinOptions,
- left + equalPrefixLength, -1,
- right + equalPrefixLength, -1);
- }
- } else {
- result = CollationFastLatin::BAIL_OUT_RESULT;
- }
- if(result == CollationFastLatin::BAIL_OUT_RESULT) {
- if(settings->dontCheckFCD()) {
- UTF16CollationIterator leftIter(data, numeric,
- left, left + equalPrefixLength, leftLimit);
- UTF16CollationIterator rightIter(data, numeric,
- right, right + equalPrefixLength, rightLimit);
- result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode);
- } else {
- FCDUTF16CollationIterator leftIter(data, numeric,
- left, left + equalPrefixLength, leftLimit);
- FCDUTF16CollationIterator rightIter(data, numeric,
- right, right + equalPrefixLength, rightLimit);
- result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode);
- }
- }
- if(result != UCOL_EQUAL || settings->getStrength() < UCOL_IDENTICAL || U_FAILURE(errorCode)) {
- return (UCollationResult)result;
- }
- // Note: If NUL-terminated, we could get the actual limits from the iterators now.
- // That would complicate the iterators a bit, NUL-terminated strings are only a C convenience,
- // and the benefit seems unlikely to be measurable.
- // Compare identical level.
- const Normalizer2Impl &nfcImpl = data->nfcImpl;
- left += equalPrefixLength;
- right += equalPrefixLength;
- if(settings->dontCheckFCD()) {
- UTF16NFDIterator leftIter(left, leftLimit);
- UTF16NFDIterator rightIter(right, rightLimit);
- return compareNFDIter(nfcImpl, leftIter, rightIter);
- } else {
- FCDUTF16NFDIterator leftIter(nfcImpl, left, leftLimit);
- FCDUTF16NFDIterator rightIter(nfcImpl, right, rightLimit);
- return compareNFDIter(nfcImpl, leftIter, rightIter);
- }
- }
- UCollationResult
- RuleBasedCollator::doCompare(const uint8_t *left, int32_t leftLength,
- const uint8_t *right, int32_t rightLength,
- UErrorCode &errorCode) const {
- // U_FAILURE(errorCode) checked by caller.
- if(left == right && leftLength == rightLength) {
- return UCOL_EQUAL;
- }
- // Identical-prefix test.
- int32_t equalPrefixLength = 0;
- if(leftLength < 0) {
- uint8_t c;
- while((c = left[equalPrefixLength]) == right[equalPrefixLength]) {
- if(c == 0) { return UCOL_EQUAL; }
- ++equalPrefixLength;
- }
- } else {
- for(;;) {
- if(equalPrefixLength == leftLength) {
- if(equalPrefixLength == rightLength) { return UCOL_EQUAL; }
- break;
- } else if(equalPrefixLength == rightLength ||
- left[equalPrefixLength] != right[equalPrefixLength]) {
- break;
- }
- ++equalPrefixLength;
- }
- }
- // Back up to the start of a partially-equal code point.
- if(equalPrefixLength > 0 &&
- ((equalPrefixLength != leftLength && U8_IS_TRAIL(left[equalPrefixLength])) ||
- (equalPrefixLength != rightLength && U8_IS_TRAIL(right[equalPrefixLength])))) {
- while(--equalPrefixLength > 0 && U8_IS_TRAIL(left[equalPrefixLength])) {}
- }
- UBool numeric = settings->isNumeric();
- if(equalPrefixLength > 0) {
- UBool unsafe = false;
- if(equalPrefixLength != leftLength) {
- int32_t i = equalPrefixLength;
- UChar32 c;
- U8_NEXT_OR_FFFD(left, i, leftLength, c);
- unsafe = data->isUnsafeBackward(c, numeric);
- }
- if(!unsafe && equalPrefixLength != rightLength) {
- int32_t i = equalPrefixLength;
- UChar32 c;
- U8_NEXT_OR_FFFD(right, i, rightLength, c);
- unsafe = data->isUnsafeBackward(c, numeric);
- }
- if(unsafe) {
- // Identical prefix: Back up to the start of a contraction or reordering sequence.
- UChar32 c;
- do {
- U8_PREV_OR_FFFD(left, 0, equalPrefixLength, c);
- } while(equalPrefixLength > 0 && data->isUnsafeBackward(c, numeric));
- }
- // See the notes in the UTF-16 version.
- // Pass the actual start of each string into the CollationIterators,
- // plus the equalPrefixLength position,
- // so that prefix matches back into the equal prefix work.
- }
- int32_t result;
- int32_t fastLatinOptions = settings->fastLatinOptions;
- if(fastLatinOptions >= 0 &&
- (equalPrefixLength == leftLength ||
- left[equalPrefixLength] <= CollationFastLatin::LATIN_MAX_UTF8_LEAD) &&
- (equalPrefixLength == rightLength ||
- right[equalPrefixLength] <= CollationFastLatin::LATIN_MAX_UTF8_LEAD)) {
- if(leftLength >= 0) {
- result = CollationFastLatin::compareUTF8(data->fastLatinTable,
- settings->fastLatinPrimaries,
- fastLatinOptions,
- left + equalPrefixLength,
- leftLength - equalPrefixLength,
- right + equalPrefixLength,
- rightLength - equalPrefixLength);
- } else {
- result = CollationFastLatin::compareUTF8(data->fastLatinTable,
- settings->fastLatinPrimaries,
- fastLatinOptions,
- left + equalPrefixLength, -1,
- right + equalPrefixLength, -1);
- }
- } else {
- result = CollationFastLatin::BAIL_OUT_RESULT;
- }
- if(result == CollationFastLatin::BAIL_OUT_RESULT) {
- if(settings->dontCheckFCD()) {
- UTF8CollationIterator leftIter(data, numeric, left, equalPrefixLength, leftLength);
- UTF8CollationIterator rightIter(data, numeric, right, equalPrefixLength, rightLength);
- result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode);
- } else {
- FCDUTF8CollationIterator leftIter(data, numeric, left, equalPrefixLength, leftLength);
- FCDUTF8CollationIterator rightIter(data, numeric, right, equalPrefixLength, rightLength);
- result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode);
- }
- }
- if(result != UCOL_EQUAL || settings->getStrength() < UCOL_IDENTICAL || U_FAILURE(errorCode)) {
- return (UCollationResult)result;
- }
- // Note: If NUL-terminated, we could get the actual limits from the iterators now.
- // That would complicate the iterators a bit, NUL-terminated strings are only a C convenience,
- // and the benefit seems unlikely to be measurable.
- // Compare identical level.
- const Normalizer2Impl &nfcImpl = data->nfcImpl;
- left += equalPrefixLength;
- right += equalPrefixLength;
- if(leftLength > 0) {
- leftLength -= equalPrefixLength;
- rightLength -= equalPrefixLength;
- }
- if(settings->dontCheckFCD()) {
- UTF8NFDIterator leftIter(left, leftLength);
- UTF8NFDIterator rightIter(right, rightLength);
- return compareNFDIter(nfcImpl, leftIter, rightIter);
- } else {
- FCDUTF8NFDIterator leftIter(data, left, leftLength);
- FCDUTF8NFDIterator rightIter(data, right, rightLength);
- return compareNFDIter(nfcImpl, leftIter, rightIter);
- }
- }
- UCollationResult
- RuleBasedCollator::compare(UCharIterator &left, UCharIterator &right,
- UErrorCode &errorCode) const {
- if(U_FAILURE(errorCode) || &left == &right) { return UCOL_EQUAL; }
- UBool numeric = settings->isNumeric();
- // Identical-prefix test.
- int32_t equalPrefixLength = 0;
- {
- UChar32 leftUnit;
- UChar32 rightUnit;
- while((leftUnit = left.next(&left)) == (rightUnit = right.next(&right))) {
- if(leftUnit < 0) { return UCOL_EQUAL; }
- ++equalPrefixLength;
- }
- // Back out the code units that differed, for the real collation comparison.
- if(leftUnit >= 0) { left.previous(&left); }
- if(rightUnit >= 0) { right.previous(&right); }
- if(equalPrefixLength > 0) {
- if((leftUnit >= 0 && data->isUnsafeBackward(leftUnit, numeric)) ||
- (rightUnit >= 0 && data->isUnsafeBackward(rightUnit, numeric))) {
- // Identical prefix: Back up to the start of a contraction or reordering sequence.
- do {
- --equalPrefixLength;
- leftUnit = left.previous(&left);
- right.previous(&right);
- } while(equalPrefixLength > 0 && data->isUnsafeBackward(leftUnit, numeric));
- }
- // See the notes in the UTF-16 version.
- }
- }
- UCollationResult result;
- if(settings->dontCheckFCD()) {
- UIterCollationIterator leftIter(data, numeric, left);
- UIterCollationIterator rightIter(data, numeric, right);
- result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode);
- } else {
- FCDUIterCollationIterator leftIter(data, numeric, left, equalPrefixLength);
- FCDUIterCollationIterator rightIter(data, numeric, right, equalPrefixLength);
- result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode);
- }
- if(result != UCOL_EQUAL || settings->getStrength() < UCOL_IDENTICAL || U_FAILURE(errorCode)) {
- return result;
- }
- // Compare identical level.
- left.move(&left, equalPrefixLength, UITER_ZERO);
- right.move(&right, equalPrefixLength, UITER_ZERO);
- const Normalizer2Impl &nfcImpl = data->nfcImpl;
- if(settings->dontCheckFCD()) {
- UIterNFDIterator leftIter(left);
- UIterNFDIterator rightIter(right);
- return compareNFDIter(nfcImpl, leftIter, rightIter);
- } else {
- FCDUIterNFDIterator leftIter(data, left, equalPrefixLength);
- FCDUIterNFDIterator rightIter(data, right, equalPrefixLength);
- return compareNFDIter(nfcImpl, leftIter, rightIter);
- }
- }
- CollationKey &
- RuleBasedCollator::getCollationKey(const UnicodeString &s, CollationKey &key,
- UErrorCode &errorCode) const {
- return getCollationKey(s.getBuffer(), s.length(), key, errorCode);
- }
- CollationKey &
- RuleBasedCollator::getCollationKey(const char16_t *s, int32_t length, CollationKey& key,
- UErrorCode &errorCode) const {
- if(U_FAILURE(errorCode)) {
- return key.setToBogus();
- }
- if(s == nullptr && length != 0) {
- errorCode = U_ILLEGAL_ARGUMENT_ERROR;
- return key.setToBogus();
- }
- key.reset(); // resets the "bogus" state
- CollationKeyByteSink sink(key);
- writeSortKey(s, length, sink, errorCode);
- if(U_FAILURE(errorCode)) {
- key.setToBogus();
- } else if(key.isBogus()) {
- errorCode = U_MEMORY_ALLOCATION_ERROR;
- } else {
- key.setLength(sink.NumberOfBytesAppended());
- }
- return key;
- }
- int32_t
- RuleBasedCollator::getSortKey(const UnicodeString &s,
- uint8_t *dest, int32_t capacity) const {
- return getSortKey(s.getBuffer(), s.length(), dest, capacity);
- }
- int32_t
- RuleBasedCollator::getSortKey(const char16_t *s, int32_t length,
- uint8_t *dest, int32_t capacity) const {
- if((s == nullptr && length != 0) || capacity < 0 || (dest == nullptr && capacity > 0)) {
- return 0;
- }
- uint8_t noDest[1] = { 0 };
- if(dest == nullptr) {
- // Distinguish pure preflighting from an allocation error.
- dest = noDest;
- capacity = 0;
- }
- FixedSortKeyByteSink sink(reinterpret_cast<char *>(dest), capacity);
- UErrorCode errorCode = U_ZERO_ERROR;
- writeSortKey(s, length, sink, errorCode);
- return U_SUCCESS(errorCode) ? sink.NumberOfBytesAppended() : 0;
- }
- void
- RuleBasedCollator::writeSortKey(const char16_t *s, int32_t length,
- SortKeyByteSink &sink, UErrorCode &errorCode) const {
- if(U_FAILURE(errorCode)) { return; }
- const char16_t *limit = (length >= 0) ? s + length : nullptr;
- UBool numeric = settings->isNumeric();
- CollationKeys::LevelCallback callback;
- if(settings->dontCheckFCD()) {
- UTF16CollationIterator iter(data, numeric, s, s, limit);
- CollationKeys::writeSortKeyUpToQuaternary(iter, data->compressibleBytes, *settings,
- sink, Collation::PRIMARY_LEVEL,
- callback, true, errorCode);
- } else {
- FCDUTF16CollationIterator iter(data, numeric, s, s, limit);
- CollationKeys::writeSortKeyUpToQuaternary(iter, data->compressibleBytes, *settings,
- sink, Collation::PRIMARY_LEVEL,
- callback, true, errorCode);
- }
- if(settings->getStrength() == UCOL_IDENTICAL) {
- writeIdenticalLevel(s, limit, sink, errorCode);
- }
- static const char terminator = 0; // TERMINATOR_BYTE
- sink.Append(&terminator, 1);
- }
- void
- RuleBasedCollator::writeIdenticalLevel(const char16_t *s, const char16_t *limit,
- SortKeyByteSink &sink, UErrorCode &errorCode) const {
- // NFD quick check
- const char16_t *nfdQCYesLimit = data->nfcImpl.decompose(s, limit, nullptr, errorCode);
- if(U_FAILURE(errorCode)) { return; }
- sink.Append(Collation::LEVEL_SEPARATOR_BYTE);
- UChar32 prev = 0;
- if(nfdQCYesLimit != s) {
- prev = u_writeIdenticalLevelRun(prev, s, (int32_t)(nfdQCYesLimit - s), sink);
- }
- // Is there non-NFD text?
- int32_t destLengthEstimate;
- if(limit != nullptr) {
- if(nfdQCYesLimit == limit) { return; }
- destLengthEstimate = (int32_t)(limit - nfdQCYesLimit);
- } else {
- // s is NUL-terminated
- if(*nfdQCYesLimit == 0) { return; }
- destLengthEstimate = -1;
- }
- UnicodeString nfd;
- data->nfcImpl.decompose(nfdQCYesLimit, limit, nfd, destLengthEstimate, errorCode);
- u_writeIdenticalLevelRun(prev, nfd.getBuffer(), nfd.length(), sink);
- }
- namespace {
- /**
- * internalNextSortKeyPart() calls CollationKeys::writeSortKeyUpToQuaternary()
- * with an instance of this callback class.
- * When another level is about to be written, the callback
- * records the level and the number of bytes that will be written until
- * the sink (which is actually a FixedSortKeyByteSink) fills up.
- *
- * When internalNextSortKeyPart() is called again, it restarts with the last level
- * and ignores as many bytes as were written previously for that level.
- */
- class PartLevelCallback : public CollationKeys::LevelCallback {
- public:
- PartLevelCallback(const SortKeyByteSink &s)
- : sink(s), level(Collation::PRIMARY_LEVEL) {
- levelCapacity = sink.GetRemainingCapacity();
- }
- virtual ~PartLevelCallback() {}
- virtual UBool needToWrite(Collation::Level l) override {
- if(!sink.Overflowed()) {
- // Remember a level that will be at least partially written.
- level = l;
- levelCapacity = sink.GetRemainingCapacity();
- return true;
- } else {
- return false;
- }
- }
- Collation::Level getLevel() const { return level; }
- int32_t getLevelCapacity() const { return levelCapacity; }
- private:
- const SortKeyByteSink &sink;
- Collation::Level level;
- int32_t levelCapacity;
- };
- } // namespace
- int32_t
- RuleBasedCollator::internalNextSortKeyPart(UCharIterator *iter, uint32_t state[2],
- uint8_t *dest, int32_t count, UErrorCode &errorCode) const {
- if(U_FAILURE(errorCode)) { return 0; }
- if(iter == nullptr || state == nullptr || count < 0 || (count > 0 && dest == nullptr)) {
- errorCode = U_ILLEGAL_ARGUMENT_ERROR;
- return 0;
- }
- if(count == 0) { return 0; }
- FixedSortKeyByteSink sink(reinterpret_cast<char *>(dest), count);
- sink.IgnoreBytes((int32_t)state[1]);
- iter->move(iter, 0, UITER_START);
- Collation::Level level = (Collation::Level)state[0];
- if(level <= Collation::QUATERNARY_LEVEL) {
- UBool numeric = settings->isNumeric();
- PartLevelCallback callback(sink);
- if(settings->dontCheckFCD()) {
- UIterCollationIterator ci(data, numeric, *iter);
- CollationKeys::writeSortKeyUpToQuaternary(ci, data->compressibleBytes, *settings,
- sink, level, callback, false, errorCode);
- } else {
- FCDUIterCollationIterator ci(data, numeric, *iter, 0);
- CollationKeys::writeSortKeyUpToQuaternary(ci, data->compressibleBytes, *settings,
- sink, level, callback, false, errorCode);
- }
- if(U_FAILURE(errorCode)) { return 0; }
- if(sink.NumberOfBytesAppended() > count) {
- state[0] = (uint32_t)callback.getLevel();
- state[1] = (uint32_t)callback.getLevelCapacity();
- return count;
- }
- // All of the normal levels are done.
- if(settings->getStrength() == UCOL_IDENTICAL) {
- level = Collation::IDENTICAL_LEVEL;
- iter->move(iter, 0, UITER_START);
- }
- // else fall through to setting ZERO_LEVEL
- }
- if(level == Collation::IDENTICAL_LEVEL) {
- int32_t levelCapacity = sink.GetRemainingCapacity();
- UnicodeString s;
- for(;;) {
- UChar32 c = iter->next(iter);
- if(c < 0) { break; }
- s.append((char16_t)c);
- }
- const char16_t *sArray = s.getBuffer();
- writeIdenticalLevel(sArray, sArray + s.length(), sink, errorCode);
- if(U_FAILURE(errorCode)) { return 0; }
- if(sink.NumberOfBytesAppended() > count) {
- state[0] = (uint32_t)level;
- state[1] = (uint32_t)levelCapacity;
- return count;
- }
- }
- // ZERO_LEVEL: Fill the remainder of dest with 00 bytes.
- state[0] = (uint32_t)Collation::ZERO_LEVEL;
- state[1] = 0;
- int32_t length = sink.NumberOfBytesAppended();
- int32_t i = length;
- while(i < count) { dest[i++] = 0; }
- return length;
- }
- void
- RuleBasedCollator::internalGetCEs(const UnicodeString &str, UVector64 &ces,
- UErrorCode &errorCode) const {
- if(U_FAILURE(errorCode)) { return; }
- const char16_t *s = str.getBuffer();
- const char16_t *limit = s + str.length();
- UBool numeric = settings->isNumeric();
- if(settings->dontCheckFCD()) {
- UTF16CollationIterator iter(data, numeric, s, s, limit);
- int64_t ce;
- while((ce = iter.nextCE(errorCode)) != Collation::NO_CE) {
- ces.addElement(ce, errorCode);
- }
- } else {
- FCDUTF16CollationIterator iter(data, numeric, s, s, limit);
- int64_t ce;
- while((ce = iter.nextCE(errorCode)) != Collation::NO_CE) {
- ces.addElement(ce, errorCode);
- }
- }
- }
- namespace {
- void appendSubtag(CharString &s, char letter, const char *subtag, int32_t length,
- UErrorCode &errorCode) {
- if(U_FAILURE(errorCode) || length == 0) { return; }
- if(!s.isEmpty()) {
- s.append('_', errorCode);
- }
- s.append(letter, errorCode);
- for(int32_t i = 0; i < length; ++i) {
- s.append(uprv_toupper(subtag[i]), errorCode);
- }
- }
- void appendAttribute(CharString &s, char letter, UColAttributeValue value,
- UErrorCode &errorCode) {
- if(U_FAILURE(errorCode)) { return; }
- if(!s.isEmpty()) {
- s.append('_', errorCode);
- }
- static const char *valueChars = "1234...........IXO..SN..LU......";
- s.append(letter, errorCode);
- s.append(valueChars[value], errorCode);
- }
- } // namespace
- int32_t
- RuleBasedCollator::internalGetShortDefinitionString(const char *locale,
- char *buffer, int32_t capacity,
- UErrorCode &errorCode) const {
- if(U_FAILURE(errorCode)) { return 0; }
- if(buffer == nullptr ? capacity != 0 : capacity < 0) {
- errorCode = U_ILLEGAL_ARGUMENT_ERROR;
- return 0;
- }
- if(locale == nullptr) {
- locale = internalGetLocaleID(ULOC_VALID_LOCALE, errorCode);
- }
- char resultLocale[ULOC_FULLNAME_CAPACITY + 1];
- int32_t length = ucol_getFunctionalEquivalent(resultLocale, ULOC_FULLNAME_CAPACITY,
- "collation", locale,
- nullptr, &errorCode);
- if(U_FAILURE(errorCode)) { return 0; }
- resultLocale[length] = 0;
- // Append items in alphabetic order of their short definition letters.
- CharString result;
- char subtag[ULOC_KEYWORD_AND_VALUES_CAPACITY];
- if(attributeHasBeenSetExplicitly(UCOL_ALTERNATE_HANDLING)) {
- appendAttribute(result, 'A', getAttribute(UCOL_ALTERNATE_HANDLING, errorCode), errorCode);
- }
- // ATTR_VARIABLE_TOP not supported because 'B' was broken.
- // See ICU tickets #10372 and #10386.
- if(attributeHasBeenSetExplicitly(UCOL_CASE_FIRST)) {
- appendAttribute(result, 'C', getAttribute(UCOL_CASE_FIRST, errorCode), errorCode);
- }
- if(attributeHasBeenSetExplicitly(UCOL_NUMERIC_COLLATION)) {
- appendAttribute(result, 'D', getAttribute(UCOL_NUMERIC_COLLATION, errorCode), errorCode);
- }
- if(attributeHasBeenSetExplicitly(UCOL_CASE_LEVEL)) {
- appendAttribute(result, 'E', getAttribute(UCOL_CASE_LEVEL, errorCode), errorCode);
- }
- if(attributeHasBeenSetExplicitly(UCOL_FRENCH_COLLATION)) {
- appendAttribute(result, 'F', getAttribute(UCOL_FRENCH_COLLATION, errorCode), errorCode);
- }
- // Note: UCOL_HIRAGANA_QUATERNARY_MODE is deprecated and never changes away from default.
- length = uloc_getKeywordValue(resultLocale, "collation", subtag, UPRV_LENGTHOF(subtag), &errorCode);
- appendSubtag(result, 'K', subtag, length, errorCode);
- length = uloc_getLanguage(resultLocale, subtag, UPRV_LENGTHOF(subtag), &errorCode);
- if (length == 0) {
- appendSubtag(result, 'L', "root", 4, errorCode);
- } else {
- appendSubtag(result, 'L', subtag, length, errorCode);
- }
- if(attributeHasBeenSetExplicitly(UCOL_NORMALIZATION_MODE)) {
- appendAttribute(result, 'N', getAttribute(UCOL_NORMALIZATION_MODE, errorCode), errorCode);
- }
- length = uloc_getCountry(resultLocale, subtag, UPRV_LENGTHOF(subtag), &errorCode);
- appendSubtag(result, 'R', subtag, length, errorCode);
- if(attributeHasBeenSetExplicitly(UCOL_STRENGTH)) {
- appendAttribute(result, 'S', getAttribute(UCOL_STRENGTH, errorCode), errorCode);
- }
- length = uloc_getVariant(resultLocale, subtag, UPRV_LENGTHOF(subtag), &errorCode);
- appendSubtag(result, 'V', subtag, length, errorCode);
- length = uloc_getScript(resultLocale, subtag, UPRV_LENGTHOF(subtag), &errorCode);
- appendSubtag(result, 'Z', subtag, length, errorCode);
- if(U_FAILURE(errorCode)) { return 0; }
- return result.extract(buffer, capacity, errorCode);
- }
- UBool
- RuleBasedCollator::isUnsafe(UChar32 c) const {
- return data->isUnsafeBackward(c, settings->isNumeric());
- }
- void U_CALLCONV
- RuleBasedCollator::computeMaxExpansions(const CollationTailoring *t, UErrorCode &errorCode) {
- t->maxExpansions = CollationElementIterator::computeMaxExpansions(t->data, errorCode);
- }
- UBool
- RuleBasedCollator::initMaxExpansions(UErrorCode &errorCode) const {
- umtx_initOnce(tailoring->maxExpansionsInitOnce, computeMaxExpansions, tailoring, errorCode);
- return U_SUCCESS(errorCode);
- }
- CollationElementIterator *
- RuleBasedCollator::createCollationElementIterator(const UnicodeString& source) const {
- UErrorCode errorCode = U_ZERO_ERROR;
- if(!initMaxExpansions(errorCode)) { return nullptr; }
- CollationElementIterator *cei = new CollationElementIterator(source, this, errorCode);
- if(U_FAILURE(errorCode)) {
- delete cei;
- return nullptr;
- }
- return cei;
- }
- CollationElementIterator *
- RuleBasedCollator::createCollationElementIterator(const CharacterIterator& source) const {
- UErrorCode errorCode = U_ZERO_ERROR;
- if(!initMaxExpansions(errorCode)) { return nullptr; }
- CollationElementIterator *cei = new CollationElementIterator(source, this, errorCode);
- if(U_FAILURE(errorCode)) {
- delete cei;
- return nullptr;
- }
- return cei;
- }
- int32_t
- RuleBasedCollator::getMaxExpansion(int32_t order) const {
- UErrorCode errorCode = U_ZERO_ERROR;
- (void)initMaxExpansions(errorCode);
- return CollationElementIterator::getMaxExpansion(tailoring->maxExpansions, order);
- }
- U_NAMESPACE_END
- #endif // !UCONFIG_NO_COLLATION
|