rulebasedcollator.cpp 60 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. *******************************************************************************
  5. * Copyright (C) 1996-2015, International Business Machines
  6. * Corporation and others. All Rights Reserved.
  7. *******************************************************************************
  8. * rulebasedcollator.cpp
  9. *
  10. * (replaced the former tblcoll.cpp)
  11. *
  12. * created on: 2012feb14 with new and old collation code
  13. * created by: Markus W. Scherer
  14. */
  15. #include "unicode/utypes.h"
  16. #if !UCONFIG_NO_COLLATION
  17. #include "unicode/coll.h"
  18. #include "unicode/coleitr.h"
  19. #include "unicode/localpointer.h"
  20. #include "unicode/locid.h"
  21. #include "unicode/sortkey.h"
  22. #include "unicode/tblcoll.h"
  23. #include "unicode/ucol.h"
  24. #include "unicode/uiter.h"
  25. #include "unicode/uloc.h"
  26. #include "unicode/uniset.h"
  27. #include "unicode/unistr.h"
  28. #include "unicode/usetiter.h"
  29. #include "unicode/utf8.h"
  30. #include "unicode/uversion.h"
  31. #include "bocsu.h"
  32. #include "charstr.h"
  33. #include "cmemory.h"
  34. #include "collation.h"
  35. #include "collationcompare.h"
  36. #include "collationdata.h"
  37. #include "collationdatareader.h"
  38. #include "collationfastlatin.h"
  39. #include "collationiterator.h"
  40. #include "collationkeys.h"
  41. #include "collationroot.h"
  42. #include "collationsets.h"
  43. #include "collationsettings.h"
  44. #include "collationtailoring.h"
  45. #include "cstring.h"
  46. #include "uassert.h"
  47. #include "ucol_imp.h"
  48. #include "uhash.h"
  49. #include "uitercollationiterator.h"
  50. #include "ustr_imp.h"
  51. #include "utf16collationiterator.h"
  52. #include "utf8collationiterator.h"
  53. #include "uvectr64.h"
  54. U_NAMESPACE_BEGIN
  55. namespace {
  56. class FixedSortKeyByteSink : public SortKeyByteSink {
  57. public:
  58. FixedSortKeyByteSink(char *dest, int32_t destCapacity)
  59. : SortKeyByteSink(dest, destCapacity) {}
  60. virtual ~FixedSortKeyByteSink();
  61. private:
  62. virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length) override;
  63. virtual UBool Resize(int32_t appendCapacity, int32_t length) override;
  64. };
  65. FixedSortKeyByteSink::~FixedSortKeyByteSink() {}
  66. void
  67. FixedSortKeyByteSink::AppendBeyondCapacity(const char *bytes, int32_t /*n*/, int32_t length) {
  68. // buffer_ != nullptr && bytes != nullptr && n > 0 && appended_ > capacity_
  69. // Fill the buffer completely.
  70. int32_t available = capacity_ - length;
  71. if (available > 0) {
  72. uprv_memcpy(buffer_ + length, bytes, available);
  73. }
  74. }
  75. UBool
  76. FixedSortKeyByteSink::Resize(int32_t /*appendCapacity*/, int32_t /*length*/) {
  77. return false;
  78. }
  79. } // namespace
  80. // Not in an anonymous namespace, so that it can be a friend of CollationKey.
  81. class CollationKeyByteSink : public SortKeyByteSink {
  82. public:
  83. CollationKeyByteSink(CollationKey &key)
  84. : SortKeyByteSink(reinterpret_cast<char *>(key.getBytes()), key.getCapacity()),
  85. key_(key) {}
  86. virtual ~CollationKeyByteSink();
  87. private:
  88. virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length) override;
  89. virtual UBool Resize(int32_t appendCapacity, int32_t length) override;
  90. CollationKey &key_;
  91. };
  92. CollationKeyByteSink::~CollationKeyByteSink() {}
  93. void
  94. CollationKeyByteSink::AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length) {
  95. // buffer_ != nullptr && bytes != nullptr && n > 0 && appended_ > capacity_
  96. if (Resize(n, length)) {
  97. uprv_memcpy(buffer_ + length, bytes, n);
  98. }
  99. }
  100. UBool
  101. CollationKeyByteSink::Resize(int32_t appendCapacity, int32_t length) {
  102. if (buffer_ == nullptr) {
  103. return false; // allocation failed before already
  104. }
  105. int32_t newCapacity = 2 * capacity_;
  106. int32_t altCapacity = length + 2 * appendCapacity;
  107. if (newCapacity < altCapacity) {
  108. newCapacity = altCapacity;
  109. }
  110. if (newCapacity < 200) {
  111. newCapacity = 200;
  112. }
  113. uint8_t *newBuffer = key_.reallocate(newCapacity, length);
  114. if (newBuffer == nullptr) {
  115. SetNotOk();
  116. return false;
  117. }
  118. buffer_ = reinterpret_cast<char *>(newBuffer);
  119. capacity_ = newCapacity;
  120. return true;
  121. }
  122. RuleBasedCollator::RuleBasedCollator(const RuleBasedCollator &other)
  123. : Collator(other),
  124. data(other.data),
  125. settings(other.settings),
  126. tailoring(other.tailoring),
  127. cacheEntry(other.cacheEntry),
  128. validLocale(other.validLocale),
  129. explicitlySetAttributes(other.explicitlySetAttributes),
  130. actualLocaleIsSameAsValid(other.actualLocaleIsSameAsValid) {
  131. settings->addRef();
  132. cacheEntry->addRef();
  133. }
  134. RuleBasedCollator::RuleBasedCollator(const uint8_t *bin, int32_t length,
  135. const RuleBasedCollator *base, UErrorCode &errorCode)
  136. : data(nullptr),
  137. settings(nullptr),
  138. tailoring(nullptr),
  139. cacheEntry(nullptr),
  140. validLocale(""),
  141. explicitlySetAttributes(0),
  142. actualLocaleIsSameAsValid(false) {
  143. if(U_FAILURE(errorCode)) { return; }
  144. if(bin == nullptr || length == 0 || base == nullptr) {
  145. errorCode = U_ILLEGAL_ARGUMENT_ERROR;
  146. return;
  147. }
  148. const CollationTailoring *root = CollationRoot::getRoot(errorCode);
  149. if(U_FAILURE(errorCode)) { return; }
  150. if(base->tailoring != root) {
  151. errorCode = U_UNSUPPORTED_ERROR;
  152. return;
  153. }
  154. LocalPointer<CollationTailoring> t(new CollationTailoring(base->tailoring->settings));
  155. if(t.isNull() || t->isBogus()) {
  156. errorCode = U_MEMORY_ALLOCATION_ERROR;
  157. return;
  158. }
  159. CollationDataReader::read(base->tailoring, bin, length, *t, errorCode);
  160. if(U_FAILURE(errorCode)) { return; }
  161. t->actualLocale.setToBogus();
  162. adoptTailoring(t.orphan(), errorCode);
  163. }
  164. RuleBasedCollator::RuleBasedCollator(const CollationCacheEntry *entry)
  165. : data(entry->tailoring->data),
  166. settings(entry->tailoring->settings),
  167. tailoring(entry->tailoring),
  168. cacheEntry(entry),
  169. validLocale(entry->validLocale),
  170. explicitlySetAttributes(0),
  171. actualLocaleIsSameAsValid(false) {
  172. settings->addRef();
  173. cacheEntry->addRef();
  174. }
  175. RuleBasedCollator::~RuleBasedCollator() {
  176. SharedObject::clearPtr(settings);
  177. SharedObject::clearPtr(cacheEntry);
  178. }
  179. void
  180. RuleBasedCollator::adoptTailoring(CollationTailoring *t, UErrorCode &errorCode) {
  181. if(U_FAILURE(errorCode)) {
  182. t->deleteIfZeroRefCount();
  183. return;
  184. }
  185. U_ASSERT(settings == nullptr && data == nullptr && tailoring == nullptr && cacheEntry == nullptr);
  186. cacheEntry = new CollationCacheEntry(t->actualLocale, t);
  187. if(cacheEntry == nullptr) {
  188. errorCode = U_MEMORY_ALLOCATION_ERROR;
  189. t->deleteIfZeroRefCount();
  190. return;
  191. }
  192. data = t->data;
  193. settings = t->settings;
  194. settings->addRef();
  195. tailoring = t;
  196. cacheEntry->addRef();
  197. validLocale = t->actualLocale;
  198. actualLocaleIsSameAsValid = false;
  199. }
  200. RuleBasedCollator *
  201. RuleBasedCollator::clone() const {
  202. return new RuleBasedCollator(*this);
  203. }
  204. RuleBasedCollator &RuleBasedCollator::operator=(const RuleBasedCollator &other) {
  205. if(this == &other) { return *this; }
  206. SharedObject::copyPtr(other.settings, settings);
  207. tailoring = other.tailoring;
  208. SharedObject::copyPtr(other.cacheEntry, cacheEntry);
  209. data = tailoring->data;
  210. validLocale = other.validLocale;
  211. explicitlySetAttributes = other.explicitlySetAttributes;
  212. actualLocaleIsSameAsValid = other.actualLocaleIsSameAsValid;
  213. return *this;
  214. }
  215. UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedCollator)
  216. bool
  217. RuleBasedCollator::operator==(const Collator& other) const {
  218. if(this == &other) { return true; }
  219. if(!Collator::operator==(other)) { return false; }
  220. const RuleBasedCollator &o = static_cast<const RuleBasedCollator &>(other);
  221. if(*settings != *o.settings) { return false; }
  222. if(data == o.data) { return true; }
  223. UBool thisIsRoot = data->base == nullptr;
  224. UBool otherIsRoot = o.data->base == nullptr;
  225. U_ASSERT(!thisIsRoot || !otherIsRoot); // otherwise their data pointers should be ==
  226. if(thisIsRoot != otherIsRoot) { return false; }
  227. if((thisIsRoot || !tailoring->rules.isEmpty()) &&
  228. (otherIsRoot || !o.tailoring->rules.isEmpty())) {
  229. // Shortcut: If both collators have valid rule strings, then compare those.
  230. if(tailoring->rules == o.tailoring->rules) { return true; }
  231. }
  232. // Different rule strings can result in the same or equivalent tailoring.
  233. // The rule strings are optional in ICU resource bundles, although included by default.
  234. // cloneBinary() drops the rule string.
  235. UErrorCode errorCode = U_ZERO_ERROR;
  236. LocalPointer<UnicodeSet> thisTailored(getTailoredSet(errorCode));
  237. LocalPointer<UnicodeSet> otherTailored(o.getTailoredSet(errorCode));
  238. if(U_FAILURE(errorCode)) { return false; }
  239. if(*thisTailored != *otherTailored) { return false; }
  240. // For completeness, we should compare all of the mappings;
  241. // or we should create a list of strings, sort it with one collator,
  242. // and check if both collators compare adjacent strings the same
  243. // (order & strength, down to quaternary); or similar.
  244. // Testing equality of collators seems unusual.
  245. return true;
  246. }
  247. int32_t
  248. RuleBasedCollator::hashCode() const {
  249. int32_t h = settings->hashCode();
  250. if(data->base == nullptr) { return h; } // root collator
  251. // Do not rely on the rule string, see comments in operator==().
  252. UErrorCode errorCode = U_ZERO_ERROR;
  253. LocalPointer<UnicodeSet> set(getTailoredSet(errorCode));
  254. if(U_FAILURE(errorCode)) { return 0; }
  255. UnicodeSetIterator iter(*set);
  256. while(iter.next() && !iter.isString()) {
  257. h ^= data->getCE32(iter.getCodepoint());
  258. }
  259. return h;
  260. }
  261. void
  262. RuleBasedCollator::setLocales(const Locale &requested, const Locale &valid,
  263. const Locale &actual) {
  264. if(actual == tailoring->actualLocale) {
  265. actualLocaleIsSameAsValid = false;
  266. } else {
  267. U_ASSERT(actual == valid);
  268. actualLocaleIsSameAsValid = true;
  269. }
  270. // Do not modify tailoring.actualLocale:
  271. // We cannot be sure that that would be thread-safe.
  272. validLocale = valid;
  273. (void)requested; // Ignore, see also ticket #10477.
  274. }
  275. Locale
  276. RuleBasedCollator::getLocale(ULocDataLocaleType type, UErrorCode& errorCode) const {
  277. if(U_FAILURE(errorCode)) {
  278. return Locale::getRoot();
  279. }
  280. switch(type) {
  281. case ULOC_ACTUAL_LOCALE:
  282. return actualLocaleIsSameAsValid ? validLocale : tailoring->actualLocale;
  283. case ULOC_VALID_LOCALE:
  284. return validLocale;
  285. case ULOC_REQUESTED_LOCALE:
  286. default:
  287. errorCode = U_ILLEGAL_ARGUMENT_ERROR;
  288. return Locale::getRoot();
  289. }
  290. }
  291. const char *
  292. RuleBasedCollator::internalGetLocaleID(ULocDataLocaleType type, UErrorCode &errorCode) const {
  293. if(U_FAILURE(errorCode)) {
  294. return nullptr;
  295. }
  296. const Locale *result;
  297. switch(type) {
  298. case ULOC_ACTUAL_LOCALE:
  299. result = actualLocaleIsSameAsValid ? &validLocale : &tailoring->actualLocale;
  300. break;
  301. case ULOC_VALID_LOCALE:
  302. result = &validLocale;
  303. break;
  304. case ULOC_REQUESTED_LOCALE:
  305. default:
  306. errorCode = U_ILLEGAL_ARGUMENT_ERROR;
  307. return nullptr;
  308. }
  309. if(result->isBogus()) { return nullptr; }
  310. const char *id = result->getName();
  311. return id[0] == 0 ? "root" : id;
  312. }
  313. const UnicodeString&
  314. RuleBasedCollator::getRules() const {
  315. return tailoring->rules;
  316. }
  317. void
  318. RuleBasedCollator::getRules(UColRuleOption delta, UnicodeString &buffer) const {
  319. if(delta == UCOL_TAILORING_ONLY) {
  320. buffer = tailoring->rules;
  321. return;
  322. }
  323. // UCOL_FULL_RULES
  324. buffer.remove();
  325. CollationLoader::appendRootRules(buffer);
  326. buffer.append(tailoring->rules).getTerminatedBuffer();
  327. }
  328. void
  329. RuleBasedCollator::getVersion(UVersionInfo version) const {
  330. uprv_memcpy(version, tailoring->version, U_MAX_VERSION_LENGTH);
  331. version[0] += (UCOL_RUNTIME_VERSION << 4) + (UCOL_RUNTIME_VERSION >> 4);
  332. }
  333. UnicodeSet *
  334. RuleBasedCollator::getTailoredSet(UErrorCode &errorCode) const {
  335. if(U_FAILURE(errorCode)) { return nullptr; }
  336. UnicodeSet *tailored = new UnicodeSet();
  337. if(tailored == nullptr) {
  338. errorCode = U_MEMORY_ALLOCATION_ERROR;
  339. return nullptr;
  340. }
  341. if(data->base != nullptr) {
  342. TailoredSet(tailored).forData(data, errorCode);
  343. if(U_FAILURE(errorCode)) {
  344. delete tailored;
  345. return nullptr;
  346. }
  347. }
  348. return tailored;
  349. }
  350. void
  351. RuleBasedCollator::internalGetContractionsAndExpansions(
  352. UnicodeSet *contractions, UnicodeSet *expansions,
  353. UBool addPrefixes, UErrorCode &errorCode) const {
  354. if(U_FAILURE(errorCode)) { return; }
  355. if(contractions != nullptr) {
  356. contractions->clear();
  357. }
  358. if(expansions != nullptr) {
  359. expansions->clear();
  360. }
  361. ContractionsAndExpansions(contractions, expansions, nullptr, addPrefixes).forData(data, errorCode);
  362. }
  363. void
  364. RuleBasedCollator::internalAddContractions(UChar32 c, UnicodeSet &set, UErrorCode &errorCode) const {
  365. if(U_FAILURE(errorCode)) { return; }
  366. ContractionsAndExpansions(&set, nullptr, nullptr, false).forCodePoint(data, c, errorCode);
  367. }
  368. const CollationSettings &
  369. RuleBasedCollator::getDefaultSettings() const {
  370. return *tailoring->settings;
  371. }
  372. UColAttributeValue
  373. RuleBasedCollator::getAttribute(UColAttribute attr, UErrorCode &errorCode) const {
  374. if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; }
  375. int32_t option;
  376. switch(attr) {
  377. case UCOL_FRENCH_COLLATION:
  378. option = CollationSettings::BACKWARD_SECONDARY;
  379. break;
  380. case UCOL_ALTERNATE_HANDLING:
  381. return settings->getAlternateHandling();
  382. case UCOL_CASE_FIRST:
  383. return settings->getCaseFirst();
  384. case UCOL_CASE_LEVEL:
  385. option = CollationSettings::CASE_LEVEL;
  386. break;
  387. case UCOL_NORMALIZATION_MODE:
  388. option = CollationSettings::CHECK_FCD;
  389. break;
  390. case UCOL_STRENGTH:
  391. return (UColAttributeValue)settings->getStrength();
  392. case UCOL_HIRAGANA_QUATERNARY_MODE:
  393. // Deprecated attribute, unsettable.
  394. return UCOL_OFF;
  395. case UCOL_NUMERIC_COLLATION:
  396. option = CollationSettings::NUMERIC;
  397. break;
  398. default:
  399. errorCode = U_ILLEGAL_ARGUMENT_ERROR;
  400. return UCOL_DEFAULT;
  401. }
  402. return ((settings->options & option) == 0) ? UCOL_OFF : UCOL_ON;
  403. }
  404. void
  405. RuleBasedCollator::setAttribute(UColAttribute attr, UColAttributeValue value,
  406. UErrorCode &errorCode) {
  407. UColAttributeValue oldValue = getAttribute(attr, errorCode);
  408. if(U_FAILURE(errorCode)) { return; }
  409. if(value == oldValue) {
  410. setAttributeExplicitly(attr);
  411. return;
  412. }
  413. const CollationSettings &defaultSettings = getDefaultSettings();
  414. if(settings == &defaultSettings) {
  415. if(value == UCOL_DEFAULT) {
  416. setAttributeDefault(attr);
  417. return;
  418. }
  419. }
  420. CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings);
  421. if(ownedSettings == nullptr) {
  422. errorCode = U_MEMORY_ALLOCATION_ERROR;
  423. return;
  424. }
  425. switch(attr) {
  426. case UCOL_FRENCH_COLLATION:
  427. ownedSettings->setFlag(CollationSettings::BACKWARD_SECONDARY, value,
  428. defaultSettings.options, errorCode);
  429. break;
  430. case UCOL_ALTERNATE_HANDLING:
  431. ownedSettings->setAlternateHandling(value, defaultSettings.options, errorCode);
  432. break;
  433. case UCOL_CASE_FIRST:
  434. ownedSettings->setCaseFirst(value, defaultSettings.options, errorCode);
  435. break;
  436. case UCOL_CASE_LEVEL:
  437. ownedSettings->setFlag(CollationSettings::CASE_LEVEL, value,
  438. defaultSettings.options, errorCode);
  439. break;
  440. case UCOL_NORMALIZATION_MODE:
  441. ownedSettings->setFlag(CollationSettings::CHECK_FCD, value,
  442. defaultSettings.options, errorCode);
  443. break;
  444. case UCOL_STRENGTH:
  445. ownedSettings->setStrength(value, defaultSettings.options, errorCode);
  446. break;
  447. case UCOL_HIRAGANA_QUATERNARY_MODE:
  448. // Deprecated attribute. Check for valid values but do not change anything.
  449. if(value != UCOL_OFF && value != UCOL_ON && value != UCOL_DEFAULT) {
  450. errorCode = U_ILLEGAL_ARGUMENT_ERROR;
  451. }
  452. break;
  453. case UCOL_NUMERIC_COLLATION:
  454. ownedSettings->setFlag(CollationSettings::NUMERIC, value, defaultSettings.options, errorCode);
  455. break;
  456. default:
  457. errorCode = U_ILLEGAL_ARGUMENT_ERROR;
  458. break;
  459. }
  460. if(U_FAILURE(errorCode)) { return; }
  461. setFastLatinOptions(*ownedSettings);
  462. if(value == UCOL_DEFAULT) {
  463. setAttributeDefault(attr);
  464. } else {
  465. setAttributeExplicitly(attr);
  466. }
  467. }
  468. Collator &
  469. RuleBasedCollator::setMaxVariable(UColReorderCode group, UErrorCode &errorCode) {
  470. if(U_FAILURE(errorCode)) { return *this; }
  471. // Convert the reorder code into a MaxVariable number, or UCOL_DEFAULT=-1.
  472. int32_t value;
  473. if(group == UCOL_REORDER_CODE_DEFAULT) {
  474. value = UCOL_DEFAULT;
  475. } else if(UCOL_REORDER_CODE_FIRST <= group && group <= UCOL_REORDER_CODE_CURRENCY) {
  476. value = group - UCOL_REORDER_CODE_FIRST;
  477. } else {
  478. errorCode = U_ILLEGAL_ARGUMENT_ERROR;
  479. return *this;
  480. }
  481. CollationSettings::MaxVariable oldValue = settings->getMaxVariable();
  482. if(value == oldValue) {
  483. setAttributeExplicitly(ATTR_VARIABLE_TOP);
  484. return *this;
  485. }
  486. const CollationSettings &defaultSettings = getDefaultSettings();
  487. if(settings == &defaultSettings) {
  488. if(value == UCOL_DEFAULT) {
  489. setAttributeDefault(ATTR_VARIABLE_TOP);
  490. return *this;
  491. }
  492. }
  493. CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings);
  494. if(ownedSettings == nullptr) {
  495. errorCode = U_MEMORY_ALLOCATION_ERROR;
  496. return *this;
  497. }
  498. if(group == UCOL_REORDER_CODE_DEFAULT) {
  499. group = (UColReorderCode)(
  500. UCOL_REORDER_CODE_FIRST + int32_t{defaultSettings.getMaxVariable()});
  501. }
  502. uint32_t varTop = data->getLastPrimaryForGroup(group);
  503. U_ASSERT(varTop != 0);
  504. ownedSettings->setMaxVariable(value, defaultSettings.options, errorCode);
  505. if(U_FAILURE(errorCode)) { return *this; }
  506. ownedSettings->variableTop = varTop;
  507. setFastLatinOptions(*ownedSettings);
  508. if(value == UCOL_DEFAULT) {
  509. setAttributeDefault(ATTR_VARIABLE_TOP);
  510. } else {
  511. setAttributeExplicitly(ATTR_VARIABLE_TOP);
  512. }
  513. return *this;
  514. }
  515. UColReorderCode
  516. RuleBasedCollator::getMaxVariable() const {
  517. return (UColReorderCode)(UCOL_REORDER_CODE_FIRST + int32_t{settings->getMaxVariable()});
  518. }
  519. uint32_t
  520. RuleBasedCollator::getVariableTop(UErrorCode & /*errorCode*/) const {
  521. return settings->variableTop;
  522. }
  523. uint32_t
  524. RuleBasedCollator::setVariableTop(const char16_t *varTop, int32_t len, UErrorCode &errorCode) {
  525. if(U_FAILURE(errorCode)) { return 0; }
  526. if(varTop == nullptr && len !=0) {
  527. errorCode = U_ILLEGAL_ARGUMENT_ERROR;
  528. return 0;
  529. }
  530. if(len < 0) { len = u_strlen(varTop); }
  531. if(len == 0) {
  532. errorCode = U_ILLEGAL_ARGUMENT_ERROR;
  533. return 0;
  534. }
  535. UBool numeric = settings->isNumeric();
  536. int64_t ce1, ce2;
  537. if(settings->dontCheckFCD()) {
  538. UTF16CollationIterator ci(data, numeric, varTop, varTop, varTop + len);
  539. ce1 = ci.nextCE(errorCode);
  540. ce2 = ci.nextCE(errorCode);
  541. } else {
  542. FCDUTF16CollationIterator ci(data, numeric, varTop, varTop, varTop + len);
  543. ce1 = ci.nextCE(errorCode);
  544. ce2 = ci.nextCE(errorCode);
  545. }
  546. if(ce1 == Collation::NO_CE || ce2 != Collation::NO_CE) {
  547. errorCode = U_CE_NOT_FOUND_ERROR;
  548. return 0;
  549. }
  550. setVariableTop((uint32_t)(ce1 >> 32), errorCode);
  551. return settings->variableTop;
  552. }
  553. uint32_t
  554. RuleBasedCollator::setVariableTop(const UnicodeString &varTop, UErrorCode &errorCode) {
  555. return setVariableTop(varTop.getBuffer(), varTop.length(), errorCode);
  556. }
  557. void
  558. RuleBasedCollator::setVariableTop(uint32_t varTop, UErrorCode &errorCode) {
  559. if(U_FAILURE(errorCode)) { return; }
  560. if(varTop != settings->variableTop) {
  561. // Pin the variable top to the end of the reordering group which contains it.
  562. // Only a few special groups are supported.
  563. int32_t group = data->getGroupForPrimary(varTop);
  564. if(group < UCOL_REORDER_CODE_FIRST || UCOL_REORDER_CODE_CURRENCY < group) {
  565. errorCode = U_ILLEGAL_ARGUMENT_ERROR;
  566. return;
  567. }
  568. uint32_t v = data->getLastPrimaryForGroup(group);
  569. U_ASSERT(v != 0 && v >= varTop);
  570. varTop = v;
  571. if(varTop != settings->variableTop) {
  572. CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings);
  573. if(ownedSettings == nullptr) {
  574. errorCode = U_MEMORY_ALLOCATION_ERROR;
  575. return;
  576. }
  577. ownedSettings->setMaxVariable(group - UCOL_REORDER_CODE_FIRST,
  578. getDefaultSettings().options, errorCode);
  579. if(U_FAILURE(errorCode)) { return; }
  580. ownedSettings->variableTop = varTop;
  581. setFastLatinOptions(*ownedSettings);
  582. }
  583. }
  584. if(varTop == getDefaultSettings().variableTop) {
  585. setAttributeDefault(ATTR_VARIABLE_TOP);
  586. } else {
  587. setAttributeExplicitly(ATTR_VARIABLE_TOP);
  588. }
  589. }
  590. int32_t
  591. RuleBasedCollator::getReorderCodes(int32_t *dest, int32_t capacity,
  592. UErrorCode &errorCode) const {
  593. if(U_FAILURE(errorCode)) { return 0; }
  594. if(capacity < 0 || (dest == nullptr && capacity > 0)) {
  595. errorCode = U_ILLEGAL_ARGUMENT_ERROR;
  596. return 0;
  597. }
  598. int32_t length = settings->reorderCodesLength;
  599. if(length == 0) { return 0; }
  600. if(length > capacity) {
  601. errorCode = U_BUFFER_OVERFLOW_ERROR;
  602. return length;
  603. }
  604. uprv_memcpy(dest, settings->reorderCodes, length * 4);
  605. return length;
  606. }
  607. void
  608. RuleBasedCollator::setReorderCodes(const int32_t *reorderCodes, int32_t length,
  609. UErrorCode &errorCode) {
  610. if(U_FAILURE(errorCode)) { return; }
  611. if(length < 0 || (reorderCodes == nullptr && length > 0)) {
  612. errorCode = U_ILLEGAL_ARGUMENT_ERROR;
  613. return;
  614. }
  615. if(length == 1 && reorderCodes[0] == UCOL_REORDER_CODE_NONE) {
  616. length = 0;
  617. }
  618. if(length == settings->reorderCodesLength &&
  619. uprv_memcmp(reorderCodes, settings->reorderCodes, length * 4) == 0) {
  620. return;
  621. }
  622. const CollationSettings &defaultSettings = getDefaultSettings();
  623. if(length == 1 && reorderCodes[0] == UCOL_REORDER_CODE_DEFAULT) {
  624. if(settings != &defaultSettings) {
  625. CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings);
  626. if(ownedSettings == nullptr) {
  627. errorCode = U_MEMORY_ALLOCATION_ERROR;
  628. return;
  629. }
  630. ownedSettings->copyReorderingFrom(defaultSettings, errorCode);
  631. setFastLatinOptions(*ownedSettings);
  632. }
  633. return;
  634. }
  635. CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings);
  636. if(ownedSettings == nullptr) {
  637. errorCode = U_MEMORY_ALLOCATION_ERROR;
  638. return;
  639. }
  640. ownedSettings->setReordering(*data, reorderCodes, length, errorCode);
  641. setFastLatinOptions(*ownedSettings);
  642. }
  643. void
  644. RuleBasedCollator::setFastLatinOptions(CollationSettings &ownedSettings) const {
  645. ownedSettings.fastLatinOptions = CollationFastLatin::getOptions(
  646. data, ownedSettings,
  647. ownedSettings.fastLatinPrimaries, UPRV_LENGTHOF(ownedSettings.fastLatinPrimaries));
  648. }
  649. UCollationResult
  650. RuleBasedCollator::compare(const UnicodeString &left, const UnicodeString &right,
  651. UErrorCode &errorCode) const {
  652. if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
  653. return doCompare(left.getBuffer(), left.length(),
  654. right.getBuffer(), right.length(), errorCode);
  655. }
  656. UCollationResult
  657. RuleBasedCollator::compare(const UnicodeString &left, const UnicodeString &right,
  658. int32_t length, UErrorCode &errorCode) const {
  659. if(U_FAILURE(errorCode) || length == 0) { return UCOL_EQUAL; }
  660. if(length < 0) {
  661. errorCode = U_ILLEGAL_ARGUMENT_ERROR;
  662. return UCOL_EQUAL;
  663. }
  664. int32_t leftLength = left.length();
  665. int32_t rightLength = right.length();
  666. if(leftLength > length) { leftLength = length; }
  667. if(rightLength > length) { rightLength = length; }
  668. return doCompare(left.getBuffer(), leftLength,
  669. right.getBuffer(), rightLength, errorCode);
  670. }
  671. UCollationResult
  672. RuleBasedCollator::compare(const char16_t *left, int32_t leftLength,
  673. const char16_t *right, int32_t rightLength,
  674. UErrorCode &errorCode) const {
  675. if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
  676. if((left == nullptr && leftLength != 0) || (right == nullptr && rightLength != 0)) {
  677. errorCode = U_ILLEGAL_ARGUMENT_ERROR;
  678. return UCOL_EQUAL;
  679. }
  680. // Make sure both or neither strings have a known length.
  681. // We do not optimize for mixed length/termination.
  682. if(leftLength >= 0) {
  683. if(rightLength < 0) { rightLength = u_strlen(right); }
  684. } else {
  685. if(rightLength >= 0) { leftLength = u_strlen(left); }
  686. }
  687. return doCompare(left, leftLength, right, rightLength, errorCode);
  688. }
  689. UCollationResult
  690. RuleBasedCollator::compareUTF8(const StringPiece &left, const StringPiece &right,
  691. UErrorCode &errorCode) const {
  692. if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
  693. const uint8_t *leftBytes = reinterpret_cast<const uint8_t *>(left.data());
  694. const uint8_t *rightBytes = reinterpret_cast<const uint8_t *>(right.data());
  695. if((leftBytes == nullptr && !left.empty()) || (rightBytes == nullptr && !right.empty())) {
  696. errorCode = U_ILLEGAL_ARGUMENT_ERROR;
  697. return UCOL_EQUAL;
  698. }
  699. return doCompare(leftBytes, left.length(), rightBytes, right.length(), errorCode);
  700. }
  701. UCollationResult
  702. RuleBasedCollator::internalCompareUTF8(const char *left, int32_t leftLength,
  703. const char *right, int32_t rightLength,
  704. UErrorCode &errorCode) const {
  705. if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
  706. if((left == nullptr && leftLength != 0) || (right == nullptr && rightLength != 0)) {
  707. errorCode = U_ILLEGAL_ARGUMENT_ERROR;
  708. return UCOL_EQUAL;
  709. }
  710. // Make sure both or neither strings have a known length.
  711. // We do not optimize for mixed length/termination.
  712. if(leftLength >= 0) {
  713. if(rightLength < 0) { rightLength = static_cast<int32_t>(uprv_strlen(right)); }
  714. } else {
  715. if(rightLength >= 0) { leftLength = static_cast<int32_t>(uprv_strlen(left)); }
  716. }
  717. return doCompare(reinterpret_cast<const uint8_t *>(left), leftLength,
  718. reinterpret_cast<const uint8_t *>(right), rightLength, errorCode);
  719. }
  720. namespace {
  721. /**
  722. * Abstract iterator for identical-level string comparisons.
  723. * Returns FCD code points and handles temporary switching to NFD.
  724. */
  725. class NFDIterator : public UObject {
  726. public:
  727. NFDIterator() : index(-1), length(0) {}
  728. virtual ~NFDIterator() {}
  729. /**
  730. * Returns the next code point from the internal normalization buffer,
  731. * or else the next text code point.
  732. * Returns -1 at the end of the text.
  733. */
  734. UChar32 nextCodePoint() {
  735. if(index >= 0) {
  736. if(index == length) {
  737. index = -1;
  738. } else {
  739. UChar32 c;
  740. U16_NEXT_UNSAFE(decomp, index, c);
  741. return c;
  742. }
  743. }
  744. return nextRawCodePoint();
  745. }
  746. /**
  747. * @param nfcImpl
  748. * @param c the last code point returned by nextCodePoint() or nextDecomposedCodePoint()
  749. * @return the first code point in c's decomposition,
  750. * or c itself if it was decomposed already or if it does not decompose
  751. */
  752. UChar32 nextDecomposedCodePoint(const Normalizer2Impl &nfcImpl, UChar32 c) {
  753. if(index >= 0) { return c; }
  754. decomp = nfcImpl.getDecomposition(c, buffer, length);
  755. if(decomp == nullptr) { return c; }
  756. index = 0;
  757. U16_NEXT_UNSAFE(decomp, index, c);
  758. return c;
  759. }
  760. protected:
  761. /**
  762. * Returns the next text code point in FCD order.
  763. * Returns -1 at the end of the text.
  764. */
  765. virtual UChar32 nextRawCodePoint() = 0;
  766. private:
  767. const char16_t *decomp;
  768. char16_t buffer[4];
  769. int32_t index;
  770. int32_t length;
  771. };
  772. class UTF16NFDIterator : public NFDIterator {
  773. public:
  774. UTF16NFDIterator(const char16_t *text, const char16_t *textLimit) : s(text), limit(textLimit) {}
  775. protected:
  776. virtual UChar32 nextRawCodePoint() override {
  777. if(s == limit) { return U_SENTINEL; }
  778. UChar32 c = *s++;
  779. if(limit == nullptr && c == 0) {
  780. s = nullptr;
  781. return U_SENTINEL;
  782. }
  783. char16_t trail;
  784. if(U16_IS_LEAD(c) && s != limit && U16_IS_TRAIL(trail = *s)) {
  785. ++s;
  786. c = U16_GET_SUPPLEMENTARY(c, trail);
  787. }
  788. return c;
  789. }
  790. const char16_t *s;
  791. const char16_t *limit;
  792. };
  793. class FCDUTF16NFDIterator : public UTF16NFDIterator {
  794. public:
  795. FCDUTF16NFDIterator(const Normalizer2Impl &nfcImpl, const char16_t *text, const char16_t *textLimit)
  796. : UTF16NFDIterator(nullptr, nullptr) {
  797. UErrorCode errorCode = U_ZERO_ERROR;
  798. const char16_t *spanLimit = nfcImpl.makeFCD(text, textLimit, nullptr, errorCode);
  799. if(U_FAILURE(errorCode)) { return; }
  800. if(spanLimit == textLimit || (textLimit == nullptr && *spanLimit == 0)) {
  801. s = text;
  802. limit = spanLimit;
  803. } else {
  804. str.setTo(text, (int32_t)(spanLimit - text));
  805. {
  806. ReorderingBuffer r_buffer(nfcImpl, str);
  807. if(r_buffer.init(str.length(), errorCode)) {
  808. nfcImpl.makeFCD(spanLimit, textLimit, &r_buffer, errorCode);
  809. }
  810. }
  811. if(U_SUCCESS(errorCode)) {
  812. s = str.getBuffer();
  813. limit = s + str.length();
  814. }
  815. }
  816. }
  817. private:
  818. UnicodeString str;
  819. };
  820. class UTF8NFDIterator : public NFDIterator {
  821. public:
  822. UTF8NFDIterator(const uint8_t *text, int32_t textLength)
  823. : s(text), pos(0), length(textLength) {}
  824. protected:
  825. virtual UChar32 nextRawCodePoint() override {
  826. if(pos == length || (s[pos] == 0 && length < 0)) { return U_SENTINEL; }
  827. UChar32 c;
  828. U8_NEXT_OR_FFFD(s, pos, length, c);
  829. return c;
  830. }
  831. const uint8_t *s;
  832. int32_t pos;
  833. int32_t length;
  834. };
  835. class FCDUTF8NFDIterator : public NFDIterator {
  836. public:
  837. FCDUTF8NFDIterator(const CollationData *data, const uint8_t *text, int32_t textLength)
  838. : u8ci(data, false, text, 0, textLength) {}
  839. protected:
  840. virtual UChar32 nextRawCodePoint() override {
  841. UErrorCode errorCode = U_ZERO_ERROR;
  842. return u8ci.nextCodePoint(errorCode);
  843. }
  844. private:
  845. FCDUTF8CollationIterator u8ci;
  846. };
  847. class UIterNFDIterator : public NFDIterator {
  848. public:
  849. UIterNFDIterator(UCharIterator &it) : iter(it) {}
  850. protected:
  851. virtual UChar32 nextRawCodePoint() override {
  852. return uiter_next32(&iter);
  853. }
  854. private:
  855. UCharIterator &iter;
  856. };
  857. class FCDUIterNFDIterator : public NFDIterator {
  858. public:
  859. FCDUIterNFDIterator(const CollationData *data, UCharIterator &it, int32_t startIndex)
  860. : uici(data, false, it, startIndex) {}
  861. protected:
  862. virtual UChar32 nextRawCodePoint() override {
  863. UErrorCode errorCode = U_ZERO_ERROR;
  864. return uici.nextCodePoint(errorCode);
  865. }
  866. private:
  867. FCDUIterCollationIterator uici;
  868. };
  869. UCollationResult compareNFDIter(const Normalizer2Impl &nfcImpl,
  870. NFDIterator &left, NFDIterator &right) {
  871. for(;;) {
  872. // Fetch the next FCD code point from each string.
  873. UChar32 leftCp = left.nextCodePoint();
  874. UChar32 rightCp = right.nextCodePoint();
  875. if(leftCp == rightCp) {
  876. if(leftCp < 0) { break; }
  877. continue;
  878. }
  879. // If they are different, then decompose each and compare again.
  880. if(leftCp < 0) {
  881. leftCp = -2; // end of string
  882. } else if(leftCp == 0xfffe) {
  883. leftCp = -1; // U+FFFE: merge separator
  884. } else {
  885. leftCp = left.nextDecomposedCodePoint(nfcImpl, leftCp);
  886. }
  887. if(rightCp < 0) {
  888. rightCp = -2; // end of string
  889. } else if(rightCp == 0xfffe) {
  890. rightCp = -1; // U+FFFE: merge separator
  891. } else {
  892. rightCp = right.nextDecomposedCodePoint(nfcImpl, rightCp);
  893. }
  894. if(leftCp < rightCp) { return UCOL_LESS; }
  895. if(leftCp > rightCp) { return UCOL_GREATER; }
  896. }
  897. return UCOL_EQUAL;
  898. }
  899. } // namespace
  900. UCollationResult
  901. RuleBasedCollator::doCompare(const char16_t *left, int32_t leftLength,
  902. const char16_t *right, int32_t rightLength,
  903. UErrorCode &errorCode) const {
  904. // U_FAILURE(errorCode) checked by caller.
  905. if(left == right && leftLength == rightLength) {
  906. return UCOL_EQUAL;
  907. }
  908. // Identical-prefix test.
  909. const char16_t *leftLimit;
  910. const char16_t *rightLimit;
  911. int32_t equalPrefixLength = 0;
  912. if(leftLength < 0) {
  913. leftLimit = nullptr;
  914. rightLimit = nullptr;
  915. char16_t c;
  916. while((c = left[equalPrefixLength]) == right[equalPrefixLength]) {
  917. if(c == 0) { return UCOL_EQUAL; }
  918. ++equalPrefixLength;
  919. }
  920. } else {
  921. leftLimit = left + leftLength;
  922. rightLimit = right + rightLength;
  923. for(;;) {
  924. if(equalPrefixLength == leftLength) {
  925. if(equalPrefixLength == rightLength) { return UCOL_EQUAL; }
  926. break;
  927. } else if(equalPrefixLength == rightLength ||
  928. left[equalPrefixLength] != right[equalPrefixLength]) {
  929. break;
  930. }
  931. ++equalPrefixLength;
  932. }
  933. }
  934. UBool numeric = settings->isNumeric();
  935. if(equalPrefixLength > 0) {
  936. if((equalPrefixLength != leftLength &&
  937. data->isUnsafeBackward(left[equalPrefixLength], numeric)) ||
  938. (equalPrefixLength != rightLength &&
  939. data->isUnsafeBackward(right[equalPrefixLength], numeric))) {
  940. // Identical prefix: Back up to the start of a contraction or reordering sequence.
  941. while(--equalPrefixLength > 0 &&
  942. data->isUnsafeBackward(left[equalPrefixLength], numeric)) {}
  943. }
  944. // Notes:
  945. // - A longer string can compare equal to a prefix of it if only ignorables follow.
  946. // - With a backward level, a longer string can compare less-than a prefix of it.
  947. // Pass the actual start of each string into the CollationIterators,
  948. // plus the equalPrefixLength position,
  949. // so that prefix matches back into the equal prefix work.
  950. }
  951. int32_t result;
  952. int32_t fastLatinOptions = settings->fastLatinOptions;
  953. if(fastLatinOptions >= 0 &&
  954. (equalPrefixLength == leftLength ||
  955. left[equalPrefixLength] <= CollationFastLatin::LATIN_MAX) &&
  956. (equalPrefixLength == rightLength ||
  957. right[equalPrefixLength] <= CollationFastLatin::LATIN_MAX)) {
  958. if(leftLength >= 0) {
  959. result = CollationFastLatin::compareUTF16(data->fastLatinTable,
  960. settings->fastLatinPrimaries,
  961. fastLatinOptions,
  962. left + equalPrefixLength,
  963. leftLength - equalPrefixLength,
  964. right + equalPrefixLength,
  965. rightLength - equalPrefixLength);
  966. } else {
  967. result = CollationFastLatin::compareUTF16(data->fastLatinTable,
  968. settings->fastLatinPrimaries,
  969. fastLatinOptions,
  970. left + equalPrefixLength, -1,
  971. right + equalPrefixLength, -1);
  972. }
  973. } else {
  974. result = CollationFastLatin::BAIL_OUT_RESULT;
  975. }
  976. if(result == CollationFastLatin::BAIL_OUT_RESULT) {
  977. if(settings->dontCheckFCD()) {
  978. UTF16CollationIterator leftIter(data, numeric,
  979. left, left + equalPrefixLength, leftLimit);
  980. UTF16CollationIterator rightIter(data, numeric,
  981. right, right + equalPrefixLength, rightLimit);
  982. result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode);
  983. } else {
  984. FCDUTF16CollationIterator leftIter(data, numeric,
  985. left, left + equalPrefixLength, leftLimit);
  986. FCDUTF16CollationIterator rightIter(data, numeric,
  987. right, right + equalPrefixLength, rightLimit);
  988. result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode);
  989. }
  990. }
  991. if(result != UCOL_EQUAL || settings->getStrength() < UCOL_IDENTICAL || U_FAILURE(errorCode)) {
  992. return (UCollationResult)result;
  993. }
  994. // Note: If NUL-terminated, we could get the actual limits from the iterators now.
  995. // That would complicate the iterators a bit, NUL-terminated strings are only a C convenience,
  996. // and the benefit seems unlikely to be measurable.
  997. // Compare identical level.
  998. const Normalizer2Impl &nfcImpl = data->nfcImpl;
  999. left += equalPrefixLength;
  1000. right += equalPrefixLength;
  1001. if(settings->dontCheckFCD()) {
  1002. UTF16NFDIterator leftIter(left, leftLimit);
  1003. UTF16NFDIterator rightIter(right, rightLimit);
  1004. return compareNFDIter(nfcImpl, leftIter, rightIter);
  1005. } else {
  1006. FCDUTF16NFDIterator leftIter(nfcImpl, left, leftLimit);
  1007. FCDUTF16NFDIterator rightIter(nfcImpl, right, rightLimit);
  1008. return compareNFDIter(nfcImpl, leftIter, rightIter);
  1009. }
  1010. }
  1011. UCollationResult
  1012. RuleBasedCollator::doCompare(const uint8_t *left, int32_t leftLength,
  1013. const uint8_t *right, int32_t rightLength,
  1014. UErrorCode &errorCode) const {
  1015. // U_FAILURE(errorCode) checked by caller.
  1016. if(left == right && leftLength == rightLength) {
  1017. return UCOL_EQUAL;
  1018. }
  1019. // Identical-prefix test.
  1020. int32_t equalPrefixLength = 0;
  1021. if(leftLength < 0) {
  1022. uint8_t c;
  1023. while((c = left[equalPrefixLength]) == right[equalPrefixLength]) {
  1024. if(c == 0) { return UCOL_EQUAL; }
  1025. ++equalPrefixLength;
  1026. }
  1027. } else {
  1028. for(;;) {
  1029. if(equalPrefixLength == leftLength) {
  1030. if(equalPrefixLength == rightLength) { return UCOL_EQUAL; }
  1031. break;
  1032. } else if(equalPrefixLength == rightLength ||
  1033. left[equalPrefixLength] != right[equalPrefixLength]) {
  1034. break;
  1035. }
  1036. ++equalPrefixLength;
  1037. }
  1038. }
  1039. // Back up to the start of a partially-equal code point.
  1040. if(equalPrefixLength > 0 &&
  1041. ((equalPrefixLength != leftLength && U8_IS_TRAIL(left[equalPrefixLength])) ||
  1042. (equalPrefixLength != rightLength && U8_IS_TRAIL(right[equalPrefixLength])))) {
  1043. while(--equalPrefixLength > 0 && U8_IS_TRAIL(left[equalPrefixLength])) {}
  1044. }
  1045. UBool numeric = settings->isNumeric();
  1046. if(equalPrefixLength > 0) {
  1047. UBool unsafe = false;
  1048. if(equalPrefixLength != leftLength) {
  1049. int32_t i = equalPrefixLength;
  1050. UChar32 c;
  1051. U8_NEXT_OR_FFFD(left, i, leftLength, c);
  1052. unsafe = data->isUnsafeBackward(c, numeric);
  1053. }
  1054. if(!unsafe && equalPrefixLength != rightLength) {
  1055. int32_t i = equalPrefixLength;
  1056. UChar32 c;
  1057. U8_NEXT_OR_FFFD(right, i, rightLength, c);
  1058. unsafe = data->isUnsafeBackward(c, numeric);
  1059. }
  1060. if(unsafe) {
  1061. // Identical prefix: Back up to the start of a contraction or reordering sequence.
  1062. UChar32 c;
  1063. do {
  1064. U8_PREV_OR_FFFD(left, 0, equalPrefixLength, c);
  1065. } while(equalPrefixLength > 0 && data->isUnsafeBackward(c, numeric));
  1066. }
  1067. // See the notes in the UTF-16 version.
  1068. // Pass the actual start of each string into the CollationIterators,
  1069. // plus the equalPrefixLength position,
  1070. // so that prefix matches back into the equal prefix work.
  1071. }
  1072. int32_t result;
  1073. int32_t fastLatinOptions = settings->fastLatinOptions;
  1074. if(fastLatinOptions >= 0 &&
  1075. (equalPrefixLength == leftLength ||
  1076. left[equalPrefixLength] <= CollationFastLatin::LATIN_MAX_UTF8_LEAD) &&
  1077. (equalPrefixLength == rightLength ||
  1078. right[equalPrefixLength] <= CollationFastLatin::LATIN_MAX_UTF8_LEAD)) {
  1079. if(leftLength >= 0) {
  1080. result = CollationFastLatin::compareUTF8(data->fastLatinTable,
  1081. settings->fastLatinPrimaries,
  1082. fastLatinOptions,
  1083. left + equalPrefixLength,
  1084. leftLength - equalPrefixLength,
  1085. right + equalPrefixLength,
  1086. rightLength - equalPrefixLength);
  1087. } else {
  1088. result = CollationFastLatin::compareUTF8(data->fastLatinTable,
  1089. settings->fastLatinPrimaries,
  1090. fastLatinOptions,
  1091. left + equalPrefixLength, -1,
  1092. right + equalPrefixLength, -1);
  1093. }
  1094. } else {
  1095. result = CollationFastLatin::BAIL_OUT_RESULT;
  1096. }
  1097. if(result == CollationFastLatin::BAIL_OUT_RESULT) {
  1098. if(settings->dontCheckFCD()) {
  1099. UTF8CollationIterator leftIter(data, numeric, left, equalPrefixLength, leftLength);
  1100. UTF8CollationIterator rightIter(data, numeric, right, equalPrefixLength, rightLength);
  1101. result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode);
  1102. } else {
  1103. FCDUTF8CollationIterator leftIter(data, numeric, left, equalPrefixLength, leftLength);
  1104. FCDUTF8CollationIterator rightIter(data, numeric, right, equalPrefixLength, rightLength);
  1105. result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode);
  1106. }
  1107. }
  1108. if(result != UCOL_EQUAL || settings->getStrength() < UCOL_IDENTICAL || U_FAILURE(errorCode)) {
  1109. return (UCollationResult)result;
  1110. }
  1111. // Note: If NUL-terminated, we could get the actual limits from the iterators now.
  1112. // That would complicate the iterators a bit, NUL-terminated strings are only a C convenience,
  1113. // and the benefit seems unlikely to be measurable.
  1114. // Compare identical level.
  1115. const Normalizer2Impl &nfcImpl = data->nfcImpl;
  1116. left += equalPrefixLength;
  1117. right += equalPrefixLength;
  1118. if(leftLength > 0) {
  1119. leftLength -= equalPrefixLength;
  1120. rightLength -= equalPrefixLength;
  1121. }
  1122. if(settings->dontCheckFCD()) {
  1123. UTF8NFDIterator leftIter(left, leftLength);
  1124. UTF8NFDIterator rightIter(right, rightLength);
  1125. return compareNFDIter(nfcImpl, leftIter, rightIter);
  1126. } else {
  1127. FCDUTF8NFDIterator leftIter(data, left, leftLength);
  1128. FCDUTF8NFDIterator rightIter(data, right, rightLength);
  1129. return compareNFDIter(nfcImpl, leftIter, rightIter);
  1130. }
  1131. }
  1132. UCollationResult
  1133. RuleBasedCollator::compare(UCharIterator &left, UCharIterator &right,
  1134. UErrorCode &errorCode) const {
  1135. if(U_FAILURE(errorCode) || &left == &right) { return UCOL_EQUAL; }
  1136. UBool numeric = settings->isNumeric();
  1137. // Identical-prefix test.
  1138. int32_t equalPrefixLength = 0;
  1139. {
  1140. UChar32 leftUnit;
  1141. UChar32 rightUnit;
  1142. while((leftUnit = left.next(&left)) == (rightUnit = right.next(&right))) {
  1143. if(leftUnit < 0) { return UCOL_EQUAL; }
  1144. ++equalPrefixLength;
  1145. }
  1146. // Back out the code units that differed, for the real collation comparison.
  1147. if(leftUnit >= 0) { left.previous(&left); }
  1148. if(rightUnit >= 0) { right.previous(&right); }
  1149. if(equalPrefixLength > 0) {
  1150. if((leftUnit >= 0 && data->isUnsafeBackward(leftUnit, numeric)) ||
  1151. (rightUnit >= 0 && data->isUnsafeBackward(rightUnit, numeric))) {
  1152. // Identical prefix: Back up to the start of a contraction or reordering sequence.
  1153. do {
  1154. --equalPrefixLength;
  1155. leftUnit = left.previous(&left);
  1156. right.previous(&right);
  1157. } while(equalPrefixLength > 0 && data->isUnsafeBackward(leftUnit, numeric));
  1158. }
  1159. // See the notes in the UTF-16 version.
  1160. }
  1161. }
  1162. UCollationResult result;
  1163. if(settings->dontCheckFCD()) {
  1164. UIterCollationIterator leftIter(data, numeric, left);
  1165. UIterCollationIterator rightIter(data, numeric, right);
  1166. result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode);
  1167. } else {
  1168. FCDUIterCollationIterator leftIter(data, numeric, left, equalPrefixLength);
  1169. FCDUIterCollationIterator rightIter(data, numeric, right, equalPrefixLength);
  1170. result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode);
  1171. }
  1172. if(result != UCOL_EQUAL || settings->getStrength() < UCOL_IDENTICAL || U_FAILURE(errorCode)) {
  1173. return result;
  1174. }
  1175. // Compare identical level.
  1176. left.move(&left, equalPrefixLength, UITER_ZERO);
  1177. right.move(&right, equalPrefixLength, UITER_ZERO);
  1178. const Normalizer2Impl &nfcImpl = data->nfcImpl;
  1179. if(settings->dontCheckFCD()) {
  1180. UIterNFDIterator leftIter(left);
  1181. UIterNFDIterator rightIter(right);
  1182. return compareNFDIter(nfcImpl, leftIter, rightIter);
  1183. } else {
  1184. FCDUIterNFDIterator leftIter(data, left, equalPrefixLength);
  1185. FCDUIterNFDIterator rightIter(data, right, equalPrefixLength);
  1186. return compareNFDIter(nfcImpl, leftIter, rightIter);
  1187. }
  1188. }
  1189. CollationKey &
  1190. RuleBasedCollator::getCollationKey(const UnicodeString &s, CollationKey &key,
  1191. UErrorCode &errorCode) const {
  1192. return getCollationKey(s.getBuffer(), s.length(), key, errorCode);
  1193. }
  1194. CollationKey &
  1195. RuleBasedCollator::getCollationKey(const char16_t *s, int32_t length, CollationKey& key,
  1196. UErrorCode &errorCode) const {
  1197. if(U_FAILURE(errorCode)) {
  1198. return key.setToBogus();
  1199. }
  1200. if(s == nullptr && length != 0) {
  1201. errorCode = U_ILLEGAL_ARGUMENT_ERROR;
  1202. return key.setToBogus();
  1203. }
  1204. key.reset(); // resets the "bogus" state
  1205. CollationKeyByteSink sink(key);
  1206. writeSortKey(s, length, sink, errorCode);
  1207. if(U_FAILURE(errorCode)) {
  1208. key.setToBogus();
  1209. } else if(key.isBogus()) {
  1210. errorCode = U_MEMORY_ALLOCATION_ERROR;
  1211. } else {
  1212. key.setLength(sink.NumberOfBytesAppended());
  1213. }
  1214. return key;
  1215. }
  1216. int32_t
  1217. RuleBasedCollator::getSortKey(const UnicodeString &s,
  1218. uint8_t *dest, int32_t capacity) const {
  1219. return getSortKey(s.getBuffer(), s.length(), dest, capacity);
  1220. }
  1221. int32_t
  1222. RuleBasedCollator::getSortKey(const char16_t *s, int32_t length,
  1223. uint8_t *dest, int32_t capacity) const {
  1224. if((s == nullptr && length != 0) || capacity < 0 || (dest == nullptr && capacity > 0)) {
  1225. return 0;
  1226. }
  1227. uint8_t noDest[1] = { 0 };
  1228. if(dest == nullptr) {
  1229. // Distinguish pure preflighting from an allocation error.
  1230. dest = noDest;
  1231. capacity = 0;
  1232. }
  1233. FixedSortKeyByteSink sink(reinterpret_cast<char *>(dest), capacity);
  1234. UErrorCode errorCode = U_ZERO_ERROR;
  1235. writeSortKey(s, length, sink, errorCode);
  1236. return U_SUCCESS(errorCode) ? sink.NumberOfBytesAppended() : 0;
  1237. }
  1238. void
  1239. RuleBasedCollator::writeSortKey(const char16_t *s, int32_t length,
  1240. SortKeyByteSink &sink, UErrorCode &errorCode) const {
  1241. if(U_FAILURE(errorCode)) { return; }
  1242. const char16_t *limit = (length >= 0) ? s + length : nullptr;
  1243. UBool numeric = settings->isNumeric();
  1244. CollationKeys::LevelCallback callback;
  1245. if(settings->dontCheckFCD()) {
  1246. UTF16CollationIterator iter(data, numeric, s, s, limit);
  1247. CollationKeys::writeSortKeyUpToQuaternary(iter, data->compressibleBytes, *settings,
  1248. sink, Collation::PRIMARY_LEVEL,
  1249. callback, true, errorCode);
  1250. } else {
  1251. FCDUTF16CollationIterator iter(data, numeric, s, s, limit);
  1252. CollationKeys::writeSortKeyUpToQuaternary(iter, data->compressibleBytes, *settings,
  1253. sink, Collation::PRIMARY_LEVEL,
  1254. callback, true, errorCode);
  1255. }
  1256. if(settings->getStrength() == UCOL_IDENTICAL) {
  1257. writeIdenticalLevel(s, limit, sink, errorCode);
  1258. }
  1259. static const char terminator = 0; // TERMINATOR_BYTE
  1260. sink.Append(&terminator, 1);
  1261. }
  1262. void
  1263. RuleBasedCollator::writeIdenticalLevel(const char16_t *s, const char16_t *limit,
  1264. SortKeyByteSink &sink, UErrorCode &errorCode) const {
  1265. // NFD quick check
  1266. const char16_t *nfdQCYesLimit = data->nfcImpl.decompose(s, limit, nullptr, errorCode);
  1267. if(U_FAILURE(errorCode)) { return; }
  1268. sink.Append(Collation::LEVEL_SEPARATOR_BYTE);
  1269. UChar32 prev = 0;
  1270. if(nfdQCYesLimit != s) {
  1271. prev = u_writeIdenticalLevelRun(prev, s, (int32_t)(nfdQCYesLimit - s), sink);
  1272. }
  1273. // Is there non-NFD text?
  1274. int32_t destLengthEstimate;
  1275. if(limit != nullptr) {
  1276. if(nfdQCYesLimit == limit) { return; }
  1277. destLengthEstimate = (int32_t)(limit - nfdQCYesLimit);
  1278. } else {
  1279. // s is NUL-terminated
  1280. if(*nfdQCYesLimit == 0) { return; }
  1281. destLengthEstimate = -1;
  1282. }
  1283. UnicodeString nfd;
  1284. data->nfcImpl.decompose(nfdQCYesLimit, limit, nfd, destLengthEstimate, errorCode);
  1285. u_writeIdenticalLevelRun(prev, nfd.getBuffer(), nfd.length(), sink);
  1286. }
  1287. namespace {
  1288. /**
  1289. * internalNextSortKeyPart() calls CollationKeys::writeSortKeyUpToQuaternary()
  1290. * with an instance of this callback class.
  1291. * When another level is about to be written, the callback
  1292. * records the level and the number of bytes that will be written until
  1293. * the sink (which is actually a FixedSortKeyByteSink) fills up.
  1294. *
  1295. * When internalNextSortKeyPart() is called again, it restarts with the last level
  1296. * and ignores as many bytes as were written previously for that level.
  1297. */
  1298. class PartLevelCallback : public CollationKeys::LevelCallback {
  1299. public:
  1300. PartLevelCallback(const SortKeyByteSink &s)
  1301. : sink(s), level(Collation::PRIMARY_LEVEL) {
  1302. levelCapacity = sink.GetRemainingCapacity();
  1303. }
  1304. virtual ~PartLevelCallback() {}
  1305. virtual UBool needToWrite(Collation::Level l) override {
  1306. if(!sink.Overflowed()) {
  1307. // Remember a level that will be at least partially written.
  1308. level = l;
  1309. levelCapacity = sink.GetRemainingCapacity();
  1310. return true;
  1311. } else {
  1312. return false;
  1313. }
  1314. }
  1315. Collation::Level getLevel() const { return level; }
  1316. int32_t getLevelCapacity() const { return levelCapacity; }
  1317. private:
  1318. const SortKeyByteSink &sink;
  1319. Collation::Level level;
  1320. int32_t levelCapacity;
  1321. };
  1322. } // namespace
  1323. int32_t
  1324. RuleBasedCollator::internalNextSortKeyPart(UCharIterator *iter, uint32_t state[2],
  1325. uint8_t *dest, int32_t count, UErrorCode &errorCode) const {
  1326. if(U_FAILURE(errorCode)) { return 0; }
  1327. if(iter == nullptr || state == nullptr || count < 0 || (count > 0 && dest == nullptr)) {
  1328. errorCode = U_ILLEGAL_ARGUMENT_ERROR;
  1329. return 0;
  1330. }
  1331. if(count == 0) { return 0; }
  1332. FixedSortKeyByteSink sink(reinterpret_cast<char *>(dest), count);
  1333. sink.IgnoreBytes((int32_t)state[1]);
  1334. iter->move(iter, 0, UITER_START);
  1335. Collation::Level level = (Collation::Level)state[0];
  1336. if(level <= Collation::QUATERNARY_LEVEL) {
  1337. UBool numeric = settings->isNumeric();
  1338. PartLevelCallback callback(sink);
  1339. if(settings->dontCheckFCD()) {
  1340. UIterCollationIterator ci(data, numeric, *iter);
  1341. CollationKeys::writeSortKeyUpToQuaternary(ci, data->compressibleBytes, *settings,
  1342. sink, level, callback, false, errorCode);
  1343. } else {
  1344. FCDUIterCollationIterator ci(data, numeric, *iter, 0);
  1345. CollationKeys::writeSortKeyUpToQuaternary(ci, data->compressibleBytes, *settings,
  1346. sink, level, callback, false, errorCode);
  1347. }
  1348. if(U_FAILURE(errorCode)) { return 0; }
  1349. if(sink.NumberOfBytesAppended() > count) {
  1350. state[0] = (uint32_t)callback.getLevel();
  1351. state[1] = (uint32_t)callback.getLevelCapacity();
  1352. return count;
  1353. }
  1354. // All of the normal levels are done.
  1355. if(settings->getStrength() == UCOL_IDENTICAL) {
  1356. level = Collation::IDENTICAL_LEVEL;
  1357. iter->move(iter, 0, UITER_START);
  1358. }
  1359. // else fall through to setting ZERO_LEVEL
  1360. }
  1361. if(level == Collation::IDENTICAL_LEVEL) {
  1362. int32_t levelCapacity = sink.GetRemainingCapacity();
  1363. UnicodeString s;
  1364. for(;;) {
  1365. UChar32 c = iter->next(iter);
  1366. if(c < 0) { break; }
  1367. s.append((char16_t)c);
  1368. }
  1369. const char16_t *sArray = s.getBuffer();
  1370. writeIdenticalLevel(sArray, sArray + s.length(), sink, errorCode);
  1371. if(U_FAILURE(errorCode)) { return 0; }
  1372. if(sink.NumberOfBytesAppended() > count) {
  1373. state[0] = (uint32_t)level;
  1374. state[1] = (uint32_t)levelCapacity;
  1375. return count;
  1376. }
  1377. }
  1378. // ZERO_LEVEL: Fill the remainder of dest with 00 bytes.
  1379. state[0] = (uint32_t)Collation::ZERO_LEVEL;
  1380. state[1] = 0;
  1381. int32_t length = sink.NumberOfBytesAppended();
  1382. int32_t i = length;
  1383. while(i < count) { dest[i++] = 0; }
  1384. return length;
  1385. }
  1386. void
  1387. RuleBasedCollator::internalGetCEs(const UnicodeString &str, UVector64 &ces,
  1388. UErrorCode &errorCode) const {
  1389. if(U_FAILURE(errorCode)) { return; }
  1390. const char16_t *s = str.getBuffer();
  1391. const char16_t *limit = s + str.length();
  1392. UBool numeric = settings->isNumeric();
  1393. if(settings->dontCheckFCD()) {
  1394. UTF16CollationIterator iter(data, numeric, s, s, limit);
  1395. int64_t ce;
  1396. while((ce = iter.nextCE(errorCode)) != Collation::NO_CE) {
  1397. ces.addElement(ce, errorCode);
  1398. }
  1399. } else {
  1400. FCDUTF16CollationIterator iter(data, numeric, s, s, limit);
  1401. int64_t ce;
  1402. while((ce = iter.nextCE(errorCode)) != Collation::NO_CE) {
  1403. ces.addElement(ce, errorCode);
  1404. }
  1405. }
  1406. }
  1407. namespace {
  1408. void appendSubtag(CharString &s, char letter, const char *subtag, int32_t length,
  1409. UErrorCode &errorCode) {
  1410. if(U_FAILURE(errorCode) || length == 0) { return; }
  1411. if(!s.isEmpty()) {
  1412. s.append('_', errorCode);
  1413. }
  1414. s.append(letter, errorCode);
  1415. for(int32_t i = 0; i < length; ++i) {
  1416. s.append(uprv_toupper(subtag[i]), errorCode);
  1417. }
  1418. }
  1419. void appendAttribute(CharString &s, char letter, UColAttributeValue value,
  1420. UErrorCode &errorCode) {
  1421. if(U_FAILURE(errorCode)) { return; }
  1422. if(!s.isEmpty()) {
  1423. s.append('_', errorCode);
  1424. }
  1425. static const char *valueChars = "1234...........IXO..SN..LU......";
  1426. s.append(letter, errorCode);
  1427. s.append(valueChars[value], errorCode);
  1428. }
  1429. } // namespace
  1430. int32_t
  1431. RuleBasedCollator::internalGetShortDefinitionString(const char *locale,
  1432. char *buffer, int32_t capacity,
  1433. UErrorCode &errorCode) const {
  1434. if(U_FAILURE(errorCode)) { return 0; }
  1435. if(buffer == nullptr ? capacity != 0 : capacity < 0) {
  1436. errorCode = U_ILLEGAL_ARGUMENT_ERROR;
  1437. return 0;
  1438. }
  1439. if(locale == nullptr) {
  1440. locale = internalGetLocaleID(ULOC_VALID_LOCALE, errorCode);
  1441. }
  1442. char resultLocale[ULOC_FULLNAME_CAPACITY + 1];
  1443. int32_t length = ucol_getFunctionalEquivalent(resultLocale, ULOC_FULLNAME_CAPACITY,
  1444. "collation", locale,
  1445. nullptr, &errorCode);
  1446. if(U_FAILURE(errorCode)) { return 0; }
  1447. resultLocale[length] = 0;
  1448. // Append items in alphabetic order of their short definition letters.
  1449. CharString result;
  1450. char subtag[ULOC_KEYWORD_AND_VALUES_CAPACITY];
  1451. if(attributeHasBeenSetExplicitly(UCOL_ALTERNATE_HANDLING)) {
  1452. appendAttribute(result, 'A', getAttribute(UCOL_ALTERNATE_HANDLING, errorCode), errorCode);
  1453. }
  1454. // ATTR_VARIABLE_TOP not supported because 'B' was broken.
  1455. // See ICU tickets #10372 and #10386.
  1456. if(attributeHasBeenSetExplicitly(UCOL_CASE_FIRST)) {
  1457. appendAttribute(result, 'C', getAttribute(UCOL_CASE_FIRST, errorCode), errorCode);
  1458. }
  1459. if(attributeHasBeenSetExplicitly(UCOL_NUMERIC_COLLATION)) {
  1460. appendAttribute(result, 'D', getAttribute(UCOL_NUMERIC_COLLATION, errorCode), errorCode);
  1461. }
  1462. if(attributeHasBeenSetExplicitly(UCOL_CASE_LEVEL)) {
  1463. appendAttribute(result, 'E', getAttribute(UCOL_CASE_LEVEL, errorCode), errorCode);
  1464. }
  1465. if(attributeHasBeenSetExplicitly(UCOL_FRENCH_COLLATION)) {
  1466. appendAttribute(result, 'F', getAttribute(UCOL_FRENCH_COLLATION, errorCode), errorCode);
  1467. }
  1468. // Note: UCOL_HIRAGANA_QUATERNARY_MODE is deprecated and never changes away from default.
  1469. length = uloc_getKeywordValue(resultLocale, "collation", subtag, UPRV_LENGTHOF(subtag), &errorCode);
  1470. appendSubtag(result, 'K', subtag, length, errorCode);
  1471. length = uloc_getLanguage(resultLocale, subtag, UPRV_LENGTHOF(subtag), &errorCode);
  1472. if (length == 0) {
  1473. appendSubtag(result, 'L', "root", 4, errorCode);
  1474. } else {
  1475. appendSubtag(result, 'L', subtag, length, errorCode);
  1476. }
  1477. if(attributeHasBeenSetExplicitly(UCOL_NORMALIZATION_MODE)) {
  1478. appendAttribute(result, 'N', getAttribute(UCOL_NORMALIZATION_MODE, errorCode), errorCode);
  1479. }
  1480. length = uloc_getCountry(resultLocale, subtag, UPRV_LENGTHOF(subtag), &errorCode);
  1481. appendSubtag(result, 'R', subtag, length, errorCode);
  1482. if(attributeHasBeenSetExplicitly(UCOL_STRENGTH)) {
  1483. appendAttribute(result, 'S', getAttribute(UCOL_STRENGTH, errorCode), errorCode);
  1484. }
  1485. length = uloc_getVariant(resultLocale, subtag, UPRV_LENGTHOF(subtag), &errorCode);
  1486. appendSubtag(result, 'V', subtag, length, errorCode);
  1487. length = uloc_getScript(resultLocale, subtag, UPRV_LENGTHOF(subtag), &errorCode);
  1488. appendSubtag(result, 'Z', subtag, length, errorCode);
  1489. if(U_FAILURE(errorCode)) { return 0; }
  1490. return result.extract(buffer, capacity, errorCode);
  1491. }
  1492. UBool
  1493. RuleBasedCollator::isUnsafe(UChar32 c) const {
  1494. return data->isUnsafeBackward(c, settings->isNumeric());
  1495. }
  1496. void U_CALLCONV
  1497. RuleBasedCollator::computeMaxExpansions(const CollationTailoring *t, UErrorCode &errorCode) {
  1498. t->maxExpansions = CollationElementIterator::computeMaxExpansions(t->data, errorCode);
  1499. }
  1500. UBool
  1501. RuleBasedCollator::initMaxExpansions(UErrorCode &errorCode) const {
  1502. umtx_initOnce(tailoring->maxExpansionsInitOnce, computeMaxExpansions, tailoring, errorCode);
  1503. return U_SUCCESS(errorCode);
  1504. }
  1505. CollationElementIterator *
  1506. RuleBasedCollator::createCollationElementIterator(const UnicodeString& source) const {
  1507. UErrorCode errorCode = U_ZERO_ERROR;
  1508. if(!initMaxExpansions(errorCode)) { return nullptr; }
  1509. CollationElementIterator *cei = new CollationElementIterator(source, this, errorCode);
  1510. if(U_FAILURE(errorCode)) {
  1511. delete cei;
  1512. return nullptr;
  1513. }
  1514. return cei;
  1515. }
  1516. CollationElementIterator *
  1517. RuleBasedCollator::createCollationElementIterator(const CharacterIterator& source) const {
  1518. UErrorCode errorCode = U_ZERO_ERROR;
  1519. if(!initMaxExpansions(errorCode)) { return nullptr; }
  1520. CollationElementIterator *cei = new CollationElementIterator(source, this, errorCode);
  1521. if(U_FAILURE(errorCode)) {
  1522. delete cei;
  1523. return nullptr;
  1524. }
  1525. return cei;
  1526. }
  1527. int32_t
  1528. RuleBasedCollator::getMaxExpansion(int32_t order) const {
  1529. UErrorCode errorCode = U_ZERO_ERROR;
  1530. (void)initMaxExpansions(errorCode);
  1531. return CollationElementIterator::getMaxExpansion(tailoring->maxExpansions, order);
  1532. }
  1533. U_NAMESPACE_END
  1534. #endif // !UCONFIG_NO_COLLATION