ustrcase.cpp 62 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. *******************************************************************************
  5. *
  6. * Copyright (C) 2001-2015, International Business Machines
  7. * Corporation and others. All Rights Reserved.
  8. *
  9. *******************************************************************************
  10. * file name: ustrcase.cpp
  11. * encoding: UTF-8
  12. * tab size: 8 (not used)
  13. * indentation:4
  14. *
  15. * created on: 2002feb20
  16. * created by: Markus W. Scherer
  17. *
  18. * Implementation file for string casing C API functions.
  19. * Uses functions from uchar.c for basic functionality that requires access
  20. * to the Unicode Character Database (uprops.dat).
  21. */
  22. #include "unicode/utypes.h"
  23. #include "unicode/brkiter.h"
  24. #include "unicode/casemap.h"
  25. #include "unicode/edits.h"
  26. #include "unicode/stringoptions.h"
  27. #include "unicode/ustring.h"
  28. #include "unicode/ucasemap.h"
  29. #include "unicode/ubrk.h"
  30. #include "unicode/utf.h"
  31. #include "unicode/utf16.h"
  32. #include "cmemory.h"
  33. #include "ucase.h"
  34. #include "ucasemap_imp.h"
  35. #include "ustr_imp.h"
  36. #include "uassert.h"
  37. /**
  38. * Code point for COMBINING ACUTE ACCENT
  39. * @internal
  40. */
  41. #define ACUTE u'\u0301'
  42. U_NAMESPACE_BEGIN
  43. namespace {
  44. int32_t checkOverflowAndEditsError(int32_t destIndex, int32_t destCapacity,
  45. Edits *edits, UErrorCode &errorCode) {
  46. if (U_SUCCESS(errorCode)) {
  47. if (destIndex > destCapacity) {
  48. errorCode = U_BUFFER_OVERFLOW_ERROR;
  49. } else if (edits != nullptr) {
  50. edits->copyErrorTo(errorCode);
  51. }
  52. }
  53. return destIndex;
  54. }
  55. /* Appends a full case mapping result, see UCASE_MAX_STRING_LENGTH. */
  56. inline int32_t
  57. appendResult(char16_t *dest, int32_t destIndex, int32_t destCapacity,
  58. int32_t result, const char16_t *s,
  59. int32_t cpLength, uint32_t options, icu::Edits *edits) {
  60. UChar32 c;
  61. int32_t length;
  62. /* decode the result */
  63. if(result<0) {
  64. /* (not) original code point */
  65. if(edits!=nullptr) {
  66. edits->addUnchanged(cpLength);
  67. }
  68. if(options & U_OMIT_UNCHANGED_TEXT) {
  69. return destIndex;
  70. }
  71. c=~result;
  72. if(destIndex<destCapacity && c<=0xffff) { // BMP slightly-fastpath
  73. dest[destIndex++] = static_cast<char16_t>(c);
  74. return destIndex;
  75. }
  76. length=cpLength;
  77. } else {
  78. if(result<=UCASE_MAX_STRING_LENGTH) {
  79. c=U_SENTINEL;
  80. length=result;
  81. } else if(destIndex<destCapacity && result<=0xffff) { // BMP slightly-fastpath
  82. dest[destIndex++] = static_cast<char16_t>(result);
  83. if(edits!=nullptr) {
  84. edits->addReplace(cpLength, 1);
  85. }
  86. return destIndex;
  87. } else {
  88. c=result;
  89. length=U16_LENGTH(c);
  90. }
  91. if(edits!=nullptr) {
  92. edits->addReplace(cpLength, length);
  93. }
  94. }
  95. if(length>(INT32_MAX-destIndex)) {
  96. return -1; // integer overflow
  97. }
  98. if(destIndex<destCapacity) {
  99. /* append the result */
  100. if(c>=0) {
  101. /* code point */
  102. UBool isError=false;
  103. U16_APPEND(dest, destIndex, destCapacity, c, isError);
  104. if(isError) {
  105. /* overflow, nothing written */
  106. destIndex+=length;
  107. }
  108. } else {
  109. /* string */
  110. if((destIndex+length)<=destCapacity) {
  111. while(length>0) {
  112. dest[destIndex++]=*s++;
  113. --length;
  114. }
  115. } else {
  116. /* overflow */
  117. destIndex+=length;
  118. }
  119. }
  120. } else {
  121. /* preflight */
  122. destIndex+=length;
  123. }
  124. return destIndex;
  125. }
  126. inline int32_t
  127. appendUChar(char16_t *dest, int32_t destIndex, int32_t destCapacity, char16_t c) {
  128. if(destIndex<destCapacity) {
  129. dest[destIndex]=c;
  130. } else if(destIndex==INT32_MAX) {
  131. return -1; // integer overflow
  132. }
  133. return destIndex+1;
  134. }
  135. int32_t
  136. appendNonEmptyUnchanged(char16_t *dest, int32_t destIndex, int32_t destCapacity,
  137. const char16_t *s, int32_t length, uint32_t options, icu::Edits *edits) {
  138. if(edits!=nullptr) {
  139. edits->addUnchanged(length);
  140. }
  141. if(options & U_OMIT_UNCHANGED_TEXT) {
  142. return destIndex;
  143. }
  144. if(length>(INT32_MAX-destIndex)) {
  145. return -1; // integer overflow
  146. }
  147. if((destIndex+length)<=destCapacity) {
  148. u_memcpy(dest+destIndex, s, length);
  149. }
  150. return destIndex + length;
  151. }
  152. inline int32_t
  153. appendUnchanged(char16_t *dest, int32_t destIndex, int32_t destCapacity,
  154. const char16_t *s, int32_t length, uint32_t options, icu::Edits *edits) {
  155. if (length <= 0) {
  156. return destIndex;
  157. }
  158. return appendNonEmptyUnchanged(dest, destIndex, destCapacity, s, length, options, edits);
  159. }
  160. UChar32 U_CALLCONV
  161. utf16_caseContextIterator(void *context, int8_t dir) {
  162. UCaseContext* csc = static_cast<UCaseContext*>(context);
  163. UChar32 c;
  164. if(dir<0) {
  165. /* reset for backward iteration */
  166. csc->index=csc->cpStart;
  167. csc->dir=dir;
  168. } else if(dir>0) {
  169. /* reset for forward iteration */
  170. csc->index=csc->cpLimit;
  171. csc->dir=dir;
  172. } else {
  173. /* continue current iteration direction */
  174. dir=csc->dir;
  175. }
  176. if(dir<0) {
  177. if(csc->start<csc->index) {
  178. U16_PREV((const char16_t *)csc->p, csc->start, csc->index, c);
  179. return c;
  180. }
  181. } else {
  182. if(csc->index<csc->limit) {
  183. U16_NEXT((const char16_t *)csc->p, csc->index, csc->limit, c);
  184. return c;
  185. }
  186. }
  187. return U_SENTINEL;
  188. }
  189. /**
  190. * caseLocale >= 0: Lowercases [srcStart..srcLimit[ but takes context [0..srcLength[ into account.
  191. * caseLocale < 0: Case-folds [srcStart..srcLimit[.
  192. */
  193. int32_t toLower(int32_t caseLocale, uint32_t options,
  194. char16_t *dest, int32_t destCapacity,
  195. const char16_t *src, UCaseContext *csc, int32_t srcStart, int32_t srcLimit,
  196. icu::Edits *edits, UErrorCode &errorCode) {
  197. const int8_t *latinToLower;
  198. if (caseLocale == UCASE_LOC_ROOT ||
  199. (caseLocale >= 0 ?
  200. !(caseLocale == UCASE_LOC_TURKISH || caseLocale == UCASE_LOC_LITHUANIAN) :
  201. (options & _FOLD_CASE_OPTIONS_MASK) == U_FOLD_CASE_DEFAULT)) {
  202. latinToLower = LatinCase::TO_LOWER_NORMAL;
  203. } else {
  204. latinToLower = LatinCase::TO_LOWER_TR_LT;
  205. }
  206. const UTrie2 *trie = ucase_getTrie();
  207. int32_t destIndex = 0;
  208. int32_t prev = srcStart;
  209. int32_t srcIndex = srcStart;
  210. for (;;) {
  211. // fast path for simple cases
  212. char16_t lead = 0;
  213. while (srcIndex < srcLimit) {
  214. lead = src[srcIndex];
  215. int32_t delta;
  216. if (lead < LatinCase::LONG_S) {
  217. int8_t d = latinToLower[lead];
  218. if (d == LatinCase::EXC) { break; }
  219. ++srcIndex;
  220. if (d == 0) { continue; }
  221. delta = d;
  222. } else if (lead >= 0xd800) {
  223. break; // surrogate or higher
  224. } else {
  225. uint16_t props = UTRIE2_GET16_FROM_U16_SINGLE_LEAD(trie, lead);
  226. if (UCASE_HAS_EXCEPTION(props)) { break; }
  227. ++srcIndex;
  228. if (!UCASE_IS_UPPER_OR_TITLE(props) || (delta = UCASE_GET_DELTA(props)) == 0) {
  229. continue;
  230. }
  231. }
  232. lead += static_cast<char16_t>(delta);
  233. destIndex = appendUnchanged(dest, destIndex, destCapacity,
  234. src + prev, srcIndex - 1 - prev, options, edits);
  235. if (destIndex >= 0) {
  236. destIndex = appendUChar(dest, destIndex, destCapacity, lead);
  237. if (edits != nullptr) {
  238. edits->addReplace(1, 1);
  239. }
  240. }
  241. if (destIndex < 0) {
  242. errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
  243. return 0;
  244. }
  245. prev = srcIndex;
  246. }
  247. if (srcIndex >= srcLimit) {
  248. break;
  249. }
  250. // slow path
  251. int32_t cpStart = srcIndex++;
  252. char16_t trail;
  253. UChar32 c;
  254. if (U16_IS_LEAD(lead) && srcIndex < srcLimit && U16_IS_TRAIL(trail = src[srcIndex])) {
  255. c = U16_GET_SUPPLEMENTARY(lead, trail);
  256. ++srcIndex;
  257. } else {
  258. c = lead;
  259. }
  260. const char16_t *s = nullptr;
  261. if (caseLocale >= 0) {
  262. csc->cpStart = cpStart;
  263. csc->cpLimit = srcIndex;
  264. c = ucase_toFullLower(c, utf16_caseContextIterator, csc, &s, caseLocale);
  265. } else {
  266. c = ucase_toFullFolding(c, &s, options);
  267. }
  268. if (c >= 0) {
  269. destIndex = appendUnchanged(dest, destIndex, destCapacity,
  270. src + prev, cpStart - prev, options, edits);
  271. if (destIndex >= 0) {
  272. destIndex = appendResult(dest, destIndex, destCapacity, c, s,
  273. srcIndex - cpStart, options, edits);
  274. }
  275. if (destIndex < 0) {
  276. errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
  277. return 0;
  278. }
  279. prev = srcIndex;
  280. }
  281. }
  282. destIndex = appendUnchanged(dest, destIndex, destCapacity,
  283. src + prev, srcIndex - prev, options, edits);
  284. if (destIndex < 0) {
  285. errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
  286. return 0;
  287. }
  288. return destIndex;
  289. }
  290. int32_t toUpper(int32_t caseLocale, uint32_t options,
  291. char16_t *dest, int32_t destCapacity,
  292. const char16_t *src, UCaseContext *csc, int32_t srcLength,
  293. icu::Edits *edits, UErrorCode &errorCode) {
  294. const int8_t *latinToUpper;
  295. if (caseLocale == UCASE_LOC_TURKISH) {
  296. latinToUpper = LatinCase::TO_UPPER_TR;
  297. } else {
  298. latinToUpper = LatinCase::TO_UPPER_NORMAL;
  299. }
  300. const UTrie2 *trie = ucase_getTrie();
  301. int32_t destIndex = 0;
  302. int32_t prev = 0;
  303. int32_t srcIndex = 0;
  304. for (;;) {
  305. // fast path for simple cases
  306. char16_t lead = 0;
  307. while (srcIndex < srcLength) {
  308. lead = src[srcIndex];
  309. int32_t delta;
  310. if (lead < LatinCase::LONG_S) {
  311. int8_t d = latinToUpper[lead];
  312. if (d == LatinCase::EXC) { break; }
  313. ++srcIndex;
  314. if (d == 0) { continue; }
  315. delta = d;
  316. } else if (lead >= 0xd800) {
  317. break; // surrogate or higher
  318. } else {
  319. uint16_t props = UTRIE2_GET16_FROM_U16_SINGLE_LEAD(trie, lead);
  320. if (UCASE_HAS_EXCEPTION(props)) { break; }
  321. ++srcIndex;
  322. if (UCASE_GET_TYPE(props) != UCASE_LOWER || (delta = UCASE_GET_DELTA(props)) == 0) {
  323. continue;
  324. }
  325. }
  326. lead += static_cast<char16_t>(delta);
  327. destIndex = appendUnchanged(dest, destIndex, destCapacity,
  328. src + prev, srcIndex - 1 - prev, options, edits);
  329. if (destIndex >= 0) {
  330. destIndex = appendUChar(dest, destIndex, destCapacity, lead);
  331. if (edits != nullptr) {
  332. edits->addReplace(1, 1);
  333. }
  334. }
  335. if (destIndex < 0) {
  336. errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
  337. return 0;
  338. }
  339. prev = srcIndex;
  340. }
  341. if (srcIndex >= srcLength) {
  342. break;
  343. }
  344. // slow path
  345. int32_t cpStart;
  346. csc->cpStart = cpStart = srcIndex++;
  347. char16_t trail;
  348. UChar32 c;
  349. if (U16_IS_LEAD(lead) && srcIndex < srcLength && U16_IS_TRAIL(trail = src[srcIndex])) {
  350. c = U16_GET_SUPPLEMENTARY(lead, trail);
  351. ++srcIndex;
  352. } else {
  353. c = lead;
  354. }
  355. csc->cpLimit = srcIndex;
  356. const char16_t *s = nullptr;
  357. c = ucase_toFullUpper(c, utf16_caseContextIterator, csc, &s, caseLocale);
  358. if (c >= 0) {
  359. destIndex = appendUnchanged(dest, destIndex, destCapacity,
  360. src + prev, cpStart - prev, options, edits);
  361. if (destIndex >= 0) {
  362. destIndex = appendResult(dest, destIndex, destCapacity, c, s,
  363. srcIndex - cpStart, options, edits);
  364. }
  365. if (destIndex < 0) {
  366. errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
  367. return 0;
  368. }
  369. prev = srcIndex;
  370. }
  371. }
  372. destIndex = appendUnchanged(dest, destIndex, destCapacity,
  373. src + prev, srcIndex - prev, options, edits);
  374. if (destIndex < 0) {
  375. errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
  376. return 0;
  377. }
  378. return destIndex;
  379. }
  380. } // namespace
  381. U_NAMESPACE_END
  382. U_NAMESPACE_USE
  383. #if !UCONFIG_NO_BREAK_ITERATION
  384. namespace {
  385. /**
  386. * Input: c is a letter I with or without acute accent.
  387. * start is the index in src after c, and is less than segmentLimit.
  388. * If a plain i/I is followed by a plain j/J,
  389. * or an i/I with acute (precomposed or decomposed) is followed by a j/J with acute,
  390. * then we output accordingly.
  391. *
  392. * @return the src index after the titlecased sequence, or the start index if no Dutch IJ
  393. */
  394. int32_t maybeTitleDutchIJ(const char16_t *src, UChar32 c, int32_t start, int32_t segmentLimit,
  395. char16_t *dest, int32_t &destIndex, int32_t destCapacity, uint32_t options,
  396. icu::Edits *edits) {
  397. U_ASSERT(start < segmentLimit);
  398. int32_t index = start;
  399. bool withAcute = false;
  400. // If the conditions are met, then the following variables tell us what to output.
  401. int32_t unchanged1 = 0; // code units before the j, or the whole sequence (0..3)
  402. bool doTitleJ = false; // true if the j needs to be titlecased
  403. int32_t unchanged2 = 0; // after the j (0 or 1)
  404. // next character after the first letter
  405. char16_t c2 = src[index++];
  406. // Is the first letter an i/I with accent?
  407. if (c == u'I') {
  408. if (c2 == ACUTE) {
  409. withAcute = true;
  410. unchanged1 = 1;
  411. if (index == segmentLimit) { return start; }
  412. c2 = src[index++];
  413. }
  414. } else { // Í
  415. withAcute = true;
  416. }
  417. // Is the next character a j/J?
  418. if (c2 == u'j') {
  419. doTitleJ = true;
  420. } else if (c2 == u'J') {
  421. ++unchanged1;
  422. } else {
  423. return start;
  424. }
  425. // A plain i/I must be followed by a plain j/J.
  426. // An i/I with acute must be followed by a j/J with acute.
  427. if (withAcute) {
  428. if (index == segmentLimit || src[index++] != ACUTE) { return start; }
  429. if (doTitleJ) {
  430. unchanged2 = 1;
  431. } else {
  432. ++unchanged1;
  433. }
  434. }
  435. // There must not be another combining mark.
  436. if (index < segmentLimit) {
  437. int32_t cp;
  438. int32_t i = index;
  439. U16_NEXT(src, i, segmentLimit, cp);
  440. uint32_t typeMask = U_GET_GC_MASK(cp);
  441. if ((typeMask & U_GC_M_MASK) != 0) {
  442. return start;
  443. }
  444. }
  445. // Output the rest of the Dutch IJ.
  446. destIndex = appendUnchanged(dest, destIndex, destCapacity, src + start, unchanged1, options, edits);
  447. start += unchanged1;
  448. if (doTitleJ) {
  449. destIndex = appendUChar(dest, destIndex, destCapacity, u'J');
  450. if (edits != nullptr) {
  451. edits->addReplace(1, 1);
  452. }
  453. ++start;
  454. }
  455. destIndex = appendUnchanged(dest, destIndex, destCapacity, src + start, unchanged2, options, edits);
  456. U_ASSERT(start + unchanged2 == index);
  457. return index;
  458. }
  459. } // namespace
  460. U_CFUNC int32_t U_CALLCONV
  461. ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *iter,
  462. char16_t *dest, int32_t destCapacity,
  463. const char16_t *src, int32_t srcLength,
  464. icu::Edits *edits,
  465. UErrorCode &errorCode) {
  466. if (!ustrcase_checkTitleAdjustmentOptions(options, errorCode)) {
  467. return 0;
  468. }
  469. /* set up local variables */
  470. UCaseContext csc=UCASECONTEXT_INITIALIZER;
  471. csc.p=(void *)src;
  472. csc.limit=srcLength;
  473. int32_t destIndex=0;
  474. int32_t prev=0;
  475. bool isFirstIndex=true;
  476. /* titlecasing loop */
  477. while(prev<srcLength) {
  478. /* find next index where to titlecase */
  479. int32_t index;
  480. if(isFirstIndex) {
  481. isFirstIndex=false;
  482. index=iter->first();
  483. } else {
  484. index=iter->next();
  485. }
  486. if(index==UBRK_DONE || index>srcLength) {
  487. index=srcLength;
  488. }
  489. /*
  490. * Segment [prev..index[ into 3 parts:
  491. * a) skipped characters (copy as-is) [prev..titleStart[
  492. * b) first letter (titlecase) [titleStart..titleLimit[
  493. * c) subsequent characters (lowercase) [titleLimit..index[
  494. */
  495. if(prev<index) {
  496. // Find and copy skipped characters [prev..titleStart[
  497. int32_t titleStart=prev;
  498. int32_t titleLimit=prev;
  499. UChar32 c;
  500. U16_NEXT(src, titleLimit, index, c);
  501. if ((options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0) {
  502. // Adjust the titlecasing index to the next cased character,
  503. // or to the next letter/number/symbol/private use.
  504. // Stop with titleStart<titleLimit<=index
  505. // if there is a character to be titlecased,
  506. // or else stop with titleStart==titleLimit==index.
  507. bool toCased = (options&U_TITLECASE_ADJUST_TO_CASED) != 0;
  508. while (toCased ? UCASE_NONE==ucase_getType(c) : !ustrcase_isLNS(c)) {
  509. titleStart=titleLimit;
  510. if(titleLimit==index) {
  511. break;
  512. }
  513. U16_NEXT(src, titleLimit, index, c);
  514. }
  515. if (prev < titleStart) {
  516. destIndex=appendUnchanged(dest, destIndex, destCapacity,
  517. src+prev, titleStart-prev, options, edits);
  518. if(destIndex<0) {
  519. errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
  520. return 0;
  521. }
  522. }
  523. }
  524. if(titleStart<titleLimit) {
  525. /* titlecase c which is from [titleStart..titleLimit[ */
  526. csc.cpStart=titleStart;
  527. csc.cpLimit=titleLimit;
  528. const char16_t *s;
  529. c=ucase_toFullTitle(c, utf16_caseContextIterator, &csc, &s, caseLocale);
  530. destIndex=appendResult(dest, destIndex, destCapacity, c, s,
  531. titleLimit-titleStart, options, edits);
  532. if(destIndex<0) {
  533. errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
  534. return 0;
  535. }
  536. /* Special case Dutch IJ titlecasing */
  537. if (titleStart+1 < index &&
  538. caseLocale == UCASE_LOC_DUTCH) {
  539. if (c < 0) {
  540. c = ~c;
  541. }
  542. if (c == u'I' || c == u'Í') {
  543. titleLimit = maybeTitleDutchIJ(src, c, titleStart + 1, index,
  544. dest, destIndex, destCapacity, options,
  545. edits);
  546. }
  547. }
  548. /* lowercase [titleLimit..index[ */
  549. if(titleLimit<index) {
  550. if((options&U_TITLECASE_NO_LOWERCASE)==0) {
  551. /* Normal operation: Lowercase the rest of the word. */
  552. destIndex+=
  553. toLower(
  554. caseLocale, options,
  555. (dest==nullptr) ? nullptr: dest+destIndex, destCapacity-destIndex,
  556. src, &csc, titleLimit, index,
  557. edits, errorCode);
  558. if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
  559. errorCode=U_ZERO_ERROR;
  560. }
  561. if(U_FAILURE(errorCode)) {
  562. return destIndex;
  563. }
  564. } else {
  565. /* Optionally just copy the rest of the word unchanged. */
  566. destIndex=appendUnchanged(dest, destIndex, destCapacity,
  567. src+titleLimit, index-titleLimit, options, edits);
  568. if(destIndex<0) {
  569. errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
  570. return 0;
  571. }
  572. }
  573. }
  574. }
  575. }
  576. prev=index;
  577. }
  578. return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
  579. }
  580. #endif // !UCONFIG_NO_BREAK_ITERATION
  581. U_NAMESPACE_BEGIN
  582. namespace GreekUpper {
  583. // Data generated by prototype code, see
  584. // https://icu.unicode.org/design/case/greek-upper
  585. // TODO: Move this data into ucase.icu.
  586. static const uint16_t data0370[] = {
  587. // U+0370..03FF
  588. 0x0370,
  589. 0x0370,
  590. 0x0372,
  591. 0x0372,
  592. 0,
  593. 0,
  594. 0x0376,
  595. 0x0376,
  596. 0,
  597. 0,
  598. 0x037A,
  599. 0x03FD,
  600. 0x03FE,
  601. 0x03FF,
  602. 0,
  603. 0x037F,
  604. 0,
  605. 0,
  606. 0,
  607. 0,
  608. 0,
  609. 0,
  610. 0x0391 | HAS_VOWEL | HAS_ACCENT,
  611. 0,
  612. 0x0395 | HAS_VOWEL | HAS_ACCENT,
  613. 0x0397 | HAS_VOWEL | HAS_ACCENT,
  614. 0x0399 | HAS_VOWEL | HAS_ACCENT,
  615. 0,
  616. 0x039F | HAS_VOWEL | HAS_ACCENT,
  617. 0,
  618. 0x03A5 | HAS_VOWEL | HAS_ACCENT,
  619. 0x03A9 | HAS_VOWEL | HAS_ACCENT,
  620. 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
  621. 0x0391 | HAS_VOWEL,
  622. 0x0392,
  623. 0x0393,
  624. 0x0394,
  625. 0x0395 | HAS_VOWEL,
  626. 0x0396,
  627. 0x0397 | HAS_VOWEL,
  628. 0x0398,
  629. 0x0399 | HAS_VOWEL,
  630. 0x039A,
  631. 0x039B,
  632. 0x039C,
  633. 0x039D,
  634. 0x039E,
  635. 0x039F | HAS_VOWEL,
  636. 0x03A0,
  637. 0x03A1,
  638. 0,
  639. 0x03A3,
  640. 0x03A4,
  641. 0x03A5 | HAS_VOWEL,
  642. 0x03A6,
  643. 0x03A7,
  644. 0x03A8,
  645. 0x03A9 | HAS_VOWEL,
  646. 0x0399 | HAS_VOWEL | HAS_DIALYTIKA,
  647. 0x03A5 | HAS_VOWEL | HAS_DIALYTIKA,
  648. 0x0391 | HAS_VOWEL | HAS_ACCENT,
  649. 0x0395 | HAS_VOWEL | HAS_ACCENT,
  650. 0x0397 | HAS_VOWEL | HAS_ACCENT,
  651. 0x0399 | HAS_VOWEL | HAS_ACCENT,
  652. 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
  653. 0x0391 | HAS_VOWEL,
  654. 0x0392,
  655. 0x0393,
  656. 0x0394,
  657. 0x0395 | HAS_VOWEL,
  658. 0x0396,
  659. 0x0397 | HAS_VOWEL,
  660. 0x0398,
  661. 0x0399 | HAS_VOWEL,
  662. 0x039A,
  663. 0x039B,
  664. 0x039C,
  665. 0x039D,
  666. 0x039E,
  667. 0x039F | HAS_VOWEL,
  668. 0x03A0,
  669. 0x03A1,
  670. 0x03A3,
  671. 0x03A3,
  672. 0x03A4,
  673. 0x03A5 | HAS_VOWEL,
  674. 0x03A6,
  675. 0x03A7,
  676. 0x03A8,
  677. 0x03A9 | HAS_VOWEL,
  678. 0x0399 | HAS_VOWEL | HAS_DIALYTIKA,
  679. 0x03A5 | HAS_VOWEL | HAS_DIALYTIKA,
  680. 0x039F | HAS_VOWEL | HAS_ACCENT,
  681. 0x03A5 | HAS_VOWEL | HAS_ACCENT,
  682. 0x03A9 | HAS_VOWEL | HAS_ACCENT,
  683. 0x03CF,
  684. 0x0392,
  685. 0x0398,
  686. 0x03D2,
  687. 0x03D2 | HAS_ACCENT,
  688. 0x03D2 | HAS_DIALYTIKA,
  689. 0x03A6,
  690. 0x03A0,
  691. 0x03CF,
  692. 0x03D8,
  693. 0x03D8,
  694. 0x03DA,
  695. 0x03DA,
  696. 0x03DC,
  697. 0x03DC,
  698. 0x03DE,
  699. 0x03DE,
  700. 0x03E0,
  701. 0x03E0,
  702. 0,
  703. 0,
  704. 0,
  705. 0,
  706. 0,
  707. 0,
  708. 0,
  709. 0,
  710. 0,
  711. 0,
  712. 0,
  713. 0,
  714. 0,
  715. 0,
  716. 0x039A,
  717. 0x03A1,
  718. 0x03F9,
  719. 0x037F,
  720. 0x03F4,
  721. 0x0395 | HAS_VOWEL,
  722. 0,
  723. 0x03F7,
  724. 0x03F7,
  725. 0x03F9,
  726. 0x03FA,
  727. 0x03FA,
  728. 0x03FC,
  729. 0x03FD,
  730. 0x03FE,
  731. 0x03FF,
  732. };
  733. static const uint16_t data1F00[] = {
  734. // U+1F00..1FFF
  735. 0x0391 | HAS_VOWEL,
  736. 0x0391 | HAS_VOWEL,
  737. 0x0391 | HAS_VOWEL | HAS_ACCENT,
  738. 0x0391 | HAS_VOWEL | HAS_ACCENT,
  739. 0x0391 | HAS_VOWEL | HAS_ACCENT,
  740. 0x0391 | HAS_VOWEL | HAS_ACCENT,
  741. 0x0391 | HAS_VOWEL | HAS_ACCENT,
  742. 0x0391 | HAS_VOWEL | HAS_ACCENT,
  743. 0x0391 | HAS_VOWEL,
  744. 0x0391 | HAS_VOWEL,
  745. 0x0391 | HAS_VOWEL | HAS_ACCENT,
  746. 0x0391 | HAS_VOWEL | HAS_ACCENT,
  747. 0x0391 | HAS_VOWEL | HAS_ACCENT,
  748. 0x0391 | HAS_VOWEL | HAS_ACCENT,
  749. 0x0391 | HAS_VOWEL | HAS_ACCENT,
  750. 0x0391 | HAS_VOWEL | HAS_ACCENT,
  751. 0x0395 | HAS_VOWEL,
  752. 0x0395 | HAS_VOWEL,
  753. 0x0395 | HAS_VOWEL | HAS_ACCENT,
  754. 0x0395 | HAS_VOWEL | HAS_ACCENT,
  755. 0x0395 | HAS_VOWEL | HAS_ACCENT,
  756. 0x0395 | HAS_VOWEL | HAS_ACCENT,
  757. 0,
  758. 0,
  759. 0x0395 | HAS_VOWEL,
  760. 0x0395 | HAS_VOWEL,
  761. 0x0395 | HAS_VOWEL | HAS_ACCENT,
  762. 0x0395 | HAS_VOWEL | HAS_ACCENT,
  763. 0x0395 | HAS_VOWEL | HAS_ACCENT,
  764. 0x0395 | HAS_VOWEL | HAS_ACCENT,
  765. 0,
  766. 0,
  767. 0x0397 | HAS_VOWEL,
  768. 0x0397 | HAS_VOWEL,
  769. 0x0397 | HAS_VOWEL | HAS_ACCENT,
  770. 0x0397 | HAS_VOWEL | HAS_ACCENT,
  771. 0x0397 | HAS_VOWEL | HAS_ACCENT,
  772. 0x0397 | HAS_VOWEL | HAS_ACCENT,
  773. 0x0397 | HAS_VOWEL | HAS_ACCENT,
  774. 0x0397 | HAS_VOWEL | HAS_ACCENT,
  775. 0x0397 | HAS_VOWEL,
  776. 0x0397 | HAS_VOWEL,
  777. 0x0397 | HAS_VOWEL | HAS_ACCENT,
  778. 0x0397 | HAS_VOWEL | HAS_ACCENT,
  779. 0x0397 | HAS_VOWEL | HAS_ACCENT,
  780. 0x0397 | HAS_VOWEL | HAS_ACCENT,
  781. 0x0397 | HAS_VOWEL | HAS_ACCENT,
  782. 0x0397 | HAS_VOWEL | HAS_ACCENT,
  783. 0x0399 | HAS_VOWEL,
  784. 0x0399 | HAS_VOWEL,
  785. 0x0399 | HAS_VOWEL | HAS_ACCENT,
  786. 0x0399 | HAS_VOWEL | HAS_ACCENT,
  787. 0x0399 | HAS_VOWEL | HAS_ACCENT,
  788. 0x0399 | HAS_VOWEL | HAS_ACCENT,
  789. 0x0399 | HAS_VOWEL | HAS_ACCENT,
  790. 0x0399 | HAS_VOWEL | HAS_ACCENT,
  791. 0x0399 | HAS_VOWEL,
  792. 0x0399 | HAS_VOWEL,
  793. 0x0399 | HAS_VOWEL | HAS_ACCENT,
  794. 0x0399 | HAS_VOWEL | HAS_ACCENT,
  795. 0x0399 | HAS_VOWEL | HAS_ACCENT,
  796. 0x0399 | HAS_VOWEL | HAS_ACCENT,
  797. 0x0399 | HAS_VOWEL | HAS_ACCENT,
  798. 0x0399 | HAS_VOWEL | HAS_ACCENT,
  799. 0x039F | HAS_VOWEL,
  800. 0x039F | HAS_VOWEL,
  801. 0x039F | HAS_VOWEL | HAS_ACCENT,
  802. 0x039F | HAS_VOWEL | HAS_ACCENT,
  803. 0x039F | HAS_VOWEL | HAS_ACCENT,
  804. 0x039F | HAS_VOWEL | HAS_ACCENT,
  805. 0,
  806. 0,
  807. 0x039F | HAS_VOWEL,
  808. 0x039F | HAS_VOWEL,
  809. 0x039F | HAS_VOWEL | HAS_ACCENT,
  810. 0x039F | HAS_VOWEL | HAS_ACCENT,
  811. 0x039F | HAS_VOWEL | HAS_ACCENT,
  812. 0x039F | HAS_VOWEL | HAS_ACCENT,
  813. 0,
  814. 0,
  815. 0x03A5 | HAS_VOWEL,
  816. 0x03A5 | HAS_VOWEL,
  817. 0x03A5 | HAS_VOWEL | HAS_ACCENT,
  818. 0x03A5 | HAS_VOWEL | HAS_ACCENT,
  819. 0x03A5 | HAS_VOWEL | HAS_ACCENT,
  820. 0x03A5 | HAS_VOWEL | HAS_ACCENT,
  821. 0x03A5 | HAS_VOWEL | HAS_ACCENT,
  822. 0x03A5 | HAS_VOWEL | HAS_ACCENT,
  823. 0,
  824. 0x03A5 | HAS_VOWEL,
  825. 0,
  826. 0x03A5 | HAS_VOWEL | HAS_ACCENT,
  827. 0,
  828. 0x03A5 | HAS_VOWEL | HAS_ACCENT,
  829. 0,
  830. 0x03A5 | HAS_VOWEL | HAS_ACCENT,
  831. 0x03A9 | HAS_VOWEL,
  832. 0x03A9 | HAS_VOWEL,
  833. 0x03A9 | HAS_VOWEL | HAS_ACCENT,
  834. 0x03A9 | HAS_VOWEL | HAS_ACCENT,
  835. 0x03A9 | HAS_VOWEL | HAS_ACCENT,
  836. 0x03A9 | HAS_VOWEL | HAS_ACCENT,
  837. 0x03A9 | HAS_VOWEL | HAS_ACCENT,
  838. 0x03A9 | HAS_VOWEL | HAS_ACCENT,
  839. 0x03A9 | HAS_VOWEL,
  840. 0x03A9 | HAS_VOWEL,
  841. 0x03A9 | HAS_VOWEL | HAS_ACCENT,
  842. 0x03A9 | HAS_VOWEL | HAS_ACCENT,
  843. 0x03A9 | HAS_VOWEL | HAS_ACCENT,
  844. 0x03A9 | HAS_VOWEL | HAS_ACCENT,
  845. 0x03A9 | HAS_VOWEL | HAS_ACCENT,
  846. 0x03A9 | HAS_VOWEL | HAS_ACCENT,
  847. 0x0391 | HAS_VOWEL | HAS_ACCENT,
  848. 0x0391 | HAS_VOWEL | HAS_ACCENT,
  849. 0x0395 | HAS_VOWEL | HAS_ACCENT,
  850. 0x0395 | HAS_VOWEL | HAS_ACCENT,
  851. 0x0397 | HAS_VOWEL | HAS_ACCENT,
  852. 0x0397 | HAS_VOWEL | HAS_ACCENT,
  853. 0x0399 | HAS_VOWEL | HAS_ACCENT,
  854. 0x0399 | HAS_VOWEL | HAS_ACCENT,
  855. 0x039F | HAS_VOWEL | HAS_ACCENT,
  856. 0x039F | HAS_VOWEL | HAS_ACCENT,
  857. 0x03A5 | HAS_VOWEL | HAS_ACCENT,
  858. 0x03A5 | HAS_VOWEL | HAS_ACCENT,
  859. 0x03A9 | HAS_VOWEL | HAS_ACCENT,
  860. 0x03A9 | HAS_VOWEL | HAS_ACCENT,
  861. 0,
  862. 0,
  863. 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
  864. 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
  865. 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
  866. 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
  867. 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
  868. 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
  869. 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
  870. 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
  871. 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
  872. 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
  873. 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
  874. 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
  875. 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
  876. 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
  877. 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
  878. 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
  879. 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
  880. 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
  881. 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
  882. 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
  883. 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
  884. 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
  885. 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
  886. 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
  887. 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
  888. 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
  889. 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
  890. 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
  891. 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
  892. 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
  893. 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
  894. 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
  895. 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
  896. 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
  897. 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
  898. 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
  899. 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
  900. 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
  901. 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
  902. 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
  903. 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
  904. 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
  905. 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
  906. 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
  907. 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
  908. 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
  909. 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
  910. 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
  911. 0x0391 | HAS_VOWEL,
  912. 0x0391 | HAS_VOWEL,
  913. 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
  914. 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
  915. 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
  916. 0,
  917. 0x0391 | HAS_VOWEL | HAS_ACCENT,
  918. 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
  919. 0x0391 | HAS_VOWEL,
  920. 0x0391 | HAS_VOWEL,
  921. 0x0391 | HAS_VOWEL | HAS_ACCENT,
  922. 0x0391 | HAS_VOWEL | HAS_ACCENT,
  923. 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
  924. 0,
  925. 0x0399 | HAS_VOWEL,
  926. 0,
  927. 0,
  928. 0,
  929. 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
  930. 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
  931. 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
  932. 0,
  933. 0x0397 | HAS_VOWEL | HAS_ACCENT,
  934. 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
  935. 0x0395 | HAS_VOWEL | HAS_ACCENT,
  936. 0x0395 | HAS_VOWEL | HAS_ACCENT,
  937. 0x0397 | HAS_VOWEL | HAS_ACCENT,
  938. 0x0397 | HAS_VOWEL | HAS_ACCENT,
  939. 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
  940. 0,
  941. 0,
  942. 0,
  943. 0x0399 | HAS_VOWEL,
  944. 0x0399 | HAS_VOWEL,
  945. 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
  946. 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
  947. 0,
  948. 0,
  949. 0x0399 | HAS_VOWEL | HAS_ACCENT,
  950. 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
  951. 0x0399 | HAS_VOWEL,
  952. 0x0399 | HAS_VOWEL,
  953. 0x0399 | HAS_VOWEL | HAS_ACCENT,
  954. 0x0399 | HAS_VOWEL | HAS_ACCENT,
  955. 0,
  956. 0,
  957. 0,
  958. 0,
  959. 0x03A5 | HAS_VOWEL,
  960. 0x03A5 | HAS_VOWEL,
  961. 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
  962. 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
  963. 0x03A1,
  964. 0x03A1,
  965. 0x03A5 | HAS_VOWEL | HAS_ACCENT,
  966. 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
  967. 0x03A5 | HAS_VOWEL,
  968. 0x03A5 | HAS_VOWEL,
  969. 0x03A5 | HAS_VOWEL | HAS_ACCENT,
  970. 0x03A5 | HAS_VOWEL | HAS_ACCENT,
  971. 0x03A1,
  972. 0,
  973. 0,
  974. 0,
  975. 0,
  976. 0,
  977. 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
  978. 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
  979. 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
  980. 0,
  981. 0x03A9 | HAS_VOWEL | HAS_ACCENT,
  982. 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
  983. 0x039F | HAS_VOWEL | HAS_ACCENT,
  984. 0x039F | HAS_VOWEL | HAS_ACCENT,
  985. 0x03A9 | HAS_VOWEL | HAS_ACCENT,
  986. 0x03A9 | HAS_VOWEL | HAS_ACCENT,
  987. 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
  988. 0,
  989. 0,
  990. 0,
  991. };
  992. // U+2126 Ohm sign
  993. static const uint16_t data2126 = 0x03A9 | HAS_VOWEL;
  994. uint32_t getLetterData(UChar32 c) {
  995. if (c < 0x370 || 0x2126 < c || (0x3ff < c && c < 0x1f00)) {
  996. return 0;
  997. } else if (c <= 0x3ff) {
  998. return data0370[c - 0x370];
  999. } else if (c <= 0x1fff) {
  1000. return data1F00[c - 0x1f00];
  1001. } else if (c == 0x2126) {
  1002. return data2126;
  1003. } else {
  1004. return 0;
  1005. }
  1006. }
  1007. uint32_t getDiacriticData(UChar32 c) {
  1008. switch (c) {
  1009. case 0x0300: // varia
  1010. case 0x0301: // tonos = oxia
  1011. case 0x0342: // perispomeni
  1012. case 0x0302: // circumflex can look like perispomeni
  1013. case 0x0303: // tilde can look like perispomeni
  1014. case 0x0311: // inverted breve can look like perispomeni
  1015. return HAS_ACCENT;
  1016. case 0x0308: // dialytika = diaeresis
  1017. return HAS_COMBINING_DIALYTIKA;
  1018. case 0x0344: // dialytika tonos
  1019. return HAS_COMBINING_DIALYTIKA | HAS_ACCENT;
  1020. case 0x0345: // ypogegrammeni = iota subscript
  1021. return HAS_YPOGEGRAMMENI;
  1022. case 0x0304: // macron
  1023. case 0x0306: // breve
  1024. case 0x0313: // comma above
  1025. case 0x0314: // reversed comma above
  1026. case 0x0343: // koronis
  1027. return HAS_OTHER_GREEK_DIACRITIC;
  1028. default:
  1029. return 0;
  1030. }
  1031. }
  1032. UBool isFollowedByCasedLetter(const char16_t *s, int32_t i, int32_t length) {
  1033. while (i < length) {
  1034. UChar32 c;
  1035. U16_NEXT(s, i, length, c);
  1036. int32_t type = ucase_getTypeOrIgnorable(c);
  1037. if ((type & UCASE_IGNORABLE) != 0) {
  1038. // Case-ignorable, continue with the loop.
  1039. } else if (type != UCASE_NONE) {
  1040. return true; // Followed by cased letter.
  1041. } else {
  1042. return false; // Uncased and not case-ignorable.
  1043. }
  1044. }
  1045. return false; // Not followed by cased letter.
  1046. }
  1047. /**
  1048. * Greek string uppercasing with a state machine.
  1049. * Probably simpler than a stateless function that has to figure out complex context-before
  1050. * for each character.
  1051. * TODO: Try to re-consolidate one way or another with the non-Greek function.
  1052. */
  1053. int32_t toUpper(uint32_t options,
  1054. char16_t *dest, int32_t destCapacity,
  1055. const char16_t *src, int32_t srcLength,
  1056. Edits *edits,
  1057. UErrorCode &errorCode) {
  1058. int32_t destIndex=0;
  1059. uint32_t state = 0;
  1060. for (int32_t i = 0; i < srcLength;) {
  1061. int32_t nextIndex = i;
  1062. UChar32 c;
  1063. U16_NEXT(src, nextIndex, srcLength, c);
  1064. uint32_t nextState = 0;
  1065. int32_t type = ucase_getTypeOrIgnorable(c);
  1066. if ((type & UCASE_IGNORABLE) != 0) {
  1067. // c is case-ignorable
  1068. nextState |= (state & AFTER_CASED);
  1069. } else if (type != UCASE_NONE) {
  1070. // c is cased
  1071. nextState |= AFTER_CASED;
  1072. }
  1073. uint32_t data = getLetterData(c);
  1074. if (data > 0) {
  1075. uint32_t upper = data & UPPER_MASK;
  1076. // Add a dialytika to this iota or ypsilon vowel
  1077. // if we removed a tonos from the previous vowel,
  1078. // and that previous vowel did not also have (or gain) a dialytika.
  1079. // Adding one only to the final vowel in a longer sequence
  1080. // (which does not occur in normal writing) would require lookahead.
  1081. // Set the same flag as for preserving an existing dialytika.
  1082. if ((data & HAS_VOWEL) != 0 &&
  1083. (state & (AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT | AFTER_VOWEL_WITH_COMBINING_ACCENT)) !=
  1084. 0 &&
  1085. (upper == 0x399 || upper == 0x3A5)) {
  1086. data |= (state & AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT) ? HAS_DIALYTIKA
  1087. : HAS_COMBINING_DIALYTIKA;
  1088. }
  1089. int32_t numYpogegrammeni = 0; // Map each one to a trailing, spacing, capital iota.
  1090. if ((data & HAS_YPOGEGRAMMENI) != 0) {
  1091. numYpogegrammeni = 1;
  1092. }
  1093. const UBool hasPrecomposedAccent = (data & HAS_ACCENT) != 0;
  1094. // Skip combining diacritics after this Greek letter.
  1095. while (nextIndex < srcLength) {
  1096. uint32_t diacriticData = getDiacriticData(src[nextIndex]);
  1097. if (diacriticData != 0) {
  1098. data |= diacriticData;
  1099. if ((diacriticData & HAS_YPOGEGRAMMENI) != 0) {
  1100. ++numYpogegrammeni;
  1101. }
  1102. ++nextIndex;
  1103. } else {
  1104. break; // not a Greek diacritic
  1105. }
  1106. }
  1107. if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) {
  1108. nextState |= hasPrecomposedAccent ? AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT
  1109. : AFTER_VOWEL_WITH_COMBINING_ACCENT;
  1110. }
  1111. // Map according to Greek rules.
  1112. UBool addTonos = false;
  1113. if (upper == 0x397 &&
  1114. (data & HAS_ACCENT) != 0 &&
  1115. numYpogegrammeni == 0 &&
  1116. (state & AFTER_CASED) == 0 &&
  1117. !isFollowedByCasedLetter(src, nextIndex, srcLength)) {
  1118. // Keep disjunctive "or" with (only) a tonos.
  1119. // We use the same "word boundary" conditions as for the Final_Sigma test.
  1120. if (hasPrecomposedAccent) {
  1121. upper = 0x389; // Preserve the precomposed form.
  1122. } else {
  1123. addTonos = true;
  1124. }
  1125. } else if ((data & HAS_DIALYTIKA) != 0) {
  1126. // Preserve a vowel with dialytika in precomposed form if it exists.
  1127. if (upper == 0x399) {
  1128. upper = 0x3AA;
  1129. data &= ~HAS_EITHER_DIALYTIKA;
  1130. } else if (upper == 0x3A5) {
  1131. upper = 0x3AB;
  1132. data &= ~HAS_EITHER_DIALYTIKA;
  1133. }
  1134. }
  1135. UBool change;
  1136. if (edits == nullptr && (options & U_OMIT_UNCHANGED_TEXT) == 0) {
  1137. change = true; // common, simple usage
  1138. } else {
  1139. // Find out first whether we are changing the text.
  1140. change = src[i] != upper || numYpogegrammeni > 0;
  1141. int32_t i2 = i + 1;
  1142. if ((data & HAS_EITHER_DIALYTIKA) != 0) {
  1143. change |= i2 >= nextIndex || src[i2] != 0x308;
  1144. ++i2;
  1145. }
  1146. if (addTonos) {
  1147. change |= i2 >= nextIndex || src[i2] != 0x301;
  1148. ++i2;
  1149. }
  1150. int32_t oldLength = nextIndex - i;
  1151. int32_t newLength = (i2 - i) + numYpogegrammeni;
  1152. change |= oldLength != newLength;
  1153. if (change) {
  1154. if (edits != nullptr) {
  1155. edits->addReplace(oldLength, newLength);
  1156. }
  1157. } else {
  1158. if (edits != nullptr) {
  1159. edits->addUnchanged(oldLength);
  1160. }
  1161. // Write unchanged text?
  1162. change = (options & U_OMIT_UNCHANGED_TEXT) == 0;
  1163. }
  1164. }
  1165. if (change) {
  1166. destIndex = appendUChar(dest, destIndex, destCapacity, static_cast<char16_t>(upper));
  1167. if (destIndex >= 0 && (data & HAS_EITHER_DIALYTIKA) != 0) {
  1168. destIndex=appendUChar(dest, destIndex, destCapacity, 0x308); // restore or add a dialytika
  1169. }
  1170. if (destIndex >= 0 && addTonos) {
  1171. destIndex=appendUChar(dest, destIndex, destCapacity, 0x301);
  1172. }
  1173. while (destIndex >= 0 && numYpogegrammeni > 0) {
  1174. destIndex=appendUChar(dest, destIndex, destCapacity, 0x399);
  1175. --numYpogegrammeni;
  1176. }
  1177. if(destIndex<0) {
  1178. errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
  1179. return 0;
  1180. }
  1181. }
  1182. } else {
  1183. const char16_t *s;
  1184. c=ucase_toFullUpper(c, nullptr, nullptr, &s, UCASE_LOC_GREEK);
  1185. destIndex = appendResult(dest, destIndex, destCapacity, c, s,
  1186. nextIndex - i, options, edits);
  1187. if (destIndex < 0) {
  1188. errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
  1189. return 0;
  1190. }
  1191. }
  1192. i = nextIndex;
  1193. state = nextState;
  1194. }
  1195. return destIndex;
  1196. }
  1197. } // namespace GreekUpper
  1198. U_NAMESPACE_END
  1199. /* functions available in the common library (for unistr_case.cpp) */
  1200. U_CFUNC int32_t U_CALLCONV
  1201. ustrcase_internalToLower(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
  1202. char16_t *dest, int32_t destCapacity,
  1203. const char16_t *src, int32_t srcLength,
  1204. icu::Edits *edits,
  1205. UErrorCode &errorCode) {
  1206. UCaseContext csc=UCASECONTEXT_INITIALIZER;
  1207. csc.p=(void *)src;
  1208. csc.limit=srcLength;
  1209. int32_t destIndex = toLower(
  1210. caseLocale, options,
  1211. dest, destCapacity,
  1212. src, &csc, 0, srcLength,
  1213. edits, errorCode);
  1214. return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
  1215. }
  1216. U_CFUNC int32_t U_CALLCONV
  1217. ustrcase_internalToUpper(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
  1218. char16_t *dest, int32_t destCapacity,
  1219. const char16_t *src, int32_t srcLength,
  1220. icu::Edits *edits,
  1221. UErrorCode &errorCode) {
  1222. int32_t destIndex;
  1223. if (caseLocale == UCASE_LOC_GREEK) {
  1224. destIndex = GreekUpper::toUpper(options, dest, destCapacity,
  1225. src, srcLength, edits, errorCode);
  1226. } else {
  1227. UCaseContext csc=UCASECONTEXT_INITIALIZER;
  1228. csc.p=(void *)src;
  1229. csc.limit=srcLength;
  1230. destIndex = toUpper(
  1231. caseLocale, options,
  1232. dest, destCapacity,
  1233. src, &csc, srcLength,
  1234. edits, errorCode);
  1235. }
  1236. return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
  1237. }
  1238. U_CFUNC int32_t U_CALLCONV
  1239. ustrcase_internalFold(int32_t /* caseLocale */, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
  1240. char16_t *dest, int32_t destCapacity,
  1241. const char16_t *src, int32_t srcLength,
  1242. icu::Edits *edits,
  1243. UErrorCode &errorCode) {
  1244. int32_t destIndex = toLower(
  1245. -1, options,
  1246. dest, destCapacity,
  1247. src, nullptr, 0, srcLength,
  1248. edits, errorCode);
  1249. return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
  1250. }
  1251. U_CFUNC int32_t
  1252. ustrcase_map(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
  1253. char16_t *dest, int32_t destCapacity,
  1254. const char16_t *src, int32_t srcLength,
  1255. UStringCaseMapper *stringCaseMapper,
  1256. icu::Edits *edits,
  1257. UErrorCode &errorCode) {
  1258. int32_t destLength;
  1259. /* check argument values */
  1260. if(U_FAILURE(errorCode)) {
  1261. return 0;
  1262. }
  1263. if( destCapacity<0 ||
  1264. (dest==nullptr && destCapacity>0) ||
  1265. src==nullptr ||
  1266. srcLength<-1
  1267. ) {
  1268. errorCode=U_ILLEGAL_ARGUMENT_ERROR;
  1269. return 0;
  1270. }
  1271. /* get the string length */
  1272. if(srcLength==-1) {
  1273. srcLength=u_strlen(src);
  1274. }
  1275. /* check for overlapping source and destination */
  1276. if( dest!=nullptr &&
  1277. ((src>=dest && src<(dest+destCapacity)) ||
  1278. (dest>=src && dest<(src+srcLength)))
  1279. ) {
  1280. errorCode=U_ILLEGAL_ARGUMENT_ERROR;
  1281. return 0;
  1282. }
  1283. if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
  1284. edits->reset();
  1285. }
  1286. destLength=stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
  1287. dest, destCapacity, src, srcLength, edits, errorCode);
  1288. return u_terminateUChars(dest, destCapacity, destLength, &errorCode);
  1289. }
  1290. U_CFUNC int32_t
  1291. ustrcase_mapWithOverlap(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
  1292. char16_t *dest, int32_t destCapacity,
  1293. const char16_t *src, int32_t srcLength,
  1294. UStringCaseMapper *stringCaseMapper,
  1295. UErrorCode &errorCode) {
  1296. char16_t buffer[300];
  1297. char16_t *temp;
  1298. int32_t destLength;
  1299. /* check argument values */
  1300. if(U_FAILURE(errorCode)) {
  1301. return 0;
  1302. }
  1303. if( destCapacity<0 ||
  1304. (dest==nullptr && destCapacity>0) ||
  1305. src==nullptr ||
  1306. srcLength<-1
  1307. ) {
  1308. errorCode=U_ILLEGAL_ARGUMENT_ERROR;
  1309. return 0;
  1310. }
  1311. /* get the string length */
  1312. if(srcLength==-1) {
  1313. srcLength=u_strlen(src);
  1314. }
  1315. /* check for overlapping source and destination */
  1316. if( dest!=nullptr &&
  1317. ((src>=dest && src<(dest+destCapacity)) ||
  1318. (dest>=src && dest<(src+srcLength)))
  1319. ) {
  1320. /* overlap: provide a temporary destination buffer and later copy the result */
  1321. if(destCapacity<=UPRV_LENGTHOF(buffer)) {
  1322. /* the stack buffer is large enough */
  1323. temp=buffer;
  1324. } else {
  1325. /* allocate a buffer */
  1326. temp=(char16_t *)uprv_malloc(destCapacity*U_SIZEOF_UCHAR);
  1327. if(temp==nullptr) {
  1328. errorCode=U_MEMORY_ALLOCATION_ERROR;
  1329. return 0;
  1330. }
  1331. }
  1332. } else {
  1333. temp=dest;
  1334. }
  1335. destLength=stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
  1336. temp, destCapacity, src, srcLength, nullptr, errorCode);
  1337. if(temp!=dest) {
  1338. /* copy the result string to the destination buffer */
  1339. if (U_SUCCESS(errorCode) && 0 < destLength && destLength <= destCapacity) {
  1340. u_memmove(dest, temp, destLength);
  1341. }
  1342. if(temp!=buffer) {
  1343. uprv_free(temp);
  1344. }
  1345. }
  1346. return u_terminateUChars(dest, destCapacity, destLength, &errorCode);
  1347. }
  1348. /* public API functions */
  1349. U_CAPI int32_t U_EXPORT2
  1350. u_strFoldCase(char16_t *dest, int32_t destCapacity,
  1351. const char16_t *src, int32_t srcLength,
  1352. uint32_t options,
  1353. UErrorCode *pErrorCode) {
  1354. return ustrcase_mapWithOverlap(
  1355. UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL
  1356. dest, destCapacity,
  1357. src, srcLength,
  1358. ustrcase_internalFold, *pErrorCode);
  1359. }
  1360. U_NAMESPACE_BEGIN
  1361. int32_t CaseMap::fold(
  1362. uint32_t options,
  1363. const char16_t *src, int32_t srcLength,
  1364. char16_t *dest, int32_t destCapacity, Edits *edits,
  1365. UErrorCode &errorCode) {
  1366. return ustrcase_map(
  1367. UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL
  1368. dest, destCapacity,
  1369. src, srcLength,
  1370. ustrcase_internalFold, edits, errorCode);
  1371. }
  1372. U_NAMESPACE_END
  1373. /* case-insensitive string comparisons -------------------------------------- */
  1374. /*
  1375. * This function is a copy of unorm_cmpEquivFold() minus the parts for
  1376. * canonical equivalence.
  1377. * Keep the functions in sync, and see there for how this works.
  1378. * The duplication is for modularization:
  1379. * It makes caseless (but not canonical caseless) matches independent of
  1380. * the normalization code.
  1381. */
  1382. /* stack element for previous-level source/decomposition pointers */
  1383. struct CmpEquivLevel {
  1384. const char16_t *start, *s, *limit;
  1385. };
  1386. typedef struct CmpEquivLevel CmpEquivLevel;
  1387. /**
  1388. * Internal implementation code comparing string with case fold.
  1389. * This function is called from u_strcmpFold() and u_caseInsensitivePrefixMatch().
  1390. *
  1391. * @param s1 input string 1
  1392. * @param length1 length of string 1, or -1 (NUL terminated)
  1393. * @param s2 input string 2
  1394. * @param length2 length of string 2, or -1 (NUL terminated)
  1395. * @param options compare options
  1396. * @param matchLen1 (output) length of partial prefix match in s1
  1397. * @param matchLen2 (output) length of partial prefix match in s2
  1398. * @param pErrorCode receives error status
  1399. * @return The result of comparison
  1400. */
  1401. static int32_t _cmpFold(
  1402. const char16_t *s1, int32_t length1,
  1403. const char16_t *s2, int32_t length2,
  1404. uint32_t options,
  1405. int32_t *matchLen1, int32_t *matchLen2,
  1406. UErrorCode *pErrorCode) {
  1407. int32_t cmpRes = 0;
  1408. /* current-level start/limit - s1/s2 as current */
  1409. const char16_t *start1, *start2, *limit1, *limit2;
  1410. /* points to the original start address */
  1411. const char16_t *org1, *org2;
  1412. /* points to the end of match + 1 */
  1413. const char16_t *m1, *m2;
  1414. /* case folding variables */
  1415. const char16_t *p;
  1416. int32_t length;
  1417. /* stacks of previous-level start/current/limit */
  1418. CmpEquivLevel stack1[2], stack2[2];
  1419. /* case folding buffers, only use current-level start/limit */
  1420. char16_t fold1[UCASE_MAX_STRING_LENGTH+1], fold2[UCASE_MAX_STRING_LENGTH+1];
  1421. /* track which is the current level per string */
  1422. int32_t level1, level2;
  1423. /* current code units, and code points for lookups */
  1424. UChar32 c1, c2, cp1, cp2;
  1425. /* no argument error checking because this itself is not an API */
  1426. /*
  1427. * assume that at least the option U_COMPARE_IGNORE_CASE is set
  1428. * otherwise this function would have to behave exactly as uprv_strCompare()
  1429. */
  1430. if(U_FAILURE(*pErrorCode)) {
  1431. return 0;
  1432. }
  1433. /* initialize */
  1434. if(matchLen1) {
  1435. U_ASSERT(matchLen2 !=nullptr);
  1436. *matchLen1=0;
  1437. *matchLen2=0;
  1438. }
  1439. start1=m1=org1=s1;
  1440. if(length1==-1) {
  1441. limit1=nullptr;
  1442. } else {
  1443. limit1=s1+length1;
  1444. }
  1445. start2=m2=org2=s2;
  1446. if(length2==-1) {
  1447. limit2=nullptr;
  1448. } else {
  1449. limit2=s2+length2;
  1450. }
  1451. level1=level2=0;
  1452. c1=c2=-1;
  1453. /* comparison loop */
  1454. for(;;) {
  1455. /*
  1456. * here a code unit value of -1 means "get another code unit"
  1457. * below it will mean "this source is finished"
  1458. */
  1459. if(c1<0) {
  1460. /* get next code unit from string 1, post-increment */
  1461. for(;;) {
  1462. if(s1==limit1 || ((c1=*s1)==0 && (limit1==nullptr || (options&_STRNCMP_STYLE)))) {
  1463. if(level1==0) {
  1464. c1=-1;
  1465. break;
  1466. }
  1467. } else {
  1468. ++s1;
  1469. break;
  1470. }
  1471. /* reached end of level buffer, pop one level */
  1472. do {
  1473. --level1;
  1474. start1=stack1[level1].start; /*Not uninitialized*/
  1475. } while(start1==nullptr);
  1476. s1=stack1[level1].s; /*Not uninitialized*/
  1477. limit1=stack1[level1].limit; /*Not uninitialized*/
  1478. }
  1479. }
  1480. if(c2<0) {
  1481. /* get next code unit from string 2, post-increment */
  1482. for(;;) {
  1483. if(s2==limit2 || ((c2=*s2)==0 && (limit2==nullptr || (options&_STRNCMP_STYLE)))) {
  1484. if(level2==0) {
  1485. c2=-1;
  1486. break;
  1487. }
  1488. } else {
  1489. ++s2;
  1490. break;
  1491. }
  1492. /* reached end of level buffer, pop one level */
  1493. do {
  1494. --level2;
  1495. start2=stack2[level2].start; /*Not uninitialized*/
  1496. } while(start2==nullptr);
  1497. s2=stack2[level2].s; /*Not uninitialized*/
  1498. limit2=stack2[level2].limit; /*Not uninitialized*/
  1499. }
  1500. }
  1501. /*
  1502. * compare c1 and c2
  1503. * either variable c1, c2 is -1 only if the corresponding string is finished
  1504. */
  1505. if(c1==c2) {
  1506. const char16_t *next1, *next2;
  1507. if(c1<0) {
  1508. cmpRes=0; /* c1==c2==-1 indicating end of strings */
  1509. break;
  1510. }
  1511. /*
  1512. * Note: Move the match positions in both strings at the same time
  1513. * only when corresponding code point(s) in the original strings
  1514. * are fully consumed. For example, when comparing s1="Fust" and
  1515. * s2="Fu\u00dfball", s2[2] is folded into "ss", and s1[2] matches
  1516. * the first code point in the case-folded data. But the second "s"
  1517. * has no matching code point in s1, so this implementation returns
  1518. * 2 as the prefix match length ("Fu").
  1519. */
  1520. next1=next2=nullptr;
  1521. if(level1==0) {
  1522. next1=s1;
  1523. } else if(s1==limit1) {
  1524. /* Note: This implementation only use a single level of stack.
  1525. * If this code needs to be changed to use multiple levels
  1526. * of stacks, the code above should check if the current
  1527. * code is at the end of all stacks.
  1528. */
  1529. U_ASSERT(level1==1);
  1530. /* is s1 at the end of the current stack? */
  1531. next1=stack1[0].s;
  1532. }
  1533. if (next1!=nullptr) {
  1534. if(level2==0) {
  1535. next2=s2;
  1536. } else if(s2==limit2) {
  1537. U_ASSERT(level2==1);
  1538. /* is s2 at the end of the current stack? */
  1539. next2=stack2[0].s;
  1540. }
  1541. if(next2!=nullptr) {
  1542. m1=next1;
  1543. m2=next2;
  1544. }
  1545. }
  1546. c1=c2=-1; /* make us fetch new code units */
  1547. continue;
  1548. } else if(c1<0) {
  1549. cmpRes=-1; /* string 1 ends before string 2 */
  1550. break;
  1551. } else if(c2<0) {
  1552. cmpRes=1; /* string 2 ends before string 1 */
  1553. break;
  1554. }
  1555. /* c1!=c2 && c1>=0 && c2>=0 */
  1556. /* get complete code points for c1, c2 for lookups if either is a surrogate */
  1557. cp1=c1;
  1558. if(U_IS_SURROGATE(c1)) {
  1559. char16_t c;
  1560. if(U_IS_SURROGATE_LEAD(c1)) {
  1561. if(s1!=limit1 && U16_IS_TRAIL(c=*s1)) {
  1562. /* advance ++s1; only below if cp1 decomposes/case-folds */
  1563. cp1=U16_GET_SUPPLEMENTARY(c1, c);
  1564. }
  1565. } else /* isTrail(c1) */ {
  1566. if(start1<=(s1-2) && U16_IS_LEAD(c=*(s1-2))) {
  1567. cp1=U16_GET_SUPPLEMENTARY(c, c1);
  1568. }
  1569. }
  1570. }
  1571. cp2=c2;
  1572. if(U_IS_SURROGATE(c2)) {
  1573. char16_t c;
  1574. if(U_IS_SURROGATE_LEAD(c2)) {
  1575. if(s2!=limit2 && U16_IS_TRAIL(c=*s2)) {
  1576. /* advance ++s2; only below if cp2 decomposes/case-folds */
  1577. cp2=U16_GET_SUPPLEMENTARY(c2, c);
  1578. }
  1579. } else /* isTrail(c2) */ {
  1580. if(start2<=(s2-2) && U16_IS_LEAD(c=*(s2-2))) {
  1581. cp2=U16_GET_SUPPLEMENTARY(c, c2);
  1582. }
  1583. }
  1584. }
  1585. /*
  1586. * go down one level for each string
  1587. * continue with the main loop as soon as there is a real change
  1588. */
  1589. if( level1==0 &&
  1590. (length = ucase_toFullFolding(cp1, &p, options)) >= 0
  1591. ) {
  1592. /* cp1 case-folds to the code point "length" or to p[length] */
  1593. if(U_IS_SURROGATE(c1)) {
  1594. if(U_IS_SURROGATE_LEAD(c1)) {
  1595. /* advance beyond source surrogate pair if it case-folds */
  1596. ++s1;
  1597. } else /* isTrail(c1) */ {
  1598. /*
  1599. * we got a supplementary code point when hitting its trail surrogate,
  1600. * therefore the lead surrogate must have been the same as in the other string;
  1601. * compare this decomposition with the lead surrogate in the other string
  1602. * remember that this simulates bulk text replacement:
  1603. * the decomposition would replace the entire code point
  1604. */
  1605. --s2;
  1606. --m2;
  1607. c2=*(s2-1);
  1608. }
  1609. }
  1610. /* push current level pointers */
  1611. stack1[0].start=start1;
  1612. stack1[0].s=s1;
  1613. stack1[0].limit=limit1;
  1614. ++level1;
  1615. /* copy the folding result to fold1[] */
  1616. if(length<=UCASE_MAX_STRING_LENGTH) {
  1617. u_memcpy(fold1, p, length);
  1618. } else {
  1619. int32_t i=0;
  1620. U16_APPEND_UNSAFE(fold1, i, length);
  1621. length=i;
  1622. }
  1623. /* set next level pointers to case folding */
  1624. start1=s1=fold1;
  1625. limit1=fold1+length;
  1626. /* get ready to read from decomposition, continue with loop */
  1627. c1=-1;
  1628. continue;
  1629. }
  1630. if( level2==0 &&
  1631. (length = ucase_toFullFolding(cp2, &p, options)) >= 0
  1632. ) {
  1633. /* cp2 case-folds to the code point "length" or to p[length] */
  1634. if(U_IS_SURROGATE(c2)) {
  1635. if(U_IS_SURROGATE_LEAD(c2)) {
  1636. /* advance beyond source surrogate pair if it case-folds */
  1637. ++s2;
  1638. } else /* isTrail(c2) */ {
  1639. /*
  1640. * we got a supplementary code point when hitting its trail surrogate,
  1641. * therefore the lead surrogate must have been the same as in the other string;
  1642. * compare this decomposition with the lead surrogate in the other string
  1643. * remember that this simulates bulk text replacement:
  1644. * the decomposition would replace the entire code point
  1645. */
  1646. --s1;
  1647. --m2;
  1648. c1=*(s1-1);
  1649. }
  1650. }
  1651. /* push current level pointers */
  1652. stack2[0].start=start2;
  1653. stack2[0].s=s2;
  1654. stack2[0].limit=limit2;
  1655. ++level2;
  1656. /* copy the folding result to fold2[] */
  1657. if(length<=UCASE_MAX_STRING_LENGTH) {
  1658. u_memcpy(fold2, p, length);
  1659. } else {
  1660. int32_t i=0;
  1661. U16_APPEND_UNSAFE(fold2, i, length);
  1662. length=i;
  1663. }
  1664. /* set next level pointers to case folding */
  1665. start2=s2=fold2;
  1666. limit2=fold2+length;
  1667. /* get ready to read from decomposition, continue with loop */
  1668. c2=-1;
  1669. continue;
  1670. }
  1671. /*
  1672. * no decomposition/case folding, max level for both sides:
  1673. * return difference result
  1674. *
  1675. * code point order comparison must not just return cp1-cp2
  1676. * because when single surrogates are present then the surrogate pairs
  1677. * that formed cp1 and cp2 may be from different string indexes
  1678. *
  1679. * example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units
  1680. * c1=d800 cp1=10001 c2=dc00 cp2=10000
  1681. * cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 }
  1682. *
  1683. * therefore, use same fix-up as in ustring.c/uprv_strCompare()
  1684. * except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++
  1685. * so we have slightly different pointer/start/limit comparisons here
  1686. */
  1687. if(c1>=0xd800 && c2>=0xd800 && (options&U_COMPARE_CODE_POINT_ORDER)) {
  1688. /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
  1689. if(
  1690. (c1<=0xdbff && s1!=limit1 && U16_IS_TRAIL(*s1)) ||
  1691. (U16_IS_TRAIL(c1) && start1!=(s1-1) && U16_IS_LEAD(*(s1-2)))
  1692. ) {
  1693. /* part of a surrogate pair, leave >=d800 */
  1694. } else {
  1695. /* BMP code point - may be surrogate code point - make <d800 */
  1696. c1-=0x2800;
  1697. }
  1698. if(
  1699. (c2<=0xdbff && s2!=limit2 && U16_IS_TRAIL(*s2)) ||
  1700. (U16_IS_TRAIL(c2) && start2!=(s2-1) && U16_IS_LEAD(*(s2-2)))
  1701. ) {
  1702. /* part of a surrogate pair, leave >=d800 */
  1703. } else {
  1704. /* BMP code point - may be surrogate code point - make <d800 */
  1705. c2-=0x2800;
  1706. }
  1707. }
  1708. cmpRes=c1-c2;
  1709. break;
  1710. }
  1711. if(matchLen1) {
  1712. *matchLen1=static_cast<int32_t>(m1-org1);
  1713. *matchLen2=static_cast<int32_t>(m2-org2);
  1714. }
  1715. return cmpRes;
  1716. }
  1717. /* internal function */
  1718. U_CFUNC int32_t
  1719. u_strcmpFold(const char16_t *s1, int32_t length1,
  1720. const char16_t *s2, int32_t length2,
  1721. uint32_t options,
  1722. UErrorCode *pErrorCode) {
  1723. return _cmpFold(s1, length1, s2, length2, options, nullptr, nullptr, pErrorCode);
  1724. }
  1725. /* public API functions */
  1726. U_CAPI int32_t U_EXPORT2
  1727. u_strCaseCompare(const char16_t *s1, int32_t length1,
  1728. const char16_t *s2, int32_t length2,
  1729. uint32_t options,
  1730. UErrorCode *pErrorCode) {
  1731. /* argument checking */
  1732. if (pErrorCode == nullptr || U_FAILURE(*pErrorCode)) {
  1733. return 0;
  1734. }
  1735. if(s1==nullptr || length1<-1 || s2==nullptr || length2<-1) {
  1736. *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
  1737. return 0;
  1738. }
  1739. return u_strcmpFold(s1, length1, s2, length2,
  1740. options|U_COMPARE_IGNORE_CASE,
  1741. pErrorCode);
  1742. }
  1743. U_CAPI int32_t U_EXPORT2
  1744. u_strcasecmp(const char16_t *s1, const char16_t *s2, uint32_t options) {
  1745. UErrorCode errorCode=U_ZERO_ERROR;
  1746. return u_strcmpFold(s1, -1, s2, -1,
  1747. options|U_COMPARE_IGNORE_CASE,
  1748. &errorCode);
  1749. }
  1750. U_CAPI int32_t U_EXPORT2
  1751. u_memcasecmp(const char16_t *s1, const char16_t *s2, int32_t length, uint32_t options) {
  1752. UErrorCode errorCode=U_ZERO_ERROR;
  1753. return u_strcmpFold(s1, length, s2, length,
  1754. options|U_COMPARE_IGNORE_CASE,
  1755. &errorCode);
  1756. }
  1757. U_CAPI int32_t U_EXPORT2
  1758. u_strncasecmp(const char16_t *s1, const char16_t *s2, int32_t n, uint32_t options) {
  1759. UErrorCode errorCode=U_ZERO_ERROR;
  1760. return u_strcmpFold(s1, n, s2, n,
  1761. options|(U_COMPARE_IGNORE_CASE|_STRNCMP_STYLE),
  1762. &errorCode);
  1763. }
  1764. /* internal API - detect length of shared prefix */
  1765. U_CAPI void
  1766. u_caseInsensitivePrefixMatch(const char16_t *s1, int32_t length1,
  1767. const char16_t *s2, int32_t length2,
  1768. uint32_t options,
  1769. int32_t *matchLen1, int32_t *matchLen2,
  1770. UErrorCode *pErrorCode) {
  1771. _cmpFold(s1, length1, s2, length2, options,
  1772. matchLen1, matchLen2, pErrorCode);
  1773. }