ucasemap.cpp 36 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. *******************************************************************************
  5. *
  6. * Copyright (C) 2005-2016, International Business Machines
  7. * Corporation and others. All Rights Reserved.
  8. *
  9. *******************************************************************************
  10. * file name: ucasemap.cpp
  11. * encoding: UTF-8
  12. * tab size: 8 (not used)
  13. * indentation:4
  14. *
  15. * created on: 2005may06
  16. * created by: Markus W. Scherer
  17. *
  18. * Case mapping service object and functions using it.
  19. */
  20. #include "unicode/utypes.h"
  21. #include "unicode/brkiter.h"
  22. #include "unicode/bytestream.h"
  23. #include "unicode/casemap.h"
  24. #include "unicode/edits.h"
  25. #include "unicode/stringoptions.h"
  26. #include "unicode/stringpiece.h"
  27. #include "unicode/ubrk.h"
  28. #include "unicode/uloc.h"
  29. #include "unicode/ustring.h"
  30. #include "unicode/ucasemap.h"
  31. #if !UCONFIG_NO_BREAK_ITERATION
  32. #include "unicode/utext.h"
  33. #endif
  34. #include "unicode/utf.h"
  35. #include "unicode/utf8.h"
  36. #include "unicode/utf16.h"
  37. #include "bytesinkutil.h"
  38. #include "cmemory.h"
  39. #include "cstring.h"
  40. #include "uassert.h"
  41. #include "ucase.h"
  42. #include "ucasemap_imp.h"
  43. #include "ustr_imp.h"
  44. U_NAMESPACE_USE
  45. /* UCaseMap service object -------------------------------------------------- */
  46. UCaseMap::UCaseMap(const char *localeID, uint32_t opts, UErrorCode *pErrorCode) :
  47. #if !UCONFIG_NO_BREAK_ITERATION
  48. iter(nullptr),
  49. #endif
  50. caseLocale(UCASE_LOC_UNKNOWN), options(opts) {
  51. ucasemap_setLocale(this, localeID, pErrorCode);
  52. }
  53. UCaseMap::~UCaseMap() {
  54. #if !UCONFIG_NO_BREAK_ITERATION
  55. delete iter;
  56. #endif
  57. }
  58. U_CAPI UCaseMap * U_EXPORT2
  59. ucasemap_open(const char *locale, uint32_t options, UErrorCode *pErrorCode) {
  60. if(U_FAILURE(*pErrorCode)) {
  61. return nullptr;
  62. }
  63. UCaseMap *csm = new UCaseMap(locale, options, pErrorCode);
  64. if(csm==nullptr) {
  65. *pErrorCode = U_MEMORY_ALLOCATION_ERROR;
  66. return nullptr;
  67. } else if (U_FAILURE(*pErrorCode)) {
  68. delete csm;
  69. return nullptr;
  70. }
  71. return csm;
  72. }
  73. U_CAPI void U_EXPORT2
  74. ucasemap_close(UCaseMap *csm) {
  75. delete csm;
  76. }
  77. U_CAPI const char * U_EXPORT2
  78. ucasemap_getLocale(const UCaseMap *csm) {
  79. return csm->locale;
  80. }
  81. U_CAPI uint32_t U_EXPORT2
  82. ucasemap_getOptions(const UCaseMap *csm) {
  83. return csm->options;
  84. }
  85. U_CAPI void U_EXPORT2
  86. ucasemap_setLocale(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode) {
  87. if(U_FAILURE(*pErrorCode)) {
  88. return;
  89. }
  90. if (locale != nullptr && *locale == 0) {
  91. csm->locale[0] = 0;
  92. csm->caseLocale = UCASE_LOC_ROOT;
  93. return;
  94. }
  95. int32_t length=uloc_getName(locale, csm->locale, (int32_t)sizeof(csm->locale), pErrorCode);
  96. if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR || length==sizeof(csm->locale)) {
  97. *pErrorCode=U_ZERO_ERROR;
  98. /* we only really need the language code for case mappings */
  99. length=uloc_getLanguage(locale, csm->locale, (int32_t)sizeof(csm->locale), pErrorCode);
  100. }
  101. if(length==sizeof(csm->locale)) {
  102. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  103. }
  104. if(U_SUCCESS(*pErrorCode)) {
  105. csm->caseLocale = ucase_getCaseLocale(csm->locale);
  106. } else {
  107. csm->locale[0]=0;
  108. csm->caseLocale = UCASE_LOC_ROOT;
  109. }
  110. }
  111. U_CAPI void U_EXPORT2
  112. ucasemap_setOptions(UCaseMap *csm, uint32_t options, UErrorCode *pErrorCode) {
  113. if(U_FAILURE(*pErrorCode)) {
  114. return;
  115. }
  116. csm->options=options;
  117. }
  118. /* UTF-8 string case mappings ----------------------------------------------- */
  119. /* TODO(markus): Move to a new, separate utf8case.cpp file. */
  120. namespace {
  121. /* append a full case mapping result, see UCASE_MAX_STRING_LENGTH */
  122. inline UBool
  123. appendResult(int32_t cpLength, int32_t result, const char16_t *s,
  124. ByteSink &sink, uint32_t options, icu::Edits *edits, UErrorCode &errorCode) {
  125. U_ASSERT(U_SUCCESS(errorCode));
  126. /* decode the result */
  127. if(result<0) {
  128. /* (not) original code point */
  129. if(edits!=nullptr) {
  130. edits->addUnchanged(cpLength);
  131. }
  132. if((options & U_OMIT_UNCHANGED_TEXT) == 0) {
  133. ByteSinkUtil::appendCodePoint(cpLength, ~result, sink);
  134. }
  135. } else {
  136. if(result<=UCASE_MAX_STRING_LENGTH) {
  137. // string: "result" is the UTF-16 length
  138. return ByteSinkUtil::appendChange(cpLength, s, result, sink, edits, errorCode);
  139. } else {
  140. ByteSinkUtil::appendCodePoint(cpLength, result, sink, edits);
  141. }
  142. }
  143. return true;
  144. }
  145. // See unicode/utf8.h U8_APPEND_UNSAFE().
  146. inline uint8_t getTwoByteLead(UChar32 c) { return (uint8_t)((c >> 6) | 0xc0); }
  147. inline uint8_t getTwoByteTrail(UChar32 c) { return (uint8_t)((c & 0x3f) | 0x80); }
  148. UChar32 U_CALLCONV
  149. utf8_caseContextIterator(void *context, int8_t dir) {
  150. UCaseContext *csc=(UCaseContext *)context;
  151. UChar32 c;
  152. if(dir<0) {
  153. /* reset for backward iteration */
  154. csc->index=csc->cpStart;
  155. csc->dir=dir;
  156. } else if(dir>0) {
  157. /* reset for forward iteration */
  158. csc->index=csc->cpLimit;
  159. csc->dir=dir;
  160. } else {
  161. /* continue current iteration direction */
  162. dir=csc->dir;
  163. }
  164. if(dir<0) {
  165. if(csc->start<csc->index) {
  166. U8_PREV((const uint8_t *)csc->p, csc->start, csc->index, c);
  167. return c;
  168. }
  169. } else {
  170. if(csc->index<csc->limit) {
  171. U8_NEXT((const uint8_t *)csc->p, csc->index, csc->limit, c);
  172. return c;
  173. }
  174. }
  175. return U_SENTINEL;
  176. }
  177. /**
  178. * caseLocale >= 0: Lowercases [srcStart..srcLimit[ but takes context [0..srcLength[ into account.
  179. * caseLocale < 0: Case-folds [srcStart..srcLimit[.
  180. */
  181. void toLower(int32_t caseLocale, uint32_t options,
  182. const uint8_t *src, UCaseContext *csc, int32_t srcStart, int32_t srcLimit,
  183. icu::ByteSink &sink, icu::Edits *edits, UErrorCode &errorCode) {
  184. const int8_t *latinToLower;
  185. if (caseLocale == UCASE_LOC_ROOT ||
  186. (caseLocale >= 0 ?
  187. !(caseLocale == UCASE_LOC_TURKISH || caseLocale == UCASE_LOC_LITHUANIAN) :
  188. (options & _FOLD_CASE_OPTIONS_MASK) == U_FOLD_CASE_DEFAULT)) {
  189. latinToLower = LatinCase::TO_LOWER_NORMAL;
  190. } else {
  191. latinToLower = LatinCase::TO_LOWER_TR_LT;
  192. }
  193. const UTrie2 *trie = ucase_getTrie();
  194. int32_t prev = srcStart;
  195. int32_t srcIndex = srcStart;
  196. for (;;) {
  197. // fast path for simple cases
  198. int32_t cpStart;
  199. UChar32 c;
  200. for (;;) {
  201. if (U_FAILURE(errorCode) || srcIndex >= srcLimit) {
  202. c = U_SENTINEL;
  203. break;
  204. }
  205. uint8_t lead = src[srcIndex++];
  206. if (lead <= 0x7f) {
  207. int8_t d = latinToLower[lead];
  208. if (d == LatinCase::EXC) {
  209. cpStart = srcIndex - 1;
  210. c = lead;
  211. break;
  212. }
  213. if (d == 0) { continue; }
  214. ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 1 - prev,
  215. sink, options, edits, errorCode);
  216. char ascii = (char)(lead + d);
  217. sink.Append(&ascii, 1);
  218. if (edits != nullptr) {
  219. edits->addReplace(1, 1);
  220. }
  221. prev = srcIndex;
  222. continue;
  223. } else if (lead < 0xe3) {
  224. uint8_t t;
  225. if (0xc2 <= lead && lead <= 0xc5 && srcIndex < srcLimit &&
  226. (t = src[srcIndex] - 0x80) <= 0x3f) {
  227. // U+0080..U+017F
  228. ++srcIndex;
  229. c = ((lead - 0xc0) << 6) | t;
  230. int8_t d = latinToLower[c];
  231. if (d == LatinCase::EXC) {
  232. cpStart = srcIndex - 2;
  233. break;
  234. }
  235. if (d == 0) { continue; }
  236. ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 2 - prev,
  237. sink, options, edits, errorCode);
  238. ByteSinkUtil::appendTwoBytes(c + d, sink);
  239. if (edits != nullptr) {
  240. edits->addReplace(2, 2);
  241. }
  242. prev = srcIndex;
  243. continue;
  244. }
  245. } else if ((lead <= 0xe9 || lead == 0xeb || lead == 0xec) &&
  246. (srcIndex + 2) <= srcLimit &&
  247. U8_IS_TRAIL(src[srcIndex]) && U8_IS_TRAIL(src[srcIndex + 1])) {
  248. // most of CJK: no case mappings
  249. srcIndex += 2;
  250. continue;
  251. }
  252. cpStart = --srcIndex;
  253. U8_NEXT(src, srcIndex, srcLimit, c);
  254. if (c < 0) {
  255. // ill-formed UTF-8
  256. continue;
  257. }
  258. uint16_t props = UTRIE2_GET16(trie, c);
  259. if (UCASE_HAS_EXCEPTION(props)) { break; }
  260. int32_t delta;
  261. if (!UCASE_IS_UPPER_OR_TITLE(props) || (delta = UCASE_GET_DELTA(props)) == 0) {
  262. continue;
  263. }
  264. ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
  265. sink, options, edits, errorCode);
  266. ByteSinkUtil::appendCodePoint(srcIndex - cpStart, c + delta, sink, edits);
  267. prev = srcIndex;
  268. }
  269. if (c < 0) {
  270. break;
  271. }
  272. // slow path
  273. const char16_t *s;
  274. if (caseLocale >= 0) {
  275. csc->cpStart = cpStart;
  276. csc->cpLimit = srcIndex;
  277. c = ucase_toFullLower(c, utf8_caseContextIterator, csc, &s, caseLocale);
  278. } else {
  279. c = ucase_toFullFolding(c, &s, options);
  280. }
  281. if (c >= 0) {
  282. ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
  283. sink, options, edits, errorCode);
  284. appendResult(srcIndex - cpStart, c, s, sink, options, edits, errorCode);
  285. prev = srcIndex;
  286. }
  287. }
  288. ByteSinkUtil::appendUnchanged(src + prev, srcIndex - prev,
  289. sink, options, edits, errorCode);
  290. }
  291. void toUpper(int32_t caseLocale, uint32_t options,
  292. const uint8_t *src, UCaseContext *csc, int32_t srcLength,
  293. icu::ByteSink &sink, icu::Edits *edits, UErrorCode &errorCode) {
  294. const int8_t *latinToUpper;
  295. if (caseLocale == UCASE_LOC_TURKISH) {
  296. latinToUpper = LatinCase::TO_UPPER_TR;
  297. } else {
  298. latinToUpper = LatinCase::TO_UPPER_NORMAL;
  299. }
  300. const UTrie2 *trie = ucase_getTrie();
  301. int32_t prev = 0;
  302. int32_t srcIndex = 0;
  303. for (;;) {
  304. // fast path for simple cases
  305. int32_t cpStart;
  306. UChar32 c;
  307. for (;;) {
  308. if (U_FAILURE(errorCode) || srcIndex >= srcLength) {
  309. c = U_SENTINEL;
  310. break;
  311. }
  312. uint8_t lead = src[srcIndex++];
  313. if (lead <= 0x7f) {
  314. int8_t d = latinToUpper[lead];
  315. if (d == LatinCase::EXC) {
  316. cpStart = srcIndex - 1;
  317. c = lead;
  318. break;
  319. }
  320. if (d == 0) { continue; }
  321. ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 1 - prev,
  322. sink, options, edits, errorCode);
  323. char ascii = (char)(lead + d);
  324. sink.Append(&ascii, 1);
  325. if (edits != nullptr) {
  326. edits->addReplace(1, 1);
  327. }
  328. prev = srcIndex;
  329. continue;
  330. } else if (lead < 0xe3) {
  331. uint8_t t;
  332. if (0xc2 <= lead && lead <= 0xc5 && srcIndex < srcLength &&
  333. (t = src[srcIndex] - 0x80) <= 0x3f) {
  334. // U+0080..U+017F
  335. ++srcIndex;
  336. c = ((lead - 0xc0) << 6) | t;
  337. int8_t d = latinToUpper[c];
  338. if (d == LatinCase::EXC) {
  339. cpStart = srcIndex - 2;
  340. break;
  341. }
  342. if (d == 0) { continue; }
  343. ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 2 - prev,
  344. sink, options, edits, errorCode);
  345. ByteSinkUtil::appendTwoBytes(c + d, sink);
  346. if (edits != nullptr) {
  347. edits->addReplace(2, 2);
  348. }
  349. prev = srcIndex;
  350. continue;
  351. }
  352. } else if ((lead <= 0xe9 || lead == 0xeb || lead == 0xec) &&
  353. (srcIndex + 2) <= srcLength &&
  354. U8_IS_TRAIL(src[srcIndex]) && U8_IS_TRAIL(src[srcIndex + 1])) {
  355. // most of CJK: no case mappings
  356. srcIndex += 2;
  357. continue;
  358. }
  359. cpStart = --srcIndex;
  360. U8_NEXT(src, srcIndex, srcLength, c);
  361. if (c < 0) {
  362. // ill-formed UTF-8
  363. continue;
  364. }
  365. uint16_t props = UTRIE2_GET16(trie, c);
  366. if (UCASE_HAS_EXCEPTION(props)) { break; }
  367. int32_t delta;
  368. if (UCASE_GET_TYPE(props) != UCASE_LOWER || (delta = UCASE_GET_DELTA(props)) == 0) {
  369. continue;
  370. }
  371. ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
  372. sink, options, edits, errorCode);
  373. ByteSinkUtil::appendCodePoint(srcIndex - cpStart, c + delta, sink, edits);
  374. prev = srcIndex;
  375. }
  376. if (c < 0) {
  377. break;
  378. }
  379. // slow path
  380. csc->cpStart = cpStart;
  381. csc->cpLimit = srcIndex;
  382. const char16_t *s;
  383. c = ucase_toFullUpper(c, utf8_caseContextIterator, csc, &s, caseLocale);
  384. if (c >= 0) {
  385. ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
  386. sink, options, edits, errorCode);
  387. appendResult(srcIndex - cpStart, c, s, sink, options, edits, errorCode);
  388. prev = srcIndex;
  389. }
  390. }
  391. ByteSinkUtil::appendUnchanged(src + prev, srcIndex - prev,
  392. sink, options, edits, errorCode);
  393. }
  394. } // namespace
  395. #if !UCONFIG_NO_BREAK_ITERATION
  396. namespace {
  397. constexpr uint8_t ACUTE_BYTE0 = u8"\u0301"[0];
  398. constexpr uint8_t ACUTE_BYTE1 = u8"\u0301"[1];
  399. /**
  400. * Input: c is a letter I with or without acute accent.
  401. * start is the index in src after c, and is less than segmentLimit.
  402. * If a plain i/I is followed by a plain j/J,
  403. * or an i/I with acute (precomposed or decomposed) is followed by a j/J with acute,
  404. * then we output accordingly.
  405. *
  406. * @return the src index after the titlecased sequence, or the start index if no Dutch IJ
  407. */
  408. int32_t maybeTitleDutchIJ(const uint8_t *src, UChar32 c, int32_t start, int32_t segmentLimit,
  409. ByteSink &sink, uint32_t options, icu::Edits *edits, UErrorCode &errorCode) {
  410. U_ASSERT(start < segmentLimit);
  411. int32_t index = start;
  412. bool withAcute = false;
  413. // If the conditions are met, then the following variables tell us what to output.
  414. int32_t unchanged1 = 0; // code units before the j, or the whole sequence (0..3)
  415. bool doTitleJ = false; // true if the j needs to be titlecased
  416. int32_t unchanged2 = 0; // after the j (0 or 1)
  417. // next character after the first letter
  418. UChar32 c2;
  419. c2 = src[index++];
  420. // Is the first letter an i/I with accent?
  421. if (c == u'I') {
  422. if (c2 == ACUTE_BYTE0 && index < segmentLimit && src[index++] == ACUTE_BYTE1) {
  423. withAcute = true;
  424. unchanged1 = 2; // ACUTE is 2 code units in UTF-8
  425. if (index == segmentLimit) { return start; }
  426. c2 = src[index++];
  427. }
  428. } else { // Í
  429. withAcute = true;
  430. }
  431. // Is the next character a j/J?
  432. if (c2 == u'j') {
  433. doTitleJ = true;
  434. } else if (c2 == u'J') {
  435. ++unchanged1;
  436. } else {
  437. return start;
  438. }
  439. // A plain i/I must be followed by a plain j/J.
  440. // An i/I with acute must be followed by a j/J with acute.
  441. if (withAcute) {
  442. if ((index + 1) >= segmentLimit || src[index++] != ACUTE_BYTE0 || src[index++] != ACUTE_BYTE1) {
  443. return start;
  444. }
  445. if (doTitleJ) {
  446. unchanged2 = 2; // ACUTE is 2 code units in UTF-8
  447. } else {
  448. unchanged1 = unchanged1 + 2; // ACUTE is 2 code units in UTF-8
  449. }
  450. }
  451. // There must not be another combining mark.
  452. if (index < segmentLimit) {
  453. int32_t cp;
  454. int32_t i = index;
  455. U8_NEXT(src, i, segmentLimit, cp);
  456. uint32_t typeMask = U_GET_GC_MASK(cp);
  457. if ((typeMask & U_GC_M_MASK) != 0) {
  458. return start;
  459. }
  460. }
  461. // Output the rest of the Dutch IJ.
  462. ByteSinkUtil::appendUnchanged(src + start, unchanged1, sink, options, edits, errorCode);
  463. start += unchanged1;
  464. if (doTitleJ) {
  465. ByteSinkUtil::appendCodePoint(1, u'J', sink, edits);
  466. ++start;
  467. }
  468. ByteSinkUtil::appendUnchanged(src + start, unchanged2, sink, options, edits, errorCode);
  469. U_ASSERT(start + unchanged2 == index);
  470. return index;
  471. }
  472. } // namespace
  473. U_CFUNC void U_CALLCONV
  474. ucasemap_internalUTF8ToTitle(
  475. int32_t caseLocale, uint32_t options, BreakIterator *iter,
  476. const uint8_t *src, int32_t srcLength,
  477. ByteSink &sink, icu::Edits *edits,
  478. UErrorCode &errorCode) {
  479. if (!ustrcase_checkTitleAdjustmentOptions(options, errorCode)) {
  480. return;
  481. }
  482. /* set up local variables */
  483. UCaseContext csc=UCASECONTEXT_INITIALIZER;
  484. csc.p=(void *)src;
  485. csc.limit=srcLength;
  486. int32_t prev=0;
  487. UBool isFirstIndex=true;
  488. /* titlecasing loop */
  489. while(prev<srcLength) {
  490. /* find next index where to titlecase */
  491. int32_t index;
  492. if(isFirstIndex) {
  493. isFirstIndex=false;
  494. index=iter->first();
  495. } else {
  496. index=iter->next();
  497. }
  498. if(index==UBRK_DONE || index>srcLength) {
  499. index=srcLength;
  500. }
  501. /*
  502. * Segment [prev..index[ into 3 parts:
  503. * a) skipped characters (copy as-is) [prev..titleStart[
  504. * b) first letter (titlecase) [titleStart..titleLimit[
  505. * c) subsequent characters (lowercase) [titleLimit..index[
  506. */
  507. if(prev<index) {
  508. /* find and copy skipped characters [prev..titleStart[ */
  509. int32_t titleStart=prev;
  510. int32_t titleLimit=prev;
  511. UChar32 c;
  512. U8_NEXT(src, titleLimit, index, c);
  513. if ((options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0) {
  514. // Adjust the titlecasing index to the next cased character,
  515. // or to the next letter/number/symbol/private use.
  516. // Stop with titleStart<titleLimit<=index
  517. // if there is a character to be titlecased,
  518. // or else stop with titleStart==titleLimit==index.
  519. UBool toCased = (options&U_TITLECASE_ADJUST_TO_CASED) != 0;
  520. while (toCased ? UCASE_NONE==ucase_getType(c) : !ustrcase_isLNS(c)) {
  521. titleStart=titleLimit;
  522. if(titleLimit==index) {
  523. break;
  524. }
  525. U8_NEXT(src, titleLimit, index, c);
  526. }
  527. if (prev < titleStart) {
  528. if (!ByteSinkUtil::appendUnchanged(src+prev, titleStart-prev,
  529. sink, options, edits, errorCode)) {
  530. return;
  531. }
  532. }
  533. }
  534. if(titleStart<titleLimit) {
  535. /* titlecase c which is from [titleStart..titleLimit[ */
  536. if(c>=0) {
  537. csc.cpStart=titleStart;
  538. csc.cpLimit=titleLimit;
  539. const char16_t *s;
  540. c=ucase_toFullTitle(c, utf8_caseContextIterator, &csc, &s, caseLocale);
  541. if (!appendResult(titleLimit-titleStart, c, s, sink, options, edits, errorCode)) {
  542. return;
  543. }
  544. } else {
  545. // Malformed UTF-8.
  546. if (!ByteSinkUtil::appendUnchanged(src+titleStart, titleLimit-titleStart,
  547. sink, options, edits, errorCode)) {
  548. return;
  549. }
  550. }
  551. /* Special case Dutch IJ titlecasing */
  552. if (titleLimit < index &&
  553. caseLocale == UCASE_LOC_DUTCH) {
  554. if (c < 0) {
  555. c = ~c;
  556. }
  557. if (c == u'I' || c == u'Í') {
  558. titleLimit = maybeTitleDutchIJ(src, c, titleLimit, index, sink, options, edits, errorCode);
  559. }
  560. }
  561. /* lowercase [titleLimit..index[ */
  562. if(titleLimit<index) {
  563. if((options&U_TITLECASE_NO_LOWERCASE)==0) {
  564. /* Normal operation: Lowercase the rest of the word. */
  565. toLower(caseLocale, options,
  566. src, &csc, titleLimit, index,
  567. sink, edits, errorCode);
  568. if(U_FAILURE(errorCode)) {
  569. return;
  570. }
  571. } else {
  572. /* Optionally just copy the rest of the word unchanged. */
  573. if (!ByteSinkUtil::appendUnchanged(src+titleLimit, index-titleLimit,
  574. sink, options, edits, errorCode)) {
  575. return;
  576. }
  577. }
  578. }
  579. }
  580. }
  581. prev=index;
  582. }
  583. }
  584. #endif
  585. U_NAMESPACE_BEGIN
  586. namespace GreekUpper {
  587. UBool isFollowedByCasedLetter(const uint8_t *s, int32_t i, int32_t length) {
  588. while (i < length) {
  589. UChar32 c;
  590. U8_NEXT(s, i, length, c);
  591. int32_t type = ucase_getTypeOrIgnorable(c);
  592. if ((type & UCASE_IGNORABLE) != 0) {
  593. // Case-ignorable, continue with the loop.
  594. } else if (type != UCASE_NONE) {
  595. return true; // Followed by cased letter.
  596. } else {
  597. return false; // Uncased and not case-ignorable.
  598. }
  599. }
  600. return false; // Not followed by cased letter.
  601. }
  602. // Keep this consistent with the UTF-16 version in ustrcase.cpp and the Java version in CaseMap.java.
  603. void toUpper(uint32_t options,
  604. const uint8_t *src, int32_t srcLength,
  605. ByteSink &sink, Edits *edits,
  606. UErrorCode &errorCode) {
  607. uint32_t state = 0;
  608. for (int32_t i = 0; i < srcLength;) {
  609. int32_t nextIndex = i;
  610. UChar32 c;
  611. U8_NEXT(src, nextIndex, srcLength, c);
  612. uint32_t nextState = 0;
  613. int32_t type = ucase_getTypeOrIgnorable(c);
  614. if ((type & UCASE_IGNORABLE) != 0) {
  615. // c is case-ignorable
  616. nextState |= (state & AFTER_CASED);
  617. } else if (type != UCASE_NONE) {
  618. // c is cased
  619. nextState |= AFTER_CASED;
  620. }
  621. uint32_t data = getLetterData(c);
  622. if (data > 0) {
  623. uint32_t upper = data & UPPER_MASK;
  624. // Add a dialytika to this iota or ypsilon vowel
  625. // if we removed a tonos from the previous vowel,
  626. // and that previous vowel did not also have (or gain) a dialytika.
  627. // Adding one only to the final vowel in a longer sequence
  628. // (which does not occur in normal writing) would require lookahead.
  629. // Set the same flag as for preserving an existing dialytika.
  630. if ((data & HAS_VOWEL) != 0 && (state & AFTER_VOWEL_WITH_ACCENT) != 0 &&
  631. (upper == 0x399 || upper == 0x3A5)) {
  632. data |= HAS_DIALYTIKA;
  633. }
  634. int32_t numYpogegrammeni = 0; // Map each one to a trailing, spacing, capital iota.
  635. if ((data & HAS_YPOGEGRAMMENI) != 0) {
  636. numYpogegrammeni = 1;
  637. }
  638. // Skip combining diacritics after this Greek letter.
  639. int32_t nextNextIndex = nextIndex;
  640. while (nextIndex < srcLength) {
  641. UChar32 c2;
  642. U8_NEXT(src, nextNextIndex, srcLength, c2);
  643. uint32_t diacriticData = getDiacriticData(c2);
  644. if (diacriticData != 0) {
  645. data |= diacriticData;
  646. if ((diacriticData & HAS_YPOGEGRAMMENI) != 0) {
  647. ++numYpogegrammeni;
  648. }
  649. nextIndex = nextNextIndex;
  650. } else {
  651. break; // not a Greek diacritic
  652. }
  653. }
  654. if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) {
  655. nextState |= AFTER_VOWEL_WITH_ACCENT;
  656. }
  657. // Map according to Greek rules.
  658. UBool addTonos = false;
  659. if (upper == 0x397 &&
  660. (data & HAS_ACCENT) != 0 &&
  661. numYpogegrammeni == 0 &&
  662. (state & AFTER_CASED) == 0 &&
  663. !isFollowedByCasedLetter(src, nextIndex, srcLength)) {
  664. // Keep disjunctive "or" with (only) a tonos.
  665. // We use the same "word boundary" conditions as for the Final_Sigma test.
  666. if (i == nextIndex) {
  667. upper = 0x389; // Preserve the precomposed form.
  668. } else {
  669. addTonos = true;
  670. }
  671. } else if ((data & HAS_DIALYTIKA) != 0) {
  672. // Preserve a vowel with dialytika in precomposed form if it exists.
  673. if (upper == 0x399) {
  674. upper = 0x3AA;
  675. data &= ~HAS_EITHER_DIALYTIKA;
  676. } else if (upper == 0x3A5) {
  677. upper = 0x3AB;
  678. data &= ~HAS_EITHER_DIALYTIKA;
  679. }
  680. }
  681. UBool change;
  682. if (edits == nullptr && (options & U_OMIT_UNCHANGED_TEXT) == 0) {
  683. change = true; // common, simple usage
  684. } else {
  685. // Find out first whether we are changing the text.
  686. U_ASSERT(0x370 <= upper && upper <= 0x3ff); // 2-byte UTF-8, main Greek block
  687. change = (i + 2) > nextIndex ||
  688. src[i] != getTwoByteLead(upper) || src[i + 1] != getTwoByteTrail(upper) ||
  689. numYpogegrammeni > 0;
  690. int32_t i2 = i + 2;
  691. if ((data & HAS_EITHER_DIALYTIKA) != 0) {
  692. change |= (i2 + 2) > nextIndex ||
  693. src[i2] != (uint8_t)u8"\u0308"[0] ||
  694. src[i2 + 1] != (uint8_t)u8"\u0308"[1];
  695. i2 += 2;
  696. }
  697. if (addTonos) {
  698. change |= (i2 + 2) > nextIndex ||
  699. src[i2] != (uint8_t)u8"\u0301"[0] ||
  700. src[i2 + 1] != (uint8_t)u8"\u0301"[1];
  701. i2 += 2;
  702. }
  703. int32_t oldLength = nextIndex - i;
  704. int32_t newLength = (i2 - i) + numYpogegrammeni * 2; // 2 bytes per U+0399
  705. change |= oldLength != newLength;
  706. if (change) {
  707. if (edits != nullptr) {
  708. edits->addReplace(oldLength, newLength);
  709. }
  710. } else {
  711. if (edits != nullptr) {
  712. edits->addUnchanged(oldLength);
  713. }
  714. // Write unchanged text?
  715. change = (options & U_OMIT_UNCHANGED_TEXT) == 0;
  716. }
  717. }
  718. if (change) {
  719. ByteSinkUtil::appendTwoBytes(upper, sink);
  720. if ((data & HAS_EITHER_DIALYTIKA) != 0) {
  721. sink.AppendU8(u8"\u0308", 2); // restore or add a dialytika
  722. }
  723. if (addTonos) {
  724. sink.AppendU8(u8"\u0301", 2);
  725. }
  726. while (numYpogegrammeni > 0) {
  727. sink.AppendU8(u8"\u0399", 2);
  728. --numYpogegrammeni;
  729. }
  730. }
  731. } else if(c>=0) {
  732. const char16_t *s;
  733. c=ucase_toFullUpper(c, nullptr, nullptr, &s, UCASE_LOC_GREEK);
  734. if (!appendResult(nextIndex - i, c, s, sink, options, edits, errorCode)) {
  735. return;
  736. }
  737. } else {
  738. // Malformed UTF-8.
  739. if (!ByteSinkUtil::appendUnchanged(src+i, nextIndex-i,
  740. sink, options, edits, errorCode)) {
  741. return;
  742. }
  743. }
  744. i = nextIndex;
  745. state = nextState;
  746. }
  747. }
  748. } // namespace GreekUpper
  749. U_NAMESPACE_END
  750. static void U_CALLCONV
  751. ucasemap_internalUTF8ToLower(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
  752. const uint8_t *src, int32_t srcLength,
  753. icu::ByteSink &sink, icu::Edits *edits,
  754. UErrorCode &errorCode) {
  755. UCaseContext csc=UCASECONTEXT_INITIALIZER;
  756. csc.p=(void *)src;
  757. csc.limit=srcLength;
  758. toLower(
  759. caseLocale, options,
  760. src, &csc, 0, srcLength,
  761. sink, edits, errorCode);
  762. }
  763. static void U_CALLCONV
  764. ucasemap_internalUTF8ToUpper(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
  765. const uint8_t *src, int32_t srcLength,
  766. icu::ByteSink &sink, icu::Edits *edits,
  767. UErrorCode &errorCode) {
  768. if (caseLocale == UCASE_LOC_GREEK) {
  769. GreekUpper::toUpper(options, src, srcLength, sink, edits, errorCode);
  770. } else {
  771. UCaseContext csc=UCASECONTEXT_INITIALIZER;
  772. csc.p=(void *)src;
  773. csc.limit=srcLength;
  774. toUpper(
  775. caseLocale, options,
  776. src, &csc, srcLength,
  777. sink, edits, errorCode);
  778. }
  779. }
  780. static void U_CALLCONV
  781. ucasemap_internalUTF8Fold(int32_t /* caseLocale */, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
  782. const uint8_t *src, int32_t srcLength,
  783. icu::ByteSink &sink, icu::Edits *edits,
  784. UErrorCode &errorCode) {
  785. toLower(
  786. -1, options,
  787. src, nullptr, 0, srcLength,
  788. sink, edits, errorCode);
  789. }
  790. void
  791. ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
  792. const char *src, int32_t srcLength,
  793. UTF8CaseMapper *stringCaseMapper,
  794. icu::ByteSink &sink, icu::Edits *edits,
  795. UErrorCode &errorCode) {
  796. /* check argument values */
  797. if (U_FAILURE(errorCode)) {
  798. return;
  799. }
  800. if ((src == nullptr && srcLength != 0) || srcLength < -1) {
  801. errorCode = U_ILLEGAL_ARGUMENT_ERROR;
  802. return;
  803. }
  804. // Get the string length.
  805. if (srcLength == -1) {
  806. srcLength = (int32_t)uprv_strlen((const char *)src);
  807. }
  808. if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
  809. edits->reset();
  810. }
  811. stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
  812. (const uint8_t *)src, srcLength, sink, edits, errorCode);
  813. sink.Flush();
  814. if (U_SUCCESS(errorCode)) {
  815. if (edits != nullptr) {
  816. edits->copyErrorTo(errorCode);
  817. }
  818. }
  819. }
  820. int32_t
  821. ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
  822. char *dest, int32_t destCapacity,
  823. const char *src, int32_t srcLength,
  824. UTF8CaseMapper *stringCaseMapper,
  825. icu::Edits *edits,
  826. UErrorCode &errorCode) {
  827. /* check argument values */
  828. if(U_FAILURE(errorCode)) {
  829. return 0;
  830. }
  831. if( destCapacity<0 ||
  832. (dest==nullptr && destCapacity>0) ||
  833. (src==nullptr && srcLength!=0) || srcLength<-1
  834. ) {
  835. errorCode=U_ILLEGAL_ARGUMENT_ERROR;
  836. return 0;
  837. }
  838. /* get the string length */
  839. if(srcLength==-1) {
  840. srcLength=(int32_t)uprv_strlen((const char *)src);
  841. }
  842. /* check for overlapping source and destination */
  843. if( dest!=nullptr &&
  844. ((src>=dest && src<(dest+destCapacity)) ||
  845. (dest>=src && dest<(src+srcLength)))
  846. ) {
  847. errorCode=U_ILLEGAL_ARGUMENT_ERROR;
  848. return 0;
  849. }
  850. CheckedArrayByteSink sink(dest, destCapacity);
  851. if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
  852. edits->reset();
  853. }
  854. stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
  855. (const uint8_t *)src, srcLength, sink, edits, errorCode);
  856. sink.Flush();
  857. if (U_SUCCESS(errorCode)) {
  858. if (sink.Overflowed()) {
  859. errorCode = U_BUFFER_OVERFLOW_ERROR;
  860. } else if (edits != nullptr) {
  861. edits->copyErrorTo(errorCode);
  862. }
  863. }
  864. return u_terminateChars(dest, destCapacity, sink.NumberOfBytesAppended(), &errorCode);
  865. }
  866. /* public API functions */
  867. U_CAPI int32_t U_EXPORT2
  868. ucasemap_utf8ToLower(const UCaseMap *csm,
  869. char *dest, int32_t destCapacity,
  870. const char *src, int32_t srcLength,
  871. UErrorCode *pErrorCode) {
  872. return ucasemap_mapUTF8(
  873. csm->caseLocale, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
  874. dest, destCapacity,
  875. src, srcLength,
  876. ucasemap_internalUTF8ToLower, nullptr, *pErrorCode);
  877. }
  878. U_CAPI int32_t U_EXPORT2
  879. ucasemap_utf8ToUpper(const UCaseMap *csm,
  880. char *dest, int32_t destCapacity,
  881. const char *src, int32_t srcLength,
  882. UErrorCode *pErrorCode) {
  883. return ucasemap_mapUTF8(
  884. csm->caseLocale, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
  885. dest, destCapacity,
  886. src, srcLength,
  887. ucasemap_internalUTF8ToUpper, nullptr, *pErrorCode);
  888. }
  889. U_CAPI int32_t U_EXPORT2
  890. ucasemap_utf8FoldCase(const UCaseMap *csm,
  891. char *dest, int32_t destCapacity,
  892. const char *src, int32_t srcLength,
  893. UErrorCode *pErrorCode) {
  894. return ucasemap_mapUTF8(
  895. UCASE_LOC_ROOT, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
  896. dest, destCapacity,
  897. src, srcLength,
  898. ucasemap_internalUTF8Fold, nullptr, *pErrorCode);
  899. }
  900. U_NAMESPACE_BEGIN
  901. void CaseMap::utf8ToLower(
  902. const char *locale, uint32_t options,
  903. StringPiece src, ByteSink &sink, Edits *edits,
  904. UErrorCode &errorCode) {
  905. ucasemap_mapUTF8(
  906. ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
  907. src.data(), src.length(),
  908. ucasemap_internalUTF8ToLower, sink, edits, errorCode);
  909. }
  910. void CaseMap::utf8ToUpper(
  911. const char *locale, uint32_t options,
  912. StringPiece src, ByteSink &sink, Edits *edits,
  913. UErrorCode &errorCode) {
  914. ucasemap_mapUTF8(
  915. ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
  916. src.data(), src.length(),
  917. ucasemap_internalUTF8ToUpper, sink, edits, errorCode);
  918. }
  919. void CaseMap::utf8Fold(
  920. uint32_t options,
  921. StringPiece src, ByteSink &sink, Edits *edits,
  922. UErrorCode &errorCode) {
  923. ucasemap_mapUTF8(
  924. UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL
  925. src.data(), src.length(),
  926. ucasemap_internalUTF8Fold, sink, edits, errorCode);
  927. }
  928. int32_t CaseMap::utf8ToLower(
  929. const char *locale, uint32_t options,
  930. const char *src, int32_t srcLength,
  931. char *dest, int32_t destCapacity, Edits *edits,
  932. UErrorCode &errorCode) {
  933. return ucasemap_mapUTF8(
  934. ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
  935. dest, destCapacity,
  936. src, srcLength,
  937. ucasemap_internalUTF8ToLower, edits, errorCode);
  938. }
  939. int32_t CaseMap::utf8ToUpper(
  940. const char *locale, uint32_t options,
  941. const char *src, int32_t srcLength,
  942. char *dest, int32_t destCapacity, Edits *edits,
  943. UErrorCode &errorCode) {
  944. return ucasemap_mapUTF8(
  945. ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
  946. dest, destCapacity,
  947. src, srcLength,
  948. ucasemap_internalUTF8ToUpper, edits, errorCode);
  949. }
  950. int32_t CaseMap::utf8Fold(
  951. uint32_t options,
  952. const char *src, int32_t srcLength,
  953. char *dest, int32_t destCapacity, Edits *edits,
  954. UErrorCode &errorCode) {
  955. return ucasemap_mapUTF8(
  956. UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL
  957. dest, destCapacity,
  958. src, srcLength,
  959. ucasemap_internalUTF8Fold, edits, errorCode);
  960. }
  961. U_NAMESPACE_END