uiter.cpp 32 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. *******************************************************************************
  5. *
  6. * Copyright (C) 2002-2012, International Business Machines
  7. * Corporation and others. All Rights Reserved.
  8. *
  9. *******************************************************************************
  10. * file name: uiter.cpp
  11. * encoding: UTF-8
  12. * tab size: 8 (not used)
  13. * indentation:4
  14. *
  15. * created on: 2002jan18
  16. * created by: Markus W. Scherer
  17. */
  18. #include "unicode/utypes.h"
  19. #include "unicode/ustring.h"
  20. #include "unicode/chariter.h"
  21. #include "unicode/rep.h"
  22. #include "unicode/uiter.h"
  23. #include "unicode/utf.h"
  24. #include "unicode/utf8.h"
  25. #include "unicode/utf16.h"
  26. #include "cstring.h"
  27. U_NAMESPACE_USE
  28. #define IS_EVEN(n) (((n)&1)==0)
  29. #define IS_POINTER_EVEN(p) IS_EVEN((size_t)p)
  30. U_CDECL_BEGIN
  31. /* No-Op UCharIterator implementation for illegal input --------------------- */
  32. static int32_t U_CALLCONV
  33. noopGetIndex(UCharIterator * /*iter*/, UCharIteratorOrigin /*origin*/) {
  34. return 0;
  35. }
  36. static int32_t U_CALLCONV
  37. noopMove(UCharIterator * /*iter*/, int32_t /*delta*/, UCharIteratorOrigin /*origin*/) {
  38. return 0;
  39. }
  40. static UBool U_CALLCONV
  41. noopHasNext(UCharIterator * /*iter*/) {
  42. return false;
  43. }
  44. static UChar32 U_CALLCONV
  45. noopCurrent(UCharIterator * /*iter*/) {
  46. return U_SENTINEL;
  47. }
  48. static uint32_t U_CALLCONV
  49. noopGetState(const UCharIterator * /*iter*/) {
  50. return UITER_NO_STATE;
  51. }
  52. static void U_CALLCONV
  53. noopSetState(UCharIterator * /*iter*/, uint32_t /*state*/, UErrorCode *pErrorCode) {
  54. *pErrorCode=U_UNSUPPORTED_ERROR;
  55. }
  56. static const UCharIterator noopIterator={
  57. nullptr, 0, 0, 0, 0, 0,
  58. noopGetIndex,
  59. noopMove,
  60. noopHasNext,
  61. noopHasNext,
  62. noopCurrent,
  63. noopCurrent,
  64. noopCurrent,
  65. nullptr,
  66. noopGetState,
  67. noopSetState
  68. };
  69. /* UCharIterator implementation for simple strings -------------------------- */
  70. /*
  71. * This is an implementation of a code unit (char16_t) iterator
  72. * for char16_t * strings.
  73. *
  74. * The UCharIterator.context field holds a pointer to the string.
  75. */
  76. static int32_t U_CALLCONV
  77. stringIteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) UPRV_NO_SANITIZE_UNDEFINED {
  78. switch(origin) {
  79. case UITER_ZERO:
  80. return 0;
  81. case UITER_START:
  82. return iter->start;
  83. case UITER_CURRENT:
  84. return iter->index;
  85. case UITER_LIMIT:
  86. return iter->limit;
  87. case UITER_LENGTH:
  88. return iter->length;
  89. default:
  90. /* not a valid origin */
  91. /* Should never get here! */
  92. return -1;
  93. }
  94. }
  95. static int32_t U_CALLCONV
  96. stringIteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) UPRV_NO_SANITIZE_UNDEFINED {
  97. int32_t pos;
  98. switch(origin) {
  99. case UITER_ZERO:
  100. pos=delta;
  101. break;
  102. case UITER_START:
  103. pos=iter->start+delta;
  104. break;
  105. case UITER_CURRENT:
  106. pos=iter->index+delta;
  107. break;
  108. case UITER_LIMIT:
  109. pos=iter->limit+delta;
  110. break;
  111. case UITER_LENGTH:
  112. pos=iter->length+delta;
  113. break;
  114. default:
  115. return -1; /* Error */
  116. }
  117. if(pos<iter->start) {
  118. pos=iter->start;
  119. } else if(pos>iter->limit) {
  120. pos=iter->limit;
  121. }
  122. return iter->index=pos;
  123. }
  124. static UBool U_CALLCONV
  125. stringIteratorHasNext(UCharIterator *iter) {
  126. return iter->index<iter->limit;
  127. }
  128. static UBool U_CALLCONV
  129. stringIteratorHasPrevious(UCharIterator *iter) {
  130. return iter->index>iter->start;
  131. }
  132. static UChar32 U_CALLCONV
  133. stringIteratorCurrent(UCharIterator *iter) {
  134. if(iter->index<iter->limit) {
  135. return ((const char16_t *)(iter->context))[iter->index];
  136. } else {
  137. return U_SENTINEL;
  138. }
  139. }
  140. static UChar32 U_CALLCONV
  141. stringIteratorNext(UCharIterator *iter) {
  142. if(iter->index<iter->limit) {
  143. return ((const char16_t *)(iter->context))[iter->index++];
  144. } else {
  145. return U_SENTINEL;
  146. }
  147. }
  148. static UChar32 U_CALLCONV
  149. stringIteratorPrevious(UCharIterator *iter) {
  150. if(iter->index>iter->start) {
  151. return ((const char16_t *)(iter->context))[--iter->index];
  152. } else {
  153. return U_SENTINEL;
  154. }
  155. }
  156. static uint32_t U_CALLCONV
  157. stringIteratorGetState(const UCharIterator *iter) {
  158. return (uint32_t)iter->index;
  159. }
  160. static void U_CALLCONV
  161. stringIteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) {
  162. if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) {
  163. /* do nothing */
  164. } else if(iter==nullptr) {
  165. *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
  166. } else if((int32_t)state<iter->start || iter->limit<(int32_t)state) {
  167. *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
  168. } else {
  169. iter->index=(int32_t)state;
  170. }
  171. }
  172. static const UCharIterator stringIterator={
  173. nullptr, 0, 0, 0, 0, 0,
  174. stringIteratorGetIndex,
  175. stringIteratorMove,
  176. stringIteratorHasNext,
  177. stringIteratorHasPrevious,
  178. stringIteratorCurrent,
  179. stringIteratorNext,
  180. stringIteratorPrevious,
  181. nullptr,
  182. stringIteratorGetState,
  183. stringIteratorSetState
  184. };
  185. U_CAPI void U_EXPORT2
  186. uiter_setString(UCharIterator *iter, const char16_t *s, int32_t length) {
  187. if (iter != nullptr) {
  188. if (s != nullptr && length >= -1) {
  189. *iter=stringIterator;
  190. iter->context=s;
  191. if(length>=0) {
  192. iter->length=length;
  193. } else {
  194. iter->length=u_strlen(s);
  195. }
  196. iter->limit=iter->length;
  197. } else {
  198. *iter=noopIterator;
  199. }
  200. }
  201. }
  202. /* UCharIterator implementation for UTF-16BE strings ------------------------ */
  203. /*
  204. * This is an implementation of a code unit (char16_t) iterator
  205. * for UTF-16BE strings, i.e., strings in byte-vectors where
  206. * each char16_t is stored as a big-endian pair of bytes.
  207. *
  208. * The UCharIterator.context field holds a pointer to the string.
  209. * Everything works just like with a normal char16_t iterator (uiter_setString),
  210. * except that UChars are assembled from byte pairs.
  211. */
  212. /* internal helper function */
  213. static inline UChar32
  214. utf16BEIteratorGet(UCharIterator *iter, int32_t index) {
  215. const uint8_t *p=(const uint8_t *)iter->context;
  216. return ((char16_t)p[2*index]<<8)|(char16_t)p[2*index+1];
  217. }
  218. static UChar32 U_CALLCONV
  219. utf16BEIteratorCurrent(UCharIterator *iter) {
  220. int32_t index;
  221. if((index=iter->index)<iter->limit) {
  222. return utf16BEIteratorGet(iter, index);
  223. } else {
  224. return U_SENTINEL;
  225. }
  226. }
  227. static UChar32 U_CALLCONV
  228. utf16BEIteratorNext(UCharIterator *iter) {
  229. int32_t index;
  230. if((index=iter->index)<iter->limit) {
  231. iter->index=index+1;
  232. return utf16BEIteratorGet(iter, index);
  233. } else {
  234. return U_SENTINEL;
  235. }
  236. }
  237. static UChar32 U_CALLCONV
  238. utf16BEIteratorPrevious(UCharIterator *iter) {
  239. int32_t index;
  240. if((index=iter->index)>iter->start) {
  241. iter->index=--index;
  242. return utf16BEIteratorGet(iter, index);
  243. } else {
  244. return U_SENTINEL;
  245. }
  246. }
  247. static const UCharIterator utf16BEIterator={
  248. nullptr, 0, 0, 0, 0, 0,
  249. stringIteratorGetIndex,
  250. stringIteratorMove,
  251. stringIteratorHasNext,
  252. stringIteratorHasPrevious,
  253. utf16BEIteratorCurrent,
  254. utf16BEIteratorNext,
  255. utf16BEIteratorPrevious,
  256. nullptr,
  257. stringIteratorGetState,
  258. stringIteratorSetState
  259. };
  260. /*
  261. * Count the number of UChars in a UTF-16BE string before a terminating char16_t NUL,
  262. * i.e., before a pair of 0 bytes where the first 0 byte is at an even
  263. * offset from s.
  264. */
  265. static int32_t
  266. utf16BE_strlen(const char *s) {
  267. if(IS_POINTER_EVEN(s)) {
  268. /*
  269. * even-aligned, call u_strlen(s)
  270. * we are probably on a little-endian machine, but searching for char16_t NUL
  271. * does not care about endianness
  272. */
  273. return u_strlen((const char16_t *)s);
  274. } else {
  275. /* odd-aligned, search for pair of 0 bytes */
  276. const char *p=s;
  277. while(!(*p==0 && p[1]==0)) {
  278. p+=2;
  279. }
  280. return (int32_t)((p-s)/2);
  281. }
  282. }
  283. U_CAPI void U_EXPORT2
  284. uiter_setUTF16BE(UCharIterator *iter, const char *s, int32_t length) {
  285. if(iter!=nullptr) {
  286. /* allow only even-length strings (the input length counts bytes) */
  287. if(s!=nullptr && (length==-1 || (length>=0 && IS_EVEN(length)))) {
  288. /* length/=2, except that >>=1 also works for -1 (-1/2==0, -1>>1==-1) */
  289. length>>=1;
  290. if(U_IS_BIG_ENDIAN && IS_POINTER_EVEN(s)) {
  291. /* big-endian machine and 2-aligned UTF-16BE string: use normal char16_t iterator */
  292. uiter_setString(iter, (const char16_t *)s, length);
  293. return;
  294. }
  295. *iter=utf16BEIterator;
  296. iter->context=s;
  297. if(length>=0) {
  298. iter->length=length;
  299. } else {
  300. iter->length=utf16BE_strlen(s);
  301. }
  302. iter->limit=iter->length;
  303. } else {
  304. *iter=noopIterator;
  305. }
  306. }
  307. }
  308. /* UCharIterator wrapper around CharacterIterator --------------------------- */
  309. /*
  310. * This is wrapper code around a C++ CharacterIterator to
  311. * look like a C UCharIterator.
  312. *
  313. * The UCharIterator.context field holds a pointer to the CharacterIterator.
  314. */
  315. static int32_t U_CALLCONV
  316. characterIteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) UPRV_NO_SANITIZE_UNDEFINED {
  317. switch(origin) {
  318. case UITER_ZERO:
  319. return 0;
  320. case UITER_START:
  321. return ((CharacterIterator *)(iter->context))->startIndex();
  322. case UITER_CURRENT:
  323. return ((CharacterIterator *)(iter->context))->getIndex();
  324. case UITER_LIMIT:
  325. return ((CharacterIterator *)(iter->context))->endIndex();
  326. case UITER_LENGTH:
  327. return ((CharacterIterator *)(iter->context))->getLength();
  328. default:
  329. /* not a valid origin */
  330. /* Should never get here! */
  331. return -1;
  332. }
  333. }
  334. static int32_t U_CALLCONV
  335. characterIteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) UPRV_NO_SANITIZE_UNDEFINED {
  336. switch(origin) {
  337. case UITER_ZERO:
  338. ((CharacterIterator *)(iter->context))->setIndex(delta);
  339. return ((CharacterIterator *)(iter->context))->getIndex();
  340. case UITER_START:
  341. case UITER_CURRENT:
  342. case UITER_LIMIT:
  343. return ((CharacterIterator *)(iter->context))->move(delta, (CharacterIterator::EOrigin)origin);
  344. case UITER_LENGTH:
  345. ((CharacterIterator *)(iter->context))->setIndex(((CharacterIterator *)(iter->context))->getLength()+delta);
  346. return ((CharacterIterator *)(iter->context))->getIndex();
  347. default:
  348. /* not a valid origin */
  349. /* Should never get here! */
  350. return -1;
  351. }
  352. }
  353. static UBool U_CALLCONV
  354. characterIteratorHasNext(UCharIterator *iter) {
  355. return ((CharacterIterator *)(iter->context))->hasNext();
  356. }
  357. static UBool U_CALLCONV
  358. characterIteratorHasPrevious(UCharIterator *iter) {
  359. return ((CharacterIterator *)(iter->context))->hasPrevious();
  360. }
  361. static UChar32 U_CALLCONV
  362. characterIteratorCurrent(UCharIterator *iter) {
  363. UChar32 c;
  364. c=((CharacterIterator *)(iter->context))->current();
  365. if(c!=0xffff || ((CharacterIterator *)(iter->context))->hasNext()) {
  366. return c;
  367. } else {
  368. return U_SENTINEL;
  369. }
  370. }
  371. static UChar32 U_CALLCONV
  372. characterIteratorNext(UCharIterator *iter) {
  373. if(((CharacterIterator *)(iter->context))->hasNext()) {
  374. return ((CharacterIterator *)(iter->context))->nextPostInc();
  375. } else {
  376. return U_SENTINEL;
  377. }
  378. }
  379. static UChar32 U_CALLCONV
  380. characterIteratorPrevious(UCharIterator *iter) {
  381. if(((CharacterIterator *)(iter->context))->hasPrevious()) {
  382. return ((CharacterIterator *)(iter->context))->previous();
  383. } else {
  384. return U_SENTINEL;
  385. }
  386. }
  387. static uint32_t U_CALLCONV
  388. characterIteratorGetState(const UCharIterator *iter) {
  389. return ((CharacterIterator *)(iter->context))->getIndex();
  390. }
  391. static void U_CALLCONV
  392. characterIteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) {
  393. if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) {
  394. /* do nothing */
  395. } else if(iter==nullptr || iter->context==nullptr) {
  396. *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
  397. } else if((int32_t)state<((CharacterIterator *)(iter->context))->startIndex() || ((CharacterIterator *)(iter->context))->endIndex()<(int32_t)state) {
  398. *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
  399. } else {
  400. ((CharacterIterator *)(iter->context))->setIndex((int32_t)state);
  401. }
  402. }
  403. static const UCharIterator characterIteratorWrapper={
  404. nullptr, 0, 0, 0, 0, 0,
  405. characterIteratorGetIndex,
  406. characterIteratorMove,
  407. characterIteratorHasNext,
  408. characterIteratorHasPrevious,
  409. characterIteratorCurrent,
  410. characterIteratorNext,
  411. characterIteratorPrevious,
  412. nullptr,
  413. characterIteratorGetState,
  414. characterIteratorSetState
  415. };
  416. U_CAPI void U_EXPORT2
  417. uiter_setCharacterIterator(UCharIterator *iter, CharacterIterator *charIter) {
  418. if (iter != nullptr) {
  419. if (charIter != nullptr) {
  420. *iter=characterIteratorWrapper;
  421. iter->context=charIter;
  422. } else {
  423. *iter=noopIterator;
  424. }
  425. }
  426. }
  427. /* UCharIterator wrapper around Replaceable --------------------------------- */
  428. /*
  429. * This is an implementation of a code unit (char16_t) iterator
  430. * based on a Replaceable object.
  431. *
  432. * The UCharIterator.context field holds a pointer to the Replaceable.
  433. * UCharIterator.length and UCharIterator.index hold Replaceable.length()
  434. * and the iteration index.
  435. */
  436. static UChar32 U_CALLCONV
  437. replaceableIteratorCurrent(UCharIterator *iter) {
  438. if(iter->index<iter->limit) {
  439. return ((Replaceable *)(iter->context))->charAt(iter->index);
  440. } else {
  441. return U_SENTINEL;
  442. }
  443. }
  444. static UChar32 U_CALLCONV
  445. replaceableIteratorNext(UCharIterator *iter) {
  446. if(iter->index<iter->limit) {
  447. return ((Replaceable *)(iter->context))->charAt(iter->index++);
  448. } else {
  449. return U_SENTINEL;
  450. }
  451. }
  452. static UChar32 U_CALLCONV
  453. replaceableIteratorPrevious(UCharIterator *iter) {
  454. if(iter->index>iter->start) {
  455. return ((Replaceable *)(iter->context))->charAt(--iter->index);
  456. } else {
  457. return U_SENTINEL;
  458. }
  459. }
  460. static const UCharIterator replaceableIterator={
  461. nullptr, 0, 0, 0, 0, 0,
  462. stringIteratorGetIndex,
  463. stringIteratorMove,
  464. stringIteratorHasNext,
  465. stringIteratorHasPrevious,
  466. replaceableIteratorCurrent,
  467. replaceableIteratorNext,
  468. replaceableIteratorPrevious,
  469. nullptr,
  470. stringIteratorGetState,
  471. stringIteratorSetState
  472. };
  473. U_CAPI void U_EXPORT2
  474. uiter_setReplaceable(UCharIterator *iter, const Replaceable *rep) {
  475. if (iter != nullptr) {
  476. if (rep != nullptr) {
  477. *iter=replaceableIterator;
  478. iter->context=rep;
  479. iter->limit=iter->length=rep->length();
  480. } else {
  481. *iter=noopIterator;
  482. }
  483. }
  484. }
  485. /* UCharIterator implementation for UTF-8 strings --------------------------- */
  486. /*
  487. * Possible, probably necessary only for an implementation for arbitrary
  488. * converters:
  489. * Maintain a buffer (ring buffer?) for a piece of converted 16-bit text.
  490. * This would require to turn reservedFn into a close function and
  491. * to introduce a uiter_close(iter).
  492. */
  493. #define UITER_CNV_CAPACITY 16
  494. /*
  495. * Minimal implementation:
  496. * Maintain a single-char16_t buffer for an additional surrogate.
  497. * The caller must not modify start and limit because they are used internally.
  498. *
  499. * Use UCharIterator fields as follows:
  500. * context pointer to UTF-8 string
  501. * length UTF-16 length of the string; -1 until lazy evaluation
  502. * start current UTF-8 index
  503. * index current UTF-16 index; may be -1="unknown" after setState()
  504. * limit UTF-8 length of the string
  505. * reservedField supplementary code point
  506. *
  507. * Since UCharIterator delivers 16-bit code units, the iteration can be
  508. * currently in the middle of the byte sequence for a supplementary code point.
  509. * In this case, reservedField will contain that code point and start will
  510. * point to after the corresponding byte sequence. The UTF-16 index will be
  511. * one less than what it would otherwise be corresponding to the UTF-8 index.
  512. * Otherwise, reservedField will be 0.
  513. */
  514. /*
  515. * Possible optimization for NUL-terminated UTF-8 and UTF-16 strings:
  516. * Add implementations that do not call strlen() for iteration but check for NUL.
  517. */
  518. static int32_t U_CALLCONV
  519. utf8IteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) UPRV_NO_SANITIZE_UNDEFINED {
  520. switch(origin) {
  521. case UITER_ZERO:
  522. case UITER_START:
  523. return 0;
  524. case UITER_CURRENT:
  525. if(iter->index<0) {
  526. /* the current UTF-16 index is unknown after setState(), count from the beginning */
  527. const uint8_t *s;
  528. UChar32 c;
  529. int32_t i, limit, index;
  530. s=(const uint8_t *)iter->context;
  531. i=index=0;
  532. limit=iter->start; /* count up to the UTF-8 index */
  533. while(i<limit) {
  534. U8_NEXT_OR_FFFD(s, i, limit, c);
  535. index+=U16_LENGTH(c);
  536. }
  537. iter->start=i; /* just in case setState() did not get us to a code point boundary */
  538. if(i==iter->limit) {
  539. iter->length=index; /* in case it was <0 or wrong */
  540. }
  541. if(iter->reservedField!=0) {
  542. --index; /* we are in the middle of a supplementary code point */
  543. }
  544. iter->index=index;
  545. }
  546. return iter->index;
  547. case UITER_LIMIT:
  548. case UITER_LENGTH:
  549. if(iter->length<0) {
  550. const uint8_t *s;
  551. UChar32 c;
  552. int32_t i, limit, length;
  553. s=(const uint8_t *)iter->context;
  554. if(iter->index<0) {
  555. /*
  556. * the current UTF-16 index is unknown after setState(),
  557. * we must first count from the beginning to here
  558. */
  559. i=length=0;
  560. limit=iter->start;
  561. /* count from the beginning to the current index */
  562. while(i<limit) {
  563. U8_NEXT_OR_FFFD(s, i, limit, c);
  564. length+=U16_LENGTH(c);
  565. }
  566. /* assume i==limit==iter->start, set the UTF-16 index */
  567. iter->start=i; /* just in case setState() did not get us to a code point boundary */
  568. iter->index= iter->reservedField!=0 ? length-1 : length;
  569. } else {
  570. i=iter->start;
  571. length=iter->index;
  572. if(iter->reservedField!=0) {
  573. ++length;
  574. }
  575. }
  576. /* count from the current index to the end */
  577. limit=iter->limit;
  578. while(i<limit) {
  579. U8_NEXT_OR_FFFD(s, i, limit, c);
  580. length+=U16_LENGTH(c);
  581. }
  582. iter->length=length;
  583. }
  584. return iter->length;
  585. default:
  586. /* not a valid origin */
  587. /* Should never get here! */
  588. return -1;
  589. }
  590. }
  591. static int32_t U_CALLCONV
  592. utf8IteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) UPRV_NO_SANITIZE_UNDEFINED {
  593. const uint8_t *s;
  594. UChar32 c;
  595. int32_t pos; /* requested UTF-16 index */
  596. int32_t i; /* UTF-8 index */
  597. UBool havePos;
  598. /* calculate the requested UTF-16 index */
  599. switch(origin) {
  600. case UITER_ZERO:
  601. case UITER_START:
  602. pos=delta;
  603. havePos=true;
  604. /* iter->index<0 (unknown) is possible */
  605. break;
  606. case UITER_CURRENT:
  607. if(iter->index>=0) {
  608. pos=iter->index+delta;
  609. havePos=true;
  610. } else {
  611. /* the current UTF-16 index is unknown after setState(), use only delta */
  612. pos=0;
  613. havePos=false;
  614. }
  615. break;
  616. case UITER_LIMIT:
  617. case UITER_LENGTH:
  618. if(iter->length>=0) {
  619. pos=iter->length+delta;
  620. havePos=true;
  621. } else {
  622. /* pin to the end, avoid counting the length */
  623. iter->index=-1;
  624. iter->start=iter->limit;
  625. iter->reservedField=0;
  626. if(delta>=0) {
  627. return UITER_UNKNOWN_INDEX;
  628. } else {
  629. /* the current UTF-16 index is unknown, use only delta */
  630. pos=0;
  631. havePos=false;
  632. }
  633. }
  634. break;
  635. default:
  636. return -1; /* Error */
  637. }
  638. if(havePos) {
  639. /* shortcuts: pinning to the edges of the string */
  640. if(pos<=0) {
  641. iter->index=iter->start=iter->reservedField=0;
  642. return 0;
  643. } else if(iter->length>=0 && pos>=iter->length) {
  644. iter->index=iter->length;
  645. iter->start=iter->limit;
  646. iter->reservedField=0;
  647. return iter->index;
  648. }
  649. /* minimize the number of U8_NEXT/PREV operations */
  650. if(iter->index<0 || pos<iter->index/2) {
  651. /* go forward from the start instead of backward from the current index */
  652. iter->index=iter->start=iter->reservedField=0;
  653. } else if(iter->length>=0 && (iter->length-pos)<(pos-iter->index)) {
  654. /*
  655. * if we have the UTF-16 index and length and the new position is
  656. * closer to the end than the current index,
  657. * then go backward from the end instead of forward from the current index
  658. */
  659. iter->index=iter->length;
  660. iter->start=iter->limit;
  661. iter->reservedField=0;
  662. }
  663. delta=pos-iter->index;
  664. if(delta==0) {
  665. return iter->index; /* nothing to do */
  666. }
  667. } else {
  668. /* move relative to unknown UTF-16 index */
  669. if(delta==0) {
  670. return UITER_UNKNOWN_INDEX; /* nothing to do */
  671. } else if(-delta>=iter->start) {
  672. /* moving backwards by more UChars than there are UTF-8 bytes, pin to 0 */
  673. iter->index=iter->start=iter->reservedField=0;
  674. return 0;
  675. } else if(delta>=(iter->limit-iter->start)) {
  676. /* moving forward by more UChars than the remaining UTF-8 bytes, pin to the end */
  677. iter->index=iter->length; /* may or may not be <0 (unknown) */
  678. iter->start=iter->limit;
  679. iter->reservedField=0;
  680. return iter->index>=0 ? iter->index : (int32_t)UITER_UNKNOWN_INDEX;
  681. }
  682. }
  683. /* delta!=0 */
  684. /* move towards the requested position, pin to the edges of the string */
  685. s=(const uint8_t *)iter->context;
  686. pos=iter->index; /* could be <0 (unknown) */
  687. i=iter->start;
  688. if(delta>0) {
  689. /* go forward */
  690. int32_t limit=iter->limit;
  691. if(iter->reservedField!=0) {
  692. iter->reservedField=0;
  693. ++pos;
  694. --delta;
  695. }
  696. while(delta>0 && i<limit) {
  697. U8_NEXT_OR_FFFD(s, i, limit, c);
  698. if(c<=0xffff) {
  699. ++pos;
  700. --delta;
  701. } else if(delta>=2) {
  702. pos+=2;
  703. delta-=2;
  704. } else /* delta==1 */ {
  705. /* stop in the middle of a supplementary code point */
  706. iter->reservedField=c;
  707. ++pos;
  708. break; /* delta=0; */
  709. }
  710. }
  711. if(i==limit) {
  712. if(iter->length<0 && iter->index>=0) {
  713. iter->length= iter->reservedField==0 ? pos : pos+1;
  714. } else if(iter->index<0 && iter->length>=0) {
  715. iter->index= iter->reservedField==0 ? iter->length : iter->length-1;
  716. }
  717. }
  718. } else /* delta<0 */ {
  719. /* go backward */
  720. if(iter->reservedField!=0) {
  721. iter->reservedField=0;
  722. i-=4; /* we stayed behind the supplementary code point; go before it now */
  723. --pos;
  724. ++delta;
  725. }
  726. while(delta<0 && i>0) {
  727. U8_PREV_OR_FFFD(s, 0, i, c);
  728. if(c<=0xffff) {
  729. --pos;
  730. ++delta;
  731. } else if(delta<=-2) {
  732. pos-=2;
  733. delta+=2;
  734. } else /* delta==-1 */ {
  735. /* stop in the middle of a supplementary code point */
  736. i+=4; /* back to behind this supplementary code point for consistent state */
  737. iter->reservedField=c;
  738. --pos;
  739. break; /* delta=0; */
  740. }
  741. }
  742. }
  743. iter->start=i;
  744. if(iter->index>=0) {
  745. return iter->index=pos;
  746. } else {
  747. /* we started with index<0 (unknown) so pos is bogus */
  748. if(i<=1) {
  749. return iter->index=i; /* reached the beginning */
  750. } else {
  751. /* we still don't know the UTF-16 index */
  752. return UITER_UNKNOWN_INDEX;
  753. }
  754. }
  755. }
  756. static UBool U_CALLCONV
  757. utf8IteratorHasNext(UCharIterator *iter) {
  758. return iter->start<iter->limit || iter->reservedField!=0;
  759. }
  760. static UBool U_CALLCONV
  761. utf8IteratorHasPrevious(UCharIterator *iter) {
  762. return iter->start>0;
  763. }
  764. static UChar32 U_CALLCONV
  765. utf8IteratorCurrent(UCharIterator *iter) {
  766. if(iter->reservedField!=0) {
  767. return U16_TRAIL(iter->reservedField);
  768. } else if(iter->start<iter->limit) {
  769. const uint8_t *s=(const uint8_t *)iter->context;
  770. UChar32 c;
  771. int32_t i=iter->start;
  772. U8_NEXT_OR_FFFD(s, i, iter->limit, c);
  773. if(c<=0xffff) {
  774. return c;
  775. } else {
  776. return U16_LEAD(c);
  777. }
  778. } else {
  779. return U_SENTINEL;
  780. }
  781. }
  782. static UChar32 U_CALLCONV
  783. utf8IteratorNext(UCharIterator *iter) {
  784. int32_t index;
  785. if(iter->reservedField!=0) {
  786. char16_t trail=U16_TRAIL(iter->reservedField);
  787. iter->reservedField=0;
  788. if((index=iter->index)>=0) {
  789. iter->index=index+1;
  790. }
  791. return trail;
  792. } else if(iter->start<iter->limit) {
  793. const uint8_t *s=(const uint8_t *)iter->context;
  794. UChar32 c;
  795. U8_NEXT_OR_FFFD(s, iter->start, iter->limit, c);
  796. if((index=iter->index)>=0) {
  797. iter->index=++index;
  798. if(iter->length<0 && iter->start==iter->limit) {
  799. iter->length= c<=0xffff ? index : index+1;
  800. }
  801. } else if(iter->start==iter->limit && iter->length>=0) {
  802. iter->index= c<=0xffff ? iter->length : iter->length-1;
  803. }
  804. if(c<=0xffff) {
  805. return c;
  806. } else {
  807. iter->reservedField=c;
  808. return U16_LEAD(c);
  809. }
  810. } else {
  811. return U_SENTINEL;
  812. }
  813. }
  814. static UChar32 U_CALLCONV
  815. utf8IteratorPrevious(UCharIterator *iter) {
  816. int32_t index;
  817. if(iter->reservedField!=0) {
  818. char16_t lead=U16_LEAD(iter->reservedField);
  819. iter->reservedField=0;
  820. iter->start-=4; /* we stayed behind the supplementary code point; go before it now */
  821. if((index=iter->index)>0) {
  822. iter->index=index-1;
  823. }
  824. return lead;
  825. } else if(iter->start>0) {
  826. const uint8_t *s=(const uint8_t *)iter->context;
  827. UChar32 c;
  828. U8_PREV_OR_FFFD(s, 0, iter->start, c);
  829. if((index=iter->index)>0) {
  830. iter->index=index-1;
  831. } else if(iter->start<=1) {
  832. iter->index= c<=0xffff ? iter->start : iter->start+1;
  833. }
  834. if(c<=0xffff) {
  835. return c;
  836. } else {
  837. iter->start+=4; /* back to behind this supplementary code point for consistent state */
  838. iter->reservedField=c;
  839. return U16_TRAIL(c);
  840. }
  841. } else {
  842. return U_SENTINEL;
  843. }
  844. }
  845. static uint32_t U_CALLCONV
  846. utf8IteratorGetState(const UCharIterator *iter) {
  847. uint32_t state=(uint32_t)(iter->start<<1);
  848. if(iter->reservedField!=0) {
  849. state|=1;
  850. }
  851. return state;
  852. }
  853. static void U_CALLCONV
  854. utf8IteratorSetState(UCharIterator *iter,
  855. uint32_t state,
  856. UErrorCode *pErrorCode)
  857. {
  858. if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) {
  859. /* do nothing */
  860. } else if(iter==nullptr) {
  861. *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
  862. } else if(state==utf8IteratorGetState(iter)) {
  863. /* setting to the current state: no-op */
  864. } else {
  865. int32_t index=(int32_t)(state>>1); /* UTF-8 index */
  866. state&=1; /* 1 if in surrogate pair, must be index>=4 */
  867. if((state==0 ? index<0 : index<4) || iter->limit<index) {
  868. *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
  869. } else {
  870. iter->start=index; /* restore UTF-8 byte index */
  871. if(index<=1) {
  872. iter->index=index;
  873. } else {
  874. iter->index=-1; /* unknown UTF-16 index */
  875. }
  876. if(state==0) {
  877. iter->reservedField=0;
  878. } else {
  879. /* verified index>=4 above */
  880. UChar32 c;
  881. U8_PREV_OR_FFFD((const uint8_t *)iter->context, 0, index, c);
  882. if(c<=0xffff) {
  883. *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
  884. } else {
  885. iter->reservedField=c;
  886. }
  887. }
  888. }
  889. }
  890. }
  891. static const UCharIterator utf8Iterator={
  892. nullptr, 0, 0, 0, 0, 0,
  893. utf8IteratorGetIndex,
  894. utf8IteratorMove,
  895. utf8IteratorHasNext,
  896. utf8IteratorHasPrevious,
  897. utf8IteratorCurrent,
  898. utf8IteratorNext,
  899. utf8IteratorPrevious,
  900. nullptr,
  901. utf8IteratorGetState,
  902. utf8IteratorSetState
  903. };
  904. U_CAPI void U_EXPORT2
  905. uiter_setUTF8(UCharIterator *iter, const char *s, int32_t length) {
  906. if (iter != nullptr) {
  907. if (s != nullptr && length >= -1) {
  908. *iter=utf8Iterator;
  909. iter->context=s;
  910. if(length>=0) {
  911. iter->limit=length;
  912. } else {
  913. iter->limit=(int32_t)uprv_strlen(s);
  914. }
  915. iter->length= iter->limit<=1 ? iter->limit : -1;
  916. } else {
  917. *iter=noopIterator;
  918. }
  919. }
  920. }
  921. /* Helper functions --------------------------------------------------------- */
  922. U_CAPI UChar32 U_EXPORT2
  923. uiter_current32(UCharIterator *iter) {
  924. UChar32 c, c2;
  925. c=iter->current(iter);
  926. if(U16_IS_SURROGATE(c)) {
  927. if(U16_IS_SURROGATE_LEAD(c)) {
  928. /*
  929. * go to the next code unit
  930. * we know that we are not at the limit because c!=U_SENTINEL
  931. */
  932. iter->move(iter, 1, UITER_CURRENT);
  933. if(U16_IS_TRAIL(c2=iter->current(iter))) {
  934. c=U16_GET_SUPPLEMENTARY(c, c2);
  935. }
  936. /* undo index movement */
  937. iter->move(iter, -1, UITER_CURRENT);
  938. } else {
  939. if(U16_IS_LEAD(c2=iter->previous(iter))) {
  940. c=U16_GET_SUPPLEMENTARY(c2, c);
  941. }
  942. if(c2>=0) {
  943. /* undo index movement */
  944. iter->move(iter, 1, UITER_CURRENT);
  945. }
  946. }
  947. }
  948. return c;
  949. }
  950. U_CAPI UChar32 U_EXPORT2
  951. uiter_next32(UCharIterator *iter) {
  952. UChar32 c, c2;
  953. c=iter->next(iter);
  954. if(U16_IS_LEAD(c)) {
  955. if(U16_IS_TRAIL(c2=iter->next(iter))) {
  956. c=U16_GET_SUPPLEMENTARY(c, c2);
  957. } else if(c2>=0) {
  958. /* unmatched first surrogate, undo index movement */
  959. iter->move(iter, -1, UITER_CURRENT);
  960. }
  961. }
  962. return c;
  963. }
  964. U_CAPI UChar32 U_EXPORT2
  965. uiter_previous32(UCharIterator *iter) {
  966. UChar32 c, c2;
  967. c=iter->previous(iter);
  968. if(U16_IS_TRAIL(c)) {
  969. if(U16_IS_LEAD(c2=iter->previous(iter))) {
  970. c=U16_GET_SUPPLEMENTARY(c2, c);
  971. } else if(c2>=0) {
  972. /* unmatched second surrogate, undo index movement */
  973. iter->move(iter, 1, UITER_CURRENT);
  974. }
  975. }
  976. return c;
  977. }
  978. U_CAPI uint32_t U_EXPORT2
  979. uiter_getState(const UCharIterator *iter) {
  980. if(iter==nullptr || iter->getState==nullptr) {
  981. return UITER_NO_STATE;
  982. } else {
  983. return iter->getState(iter);
  984. }
  985. }
  986. U_CAPI void U_EXPORT2
  987. uiter_setState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) {
  988. if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) {
  989. /* do nothing */
  990. } else if(iter==nullptr) {
  991. *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
  992. } else if(iter->setState==nullptr) {
  993. *pErrorCode=U_UNSUPPORTED_ERROR;
  994. } else {
  995. iter->setState(iter, state, pErrorCode);
  996. }
  997. }
  998. U_CDECL_END