ucnv_u8.cpp 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. **********************************************************************
  5. * Copyright (C) 2002-2016, International Business Machines
  6. * Corporation and others. All Rights Reserved.
  7. **********************************************************************
  8. * file name: ucnv_u8.c
  9. * encoding: UTF-8
  10. * tab size: 8 (not used)
  11. * indentation:4
  12. *
  13. * created on: 2002jul01
  14. * created by: Markus W. Scherer
  15. *
  16. * UTF-8 converter implementation. Used to be in ucnv_utf.c.
  17. *
  18. * Also, CESU-8 implementation, see UTR 26.
  19. * The CESU-8 converter uses all the same functions as the
  20. * UTF-8 converter, with a branch for converting supplementary code points.
  21. */
  22. #include "unicode/utypes.h"
  23. #if !UCONFIG_NO_CONVERSION
  24. #include "unicode/ucnv.h"
  25. #include "unicode/utf.h"
  26. #include "unicode/utf8.h"
  27. #include "unicode/utf16.h"
  28. #include "uassert.h"
  29. #include "ucnv_bld.h"
  30. #include "ucnv_cnv.h"
  31. #include "cmemory.h"
  32. #include "ustr_imp.h"
  33. /* Prototypes --------------------------------------------------------------- */
  34. /* Keep these here to make finicky compilers happy */
  35. U_CFUNC void ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs *args,
  36. UErrorCode *err);
  37. U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs *args,
  38. UErrorCode *err);
  39. /* UTF-8 -------------------------------------------------------------------- */
  40. #define MAXIMUM_UCS2 0x0000FFFF
  41. static const uint32_t offsetsFromUTF8[5] = {0,
  42. static_cast<uint32_t>(0x00000000), static_cast<uint32_t>(0x00003080),
  43. static_cast<uint32_t>(0x000E2080), static_cast<uint32_t>(0x03C82080)
  44. };
  45. static UBool hasCESU8Data(const UConverter *cnv)
  46. {
  47. #if UCONFIG_ONLY_HTML_CONVERSION
  48. return false;
  49. #else
  50. return cnv->sharedData == &_CESU8Data;
  51. #endif
  52. }
  53. U_CDECL_BEGIN
  54. static void U_CALLCONV ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
  55. UErrorCode * err)
  56. {
  57. UConverter *cnv = args->converter;
  58. const unsigned char *mySource = (unsigned char *) args->source;
  59. char16_t *myTarget = args->target;
  60. const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
  61. const char16_t *targetLimit = args->targetLimit;
  62. unsigned char *toUBytes = cnv->toUBytes;
  63. UBool isCESU8 = hasCESU8Data(cnv);
  64. uint32_t ch, ch2 = 0;
  65. int32_t i, inBytes;
  66. /* Restore size of current sequence */
  67. if (cnv->toULength > 0 && myTarget < targetLimit)
  68. {
  69. inBytes = cnv->mode; /* restore # of bytes to consume */
  70. i = cnv->toULength; /* restore # of bytes consumed */
  71. cnv->toULength = 0;
  72. ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
  73. cnv->toUnicodeStatus = 0;
  74. goto morebytes;
  75. }
  76. while (mySource < sourceLimit && myTarget < targetLimit)
  77. {
  78. ch = *(mySource++);
  79. if (U8_IS_SINGLE(ch)) /* Simple case */
  80. {
  81. *(myTarget++) = (char16_t) ch;
  82. }
  83. else
  84. {
  85. /* store the first char */
  86. toUBytes[0] = (char)ch;
  87. inBytes = U8_COUNT_BYTES_NON_ASCII(ch); /* lookup current sequence length */
  88. i = 1;
  89. morebytes:
  90. while (i < inBytes)
  91. {
  92. if (mySource < sourceLimit)
  93. {
  94. toUBytes[i] = (char) (ch2 = *mySource);
  95. if (!icu::UTF8::isValidTrail(ch, static_cast<uint8_t>(ch2), i, inBytes) &&
  96. !(isCESU8 && i == 1 && ch == 0xed && U8_IS_TRAIL(ch2)))
  97. {
  98. break; /* i < inBytes */
  99. }
  100. ch = (ch << 6) + ch2;
  101. ++mySource;
  102. i++;
  103. }
  104. else
  105. {
  106. /* stores a partially calculated target*/
  107. cnv->toUnicodeStatus = ch;
  108. cnv->mode = inBytes;
  109. cnv->toULength = (int8_t) i;
  110. goto donefornow;
  111. }
  112. }
  113. // In CESU-8, only surrogates, not supplementary code points, are encoded directly.
  114. if (i == inBytes && (!isCESU8 || i <= 3))
  115. {
  116. /* Remove the accumulated high bits */
  117. ch -= offsetsFromUTF8[inBytes];
  118. /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
  119. if (ch <= MAXIMUM_UCS2)
  120. {
  121. /* fits in 16 bits */
  122. *(myTarget++) = (char16_t) ch;
  123. }
  124. else
  125. {
  126. /* write out the surrogates */
  127. *(myTarget++) = U16_LEAD(ch);
  128. ch = U16_TRAIL(ch);
  129. if (myTarget < targetLimit)
  130. {
  131. *(myTarget++) = (char16_t)ch;
  132. }
  133. else
  134. {
  135. /* Put in overflow buffer (not handled here) */
  136. cnv->UCharErrorBuffer[0] = (char16_t) ch;
  137. cnv->UCharErrorBufferLength = 1;
  138. *err = U_BUFFER_OVERFLOW_ERROR;
  139. break;
  140. }
  141. }
  142. }
  143. else
  144. {
  145. cnv->toULength = (int8_t)i;
  146. *err = U_ILLEGAL_CHAR_FOUND;
  147. break;
  148. }
  149. }
  150. }
  151. donefornow:
  152. if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
  153. {
  154. /* End of target buffer */
  155. *err = U_BUFFER_OVERFLOW_ERROR;
  156. }
  157. args->target = myTarget;
  158. args->source = (const char *) mySource;
  159. }
  160. static void U_CALLCONV ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs * args,
  161. UErrorCode * err)
  162. {
  163. UConverter *cnv = args->converter;
  164. const unsigned char *mySource = (unsigned char *) args->source;
  165. char16_t *myTarget = args->target;
  166. int32_t *myOffsets = args->offsets;
  167. int32_t offsetNum = 0;
  168. const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
  169. const char16_t *targetLimit = args->targetLimit;
  170. unsigned char *toUBytes = cnv->toUBytes;
  171. UBool isCESU8 = hasCESU8Data(cnv);
  172. uint32_t ch, ch2 = 0;
  173. int32_t i, inBytes;
  174. /* Restore size of current sequence */
  175. if (cnv->toULength > 0 && myTarget < targetLimit)
  176. {
  177. inBytes = cnv->mode; /* restore # of bytes to consume */
  178. i = cnv->toULength; /* restore # of bytes consumed */
  179. cnv->toULength = 0;
  180. ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
  181. cnv->toUnicodeStatus = 0;
  182. goto morebytes;
  183. }
  184. while (mySource < sourceLimit && myTarget < targetLimit)
  185. {
  186. ch = *(mySource++);
  187. if (U8_IS_SINGLE(ch)) /* Simple case */
  188. {
  189. *(myTarget++) = (char16_t) ch;
  190. *(myOffsets++) = offsetNum++;
  191. }
  192. else
  193. {
  194. toUBytes[0] = (char)ch;
  195. inBytes = U8_COUNT_BYTES_NON_ASCII(ch);
  196. i = 1;
  197. morebytes:
  198. while (i < inBytes)
  199. {
  200. if (mySource < sourceLimit)
  201. {
  202. toUBytes[i] = (char) (ch2 = *mySource);
  203. if (!icu::UTF8::isValidTrail(ch, static_cast<uint8_t>(ch2), i, inBytes) &&
  204. !(isCESU8 && i == 1 && ch == 0xed && U8_IS_TRAIL(ch2)))
  205. {
  206. break; /* i < inBytes */
  207. }
  208. ch = (ch << 6) + ch2;
  209. ++mySource;
  210. i++;
  211. }
  212. else
  213. {
  214. cnv->toUnicodeStatus = ch;
  215. cnv->mode = inBytes;
  216. cnv->toULength = (int8_t)i;
  217. goto donefornow;
  218. }
  219. }
  220. // In CESU-8, only surrogates, not supplementary code points, are encoded directly.
  221. if (i == inBytes && (!isCESU8 || i <= 3))
  222. {
  223. /* Remove the accumulated high bits */
  224. ch -= offsetsFromUTF8[inBytes];
  225. /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
  226. if (ch <= MAXIMUM_UCS2)
  227. {
  228. /* fits in 16 bits */
  229. *(myTarget++) = (char16_t) ch;
  230. *(myOffsets++) = offsetNum;
  231. }
  232. else
  233. {
  234. /* write out the surrogates */
  235. *(myTarget++) = U16_LEAD(ch);
  236. *(myOffsets++) = offsetNum;
  237. ch = U16_TRAIL(ch);
  238. if (myTarget < targetLimit)
  239. {
  240. *(myTarget++) = (char16_t)ch;
  241. *(myOffsets++) = offsetNum;
  242. }
  243. else
  244. {
  245. cnv->UCharErrorBuffer[0] = (char16_t) ch;
  246. cnv->UCharErrorBufferLength = 1;
  247. *err = U_BUFFER_OVERFLOW_ERROR;
  248. }
  249. }
  250. offsetNum += i;
  251. }
  252. else
  253. {
  254. cnv->toULength = (int8_t)i;
  255. *err = U_ILLEGAL_CHAR_FOUND;
  256. break;
  257. }
  258. }
  259. }
  260. donefornow:
  261. if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
  262. { /* End of target buffer */
  263. *err = U_BUFFER_OVERFLOW_ERROR;
  264. }
  265. args->target = myTarget;
  266. args->source = (const char *) mySource;
  267. args->offsets = myOffsets;
  268. }
  269. U_CDECL_END
  270. U_CFUNC void U_CALLCONV ucnv_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args,
  271. UErrorCode * err)
  272. {
  273. UConverter *cnv = args->converter;
  274. const char16_t *mySource = args->source;
  275. const char16_t *sourceLimit = args->sourceLimit;
  276. uint8_t *myTarget = (uint8_t *) args->target;
  277. const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
  278. uint8_t *tempPtr;
  279. UChar32 ch;
  280. uint8_t tempBuf[4];
  281. int32_t indexToWrite;
  282. UBool isNotCESU8 = !hasCESU8Data(cnv);
  283. if (cnv->fromUChar32 && myTarget < targetLimit)
  284. {
  285. ch = cnv->fromUChar32;
  286. cnv->fromUChar32 = 0;
  287. goto lowsurrogate;
  288. }
  289. while (mySource < sourceLimit && myTarget < targetLimit)
  290. {
  291. ch = *(mySource++);
  292. if (ch < 0x80) /* Single byte */
  293. {
  294. *(myTarget++) = (uint8_t) ch;
  295. }
  296. else if (ch < 0x800) /* Double byte */
  297. {
  298. *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
  299. if (myTarget < targetLimit)
  300. {
  301. *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
  302. }
  303. else
  304. {
  305. cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
  306. cnv->charErrorBufferLength = 1;
  307. *err = U_BUFFER_OVERFLOW_ERROR;
  308. }
  309. }
  310. else {
  311. /* Check for surrogates */
  312. if(U16_IS_SURROGATE(ch) && isNotCESU8) {
  313. lowsurrogate:
  314. if (mySource < sourceLimit) {
  315. /* test both code units */
  316. if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) {
  317. /* convert and consume this supplementary code point */
  318. ch=U16_GET_SUPPLEMENTARY(ch, *mySource);
  319. ++mySource;
  320. /* exit this condition tree */
  321. }
  322. else {
  323. /* this is an unpaired trail or lead code unit */
  324. /* callback(illegal) */
  325. cnv->fromUChar32 = ch;
  326. *err = U_ILLEGAL_CHAR_FOUND;
  327. break;
  328. }
  329. }
  330. else {
  331. /* no more input */
  332. cnv->fromUChar32 = ch;
  333. break;
  334. }
  335. }
  336. /* Do we write the buffer directly for speed,
  337. or do we have to be careful about target buffer space? */
  338. tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
  339. if (ch <= MAXIMUM_UCS2) {
  340. indexToWrite = 2;
  341. tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
  342. }
  343. else {
  344. indexToWrite = 3;
  345. tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
  346. tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
  347. }
  348. tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
  349. tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
  350. if (tempPtr == myTarget) {
  351. /* There was enough space to write the codepoint directly. */
  352. myTarget += (indexToWrite + 1);
  353. }
  354. else {
  355. /* We might run out of room soon. Write it slowly. */
  356. for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
  357. if (myTarget < targetLimit) {
  358. *(myTarget++) = *tempPtr;
  359. }
  360. else {
  361. cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
  362. *err = U_BUFFER_OVERFLOW_ERROR;
  363. }
  364. }
  365. }
  366. }
  367. }
  368. if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
  369. {
  370. *err = U_BUFFER_OVERFLOW_ERROR;
  371. }
  372. args->target = (char *) myTarget;
  373. args->source = mySource;
  374. }
  375. U_CFUNC void U_CALLCONV ucnv_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
  376. UErrorCode * err)
  377. {
  378. UConverter *cnv = args->converter;
  379. const char16_t *mySource = args->source;
  380. int32_t *myOffsets = args->offsets;
  381. const char16_t *sourceLimit = args->sourceLimit;
  382. uint8_t *myTarget = (uint8_t *) args->target;
  383. const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
  384. uint8_t *tempPtr;
  385. UChar32 ch;
  386. int32_t offsetNum, nextSourceIndex;
  387. int32_t indexToWrite;
  388. uint8_t tempBuf[4];
  389. UBool isNotCESU8 = !hasCESU8Data(cnv);
  390. if (cnv->fromUChar32 && myTarget < targetLimit)
  391. {
  392. ch = cnv->fromUChar32;
  393. cnv->fromUChar32 = 0;
  394. offsetNum = -1;
  395. nextSourceIndex = 0;
  396. goto lowsurrogate;
  397. } else {
  398. offsetNum = 0;
  399. }
  400. while (mySource < sourceLimit && myTarget < targetLimit)
  401. {
  402. ch = *(mySource++);
  403. if (ch < 0x80) /* Single byte */
  404. {
  405. *(myOffsets++) = offsetNum++;
  406. *(myTarget++) = (char) ch;
  407. }
  408. else if (ch < 0x800) /* Double byte */
  409. {
  410. *(myOffsets++) = offsetNum;
  411. *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
  412. if (myTarget < targetLimit)
  413. {
  414. *(myOffsets++) = offsetNum++;
  415. *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
  416. }
  417. else
  418. {
  419. cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
  420. cnv->charErrorBufferLength = 1;
  421. *err = U_BUFFER_OVERFLOW_ERROR;
  422. }
  423. }
  424. else
  425. /* Check for surrogates */
  426. {
  427. nextSourceIndex = offsetNum + 1;
  428. if(U16_IS_SURROGATE(ch) && isNotCESU8) {
  429. lowsurrogate:
  430. if (mySource < sourceLimit) {
  431. /* test both code units */
  432. if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) {
  433. /* convert and consume this supplementary code point */
  434. ch=U16_GET_SUPPLEMENTARY(ch, *mySource);
  435. ++mySource;
  436. ++nextSourceIndex;
  437. /* exit this condition tree */
  438. }
  439. else {
  440. /* this is an unpaired trail or lead code unit */
  441. /* callback(illegal) */
  442. cnv->fromUChar32 = ch;
  443. *err = U_ILLEGAL_CHAR_FOUND;
  444. break;
  445. }
  446. }
  447. else {
  448. /* no more input */
  449. cnv->fromUChar32 = ch;
  450. break;
  451. }
  452. }
  453. /* Do we write the buffer directly for speed,
  454. or do we have to be careful about target buffer space? */
  455. tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
  456. if (ch <= MAXIMUM_UCS2) {
  457. indexToWrite = 2;
  458. tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
  459. }
  460. else {
  461. indexToWrite = 3;
  462. tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
  463. tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
  464. }
  465. tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
  466. tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
  467. if (tempPtr == myTarget) {
  468. /* There was enough space to write the codepoint directly. */
  469. myTarget += (indexToWrite + 1);
  470. myOffsets[0] = offsetNum;
  471. myOffsets[1] = offsetNum;
  472. myOffsets[2] = offsetNum;
  473. if (indexToWrite >= 3) {
  474. myOffsets[3] = offsetNum;
  475. }
  476. myOffsets += (indexToWrite + 1);
  477. }
  478. else {
  479. /* We might run out of room soon. Write it slowly. */
  480. for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
  481. if (myTarget < targetLimit)
  482. {
  483. *(myOffsets++) = offsetNum;
  484. *(myTarget++) = *tempPtr;
  485. }
  486. else
  487. {
  488. cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
  489. *err = U_BUFFER_OVERFLOW_ERROR;
  490. }
  491. }
  492. }
  493. offsetNum = nextSourceIndex;
  494. }
  495. }
  496. if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
  497. {
  498. *err = U_BUFFER_OVERFLOW_ERROR;
  499. }
  500. args->target = (char *) myTarget;
  501. args->source = mySource;
  502. args->offsets = myOffsets;
  503. }
  504. U_CDECL_BEGIN
  505. static UChar32 U_CALLCONV ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
  506. UErrorCode *err) {
  507. UConverter *cnv;
  508. const uint8_t *sourceInitial;
  509. const uint8_t *source;
  510. uint8_t myByte;
  511. UChar32 ch;
  512. int8_t i;
  513. /* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */
  514. cnv = args->converter;
  515. sourceInitial = source = (const uint8_t *)args->source;
  516. if (source >= (const uint8_t *)args->sourceLimit)
  517. {
  518. /* no input */
  519. *err = U_INDEX_OUTOFBOUNDS_ERROR;
  520. return 0xffff;
  521. }
  522. myByte = *(source++);
  523. if (U8_IS_SINGLE(myByte))
  524. {
  525. args->source = (const char *)source;
  526. return (UChar32)myByte;
  527. }
  528. uint16_t countTrailBytes = U8_COUNT_TRAIL_BYTES(myByte);
  529. if (countTrailBytes == 0) {
  530. cnv->toUBytes[0] = myByte;
  531. cnv->toULength = 1;
  532. *err = U_ILLEGAL_CHAR_FOUND;
  533. args->source = (const char *)source;
  534. return 0xffff;
  535. }
  536. /*The byte sequence is longer than the buffer area passed*/
  537. if (((const char *)source + countTrailBytes) > args->sourceLimit)
  538. {
  539. /* check if all of the remaining bytes are trail bytes */
  540. uint16_t extraBytesToWrite = countTrailBytes + 1;
  541. cnv->toUBytes[0] = myByte;
  542. i = 1;
  543. *err = U_TRUNCATED_CHAR_FOUND;
  544. while(source < (const uint8_t *)args->sourceLimit) {
  545. uint8_t b = *source;
  546. if(icu::UTF8::isValidTrail(myByte, b, i, extraBytesToWrite)) {
  547. cnv->toUBytes[i++] = b;
  548. ++source;
  549. } else {
  550. /* error even before we run out of input */
  551. *err = U_ILLEGAL_CHAR_FOUND;
  552. break;
  553. }
  554. }
  555. cnv->toULength = i;
  556. args->source = (const char *)source;
  557. return 0xffff;
  558. }
  559. ch = myByte << 6;
  560. if(countTrailBytes == 2) {
  561. uint8_t t1 = *source, t2;
  562. if(U8_IS_VALID_LEAD3_AND_T1(myByte, t1) && U8_IS_TRAIL(t2 = *++source)) {
  563. args->source = (const char *)(source + 1);
  564. return (((ch + t1) << 6) + t2) - offsetsFromUTF8[3];
  565. }
  566. } else if(countTrailBytes == 1) {
  567. uint8_t t1 = *source;
  568. if(U8_IS_TRAIL(t1)) {
  569. args->source = (const char *)(source + 1);
  570. return (ch + t1) - offsetsFromUTF8[2];
  571. }
  572. } else { // countTrailBytes == 3
  573. uint8_t t1 = *source, t2, t3;
  574. if(U8_IS_VALID_LEAD4_AND_T1(myByte, t1) && U8_IS_TRAIL(t2 = *++source) &&
  575. U8_IS_TRAIL(t3 = *++source)) {
  576. args->source = (const char *)(source + 1);
  577. return (((((ch + t1) << 6) + t2) << 6) + t3) - offsetsFromUTF8[4];
  578. }
  579. }
  580. args->source = (const char *)source;
  581. for(i = 0; sourceInitial < source; ++i) {
  582. cnv->toUBytes[i] = *sourceInitial++;
  583. }
  584. cnv->toULength = i;
  585. *err = U_ILLEGAL_CHAR_FOUND;
  586. return 0xffff;
  587. }
  588. U_CDECL_END
  589. /* UTF-8-from-UTF-8 conversion functions ------------------------------------ */
  590. U_CDECL_BEGIN
  591. /* "Convert" UTF-8 to UTF-8: Validate and copy. Modified from ucnv_DBCSFromUTF8(). */
  592. static void U_CALLCONV
  593. ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
  594. UConverterToUnicodeArgs *pToUArgs,
  595. UErrorCode *pErrorCode) {
  596. UConverter *utf8;
  597. const uint8_t *source, *sourceLimit;
  598. uint8_t *target;
  599. int32_t targetCapacity;
  600. int32_t count;
  601. int8_t oldToULength, toULength, toULimit;
  602. UChar32 c;
  603. uint8_t b, t1, t2;
  604. /* set up the local pointers */
  605. utf8=pToUArgs->converter;
  606. source=(uint8_t *)pToUArgs->source;
  607. sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
  608. target=(uint8_t *)pFromUArgs->target;
  609. targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
  610. /* get the converter state from the UTF-8 UConverter */
  611. if(utf8->toULength > 0) {
  612. toULength=oldToULength=utf8->toULength;
  613. toULimit=(int8_t)utf8->mode;
  614. c=(UChar32)utf8->toUnicodeStatus;
  615. } else {
  616. toULength=oldToULength=toULimit=0;
  617. c = 0;
  618. }
  619. count=(int32_t)(sourceLimit-source)+oldToULength;
  620. if(count<toULimit) {
  621. /*
  622. * Not enough input to complete the partial character.
  623. * Jump to moreBytes below - it will not output to target.
  624. */
  625. } else if(targetCapacity<toULimit) {
  626. /*
  627. * Not enough target capacity to output the partial character.
  628. * Let the standard converter handle this.
  629. */
  630. *pErrorCode=U_USING_DEFAULT_WARNING;
  631. return;
  632. } else {
  633. // Use a single counter for source and target, counting the minimum of
  634. // the source length and the target capacity.
  635. // Let the standard converter handle edge cases.
  636. if(count>targetCapacity) {
  637. count=targetCapacity;
  638. }
  639. // The conversion loop checks count>0 only once per character.
  640. // If the buffer ends with a truncated sequence,
  641. // then we reduce the count to stop before that,
  642. // and collect the remaining bytes after the conversion loop.
  643. // Do not go back into the bytes that will be read for finishing a partial
  644. // sequence from the previous buffer.
  645. int32_t length=count-toULength;
  646. U8_TRUNCATE_IF_INCOMPLETE(source, 0, length);
  647. count=toULength+length;
  648. }
  649. if(c!=0) {
  650. utf8->toUnicodeStatus=0;
  651. utf8->toULength=0;
  652. goto moreBytes;
  653. /* See note in ucnv_SBCSFromUTF8() about this goto. */
  654. }
  655. /* conversion loop */
  656. while(count>0) {
  657. b=*source++;
  658. if(U8_IS_SINGLE(b)) {
  659. /* convert ASCII */
  660. *target++=b;
  661. --count;
  662. continue;
  663. } else {
  664. if(b>=0xe0) {
  665. if( /* handle U+0800..U+FFFF inline */
  666. b<0xf0 &&
  667. U8_IS_VALID_LEAD3_AND_T1(b, t1=source[0]) &&
  668. U8_IS_TRAIL(t2=source[1])
  669. ) {
  670. source+=2;
  671. *target++=b;
  672. *target++=t1;
  673. *target++=t2;
  674. count-=3;
  675. continue;
  676. }
  677. } else {
  678. if( /* handle U+0080..U+07FF inline */
  679. b>=0xc2 &&
  680. U8_IS_TRAIL(t1=*source)
  681. ) {
  682. ++source;
  683. *target++=b;
  684. *target++=t1;
  685. count-=2;
  686. continue;
  687. }
  688. }
  689. /* handle "complicated" and error cases, and continuing partial characters */
  690. oldToULength=0;
  691. toULength=1;
  692. toULimit=U8_COUNT_BYTES_NON_ASCII(b);
  693. c=b;
  694. moreBytes:
  695. while(toULength<toULimit) {
  696. if(source<sourceLimit) {
  697. b=*source;
  698. if(icu::UTF8::isValidTrail(c, b, toULength, toULimit)) {
  699. ++source;
  700. ++toULength;
  701. c=(c<<6)+b;
  702. } else {
  703. break; /* sequence too short, stop with toULength<toULimit */
  704. }
  705. } else {
  706. /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
  707. source-=(toULength-oldToULength);
  708. while(oldToULength<toULength) {
  709. utf8->toUBytes[oldToULength++]=*source++;
  710. }
  711. utf8->toUnicodeStatus=c;
  712. utf8->toULength=toULength;
  713. utf8->mode=toULimit;
  714. pToUArgs->source=(char *)source;
  715. pFromUArgs->target=(char *)target;
  716. return;
  717. }
  718. }
  719. if(toULength!=toULimit) {
  720. /* error handling: illegal UTF-8 byte sequence */
  721. source-=(toULength-oldToULength);
  722. while(oldToULength<toULength) {
  723. utf8->toUBytes[oldToULength++]=*source++;
  724. }
  725. utf8->toULength=toULength;
  726. pToUArgs->source=(char *)source;
  727. pFromUArgs->target=(char *)target;
  728. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  729. return;
  730. }
  731. /* copy the legal byte sequence to the target */
  732. {
  733. int8_t i;
  734. for(i=0; i<oldToULength; ++i) {
  735. *target++=utf8->toUBytes[i];
  736. }
  737. source-=(toULength-oldToULength);
  738. for(; i<toULength; ++i) {
  739. *target++=*source++;
  740. }
  741. count-=toULength;
  742. }
  743. }
  744. }
  745. U_ASSERT(count>=0);
  746. if(U_SUCCESS(*pErrorCode) && source<sourceLimit) {
  747. if(target==(const uint8_t *)pFromUArgs->targetLimit) {
  748. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  749. } else {
  750. b=*source;
  751. toULimit=U8_COUNT_BYTES(b);
  752. if(toULimit>(sourceLimit-source)) {
  753. /* collect a truncated byte sequence */
  754. toULength=0;
  755. c=b;
  756. for(;;) {
  757. utf8->toUBytes[toULength++]=b;
  758. if(++source==sourceLimit) {
  759. /* partial byte sequence at end of source */
  760. utf8->toUnicodeStatus=c;
  761. utf8->toULength=toULength;
  762. utf8->mode=toULimit;
  763. break;
  764. } else if(!icu::UTF8::isValidTrail(c, b=*source, toULength, toULimit)) {
  765. utf8->toULength=toULength;
  766. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  767. break;
  768. }
  769. c=(c<<6)+b;
  770. }
  771. } else {
  772. /* partial-sequence target overflow: fall back to the pivoting implementation */
  773. *pErrorCode=U_USING_DEFAULT_WARNING;
  774. }
  775. }
  776. }
  777. /* write back the updated pointers */
  778. pToUArgs->source=(char *)source;
  779. pFromUArgs->target=(char *)target;
  780. }
  781. U_CDECL_END
  782. /* UTF-8 converter data ----------------------------------------------------- */
  783. static const UConverterImpl _UTF8Impl={
  784. UCNV_UTF8,
  785. nullptr,
  786. nullptr,
  787. nullptr,
  788. nullptr,
  789. nullptr,
  790. ucnv_toUnicode_UTF8,
  791. ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
  792. ucnv_fromUnicode_UTF8,
  793. ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
  794. ucnv_getNextUChar_UTF8,
  795. nullptr,
  796. nullptr,
  797. nullptr,
  798. nullptr,
  799. ucnv_getNonSurrogateUnicodeSet,
  800. ucnv_UTF8FromUTF8,
  801. ucnv_UTF8FromUTF8
  802. };
  803. /* The 1208 CCSID refers to any version of Unicode of UTF-8 */
  804. static const UConverterStaticData _UTF8StaticData={
  805. sizeof(UConverterStaticData),
  806. "UTF-8",
  807. 1208, UCNV_IBM, UCNV_UTF8,
  808. 1, 3, /* max 3 bytes per char16_t from UTF-8 (4 bytes from surrogate _pair_) */
  809. { 0xef, 0xbf, 0xbd, 0 },3,false,false,
  810. 0,
  811. 0,
  812. { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
  813. };
  814. const UConverterSharedData _UTF8Data=
  815. UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF8StaticData, &_UTF8Impl);
  816. /* CESU-8 converter data ---------------------------------------------------- */
  817. static const UConverterImpl _CESU8Impl={
  818. UCNV_CESU8,
  819. nullptr,
  820. nullptr,
  821. nullptr,
  822. nullptr,
  823. nullptr,
  824. ucnv_toUnicode_UTF8,
  825. ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
  826. ucnv_fromUnicode_UTF8,
  827. ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
  828. nullptr,
  829. nullptr,
  830. nullptr,
  831. nullptr,
  832. nullptr,
  833. ucnv_getCompleteUnicodeSet,
  834. nullptr,
  835. nullptr
  836. };
  837. static const UConverterStaticData _CESU8StaticData={
  838. sizeof(UConverterStaticData),
  839. "CESU-8",
  840. 9400, /* CCSID for CESU-8 */
  841. UCNV_UNKNOWN, UCNV_CESU8, 1, 3,
  842. { 0xef, 0xbf, 0xbd, 0 },3,false,false,
  843. 0,
  844. 0,
  845. { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
  846. };
  847. const UConverterSharedData _CESU8Data=
  848. UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_CESU8StaticData, &_CESU8Impl);
  849. #endif