ustrtrns.cpp 49 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. ******************************************************************************
  5. *
  6. * Copyright (C) 2001-2016, International Business Machines
  7. * Corporation and others. All Rights Reserved.
  8. *
  9. ******************************************************************************
  10. *
  11. * File ustrtrns.cpp
  12. *
  13. * Modification History:
  14. *
  15. * Date Name Description
  16. * 9/10/2001 Ram Creation.
  17. ******************************************************************************
  18. */
  19. /*******************************************************************************
  20. *
  21. * u_strTo* and u_strFrom* APIs
  22. * WCS functions moved to ustr_wcs.c for better modularization
  23. *
  24. *******************************************************************************
  25. */
  26. #include "unicode/putil.h"
  27. #include "unicode/ustring.h"
  28. #include "unicode/utf.h"
  29. #include "unicode/utf8.h"
  30. #include "unicode/utf16.h"
  31. #include "cstring.h"
  32. #include "cmemory.h"
  33. #include "ustr_imp.h"
  34. #include "uassert.h"
  35. U_CAPI char16_t* U_EXPORT2
  36. u_strFromUTF32WithSub(char16_t *dest,
  37. int32_t destCapacity,
  38. int32_t *pDestLength,
  39. const UChar32 *src,
  40. int32_t srcLength,
  41. UChar32 subchar, int32_t *pNumSubstitutions,
  42. UErrorCode *pErrorCode) {
  43. const UChar32 *srcLimit;
  44. UChar32 ch;
  45. char16_t *destLimit;
  46. char16_t *pDest;
  47. int32_t reqLength;
  48. int32_t numSubstitutions;
  49. /* args check */
  50. if(U_FAILURE(*pErrorCode)){
  51. return nullptr;
  52. }
  53. if( (src==nullptr && srcLength!=0) || srcLength < -1 ||
  54. (destCapacity<0) || (dest == nullptr && destCapacity > 0) ||
  55. subchar > 0x10ffff || U_IS_SURROGATE(subchar)
  56. ) {
  57. *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
  58. return nullptr;
  59. }
  60. if(pNumSubstitutions != nullptr) {
  61. *pNumSubstitutions = 0;
  62. }
  63. pDest = dest;
  64. destLimit = (dest!=nullptr)?(dest + destCapacity):nullptr;
  65. reqLength = 0;
  66. numSubstitutions = 0;
  67. if(srcLength < 0) {
  68. /* simple loop for conversion of a NUL-terminated BMP string */
  69. while((ch=*src) != 0 &&
  70. ((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff))) {
  71. ++src;
  72. if(pDest < destLimit) {
  73. *pDest++ = (char16_t)ch;
  74. } else {
  75. ++reqLength;
  76. }
  77. }
  78. srcLimit = src;
  79. if(ch != 0) {
  80. /* "complicated" case, find the end of the remaining string */
  81. while(*++srcLimit != 0) {}
  82. }
  83. } else {
  84. srcLimit = (src!=nullptr)?(src + srcLength):nullptr;
  85. }
  86. /* convert with length */
  87. while(src < srcLimit) {
  88. ch = *src++;
  89. do {
  90. /* usually "loops" once; twice only for writing subchar */
  91. if((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff)) {
  92. if(pDest < destLimit) {
  93. *pDest++ = (char16_t)ch;
  94. } else {
  95. ++reqLength;
  96. }
  97. break;
  98. } else if(0x10000 <= ch && ch <= 0x10ffff) {
  99. if(pDest!=nullptr && ((pDest + 2) <= destLimit)) {
  100. *pDest++ = U16_LEAD(ch);
  101. *pDest++ = U16_TRAIL(ch);
  102. } else {
  103. reqLength += 2;
  104. }
  105. break;
  106. } else if((ch = subchar) < 0) {
  107. /* surrogate code point, or not a Unicode code point at all */
  108. *pErrorCode = U_INVALID_CHAR_FOUND;
  109. return nullptr;
  110. } else {
  111. ++numSubstitutions;
  112. }
  113. } while(true);
  114. }
  115. reqLength += (int32_t)(pDest - dest);
  116. if(pDestLength) {
  117. *pDestLength = reqLength;
  118. }
  119. if(pNumSubstitutions != nullptr) {
  120. *pNumSubstitutions = numSubstitutions;
  121. }
  122. /* Terminate the buffer */
  123. u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
  124. return dest;
  125. }
  126. U_CAPI char16_t* U_EXPORT2
  127. u_strFromUTF32(char16_t *dest,
  128. int32_t destCapacity,
  129. int32_t *pDestLength,
  130. const UChar32 *src,
  131. int32_t srcLength,
  132. UErrorCode *pErrorCode) {
  133. return u_strFromUTF32WithSub(
  134. dest, destCapacity, pDestLength,
  135. src, srcLength,
  136. U_SENTINEL, nullptr,
  137. pErrorCode);
  138. }
  139. U_CAPI UChar32* U_EXPORT2
  140. u_strToUTF32WithSub(UChar32 *dest,
  141. int32_t destCapacity,
  142. int32_t *pDestLength,
  143. const char16_t *src,
  144. int32_t srcLength,
  145. UChar32 subchar, int32_t *pNumSubstitutions,
  146. UErrorCode *pErrorCode) {
  147. const char16_t *srcLimit;
  148. UChar32 ch;
  149. char16_t ch2;
  150. UChar32 *destLimit;
  151. UChar32 *pDest;
  152. int32_t reqLength;
  153. int32_t numSubstitutions;
  154. /* args check */
  155. if(U_FAILURE(*pErrorCode)){
  156. return nullptr;
  157. }
  158. if( (src==nullptr && srcLength!=0) || srcLength < -1 ||
  159. (destCapacity<0) || (dest == nullptr && destCapacity > 0) ||
  160. subchar > 0x10ffff || U_IS_SURROGATE(subchar)
  161. ) {
  162. *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
  163. return nullptr;
  164. }
  165. if(pNumSubstitutions != nullptr) {
  166. *pNumSubstitutions = 0;
  167. }
  168. pDest = dest;
  169. destLimit = (dest!=nullptr)?(dest + destCapacity):nullptr;
  170. reqLength = 0;
  171. numSubstitutions = 0;
  172. if(srcLength < 0) {
  173. /* simple loop for conversion of a NUL-terminated BMP string */
  174. while((ch=*src) != 0 && !U16_IS_SURROGATE(ch)) {
  175. ++src;
  176. if(pDest < destLimit) {
  177. *pDest++ = ch;
  178. } else {
  179. ++reqLength;
  180. }
  181. }
  182. srcLimit = src;
  183. if(ch != 0) {
  184. /* "complicated" case, find the end of the remaining string */
  185. while(*++srcLimit != 0) {}
  186. }
  187. } else {
  188. srcLimit = (src!=nullptr)?(src + srcLength):nullptr;
  189. }
  190. /* convert with length */
  191. while(src < srcLimit) {
  192. ch = *src++;
  193. if(!U16_IS_SURROGATE(ch)) {
  194. /* write or count ch below */
  195. } else if(U16_IS_SURROGATE_LEAD(ch) && src < srcLimit && U16_IS_TRAIL(ch2 = *src)) {
  196. ++src;
  197. ch = U16_GET_SUPPLEMENTARY(ch, ch2);
  198. } else if((ch = subchar) < 0) {
  199. /* unpaired surrogate */
  200. *pErrorCode = U_INVALID_CHAR_FOUND;
  201. return nullptr;
  202. } else {
  203. ++numSubstitutions;
  204. }
  205. if(pDest < destLimit) {
  206. *pDest++ = ch;
  207. } else {
  208. ++reqLength;
  209. }
  210. }
  211. reqLength += (int32_t)(pDest - dest);
  212. if(pDestLength) {
  213. *pDestLength = reqLength;
  214. }
  215. if(pNumSubstitutions != nullptr) {
  216. *pNumSubstitutions = numSubstitutions;
  217. }
  218. /* Terminate the buffer */
  219. u_terminateUChar32s(dest, destCapacity, reqLength, pErrorCode);
  220. return dest;
  221. }
  222. U_CAPI UChar32* U_EXPORT2
  223. u_strToUTF32(UChar32 *dest,
  224. int32_t destCapacity,
  225. int32_t *pDestLength,
  226. const char16_t *src,
  227. int32_t srcLength,
  228. UErrorCode *pErrorCode) {
  229. return u_strToUTF32WithSub(
  230. dest, destCapacity, pDestLength,
  231. src, srcLength,
  232. U_SENTINEL, nullptr,
  233. pErrorCode);
  234. }
  235. U_CAPI char16_t* U_EXPORT2
  236. u_strFromUTF8WithSub(char16_t *dest,
  237. int32_t destCapacity,
  238. int32_t *pDestLength,
  239. const char* src,
  240. int32_t srcLength,
  241. UChar32 subchar, int32_t *pNumSubstitutions,
  242. UErrorCode *pErrorCode){
  243. /* args check */
  244. if(U_FAILURE(*pErrorCode)) {
  245. return nullptr;
  246. }
  247. if( (src==nullptr && srcLength!=0) || srcLength < -1 ||
  248. (destCapacity<0) || (dest == nullptr && destCapacity > 0) ||
  249. subchar > 0x10ffff || U_IS_SURROGATE(subchar)
  250. ) {
  251. *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
  252. return nullptr;
  253. }
  254. if(pNumSubstitutions!=nullptr) {
  255. *pNumSubstitutions=0;
  256. }
  257. char16_t *pDest = dest;
  258. char16_t *pDestLimit = dest+destCapacity;
  259. int32_t reqLength = 0;
  260. int32_t numSubstitutions=0;
  261. /*
  262. * Inline processing of UTF-8 byte sequences:
  263. *
  264. * Byte sequences for the most common characters are handled inline in
  265. * the conversion loops. In order to reduce the path lengths for those
  266. * characters, the tests are arranged in a kind of binary search.
  267. * ASCII (<=0x7f) is checked first, followed by the dividing point
  268. * between 2- and 3-byte sequences (0xe0).
  269. * The 3-byte branch is tested first to speed up CJK text.
  270. * The compiler should combine the subtractions for the two tests for 0xe0.
  271. * Each branch then tests for the other end of its range.
  272. */
  273. if(srcLength < 0){
  274. /*
  275. * Transform a NUL-terminated string.
  276. * The code explicitly checks for NULs only in the lead byte position.
  277. * A NUL byte in the trail byte position fails the trail byte range check anyway.
  278. */
  279. int32_t i;
  280. UChar32 c;
  281. for(i = 0; (c = (uint8_t)src[i]) != 0 && (pDest < pDestLimit);) {
  282. // modified copy of U8_NEXT()
  283. ++i;
  284. if(U8_IS_SINGLE(c)) {
  285. *pDest++=(char16_t)c;
  286. } else {
  287. uint8_t __t1, __t2;
  288. if( /* handle U+0800..U+FFFF inline */
  289. (0xe0<=(c) && (c)<0xf0) &&
  290. U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
  291. (__t2=src[(i)+1]-0x80)<=0x3f) {
  292. *pDest++ = (((c)&0xf)<<12)|((src[i]&0x3f)<<6)|__t2;
  293. i+=2;
  294. } else if( /* handle U+0080..U+07FF inline */
  295. ((c)<0xe0 && (c)>=0xc2) &&
  296. (__t1=src[i]-0x80)<=0x3f) {
  297. *pDest++ = (((c)&0x1f)<<6)|__t1;
  298. ++(i);
  299. } else {
  300. /* function call for "complicated" and error cases */
  301. (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), -1, c, -1);
  302. if(c<0 && (++numSubstitutions, c = subchar) < 0) {
  303. *pErrorCode = U_INVALID_CHAR_FOUND;
  304. return nullptr;
  305. } else if(c<=0xFFFF) {
  306. *(pDest++)=(char16_t)c;
  307. } else {
  308. *(pDest++)=U16_LEAD(c);
  309. if(pDest<pDestLimit) {
  310. *(pDest++)=U16_TRAIL(c);
  311. } else {
  312. reqLength++;
  313. break;
  314. }
  315. }
  316. }
  317. }
  318. }
  319. /* Pre-flight the rest of the string. */
  320. while((c = (uint8_t)src[i]) != 0) {
  321. // modified copy of U8_NEXT()
  322. ++i;
  323. if(U8_IS_SINGLE(c)) {
  324. ++reqLength;
  325. } else {
  326. uint8_t __t1, __t2;
  327. if( /* handle U+0800..U+FFFF inline */
  328. (0xe0<=(c) && (c)<0xf0) &&
  329. U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
  330. (__t2=src[(i)+1]-0x80)<=0x3f) {
  331. ++reqLength;
  332. i+=2;
  333. } else if( /* handle U+0080..U+07FF inline */
  334. ((c)<0xe0 && (c)>=0xc2) &&
  335. (__t1=src[i]-0x80)<=0x3f) {
  336. ++reqLength;
  337. ++(i);
  338. } else {
  339. /* function call for "complicated" and error cases */
  340. (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), -1, c, -1);
  341. if(c<0 && (++numSubstitutions, c = subchar) < 0) {
  342. *pErrorCode = U_INVALID_CHAR_FOUND;
  343. return nullptr;
  344. }
  345. reqLength += U16_LENGTH(c);
  346. }
  347. }
  348. }
  349. } else /* srcLength >= 0 */ {
  350. /* Faster loop without ongoing checking for srcLength and pDestLimit. */
  351. int32_t i = 0;
  352. UChar32 c;
  353. for(;;) {
  354. /*
  355. * Each iteration of the inner loop progresses by at most 3 UTF-8
  356. * bytes and one char16_t, for most characters.
  357. * For supplementary code points (4 & 2), which are rare,
  358. * there is an additional adjustment.
  359. */
  360. int32_t count = (int32_t)(pDestLimit - pDest);
  361. int32_t count2 = (srcLength - i) / 3;
  362. if(count > count2) {
  363. count = count2; /* min(remaining dest, remaining src/3) */
  364. }
  365. if(count < 3) {
  366. /*
  367. * Too much overhead if we get near the end of the string,
  368. * continue with the next loop.
  369. */
  370. break;
  371. }
  372. do {
  373. // modified copy of U8_NEXT()
  374. c = (uint8_t)src[i++];
  375. if(U8_IS_SINGLE(c)) {
  376. *pDest++=(char16_t)c;
  377. } else {
  378. uint8_t __t1, __t2;
  379. if( /* handle U+0800..U+FFFF inline */
  380. (0xe0<=(c) && (c)<0xf0) &&
  381. ((i)+1)<srcLength &&
  382. U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
  383. (__t2=src[(i)+1]-0x80)<=0x3f) {
  384. *pDest++ = (((c)&0xf)<<12)|((src[i]&0x3f)<<6)|__t2;
  385. i+=2;
  386. } else if( /* handle U+0080..U+07FF inline */
  387. ((c)<0xe0 && (c)>=0xc2) &&
  388. ((i)!=srcLength) &&
  389. (__t1=src[i]-0x80)<=0x3f) {
  390. *pDest++ = (((c)&0x1f)<<6)|__t1;
  391. ++(i);
  392. } else {
  393. if(c >= 0xf0 || subchar > 0xffff) {
  394. // We may read up to four bytes and write up to two UChars,
  395. // which we didn't account for with computing count,
  396. // so we adjust it here.
  397. if(--count == 0) {
  398. --i; // back out byte c
  399. break;
  400. }
  401. }
  402. /* function call for "complicated" and error cases */
  403. (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -1);
  404. if(c<0 && (++numSubstitutions, c = subchar) < 0) {
  405. *pErrorCode = U_INVALID_CHAR_FOUND;
  406. return nullptr;
  407. } else if(c<=0xFFFF) {
  408. *(pDest++)=(char16_t)c;
  409. } else {
  410. *(pDest++)=U16_LEAD(c);
  411. *(pDest++)=U16_TRAIL(c);
  412. }
  413. }
  414. }
  415. } while(--count > 0);
  416. }
  417. while(i < srcLength && (pDest < pDestLimit)) {
  418. // modified copy of U8_NEXT()
  419. c = (uint8_t)src[i++];
  420. if(U8_IS_SINGLE(c)) {
  421. *pDest++=(char16_t)c;
  422. } else {
  423. uint8_t __t1, __t2;
  424. if( /* handle U+0800..U+FFFF inline */
  425. (0xe0<=(c) && (c)<0xf0) &&
  426. ((i)+1)<srcLength &&
  427. U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
  428. (__t2=src[(i)+1]-0x80)<=0x3f) {
  429. *pDest++ = (((c)&0xf)<<12)|((src[i]&0x3f)<<6)|__t2;
  430. i+=2;
  431. } else if( /* handle U+0080..U+07FF inline */
  432. ((c)<0xe0 && (c)>=0xc2) &&
  433. ((i)!=srcLength) &&
  434. (__t1=src[i]-0x80)<=0x3f) {
  435. *pDest++ = (((c)&0x1f)<<6)|__t1;
  436. ++(i);
  437. } else {
  438. /* function call for "complicated" and error cases */
  439. (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -1);
  440. if(c<0 && (++numSubstitutions, c = subchar) < 0) {
  441. *pErrorCode = U_INVALID_CHAR_FOUND;
  442. return nullptr;
  443. } else if(c<=0xFFFF) {
  444. *(pDest++)=(char16_t)c;
  445. } else {
  446. *(pDest++)=U16_LEAD(c);
  447. if(pDest<pDestLimit) {
  448. *(pDest++)=U16_TRAIL(c);
  449. } else {
  450. reqLength++;
  451. break;
  452. }
  453. }
  454. }
  455. }
  456. }
  457. /* Pre-flight the rest of the string. */
  458. while(i < srcLength) {
  459. // modified copy of U8_NEXT()
  460. c = (uint8_t)src[i++];
  461. if(U8_IS_SINGLE(c)) {
  462. ++reqLength;
  463. } else {
  464. uint8_t __t1, __t2;
  465. if( /* handle U+0800..U+FFFF inline */
  466. (0xe0<=(c) && (c)<0xf0) &&
  467. ((i)+1)<srcLength &&
  468. U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
  469. (__t2=src[(i)+1]-0x80)<=0x3f) {
  470. ++reqLength;
  471. i+=2;
  472. } else if( /* handle U+0080..U+07FF inline */
  473. ((c)<0xe0 && (c)>=0xc2) &&
  474. ((i)!=srcLength) &&
  475. (__t1=src[i]-0x80)<=0x3f) {
  476. ++reqLength;
  477. ++(i);
  478. } else {
  479. /* function call for "complicated" and error cases */
  480. (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -1);
  481. if(c<0 && (++numSubstitutions, c = subchar) < 0) {
  482. *pErrorCode = U_INVALID_CHAR_FOUND;
  483. return nullptr;
  484. }
  485. reqLength += U16_LENGTH(c);
  486. }
  487. }
  488. }
  489. }
  490. reqLength+=(int32_t)(pDest - dest);
  491. if(pNumSubstitutions!=nullptr) {
  492. *pNumSubstitutions=numSubstitutions;
  493. }
  494. if(pDestLength){
  495. *pDestLength = reqLength;
  496. }
  497. /* Terminate the buffer */
  498. u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
  499. return dest;
  500. }
  501. U_CAPI char16_t* U_EXPORT2
  502. u_strFromUTF8(char16_t *dest,
  503. int32_t destCapacity,
  504. int32_t *pDestLength,
  505. const char* src,
  506. int32_t srcLength,
  507. UErrorCode *pErrorCode){
  508. return u_strFromUTF8WithSub(
  509. dest, destCapacity, pDestLength,
  510. src, srcLength,
  511. U_SENTINEL, nullptr,
  512. pErrorCode);
  513. }
  514. U_CAPI char16_t * U_EXPORT2
  515. u_strFromUTF8Lenient(char16_t *dest,
  516. int32_t destCapacity,
  517. int32_t *pDestLength,
  518. const char *src,
  519. int32_t srcLength,
  520. UErrorCode *pErrorCode) {
  521. char16_t *pDest = dest;
  522. UChar32 ch;
  523. int32_t reqLength = 0;
  524. uint8_t* pSrc = (uint8_t*) src;
  525. /* args check */
  526. if(U_FAILURE(*pErrorCode)){
  527. return nullptr;
  528. }
  529. if( (src==nullptr && srcLength!=0) || srcLength < -1 ||
  530. (destCapacity<0) || (dest == nullptr && destCapacity > 0)
  531. ) {
  532. *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
  533. return nullptr;
  534. }
  535. if(srcLength < 0) {
  536. /* Transform a NUL-terminated string. */
  537. char16_t *pDestLimit = (dest!=nullptr)?(dest+destCapacity):nullptr;
  538. uint8_t t1, t2, t3; /* trail bytes */
  539. while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
  540. if(ch < 0xc0) {
  541. /*
  542. * ASCII, or a trail byte in lead position which is treated like
  543. * a single-byte sequence for better character boundary
  544. * resynchronization after illegal sequences.
  545. */
  546. *pDest++=(char16_t)ch;
  547. ++pSrc;
  548. continue;
  549. } else if(ch < 0xe0) { /* U+0080..U+07FF */
  550. if((t1 = pSrc[1]) != 0) {
  551. /* 0x3080 = (0xc0 << 6) + 0x80 */
  552. *pDest++ = (char16_t)((ch << 6) + t1 - 0x3080);
  553. pSrc += 2;
  554. continue;
  555. }
  556. } else if(ch < 0xf0) { /* U+0800..U+FFFF */
  557. if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0) {
  558. /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (char16_t) */
  559. /* 0x2080 = (0x80 << 6) + 0x80 */
  560. *pDest++ = (char16_t)((ch << 12) + (t1 << 6) + t2 - 0x2080);
  561. pSrc += 3;
  562. continue;
  563. }
  564. } else /* f0..f4 */ { /* U+10000..U+10FFFF */
  565. if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0 && (t3 = pSrc[3]) != 0) {
  566. pSrc += 4;
  567. /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
  568. ch = (ch << 18) + (t1 << 12) + (t2 << 6) + t3 - 0x3c82080;
  569. *(pDest++) = U16_LEAD(ch);
  570. if(pDest < pDestLimit) {
  571. *(pDest++) = U16_TRAIL(ch);
  572. } else {
  573. reqLength = 1;
  574. break;
  575. }
  576. continue;
  577. }
  578. }
  579. /* truncated character at the end */
  580. *pDest++ = 0xfffd;
  581. while(*++pSrc != 0) {}
  582. break;
  583. }
  584. /* Pre-flight the rest of the string. */
  585. while((ch = *pSrc) != 0) {
  586. if(ch < 0xc0) {
  587. /*
  588. * ASCII, or a trail byte in lead position which is treated like
  589. * a single-byte sequence for better character boundary
  590. * resynchronization after illegal sequences.
  591. */
  592. ++reqLength;
  593. ++pSrc;
  594. continue;
  595. } else if(ch < 0xe0) { /* U+0080..U+07FF */
  596. if(pSrc[1] != 0) {
  597. ++reqLength;
  598. pSrc += 2;
  599. continue;
  600. }
  601. } else if(ch < 0xf0) { /* U+0800..U+FFFF */
  602. if(pSrc[1] != 0 && pSrc[2] != 0) {
  603. ++reqLength;
  604. pSrc += 3;
  605. continue;
  606. }
  607. } else /* f0..f4 */ { /* U+10000..U+10FFFF */
  608. if(pSrc[1] != 0 && pSrc[2] != 0 && pSrc[3] != 0) {
  609. reqLength += 2;
  610. pSrc += 4;
  611. continue;
  612. }
  613. }
  614. /* truncated character at the end */
  615. ++reqLength;
  616. break;
  617. }
  618. } else /* srcLength >= 0 */ {
  619. const uint8_t *pSrcLimit = (pSrc!=nullptr)?(pSrc + srcLength):nullptr;
  620. /*
  621. * This function requires that if srcLength is given, then it must be
  622. * destCapatity >= srcLength so that we need not check for
  623. * destination buffer overflow in the loop.
  624. */
  625. if(destCapacity < srcLength) {
  626. if(pDestLength != nullptr) {
  627. *pDestLength = srcLength; /* this likely overestimates the true destLength! */
  628. }
  629. *pErrorCode = U_BUFFER_OVERFLOW_ERROR;
  630. return nullptr;
  631. }
  632. if((pSrcLimit - pSrc) >= 4) {
  633. pSrcLimit -= 3; /* temporarily reduce pSrcLimit */
  634. /* in this loop, we can always access at least 4 bytes, up to pSrc+3 */
  635. do {
  636. ch = *pSrc++;
  637. if(ch < 0xc0) {
  638. /*
  639. * ASCII, or a trail byte in lead position which is treated like
  640. * a single-byte sequence for better character boundary
  641. * resynchronization after illegal sequences.
  642. */
  643. *pDest++=(char16_t)ch;
  644. } else if(ch < 0xe0) { /* U+0080..U+07FF */
  645. /* 0x3080 = (0xc0 << 6) + 0x80 */
  646. *pDest++ = (char16_t)((ch << 6) + *pSrc++ - 0x3080);
  647. } else if(ch < 0xf0) { /* U+0800..U+FFFF */
  648. /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (char16_t) */
  649. /* 0x2080 = (0x80 << 6) + 0x80 */
  650. ch = (ch << 12) + (*pSrc++ << 6);
  651. *pDest++ = (char16_t)(ch + *pSrc++ - 0x2080);
  652. } else /* f0..f4 */ { /* U+10000..U+10FFFF */
  653. /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
  654. ch = (ch << 18) + (*pSrc++ << 12);
  655. ch += *pSrc++ << 6;
  656. ch += *pSrc++ - 0x3c82080;
  657. *(pDest++) = U16_LEAD(ch);
  658. *(pDest++) = U16_TRAIL(ch);
  659. }
  660. } while(pSrc < pSrcLimit);
  661. pSrcLimit += 3; /* restore original pSrcLimit */
  662. }
  663. while(pSrc < pSrcLimit) {
  664. ch = *pSrc++;
  665. if(ch < 0xc0) {
  666. /*
  667. * ASCII, or a trail byte in lead position which is treated like
  668. * a single-byte sequence for better character boundary
  669. * resynchronization after illegal sequences.
  670. */
  671. *pDest++=(char16_t)ch;
  672. continue;
  673. } else if(ch < 0xe0) { /* U+0080..U+07FF */
  674. if(pSrc < pSrcLimit) {
  675. /* 0x3080 = (0xc0 << 6) + 0x80 */
  676. *pDest++ = (char16_t)((ch << 6) + *pSrc++ - 0x3080);
  677. continue;
  678. }
  679. } else if(ch < 0xf0) { /* U+0800..U+FFFF */
  680. if((pSrcLimit - pSrc) >= 2) {
  681. /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (char16_t) */
  682. /* 0x2080 = (0x80 << 6) + 0x80 */
  683. ch = (ch << 12) + (*pSrc++ << 6);
  684. *pDest++ = (char16_t)(ch + *pSrc++ - 0x2080);
  685. pSrc += 3;
  686. continue;
  687. }
  688. } else /* f0..f4 */ { /* U+10000..U+10FFFF */
  689. if((pSrcLimit - pSrc) >= 3) {
  690. /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
  691. ch = (ch << 18) + (*pSrc++ << 12);
  692. ch += *pSrc++ << 6;
  693. ch += *pSrc++ - 0x3c82080;
  694. *(pDest++) = U16_LEAD(ch);
  695. *(pDest++) = U16_TRAIL(ch);
  696. pSrc += 4;
  697. continue;
  698. }
  699. }
  700. /* truncated character at the end */
  701. *pDest++ = 0xfffd;
  702. break;
  703. }
  704. }
  705. reqLength+=(int32_t)(pDest - dest);
  706. if(pDestLength){
  707. *pDestLength = reqLength;
  708. }
  709. /* Terminate the buffer */
  710. u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
  711. return dest;
  712. }
  713. static inline uint8_t *
  714. _appendUTF8(uint8_t *pDest, UChar32 c) {
  715. /* it is 0<=c<=0x10ffff and not a surrogate if called by a validating function */
  716. if((c)<=0x7f) {
  717. *pDest++ = static_cast<uint8_t>(c);
  718. } else if(c<=0x7ff) {
  719. *pDest++ = static_cast<uint8_t>((c >> 6) | 0xc0);
  720. *pDest++ = static_cast<uint8_t>((c & 0x3f) | 0x80);
  721. } else if(c<=0xffff) {
  722. *pDest++ = static_cast<uint8_t>((c >> 12) | 0xe0);
  723. *pDest++ = static_cast<uint8_t>(((c >> 6) & 0x3f) | 0x80);
  724. *pDest++ = static_cast<uint8_t>(((c) & 0x3f) | 0x80);
  725. } else /* if((uint32_t)(c)<=0x10ffff) */ {
  726. *pDest++ = static_cast<uint8_t>(((c) >> 18) | 0xf0);
  727. *pDest++ = static_cast<uint8_t>((((c) >> 12) & 0x3f) | 0x80);
  728. *pDest++ = static_cast<uint8_t>((((c) >> 6) & 0x3f) | 0x80);
  729. *pDest++ = static_cast<uint8_t>(((c) & 0x3f) | 0x80);
  730. }
  731. return pDest;
  732. }
  733. U_CAPI char* U_EXPORT2
  734. u_strToUTF8WithSub(char *dest,
  735. int32_t destCapacity,
  736. int32_t *pDestLength,
  737. const char16_t *pSrc,
  738. int32_t srcLength,
  739. UChar32 subchar, int32_t *pNumSubstitutions,
  740. UErrorCode *pErrorCode){
  741. int32_t reqLength=0;
  742. uint32_t ch=0,ch2=0;
  743. uint8_t *pDest = (uint8_t *)dest;
  744. uint8_t *pDestLimit = (pDest!=nullptr)?(pDest + destCapacity):nullptr;
  745. int32_t numSubstitutions;
  746. /* args check */
  747. if(U_FAILURE(*pErrorCode)){
  748. return nullptr;
  749. }
  750. if( (pSrc==nullptr && srcLength!=0) || srcLength < -1 ||
  751. (destCapacity<0) || (dest == nullptr && destCapacity > 0) ||
  752. subchar > 0x10ffff || U_IS_SURROGATE(subchar)
  753. ) {
  754. *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
  755. return nullptr;
  756. }
  757. if(pNumSubstitutions!=nullptr) {
  758. *pNumSubstitutions=0;
  759. }
  760. numSubstitutions=0;
  761. if(srcLength==-1) {
  762. while((ch=*pSrc)!=0) {
  763. ++pSrc;
  764. if(ch <= 0x7f) {
  765. if(pDest<pDestLimit) {
  766. *pDest++ = (uint8_t)ch;
  767. } else {
  768. reqLength = 1;
  769. break;
  770. }
  771. } else if(ch <= 0x7ff) {
  772. if((pDestLimit - pDest) >= 2) {
  773. *pDest++=(uint8_t)((ch>>6)|0xc0);
  774. *pDest++=(uint8_t)((ch&0x3f)|0x80);
  775. } else {
  776. reqLength = 2;
  777. break;
  778. }
  779. } else if(ch <= 0xd7ff || ch >= 0xe000) {
  780. if((pDestLimit - pDest) >= 3) {
  781. *pDest++=(uint8_t)((ch>>12)|0xe0);
  782. *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
  783. *pDest++=(uint8_t)((ch&0x3f)|0x80);
  784. } else {
  785. reqLength = 3;
  786. break;
  787. }
  788. } else /* ch is a surrogate */ {
  789. int32_t length;
  790. /*need not check for NUL because NUL fails U16_IS_TRAIL() anyway*/
  791. if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
  792. ++pSrc;
  793. ch=U16_GET_SUPPLEMENTARY(ch, ch2);
  794. } else if(subchar>=0) {
  795. ch=subchar;
  796. ++numSubstitutions;
  797. } else {
  798. /* Unicode 3.2 forbids surrogate code points in UTF-8 */
  799. *pErrorCode = U_INVALID_CHAR_FOUND;
  800. return nullptr;
  801. }
  802. length = U8_LENGTH(ch);
  803. if((pDestLimit - pDest) >= length) {
  804. /* convert and append*/
  805. pDest=_appendUTF8(pDest, ch);
  806. } else {
  807. reqLength = length;
  808. break;
  809. }
  810. }
  811. }
  812. while((ch=*pSrc++)!=0) {
  813. if(ch<=0x7f) {
  814. ++reqLength;
  815. } else if(ch<=0x7ff) {
  816. reqLength+=2;
  817. } else if(!U16_IS_SURROGATE(ch)) {
  818. reqLength+=3;
  819. } else if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
  820. ++pSrc;
  821. reqLength+=4;
  822. } else if(subchar>=0) {
  823. reqLength+=U8_LENGTH(subchar);
  824. ++numSubstitutions;
  825. } else {
  826. /* Unicode 3.2 forbids surrogate code points in UTF-8 */
  827. *pErrorCode = U_INVALID_CHAR_FOUND;
  828. return nullptr;
  829. }
  830. }
  831. } else {
  832. const char16_t *pSrcLimit = (pSrc!=nullptr)?(pSrc+srcLength):nullptr;
  833. int32_t count;
  834. /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
  835. for(;;) {
  836. /*
  837. * Each iteration of the inner loop progresses by at most 3 UTF-8
  838. * bytes and one char16_t, for most characters.
  839. * For supplementary code points (4 & 2), which are rare,
  840. * there is an additional adjustment.
  841. */
  842. count = (int32_t)((pDestLimit - pDest) / 3);
  843. srcLength = (int32_t)(pSrcLimit - pSrc);
  844. if(count > srcLength) {
  845. count = srcLength; /* min(remaining dest/3, remaining src) */
  846. }
  847. if(count < 3) {
  848. /*
  849. * Too much overhead if we get near the end of the string,
  850. * continue with the next loop.
  851. */
  852. break;
  853. }
  854. do {
  855. ch=*pSrc++;
  856. if(ch <= 0x7f) {
  857. *pDest++ = (uint8_t)ch;
  858. } else if(ch <= 0x7ff) {
  859. *pDest++=(uint8_t)((ch>>6)|0xc0);
  860. *pDest++=(uint8_t)((ch&0x3f)|0x80);
  861. } else if(ch <= 0xd7ff || ch >= 0xe000) {
  862. *pDest++=(uint8_t)((ch>>12)|0xe0);
  863. *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
  864. *pDest++=(uint8_t)((ch&0x3f)|0x80);
  865. } else /* ch is a surrogate */ {
  866. /*
  867. * We will read two UChars and probably output four bytes,
  868. * which we didn't account for with computing count,
  869. * so we adjust it here.
  870. */
  871. if(--count == 0) {
  872. --pSrc; /* undo ch=*pSrc++ for the lead surrogate */
  873. break; /* recompute count */
  874. }
  875. if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
  876. ++pSrc;
  877. ch=U16_GET_SUPPLEMENTARY(ch, ch2);
  878. /* writing 4 bytes per 2 UChars is ok */
  879. *pDest++=(uint8_t)((ch>>18)|0xf0);
  880. *pDest++=(uint8_t)(((ch>>12)&0x3f)|0x80);
  881. *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
  882. *pDest++=(uint8_t)((ch&0x3f)|0x80);
  883. } else {
  884. /* Unicode 3.2 forbids surrogate code points in UTF-8 */
  885. if(subchar>=0) {
  886. ch=subchar;
  887. ++numSubstitutions;
  888. } else {
  889. *pErrorCode = U_INVALID_CHAR_FOUND;
  890. return nullptr;
  891. }
  892. /* convert and append*/
  893. pDest=_appendUTF8(pDest, ch);
  894. }
  895. }
  896. } while(--count > 0);
  897. }
  898. while(pSrc<pSrcLimit) {
  899. ch=*pSrc++;
  900. if(ch <= 0x7f) {
  901. if(pDest<pDestLimit) {
  902. *pDest++ = (uint8_t)ch;
  903. } else {
  904. reqLength = 1;
  905. break;
  906. }
  907. } else if(ch <= 0x7ff) {
  908. if((pDestLimit - pDest) >= 2) {
  909. *pDest++=(uint8_t)((ch>>6)|0xc0);
  910. *pDest++=(uint8_t)((ch&0x3f)|0x80);
  911. } else {
  912. reqLength = 2;
  913. break;
  914. }
  915. } else if(ch <= 0xd7ff || ch >= 0xe000) {
  916. if((pDestLimit - pDest) >= 3) {
  917. *pDest++=(uint8_t)((ch>>12)|0xe0);
  918. *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
  919. *pDest++=(uint8_t)((ch&0x3f)|0x80);
  920. } else {
  921. reqLength = 3;
  922. break;
  923. }
  924. } else /* ch is a surrogate */ {
  925. int32_t length;
  926. if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) {
  927. ++pSrc;
  928. ch=U16_GET_SUPPLEMENTARY(ch, ch2);
  929. } else if(subchar>=0) {
  930. ch=subchar;
  931. ++numSubstitutions;
  932. } else {
  933. /* Unicode 3.2 forbids surrogate code points in UTF-8 */
  934. *pErrorCode = U_INVALID_CHAR_FOUND;
  935. return nullptr;
  936. }
  937. length = U8_LENGTH(ch);
  938. if((pDestLimit - pDest) >= length) {
  939. /* convert and append*/
  940. pDest=_appendUTF8(pDest, ch);
  941. } else {
  942. reqLength = length;
  943. break;
  944. }
  945. }
  946. }
  947. while(pSrc<pSrcLimit) {
  948. ch=*pSrc++;
  949. if(ch<=0x7f) {
  950. ++reqLength;
  951. } else if(ch<=0x7ff) {
  952. reqLength+=2;
  953. } else if(!U16_IS_SURROGATE(ch)) {
  954. reqLength+=3;
  955. } else if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) {
  956. ++pSrc;
  957. reqLength+=4;
  958. } else if(subchar>=0) {
  959. reqLength+=U8_LENGTH(subchar);
  960. ++numSubstitutions;
  961. } else {
  962. /* Unicode 3.2 forbids surrogate code points in UTF-8 */
  963. *pErrorCode = U_INVALID_CHAR_FOUND;
  964. return nullptr;
  965. }
  966. }
  967. }
  968. reqLength+=(int32_t)(pDest - (uint8_t *)dest);
  969. if(pNumSubstitutions!=nullptr) {
  970. *pNumSubstitutions=numSubstitutions;
  971. }
  972. if(pDestLength){
  973. *pDestLength = reqLength;
  974. }
  975. /* Terminate the buffer */
  976. u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
  977. return dest;
  978. }
  979. U_CAPI char* U_EXPORT2
  980. u_strToUTF8(char *dest,
  981. int32_t destCapacity,
  982. int32_t *pDestLength,
  983. const char16_t *pSrc,
  984. int32_t srcLength,
  985. UErrorCode *pErrorCode){
  986. return u_strToUTF8WithSub(
  987. dest, destCapacity, pDestLength,
  988. pSrc, srcLength,
  989. U_SENTINEL, nullptr,
  990. pErrorCode);
  991. }
  992. U_CAPI char16_t* U_EXPORT2
  993. u_strFromJavaModifiedUTF8WithSub(
  994. char16_t *dest,
  995. int32_t destCapacity,
  996. int32_t *pDestLength,
  997. const char *src,
  998. int32_t srcLength,
  999. UChar32 subchar, int32_t *pNumSubstitutions,
  1000. UErrorCode *pErrorCode) {
  1001. /* args check */
  1002. if(U_FAILURE(*pErrorCode)) {
  1003. return nullptr;
  1004. }
  1005. if( (src==nullptr && srcLength!=0) || srcLength < -1 ||
  1006. (dest==nullptr && destCapacity!=0) || destCapacity<0 ||
  1007. subchar > 0x10ffff || U_IS_SURROGATE(subchar)
  1008. ) {
  1009. *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
  1010. return nullptr;
  1011. }
  1012. if(pNumSubstitutions!=nullptr) {
  1013. *pNumSubstitutions=0;
  1014. }
  1015. char16_t *pDest = dest;
  1016. char16_t *pDestLimit = dest+destCapacity;
  1017. int32_t reqLength = 0;
  1018. int32_t numSubstitutions=0;
  1019. if(srcLength < 0) {
  1020. /*
  1021. * Transform a NUL-terminated ASCII string.
  1022. * Handle non-ASCII strings with slower code.
  1023. */
  1024. UChar32 c;
  1025. while(((c = (uint8_t)*src) != 0) && c <= 0x7f && (pDest < pDestLimit)) {
  1026. *pDest++=(char16_t)c;
  1027. ++src;
  1028. }
  1029. if(c == 0) {
  1030. reqLength=(int32_t)(pDest - dest);
  1031. if(pDestLength) {
  1032. *pDestLength = reqLength;
  1033. }
  1034. /* Terminate the buffer */
  1035. u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
  1036. return dest;
  1037. }
  1038. srcLength = static_cast<int32_t>(uprv_strlen(src));
  1039. }
  1040. /* Faster loop without ongoing checking for srcLength and pDestLimit. */
  1041. UChar32 ch;
  1042. uint8_t t1, t2;
  1043. int32_t i = 0;
  1044. for(;;) {
  1045. int32_t count = (int32_t)(pDestLimit - pDest);
  1046. int32_t count2 = srcLength - i;
  1047. if(count >= count2 && srcLength > 0 && U8_IS_SINGLE(*src)) {
  1048. /* fast ASCII loop */
  1049. int32_t start = i;
  1050. uint8_t b;
  1051. while(i < srcLength && U8_IS_SINGLE(b = src[i])) {
  1052. *pDest++=b;
  1053. ++i;
  1054. }
  1055. int32_t delta = i - start;
  1056. count -= delta;
  1057. count2 -= delta;
  1058. }
  1059. /*
  1060. * Each iteration of the inner loop progresses by at most 3 UTF-8
  1061. * bytes and one char16_t.
  1062. */
  1063. if(subchar > 0xFFFF) {
  1064. break;
  1065. }
  1066. count2 /= 3;
  1067. if(count > count2) {
  1068. count = count2; /* min(remaining dest, remaining src/3) */
  1069. }
  1070. if(count < 3) {
  1071. /*
  1072. * Too much overhead if we get near the end of the string,
  1073. * continue with the next loop.
  1074. */
  1075. break;
  1076. }
  1077. do {
  1078. ch = (uint8_t)src[i++];
  1079. if(U8_IS_SINGLE(ch)) {
  1080. *pDest++=(char16_t)ch;
  1081. } else {
  1082. if(ch >= 0xe0) {
  1083. if( /* handle U+0000..U+FFFF inline */
  1084. ch <= 0xef &&
  1085. (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f &&
  1086. (t2 = (uint8_t)(src[i+1] - 0x80)) <= 0x3f
  1087. ) {
  1088. /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (char16_t) */
  1089. *pDest++ = (char16_t)((ch << 12) | (t1 << 6) | t2);
  1090. i += 2;
  1091. continue;
  1092. }
  1093. } else {
  1094. if( /* handle U+0000..U+07FF inline */
  1095. ch >= 0xc0 &&
  1096. (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f
  1097. ) {
  1098. *pDest++ = (char16_t)(((ch & 0x1f) << 6) | t1);
  1099. ++i;
  1100. continue;
  1101. }
  1102. }
  1103. if(subchar < 0) {
  1104. *pErrorCode = U_INVALID_CHAR_FOUND;
  1105. return nullptr;
  1106. } else if(subchar > 0xffff && --count == 0) {
  1107. /*
  1108. * We need to write two UChars, adjusted count for that,
  1109. * and ran out of space.
  1110. */
  1111. --i; // back out byte ch
  1112. break;
  1113. } else {
  1114. /* function call for error cases */
  1115. utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -1);
  1116. ++numSubstitutions;
  1117. *(pDest++)=(char16_t)subchar;
  1118. }
  1119. }
  1120. } while(--count > 0);
  1121. }
  1122. while(i < srcLength && (pDest < pDestLimit)) {
  1123. ch = (uint8_t)src[i++];
  1124. if(U8_IS_SINGLE(ch)){
  1125. *pDest++=(char16_t)ch;
  1126. } else {
  1127. if(ch >= 0xe0) {
  1128. if( /* handle U+0000..U+FFFF inline */
  1129. ch <= 0xef &&
  1130. (i+1) < srcLength &&
  1131. (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f &&
  1132. (t2 = (uint8_t)(src[i+1] - 0x80)) <= 0x3f
  1133. ) {
  1134. /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (char16_t) */
  1135. *pDest++ = (char16_t)((ch << 12) | (t1 << 6) | t2);
  1136. i += 2;
  1137. continue;
  1138. }
  1139. } else {
  1140. if( /* handle U+0000..U+07FF inline */
  1141. ch >= 0xc0 &&
  1142. i < srcLength &&
  1143. (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f
  1144. ) {
  1145. *pDest++ = (char16_t)(((ch & 0x1f) << 6) | t1);
  1146. ++i;
  1147. continue;
  1148. }
  1149. }
  1150. if(subchar < 0) {
  1151. *pErrorCode = U_INVALID_CHAR_FOUND;
  1152. return nullptr;
  1153. } else {
  1154. /* function call for error cases */
  1155. utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -1);
  1156. ++numSubstitutions;
  1157. if(subchar<=0xFFFF) {
  1158. *(pDest++)=(char16_t)subchar;
  1159. } else {
  1160. *(pDest++)=U16_LEAD(subchar);
  1161. if(pDest<pDestLimit) {
  1162. *(pDest++)=U16_TRAIL(subchar);
  1163. } else {
  1164. reqLength++;
  1165. break;
  1166. }
  1167. }
  1168. }
  1169. }
  1170. }
  1171. /* Pre-flight the rest of the string. */
  1172. while(i < srcLength) {
  1173. ch = (uint8_t)src[i++];
  1174. if(U8_IS_SINGLE(ch)) {
  1175. reqLength++;
  1176. } else {
  1177. if(ch >= 0xe0) {
  1178. if( /* handle U+0000..U+FFFF inline */
  1179. ch <= 0xef &&
  1180. (i+1) < srcLength &&
  1181. (uint8_t)(src[i] - 0x80) <= 0x3f &&
  1182. (uint8_t)(src[i+1] - 0x80) <= 0x3f
  1183. ) {
  1184. reqLength++;
  1185. i += 2;
  1186. continue;
  1187. }
  1188. } else {
  1189. if( /* handle U+0000..U+07FF inline */
  1190. ch >= 0xc0 &&
  1191. i < srcLength &&
  1192. (uint8_t)(src[i] - 0x80) <= 0x3f
  1193. ) {
  1194. reqLength++;
  1195. ++i;
  1196. continue;
  1197. }
  1198. }
  1199. if(subchar < 0) {
  1200. *pErrorCode = U_INVALID_CHAR_FOUND;
  1201. return nullptr;
  1202. } else {
  1203. /* function call for error cases */
  1204. utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -1);
  1205. ++numSubstitutions;
  1206. reqLength+=U16_LENGTH(ch);
  1207. }
  1208. }
  1209. }
  1210. if(pNumSubstitutions!=nullptr) {
  1211. *pNumSubstitutions=numSubstitutions;
  1212. }
  1213. reqLength+=(int32_t)(pDest - dest);
  1214. if(pDestLength) {
  1215. *pDestLength = reqLength;
  1216. }
  1217. /* Terminate the buffer */
  1218. u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
  1219. return dest;
  1220. }
  1221. U_CAPI char* U_EXPORT2
  1222. u_strToJavaModifiedUTF8(
  1223. char *dest,
  1224. int32_t destCapacity,
  1225. int32_t *pDestLength,
  1226. const char16_t *src,
  1227. int32_t srcLength,
  1228. UErrorCode *pErrorCode) {
  1229. int32_t reqLength=0;
  1230. uint32_t ch=0;
  1231. const char16_t *pSrcLimit;
  1232. int32_t count;
  1233. /* args check */
  1234. if(U_FAILURE(*pErrorCode)){
  1235. return nullptr;
  1236. }
  1237. if( (src==nullptr && srcLength!=0) || srcLength < -1 ||
  1238. (dest==nullptr && destCapacity!=0) || destCapacity<0
  1239. ) {
  1240. *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
  1241. return nullptr;
  1242. }
  1243. uint8_t *pDest = (uint8_t *)dest;
  1244. uint8_t *pDestLimit = pDest + destCapacity;
  1245. if(srcLength==-1) {
  1246. /* Convert NUL-terminated ASCII, then find the string length. */
  1247. while((ch=*src)<=0x7f && ch != 0 && pDest<pDestLimit) {
  1248. *pDest++ = (uint8_t)ch;
  1249. ++src;
  1250. }
  1251. if(ch == 0) {
  1252. reqLength=(int32_t)(pDest - (uint8_t *)dest);
  1253. if(pDestLength) {
  1254. *pDestLength = reqLength;
  1255. }
  1256. /* Terminate the buffer */
  1257. u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
  1258. return dest;
  1259. }
  1260. srcLength = u_strlen(src);
  1261. }
  1262. /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
  1263. pSrcLimit = (src!=nullptr)?(src+srcLength):nullptr;
  1264. for(;;) {
  1265. count = (int32_t)(pDestLimit - pDest);
  1266. srcLength = (int32_t)(pSrcLimit - src);
  1267. if(count >= srcLength && srcLength > 0 && *src <= 0x7f) {
  1268. /* fast ASCII loop */
  1269. const char16_t *prevSrc = src;
  1270. int32_t delta;
  1271. while(src < pSrcLimit && (ch = *src) <= 0x7f && ch != 0) {
  1272. *pDest++=(uint8_t)ch;
  1273. ++src;
  1274. }
  1275. delta = (int32_t)(src - prevSrc);
  1276. count -= delta;
  1277. srcLength -= delta;
  1278. }
  1279. /*
  1280. * Each iteration of the inner loop progresses by at most 3 UTF-8
  1281. * bytes and one char16_t.
  1282. */
  1283. count /= 3;
  1284. if(count > srcLength) {
  1285. count = srcLength; /* min(remaining dest/3, remaining src) */
  1286. }
  1287. if(count < 3) {
  1288. /*
  1289. * Too much overhead if we get near the end of the string,
  1290. * continue with the next loop.
  1291. */
  1292. break;
  1293. }
  1294. do {
  1295. ch=*src++;
  1296. if(ch <= 0x7f && ch != 0) {
  1297. *pDest++ = (uint8_t)ch;
  1298. } else if(ch <= 0x7ff) {
  1299. *pDest++=(uint8_t)((ch>>6)|0xc0);
  1300. *pDest++=(uint8_t)((ch&0x3f)|0x80);
  1301. } else {
  1302. *pDest++=(uint8_t)((ch>>12)|0xe0);
  1303. *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
  1304. *pDest++=(uint8_t)((ch&0x3f)|0x80);
  1305. }
  1306. } while(--count > 0);
  1307. }
  1308. while(src<pSrcLimit) {
  1309. ch=*src++;
  1310. if(ch <= 0x7f && ch != 0) {
  1311. if(pDest<pDestLimit) {
  1312. *pDest++ = (uint8_t)ch;
  1313. } else {
  1314. reqLength = 1;
  1315. break;
  1316. }
  1317. } else if(ch <= 0x7ff) {
  1318. if((pDestLimit - pDest) >= 2) {
  1319. *pDest++=(uint8_t)((ch>>6)|0xc0);
  1320. *pDest++=(uint8_t)((ch&0x3f)|0x80);
  1321. } else {
  1322. reqLength = 2;
  1323. break;
  1324. }
  1325. } else {
  1326. if((pDestLimit - pDest) >= 3) {
  1327. *pDest++=(uint8_t)((ch>>12)|0xe0);
  1328. *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
  1329. *pDest++=(uint8_t)((ch&0x3f)|0x80);
  1330. } else {
  1331. reqLength = 3;
  1332. break;
  1333. }
  1334. }
  1335. }
  1336. while(src<pSrcLimit) {
  1337. ch=*src++;
  1338. if(ch <= 0x7f && ch != 0) {
  1339. ++reqLength;
  1340. } else if(ch<=0x7ff) {
  1341. reqLength+=2;
  1342. } else {
  1343. reqLength+=3;
  1344. }
  1345. }
  1346. reqLength+=(int32_t)(pDest - (uint8_t *)dest);
  1347. if(pDestLength){
  1348. *pDestLength = reqLength;
  1349. }
  1350. /* Terminate the buffer */
  1351. u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
  1352. return dest;
  1353. }