ucnv_u16.cpp 49 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. **********************************************************************
  5. * Copyright (C) 2002-2015, International Business Machines
  6. * Corporation and others. All Rights Reserved.
  7. **********************************************************************
  8. * file name: ucnv_u16.c
  9. * encoding: UTF-8
  10. * tab size: 8 (not used)
  11. * indentation:4
  12. *
  13. * created on: 2002jul01
  14. * created by: Markus W. Scherer
  15. *
  16. * UTF-16 converter implementation. Used to be in ucnv_utf.c.
  17. */
  18. #include "unicode/utypes.h"
  19. #if !UCONFIG_NO_CONVERSION
  20. #include "unicode/ucnv.h"
  21. #include "unicode/uversion.h"
  22. #include "ucnv_bld.h"
  23. #include "ucnv_cnv.h"
  24. #include "cmemory.h"
  25. enum {
  26. UCNV_NEED_TO_WRITE_BOM=1
  27. };
  28. U_CDECL_BEGIN
  29. /*
  30. * The UTF-16 toUnicode implementation is also used for the Java-specific
  31. * "with BOM" variants of UTF-16BE and UTF-16LE.
  32. */
  33. static void U_CALLCONV
  34. _UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
  35. UErrorCode *pErrorCode);
  36. /* UTF-16BE ----------------------------------------------------------------- */
  37. #if U_IS_BIG_ENDIAN
  38. # define _UTF16PEFromUnicodeWithOffsets _UTF16BEFromUnicodeWithOffsets
  39. #else
  40. # define _UTF16PEFromUnicodeWithOffsets _UTF16LEFromUnicodeWithOffsets
  41. #endif
  42. static void U_CALLCONV
  43. _UTF16BEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
  44. UErrorCode *pErrorCode) {
  45. UConverter *cnv;
  46. const char16_t *source;
  47. char *target;
  48. int32_t *offsets;
  49. uint32_t targetCapacity, length, sourceIndex;
  50. char16_t c, trail;
  51. char overflow[4];
  52. source=pArgs->source;
  53. length=(int32_t)(pArgs->sourceLimit-source);
  54. if(length<=0) {
  55. /* no input, nothing to do */
  56. return;
  57. }
  58. cnv=pArgs->converter;
  59. /* write the BOM if necessary */
  60. if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
  61. static const char bom[]={ (char)0xfeu, (char)0xffu };
  62. ucnv_fromUWriteBytes(cnv,
  63. bom, 2,
  64. &pArgs->target, pArgs->targetLimit,
  65. &pArgs->offsets, -1,
  66. pErrorCode);
  67. cnv->fromUnicodeStatus=0;
  68. }
  69. target=pArgs->target;
  70. if(target >= pArgs->targetLimit) {
  71. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  72. return;
  73. }
  74. targetCapacity=(uint32_t)(pArgs->targetLimit-target);
  75. offsets=pArgs->offsets;
  76. sourceIndex=0;
  77. /* c!=0 indicates in several places outside the main loops that a surrogate was found */
  78. if((c=(char16_t)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCapacity>=4) {
  79. /* the last buffer ended with a lead surrogate, output the surrogate pair */
  80. ++source;
  81. --length;
  82. target[0]=(uint8_t)(c>>8);
  83. target[1]=(uint8_t)c;
  84. target[2]=(uint8_t)(trail>>8);
  85. target[3]=(uint8_t)trail;
  86. target+=4;
  87. targetCapacity-=4;
  88. if(offsets!=nullptr) {
  89. *offsets++=-1;
  90. *offsets++=-1;
  91. *offsets++=-1;
  92. *offsets++=-1;
  93. }
  94. sourceIndex=1;
  95. cnv->fromUChar32=c=0;
  96. }
  97. if(c==0) {
  98. /* copy an even number of bytes for complete UChars */
  99. uint32_t count=2*length;
  100. if(count>targetCapacity) {
  101. count=targetCapacity&~1;
  102. }
  103. /* count is even */
  104. targetCapacity-=count;
  105. count>>=1;
  106. length-=count;
  107. if(offsets==nullptr) {
  108. while(count>0) {
  109. c=*source++;
  110. if(U16_IS_SINGLE(c)) {
  111. target[0]=(uint8_t)(c>>8);
  112. target[1]=(uint8_t)c;
  113. target+=2;
  114. } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
  115. ++source;
  116. --count;
  117. target[0]=(uint8_t)(c>>8);
  118. target[1]=(uint8_t)c;
  119. target[2]=(uint8_t)(trail>>8);
  120. target[3]=(uint8_t)trail;
  121. target+=4;
  122. } else {
  123. break;
  124. }
  125. --count;
  126. }
  127. } else {
  128. while(count>0) {
  129. c=*source++;
  130. if(U16_IS_SINGLE(c)) {
  131. target[0]=(uint8_t)(c>>8);
  132. target[1]=(uint8_t)c;
  133. target+=2;
  134. *offsets++=sourceIndex;
  135. *offsets++=sourceIndex++;
  136. } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
  137. ++source;
  138. --count;
  139. target[0]=(uint8_t)(c>>8);
  140. target[1]=(uint8_t)c;
  141. target[2]=(uint8_t)(trail>>8);
  142. target[3]=(uint8_t)trail;
  143. target+=4;
  144. *offsets++=sourceIndex;
  145. *offsets++=sourceIndex;
  146. *offsets++=sourceIndex;
  147. *offsets++=sourceIndex;
  148. sourceIndex+=2;
  149. } else {
  150. break;
  151. }
  152. --count;
  153. }
  154. }
  155. if(count==0) {
  156. /* done with the loop for complete UChars */
  157. if(length>0 && targetCapacity>0) {
  158. /*
  159. * there is more input and some target capacity -
  160. * it must be targetCapacity==1 because otherwise
  161. * the above would have copied more;
  162. * prepare for overflow output
  163. */
  164. if(U16_IS_SINGLE(c=*source++)) {
  165. overflow[0]=(char)(c>>8);
  166. overflow[1]=(char)c;
  167. length=2; /* 2 bytes to output */
  168. c=0;
  169. /* } else { keep c for surrogate handling, length will be set there */
  170. }
  171. } else {
  172. length=0;
  173. c=0;
  174. }
  175. } else {
  176. /* keep c for surrogate handling, length will be set there */
  177. targetCapacity+=2*count;
  178. }
  179. } else {
  180. length=0; /* from here on, length counts the bytes in overflow[] */
  181. }
  182. if(c!=0) {
  183. /*
  184. * c is a surrogate, and
  185. * - source or target too short
  186. * - or the surrogate is unmatched
  187. */
  188. length=0;
  189. if(U16_IS_SURROGATE_LEAD(c)) {
  190. if(source<pArgs->sourceLimit) {
  191. if(U16_IS_TRAIL(trail=*source)) {
  192. /* output the surrogate pair, will overflow (see conditions comment above) */
  193. ++source;
  194. overflow[0]=(char)(c>>8);
  195. overflow[1]=(char)c;
  196. overflow[2]=(char)(trail>>8);
  197. overflow[3]=(char)trail;
  198. length=4; /* 4 bytes to output */
  199. c=0;
  200. } else {
  201. /* unmatched lead surrogate */
  202. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  203. }
  204. } else {
  205. /* see if the trail surrogate is in the next buffer */
  206. }
  207. } else {
  208. /* unmatched trail surrogate */
  209. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  210. }
  211. cnv->fromUChar32=c;
  212. }
  213. if(length>0) {
  214. /* output length bytes with overflow (length>targetCapacity>0) */
  215. ucnv_fromUWriteBytes(cnv,
  216. overflow, length,
  217. &target, pArgs->targetLimit,
  218. &offsets, sourceIndex,
  219. pErrorCode);
  220. targetCapacity = static_cast<uint32_t>(pArgs->targetLimit - target);
  221. }
  222. if(U_SUCCESS(*pErrorCode) && source<pArgs->sourceLimit && targetCapacity==0) {
  223. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  224. }
  225. /* write back the updated pointers */
  226. pArgs->source=source;
  227. pArgs->target = target;
  228. pArgs->offsets=offsets;
  229. }
  230. static void U_CALLCONV
  231. _UTF16BEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
  232. UErrorCode *pErrorCode) {
  233. UConverter *cnv;
  234. const uint8_t *source;
  235. char16_t *target;
  236. int32_t *offsets;
  237. uint32_t targetCapacity, length, count, sourceIndex;
  238. char16_t c, trail;
  239. if(pArgs->converter->mode<8) {
  240. _UTF16ToUnicodeWithOffsets(pArgs, pErrorCode);
  241. return;
  242. }
  243. cnv=pArgs->converter;
  244. source=(const uint8_t *)pArgs->source;
  245. length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source);
  246. if(length<=0 && cnv->toUnicodeStatus==0) {
  247. /* no input, nothing to do */
  248. return;
  249. }
  250. target=pArgs->target;
  251. if(target >= pArgs->targetLimit) {
  252. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  253. return;
  254. }
  255. targetCapacity=(uint32_t)(pArgs->targetLimit-target);
  256. offsets=pArgs->offsets;
  257. sourceIndex=0;
  258. c=0;
  259. /* complete a partial char16_t or pair from the last call */
  260. if(cnv->toUnicodeStatus!=0) {
  261. /*
  262. * special case: single byte from a previous buffer,
  263. * where the byte turned out not to belong to a trail surrogate
  264. * and the preceding, unmatched lead surrogate was put into toUBytes[]
  265. * for error handling
  266. */
  267. cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus;
  268. cnv->toULength=1;
  269. cnv->toUnicodeStatus=0;
  270. }
  271. if((count=cnv->toULength)!=0) {
  272. uint8_t *p=cnv->toUBytes;
  273. do {
  274. p[count++]=*source++;
  275. ++sourceIndex;
  276. --length;
  277. if(count==2) {
  278. c=((char16_t)p[0]<<8)|p[1];
  279. if(U16_IS_SINGLE(c)) {
  280. /* output the BMP code point */
  281. *target++=c;
  282. if(offsets!=nullptr) {
  283. *offsets++=-1;
  284. }
  285. --targetCapacity;
  286. count=0;
  287. c=0;
  288. break;
  289. } else if(U16_IS_SURROGATE_LEAD(c)) {
  290. /* continue collecting bytes for the trail surrogate */
  291. c=0; /* avoid unnecessary surrogate handling below */
  292. } else {
  293. /* fall through to error handling for an unmatched trail surrogate */
  294. break;
  295. }
  296. } else if(count==4) {
  297. c=((char16_t)p[0]<<8)|p[1];
  298. trail=((char16_t)p[2]<<8)|p[3];
  299. if(U16_IS_TRAIL(trail)) {
  300. /* output the surrogate pair */
  301. *target++=c;
  302. if(targetCapacity>=2) {
  303. *target++=trail;
  304. if(offsets!=nullptr) {
  305. *offsets++=-1;
  306. *offsets++=-1;
  307. }
  308. targetCapacity-=2;
  309. } else /* targetCapacity==1 */ {
  310. targetCapacity=0;
  311. cnv->UCharErrorBuffer[0]=trail;
  312. cnv->UCharErrorBufferLength=1;
  313. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  314. }
  315. count=0;
  316. c=0;
  317. break;
  318. } else {
  319. /* unmatched lead surrogate, handle here for consistent toUBytes[] */
  320. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  321. /* back out reading the code unit after it */
  322. if(((const uint8_t *)pArgs->source-source)>=2) {
  323. source-=2;
  324. } else {
  325. /*
  326. * if the trail unit's first byte was in a previous buffer, then
  327. * we need to put it into a special place because toUBytes[] will be
  328. * used for the lead unit's bytes
  329. */
  330. cnv->toUnicodeStatus=0x100|p[2];
  331. --source;
  332. }
  333. cnv->toULength=2;
  334. /* write back the updated pointers */
  335. pArgs->source=(const char *)source;
  336. pArgs->target=target;
  337. pArgs->offsets=offsets;
  338. return;
  339. }
  340. }
  341. } while(length>0);
  342. cnv->toULength=(int8_t)count;
  343. }
  344. /* copy an even number of bytes for complete UChars */
  345. count=2*targetCapacity;
  346. if(count>length) {
  347. count=length&~1;
  348. }
  349. if(c==0 && count>0) {
  350. length-=count;
  351. count>>=1;
  352. targetCapacity-=count;
  353. if(offsets==nullptr) {
  354. do {
  355. c=((char16_t)source[0]<<8)|source[1];
  356. source+=2;
  357. if(U16_IS_SINGLE(c)) {
  358. *target++=c;
  359. } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
  360. U16_IS_TRAIL(trail=((char16_t)source[0]<<8)|source[1])
  361. ) {
  362. source+=2;
  363. --count;
  364. *target++=c;
  365. *target++=trail;
  366. } else {
  367. break;
  368. }
  369. } while(--count>0);
  370. } else {
  371. do {
  372. c=((char16_t)source[0]<<8)|source[1];
  373. source+=2;
  374. if(U16_IS_SINGLE(c)) {
  375. *target++=c;
  376. *offsets++=sourceIndex;
  377. sourceIndex+=2;
  378. } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
  379. U16_IS_TRAIL(trail=((char16_t)source[0]<<8)|source[1])
  380. ) {
  381. source+=2;
  382. --count;
  383. *target++=c;
  384. *target++=trail;
  385. *offsets++=sourceIndex;
  386. *offsets++=sourceIndex;
  387. sourceIndex+=4;
  388. } else {
  389. break;
  390. }
  391. } while(--count>0);
  392. }
  393. if(count==0) {
  394. /* done with the loop for complete UChars */
  395. c=0;
  396. } else {
  397. /* keep c for surrogate handling, trail will be set there */
  398. length+=2*(count-1); /* one more byte pair was consumed than count decremented */
  399. targetCapacity+=count;
  400. }
  401. }
  402. if(c!=0) {
  403. /*
  404. * c is a surrogate, and
  405. * - source or target too short
  406. * - or the surrogate is unmatched
  407. */
  408. cnv->toUBytes[0]=(uint8_t)(c>>8);
  409. cnv->toUBytes[1]=(uint8_t)c;
  410. cnv->toULength=2;
  411. if(U16_IS_SURROGATE_LEAD(c)) {
  412. if(length>=2) {
  413. if(U16_IS_TRAIL(trail=((char16_t)source[0]<<8)|source[1])) {
  414. /* output the surrogate pair, will overflow (see conditions comment above) */
  415. source+=2;
  416. length-=2;
  417. *target++=c;
  418. if(offsets!=nullptr) {
  419. *offsets++=sourceIndex;
  420. }
  421. cnv->UCharErrorBuffer[0]=trail;
  422. cnv->UCharErrorBufferLength=1;
  423. cnv->toULength=0;
  424. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  425. } else {
  426. /* unmatched lead surrogate */
  427. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  428. }
  429. } else {
  430. /* see if the trail surrogate is in the next buffer */
  431. }
  432. } else {
  433. /* unmatched trail surrogate */
  434. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  435. }
  436. }
  437. if(U_SUCCESS(*pErrorCode)) {
  438. /* check for a remaining source byte */
  439. if(length>0) {
  440. if(targetCapacity==0) {
  441. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  442. } else {
  443. /* it must be length==1 because otherwise the above would have copied more */
  444. cnv->toUBytes[cnv->toULength++]=*source++;
  445. }
  446. }
  447. }
  448. /* write back the updated pointers */
  449. pArgs->source=(const char *)source;
  450. pArgs->target=target;
  451. pArgs->offsets=offsets;
  452. }
  453. static UChar32 U_CALLCONV
  454. _UTF16BEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
  455. const uint8_t *s, *sourceLimit;
  456. UChar32 c;
  457. if(pArgs->converter->mode<8) {
  458. return UCNV_GET_NEXT_UCHAR_USE_TO_U;
  459. }
  460. s=(const uint8_t *)pArgs->source;
  461. sourceLimit=(const uint8_t *)pArgs->sourceLimit;
  462. if(s>=sourceLimit) {
  463. /* no input */
  464. *err=U_INDEX_OUTOFBOUNDS_ERROR;
  465. return 0xffff;
  466. }
  467. if(s+2>sourceLimit) {
  468. /* only one byte: truncated char16_t */
  469. pArgs->converter->toUBytes[0]=*s++;
  470. pArgs->converter->toULength=1;
  471. pArgs->source=(const char *)s;
  472. *err = U_TRUNCATED_CHAR_FOUND;
  473. return 0xffff;
  474. }
  475. /* get one char16_t */
  476. c=((UChar32)*s<<8)|s[1];
  477. s+=2;
  478. /* check for a surrogate pair */
  479. if(U_IS_SURROGATE(c)) {
  480. if(U16_IS_SURROGATE_LEAD(c)) {
  481. if(s+2<=sourceLimit) {
  482. char16_t trail;
  483. /* get a second char16_t and see if it is a trail surrogate */
  484. trail=((char16_t)*s<<8)|s[1];
  485. if(U16_IS_TRAIL(trail)) {
  486. c=U16_GET_SUPPLEMENTARY(c, trail);
  487. s+=2;
  488. } else {
  489. /* unmatched lead surrogate */
  490. c=-2;
  491. }
  492. } else {
  493. /* too few (2 or 3) bytes for a surrogate pair: truncated code point */
  494. uint8_t *bytes=pArgs->converter->toUBytes;
  495. s-=2;
  496. pArgs->converter->toULength=(int8_t)(sourceLimit-s);
  497. do {
  498. *bytes++=*s++;
  499. } while(s<sourceLimit);
  500. c=0xffff;
  501. *err=U_TRUNCATED_CHAR_FOUND;
  502. }
  503. } else {
  504. /* unmatched trail surrogate */
  505. c=-2;
  506. }
  507. if(c<0) {
  508. /* write the unmatched surrogate */
  509. uint8_t *bytes=pArgs->converter->toUBytes;
  510. pArgs->converter->toULength=2;
  511. *bytes=*(s-2);
  512. bytes[1]=*(s-1);
  513. c=0xffff;
  514. *err=U_ILLEGAL_CHAR_FOUND;
  515. }
  516. }
  517. pArgs->source=(const char *)s;
  518. return c;
  519. }
  520. static void U_CALLCONV
  521. _UTF16BEReset(UConverter *cnv, UConverterResetChoice choice) {
  522. if(choice<=UCNV_RESET_TO_UNICODE) {
  523. /* reset toUnicode state */
  524. if(UCNV_GET_VERSION(cnv)==0) {
  525. cnv->mode=8; /* no BOM handling */
  526. } else {
  527. cnv->mode=0; /* Java-specific "UnicodeBig" requires BE BOM or no BOM */
  528. }
  529. }
  530. if(choice!=UCNV_RESET_TO_UNICODE && UCNV_GET_VERSION(cnv)==1) {
  531. /* reset fromUnicode for "UnicodeBig": prepare to output the UTF-16BE BOM */
  532. cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
  533. }
  534. }
  535. static void U_CALLCONV
  536. _UTF16BEOpen(UConverter *cnv,
  537. UConverterLoadArgs *pArgs,
  538. UErrorCode *pErrorCode) {
  539. (void)pArgs;
  540. if(UCNV_GET_VERSION(cnv)<=1) {
  541. _UTF16BEReset(cnv, UCNV_RESET_BOTH);
  542. } else {
  543. *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
  544. }
  545. }
  546. static const char * U_CALLCONV
  547. _UTF16BEGetName(const UConverter *cnv) {
  548. if(UCNV_GET_VERSION(cnv)==0) {
  549. return "UTF-16BE";
  550. } else {
  551. return "UTF-16BE,version=1";
  552. }
  553. }
  554. U_CDECL_END
  555. static const UConverterImpl _UTF16BEImpl={
  556. UCNV_UTF16_BigEndian,
  557. nullptr,
  558. nullptr,
  559. _UTF16BEOpen,
  560. nullptr,
  561. _UTF16BEReset,
  562. _UTF16BEToUnicodeWithOffsets,
  563. _UTF16BEToUnicodeWithOffsets,
  564. _UTF16BEFromUnicodeWithOffsets,
  565. _UTF16BEFromUnicodeWithOffsets,
  566. _UTF16BEGetNextUChar,
  567. nullptr,
  568. _UTF16BEGetName,
  569. nullptr,
  570. nullptr,
  571. ucnv_getNonSurrogateUnicodeSet,
  572. nullptr,
  573. nullptr
  574. };
  575. static const UConverterStaticData _UTF16BEStaticData={
  576. sizeof(UConverterStaticData),
  577. "UTF-16BE",
  578. 1200, UCNV_IBM, UCNV_UTF16_BigEndian, 2, 2,
  579. { 0xff, 0xfd, 0, 0 },2,false,false,
  580. 0,
  581. 0,
  582. { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
  583. };
  584. const UConverterSharedData _UTF16BEData=
  585. UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16BEStaticData, &_UTF16BEImpl);
  586. /* UTF-16LE ----------------------------------------------------------------- */
  587. U_CDECL_BEGIN
  588. static void U_CALLCONV
  589. _UTF16LEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
  590. UErrorCode *pErrorCode) {
  591. UConverter *cnv;
  592. const char16_t *source;
  593. char *target;
  594. int32_t *offsets;
  595. uint32_t targetCapacity, length, sourceIndex;
  596. char16_t c, trail;
  597. char overflow[4];
  598. source=pArgs->source;
  599. length=(int32_t)(pArgs->sourceLimit-source);
  600. if(length<=0) {
  601. /* no input, nothing to do */
  602. return;
  603. }
  604. cnv=pArgs->converter;
  605. /* write the BOM if necessary */
  606. if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
  607. static const char bom[]={ (char)0xffu, (char)0xfeu };
  608. ucnv_fromUWriteBytes(cnv,
  609. bom, 2,
  610. &pArgs->target, pArgs->targetLimit,
  611. &pArgs->offsets, -1,
  612. pErrorCode);
  613. cnv->fromUnicodeStatus=0;
  614. }
  615. target=pArgs->target;
  616. if(target >= pArgs->targetLimit) {
  617. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  618. return;
  619. }
  620. targetCapacity=(uint32_t)(pArgs->targetLimit-pArgs->target);
  621. offsets=pArgs->offsets;
  622. sourceIndex=0;
  623. /* c!=0 indicates in several places outside the main loops that a surrogate was found */
  624. if((c=(char16_t)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCapacity>=4) {
  625. /* the last buffer ended with a lead surrogate, output the surrogate pair */
  626. ++source;
  627. --length;
  628. target[0]=(uint8_t)c;
  629. target[1]=(uint8_t)(c>>8);
  630. target[2]=(uint8_t)trail;
  631. target[3]=(uint8_t)(trail>>8);
  632. target+=4;
  633. targetCapacity-=4;
  634. if(offsets!=nullptr) {
  635. *offsets++=-1;
  636. *offsets++=-1;
  637. *offsets++=-1;
  638. *offsets++=-1;
  639. }
  640. sourceIndex=1;
  641. cnv->fromUChar32=c=0;
  642. }
  643. if(c==0) {
  644. /* copy an even number of bytes for complete UChars */
  645. uint32_t count=2*length;
  646. if(count>targetCapacity) {
  647. count=targetCapacity&~1;
  648. }
  649. /* count is even */
  650. targetCapacity-=count;
  651. count>>=1;
  652. length-=count;
  653. if(offsets==nullptr) {
  654. while(count>0) {
  655. c=*source++;
  656. if(U16_IS_SINGLE(c)) {
  657. target[0]=(uint8_t)c;
  658. target[1]=(uint8_t)(c>>8);
  659. target+=2;
  660. } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
  661. ++source;
  662. --count;
  663. target[0]=(uint8_t)c;
  664. target[1]=(uint8_t)(c>>8);
  665. target[2]=(uint8_t)trail;
  666. target[3]=(uint8_t)(trail>>8);
  667. target+=4;
  668. } else {
  669. break;
  670. }
  671. --count;
  672. }
  673. } else {
  674. while(count>0) {
  675. c=*source++;
  676. if(U16_IS_SINGLE(c)) {
  677. target[0]=(uint8_t)c;
  678. target[1]=(uint8_t)(c>>8);
  679. target+=2;
  680. *offsets++=sourceIndex;
  681. *offsets++=sourceIndex++;
  682. } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
  683. ++source;
  684. --count;
  685. target[0]=(uint8_t)c;
  686. target[1]=(uint8_t)(c>>8);
  687. target[2]=(uint8_t)trail;
  688. target[3]=(uint8_t)(trail>>8);
  689. target+=4;
  690. *offsets++=sourceIndex;
  691. *offsets++=sourceIndex;
  692. *offsets++=sourceIndex;
  693. *offsets++=sourceIndex;
  694. sourceIndex+=2;
  695. } else {
  696. break;
  697. }
  698. --count;
  699. }
  700. }
  701. if(count==0) {
  702. /* done with the loop for complete UChars */
  703. if(length>0 && targetCapacity>0) {
  704. /*
  705. * there is more input and some target capacity -
  706. * it must be targetCapacity==1 because otherwise
  707. * the above would have copied more;
  708. * prepare for overflow output
  709. */
  710. if(U16_IS_SINGLE(c=*source++)) {
  711. overflow[0]=(char)c;
  712. overflow[1]=(char)(c>>8);
  713. length=2; /* 2 bytes to output */
  714. c=0;
  715. /* } else { keep c for surrogate handling, length will be set there */
  716. }
  717. } else {
  718. length=0;
  719. c=0;
  720. }
  721. } else {
  722. /* keep c for surrogate handling, length will be set there */
  723. targetCapacity+=2*count;
  724. }
  725. } else {
  726. length=0; /* from here on, length counts the bytes in overflow[] */
  727. }
  728. if(c!=0) {
  729. /*
  730. * c is a surrogate, and
  731. * - source or target too short
  732. * - or the surrogate is unmatched
  733. */
  734. length=0;
  735. if(U16_IS_SURROGATE_LEAD(c)) {
  736. if(source<pArgs->sourceLimit) {
  737. if(U16_IS_TRAIL(trail=*source)) {
  738. /* output the surrogate pair, will overflow (see conditions comment above) */
  739. ++source;
  740. overflow[0]=(char)c;
  741. overflow[1]=(char)(c>>8);
  742. overflow[2]=(char)trail;
  743. overflow[3]=(char)(trail>>8);
  744. length=4; /* 4 bytes to output */
  745. c=0;
  746. } else {
  747. /* unmatched lead surrogate */
  748. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  749. }
  750. } else {
  751. /* see if the trail surrogate is in the next buffer */
  752. }
  753. } else {
  754. /* unmatched trail surrogate */
  755. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  756. }
  757. cnv->fromUChar32=c;
  758. }
  759. if(length>0) {
  760. /* output length bytes with overflow (length>targetCapacity>0) */
  761. ucnv_fromUWriteBytes(cnv,
  762. overflow, length,
  763. &target, pArgs->targetLimit,
  764. &offsets, sourceIndex,
  765. pErrorCode);
  766. targetCapacity = static_cast<uint32_t>(pArgs->targetLimit - target);
  767. }
  768. if(U_SUCCESS(*pErrorCode) && source<pArgs->sourceLimit && targetCapacity==0) {
  769. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  770. }
  771. /* write back the updated pointers */
  772. pArgs->source=source;
  773. pArgs->target=target;
  774. pArgs->offsets=offsets;
  775. }
  776. static void U_CALLCONV
  777. _UTF16LEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
  778. UErrorCode *pErrorCode) {
  779. UConverter *cnv;
  780. const uint8_t *source;
  781. char16_t *target;
  782. int32_t *offsets;
  783. uint32_t targetCapacity, length, count, sourceIndex;
  784. char16_t c, trail;
  785. if(pArgs->converter->mode<8) {
  786. _UTF16ToUnicodeWithOffsets(pArgs, pErrorCode);
  787. return;
  788. }
  789. cnv=pArgs->converter;
  790. source=(const uint8_t *)pArgs->source;
  791. length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source);
  792. if(length<=0 && cnv->toUnicodeStatus==0) {
  793. /* no input, nothing to do */
  794. return;
  795. }
  796. target=pArgs->target;
  797. if(target >= pArgs->targetLimit) {
  798. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  799. return;
  800. }
  801. targetCapacity=(uint32_t)(pArgs->targetLimit-pArgs->target);
  802. offsets=pArgs->offsets;
  803. sourceIndex=0;
  804. c=0;
  805. /* complete a partial char16_t or pair from the last call */
  806. if(cnv->toUnicodeStatus!=0) {
  807. /*
  808. * special case: single byte from a previous buffer,
  809. * where the byte turned out not to belong to a trail surrogate
  810. * and the preceding, unmatched lead surrogate was put into toUBytes[]
  811. * for error handling
  812. */
  813. cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus;
  814. cnv->toULength=1;
  815. cnv->toUnicodeStatus=0;
  816. }
  817. if((count=cnv->toULength)!=0) {
  818. uint8_t *p=cnv->toUBytes;
  819. do {
  820. p[count++]=*source++;
  821. ++sourceIndex;
  822. --length;
  823. if(count==2) {
  824. c=((char16_t)p[1]<<8)|p[0];
  825. if(U16_IS_SINGLE(c)) {
  826. /* output the BMP code point */
  827. *target++=c;
  828. if(offsets!=nullptr) {
  829. *offsets++=-1;
  830. }
  831. --targetCapacity;
  832. count=0;
  833. c=0;
  834. break;
  835. } else if(U16_IS_SURROGATE_LEAD(c)) {
  836. /* continue collecting bytes for the trail surrogate */
  837. c=0; /* avoid unnecessary surrogate handling below */
  838. } else {
  839. /* fall through to error handling for an unmatched trail surrogate */
  840. break;
  841. }
  842. } else if(count==4) {
  843. c=((char16_t)p[1]<<8)|p[0];
  844. trail=((char16_t)p[3]<<8)|p[2];
  845. if(U16_IS_TRAIL(trail)) {
  846. /* output the surrogate pair */
  847. *target++=c;
  848. if(targetCapacity>=2) {
  849. *target++=trail;
  850. if(offsets!=nullptr) {
  851. *offsets++=-1;
  852. *offsets++=-1;
  853. }
  854. targetCapacity-=2;
  855. } else /* targetCapacity==1 */ {
  856. targetCapacity=0;
  857. cnv->UCharErrorBuffer[0]=trail;
  858. cnv->UCharErrorBufferLength=1;
  859. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  860. }
  861. count=0;
  862. c=0;
  863. break;
  864. } else {
  865. /* unmatched lead surrogate, handle here for consistent toUBytes[] */
  866. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  867. /* back out reading the code unit after it */
  868. if(((const uint8_t *)pArgs->source-source)>=2) {
  869. source-=2;
  870. } else {
  871. /*
  872. * if the trail unit's first byte was in a previous buffer, then
  873. * we need to put it into a special place because toUBytes[] will be
  874. * used for the lead unit's bytes
  875. */
  876. cnv->toUnicodeStatus=0x100|p[2];
  877. --source;
  878. }
  879. cnv->toULength=2;
  880. /* write back the updated pointers */
  881. pArgs->source=(const char *)source;
  882. pArgs->target=target;
  883. pArgs->offsets=offsets;
  884. return;
  885. }
  886. }
  887. } while(length>0);
  888. cnv->toULength=(int8_t)count;
  889. }
  890. /* copy an even number of bytes for complete UChars */
  891. count=2*targetCapacity;
  892. if(count>length) {
  893. count=length&~1;
  894. }
  895. if(c==0 && count>0) {
  896. length-=count;
  897. count>>=1;
  898. targetCapacity-=count;
  899. if(offsets==nullptr) {
  900. do {
  901. c=((char16_t)source[1]<<8)|source[0];
  902. source+=2;
  903. if(U16_IS_SINGLE(c)) {
  904. *target++=c;
  905. } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
  906. U16_IS_TRAIL(trail=((char16_t)source[1]<<8)|source[0])
  907. ) {
  908. source+=2;
  909. --count;
  910. *target++=c;
  911. *target++=trail;
  912. } else {
  913. break;
  914. }
  915. } while(--count>0);
  916. } else {
  917. do {
  918. c=((char16_t)source[1]<<8)|source[0];
  919. source+=2;
  920. if(U16_IS_SINGLE(c)) {
  921. *target++=c;
  922. *offsets++=sourceIndex;
  923. sourceIndex+=2;
  924. } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
  925. U16_IS_TRAIL(trail=((char16_t)source[1]<<8)|source[0])
  926. ) {
  927. source+=2;
  928. --count;
  929. *target++=c;
  930. *target++=trail;
  931. *offsets++=sourceIndex;
  932. *offsets++=sourceIndex;
  933. sourceIndex+=4;
  934. } else {
  935. break;
  936. }
  937. } while(--count>0);
  938. }
  939. if(count==0) {
  940. /* done with the loop for complete UChars */
  941. c=0;
  942. } else {
  943. /* keep c for surrogate handling, trail will be set there */
  944. length+=2*(count-1); /* one more byte pair was consumed than count decremented */
  945. targetCapacity+=count;
  946. }
  947. }
  948. if(c!=0) {
  949. /*
  950. * c is a surrogate, and
  951. * - source or target too short
  952. * - or the surrogate is unmatched
  953. */
  954. cnv->toUBytes[0]=(uint8_t)c;
  955. cnv->toUBytes[1]=(uint8_t)(c>>8);
  956. cnv->toULength=2;
  957. if(U16_IS_SURROGATE_LEAD(c)) {
  958. if(length>=2) {
  959. if(U16_IS_TRAIL(trail=((char16_t)source[1]<<8)|source[0])) {
  960. /* output the surrogate pair, will overflow (see conditions comment above) */
  961. source+=2;
  962. length-=2;
  963. *target++=c;
  964. if(offsets!=nullptr) {
  965. *offsets++=sourceIndex;
  966. }
  967. cnv->UCharErrorBuffer[0]=trail;
  968. cnv->UCharErrorBufferLength=1;
  969. cnv->toULength=0;
  970. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  971. } else {
  972. /* unmatched lead surrogate */
  973. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  974. }
  975. } else {
  976. /* see if the trail surrogate is in the next buffer */
  977. }
  978. } else {
  979. /* unmatched trail surrogate */
  980. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  981. }
  982. }
  983. if(U_SUCCESS(*pErrorCode)) {
  984. /* check for a remaining source byte */
  985. if(length>0) {
  986. if(targetCapacity==0) {
  987. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  988. } else {
  989. /* it must be length==1 because otherwise the above would have copied more */
  990. cnv->toUBytes[cnv->toULength++]=*source++;
  991. }
  992. }
  993. }
  994. /* write back the updated pointers */
  995. pArgs->source=(const char *)source;
  996. pArgs->target=target;
  997. pArgs->offsets=offsets;
  998. }
  999. static UChar32 U_CALLCONV
  1000. _UTF16LEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
  1001. const uint8_t *s, *sourceLimit;
  1002. UChar32 c;
  1003. if(pArgs->converter->mode<8) {
  1004. return UCNV_GET_NEXT_UCHAR_USE_TO_U;
  1005. }
  1006. s=(const uint8_t *)pArgs->source;
  1007. sourceLimit=(const uint8_t *)pArgs->sourceLimit;
  1008. if(s>=sourceLimit) {
  1009. /* no input */
  1010. *err=U_INDEX_OUTOFBOUNDS_ERROR;
  1011. return 0xffff;
  1012. }
  1013. if(s+2>sourceLimit) {
  1014. /* only one byte: truncated char16_t */
  1015. pArgs->converter->toUBytes[0]=*s++;
  1016. pArgs->converter->toULength=1;
  1017. pArgs->source=(const char *)s;
  1018. *err = U_TRUNCATED_CHAR_FOUND;
  1019. return 0xffff;
  1020. }
  1021. /* get one char16_t */
  1022. c=((UChar32)s[1]<<8)|*s;
  1023. s+=2;
  1024. /* check for a surrogate pair */
  1025. if(U_IS_SURROGATE(c)) {
  1026. if(U16_IS_SURROGATE_LEAD(c)) {
  1027. if(s+2<=sourceLimit) {
  1028. char16_t trail;
  1029. /* get a second char16_t and see if it is a trail surrogate */
  1030. trail=((char16_t)s[1]<<8)|*s;
  1031. if(U16_IS_TRAIL(trail)) {
  1032. c=U16_GET_SUPPLEMENTARY(c, trail);
  1033. s+=2;
  1034. } else {
  1035. /* unmatched lead surrogate */
  1036. c=-2;
  1037. }
  1038. } else {
  1039. /* too few (2 or 3) bytes for a surrogate pair: truncated code point */
  1040. uint8_t *bytes=pArgs->converter->toUBytes;
  1041. s-=2;
  1042. pArgs->converter->toULength=(int8_t)(sourceLimit-s);
  1043. do {
  1044. *bytes++=*s++;
  1045. } while(s<sourceLimit);
  1046. c=0xffff;
  1047. *err=U_TRUNCATED_CHAR_FOUND;
  1048. }
  1049. } else {
  1050. /* unmatched trail surrogate */
  1051. c=-2;
  1052. }
  1053. if(c<0) {
  1054. /* write the unmatched surrogate */
  1055. uint8_t *bytes=pArgs->converter->toUBytes;
  1056. pArgs->converter->toULength=2;
  1057. *bytes=*(s-2);
  1058. bytes[1]=*(s-1);
  1059. c=0xffff;
  1060. *err=U_ILLEGAL_CHAR_FOUND;
  1061. }
  1062. }
  1063. pArgs->source=(const char *)s;
  1064. return c;
  1065. }
  1066. static void U_CALLCONV
  1067. _UTF16LEReset(UConverter *cnv, UConverterResetChoice choice) {
  1068. if(choice<=UCNV_RESET_TO_UNICODE) {
  1069. /* reset toUnicode state */
  1070. if(UCNV_GET_VERSION(cnv)==0) {
  1071. cnv->mode=8; /* no BOM handling */
  1072. } else {
  1073. cnv->mode=0; /* Java-specific "UnicodeLittle" requires LE BOM or no BOM */
  1074. }
  1075. }
  1076. if(choice!=UCNV_RESET_TO_UNICODE && UCNV_GET_VERSION(cnv)==1) {
  1077. /* reset fromUnicode for "UnicodeLittle": prepare to output the UTF-16LE BOM */
  1078. cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
  1079. }
  1080. }
  1081. static void U_CALLCONV
  1082. _UTF16LEOpen(UConverter *cnv,
  1083. UConverterLoadArgs *pArgs,
  1084. UErrorCode *pErrorCode) {
  1085. (void)pArgs;
  1086. if(UCNV_GET_VERSION(cnv)<=1) {
  1087. _UTF16LEReset(cnv, UCNV_RESET_BOTH);
  1088. } else {
  1089. *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
  1090. }
  1091. }
  1092. static const char * U_CALLCONV
  1093. _UTF16LEGetName(const UConverter *cnv) {
  1094. if(UCNV_GET_VERSION(cnv)==0) {
  1095. return "UTF-16LE";
  1096. } else {
  1097. return "UTF-16LE,version=1";
  1098. }
  1099. }
  1100. U_CDECL_END
  1101. static const UConverterImpl _UTF16LEImpl={
  1102. UCNV_UTF16_LittleEndian,
  1103. nullptr,
  1104. nullptr,
  1105. _UTF16LEOpen,
  1106. nullptr,
  1107. _UTF16LEReset,
  1108. _UTF16LEToUnicodeWithOffsets,
  1109. _UTF16LEToUnicodeWithOffsets,
  1110. _UTF16LEFromUnicodeWithOffsets,
  1111. _UTF16LEFromUnicodeWithOffsets,
  1112. _UTF16LEGetNextUChar,
  1113. nullptr,
  1114. _UTF16LEGetName,
  1115. nullptr,
  1116. nullptr,
  1117. ucnv_getNonSurrogateUnicodeSet,
  1118. nullptr,
  1119. nullptr
  1120. };
  1121. static const UConverterStaticData _UTF16LEStaticData={
  1122. sizeof(UConverterStaticData),
  1123. "UTF-16LE",
  1124. 1202, UCNV_IBM, UCNV_UTF16_LittleEndian, 2, 2,
  1125. { 0xfd, 0xff, 0, 0 },2,false,false,
  1126. 0,
  1127. 0,
  1128. { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
  1129. };
  1130. const UConverterSharedData _UTF16LEData=
  1131. UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16LEStaticData, &_UTF16LEImpl);
  1132. /* UTF-16 (Detect BOM) ------------------------------------------------------ */
  1133. /*
  1134. * Detect a BOM at the beginning of the stream and select UTF-16BE or UTF-16LE
  1135. * accordingly.
  1136. * This is a simpler version of the UTF-32 converter, with
  1137. * fewer states for shorter BOMs.
  1138. *
  1139. * State values:
  1140. * 0 initial state
  1141. * 1 saw first byte
  1142. * 2..5 -
  1143. * 6..7 see _UTF16ToUnicodeWithOffsets() comments in state 1
  1144. * 8 UTF-16BE mode
  1145. * 9 UTF-16LE mode
  1146. *
  1147. * During detection: state==number of initial bytes seen so far.
  1148. *
  1149. * On output, emit U+FEFF as the first code point.
  1150. *
  1151. * Variants:
  1152. * - UTF-16,version=1 (Java "Unicode" encoding) treats a missing BOM as an error.
  1153. * - UTF-16BE,version=1 (Java "UnicodeBig" encoding) and
  1154. * UTF-16LE,version=1 (Java "UnicodeLittle" encoding) treat a reverse BOM as an error.
  1155. */
  1156. U_CDECL_BEGIN
  1157. static void U_CALLCONV
  1158. _UTF16Reset(UConverter *cnv, UConverterResetChoice choice) {
  1159. if(choice<=UCNV_RESET_TO_UNICODE) {
  1160. /* reset toUnicode: state=0 */
  1161. cnv->mode=0;
  1162. }
  1163. if(choice!=UCNV_RESET_TO_UNICODE) {
  1164. /* reset fromUnicode: prepare to output the UTF-16PE BOM */
  1165. cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
  1166. }
  1167. }
  1168. U_CDECL_END
  1169. extern const UConverterSharedData _UTF16v2Data;
  1170. U_CDECL_BEGIN
  1171. static void U_CALLCONV
  1172. _UTF16Open(UConverter *cnv,
  1173. UConverterLoadArgs *pArgs,
  1174. UErrorCode *pErrorCode) {
  1175. if(UCNV_GET_VERSION(cnv)<=2) {
  1176. if(UCNV_GET_VERSION(cnv)==2 && !pArgs->onlyTestIsLoadable) {
  1177. /*
  1178. * Switch implementation, and switch the staticData that's different
  1179. * and was copied into the UConverter.
  1180. * (See ucnv_createConverterFromSharedData() in ucnv_bld.c.)
  1181. * UTF-16,version=2 fromUnicode() always writes a big-endian byte stream.
  1182. */
  1183. cnv->sharedData=(UConverterSharedData*)&_UTF16v2Data;
  1184. uprv_memcpy(cnv->subChars, _UTF16v2Data.staticData->subChar, UCNV_MAX_SUBCHAR_LEN);
  1185. }
  1186. _UTF16Reset(cnv, UCNV_RESET_BOTH);
  1187. } else {
  1188. *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
  1189. }
  1190. }
  1191. static const char * U_CALLCONV
  1192. _UTF16GetName(const UConverter *cnv) {
  1193. if(UCNV_GET_VERSION(cnv)==0) {
  1194. return "UTF-16";
  1195. } else if(UCNV_GET_VERSION(cnv)==1) {
  1196. return "UTF-16,version=1";
  1197. } else {
  1198. return "UTF-16,version=2";
  1199. }
  1200. }
  1201. U_CDECL_END
  1202. extern const UConverterSharedData _UTF16Data;
  1203. static inline bool IS_UTF16BE(const UConverter *cnv) {
  1204. return ((cnv)->sharedData == &_UTF16BEData);
  1205. }
  1206. static inline bool IS_UTF16LE(const UConverter *cnv) {
  1207. return ((cnv)->sharedData == &_UTF16LEData);
  1208. }
  1209. static inline bool IS_UTF16(const UConverter *cnv) {
  1210. return ((cnv)->sharedData==&_UTF16Data) || ((cnv)->sharedData == &_UTF16v2Data);
  1211. }
  1212. U_CDECL_BEGIN
  1213. static void U_CALLCONV
  1214. _UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
  1215. UErrorCode *pErrorCode) {
  1216. UConverter *cnv=pArgs->converter;
  1217. const char *source=pArgs->source;
  1218. const char *sourceLimit=pArgs->sourceLimit;
  1219. int32_t *offsets=pArgs->offsets;
  1220. int32_t state, offsetDelta;
  1221. uint8_t b;
  1222. state=cnv->mode;
  1223. /*
  1224. * If we detect a BOM in this buffer, then we must add the BOM size to the
  1225. * offsets because the actual converter function will not see and count the BOM.
  1226. * offsetDelta will have the number of the BOM bytes that are in the current buffer.
  1227. */
  1228. offsetDelta=0;
  1229. while(source<sourceLimit && U_SUCCESS(*pErrorCode)) {
  1230. switch(state) {
  1231. case 0:
  1232. cnv->toUBytes[0]=(uint8_t)*source++;
  1233. cnv->toULength=1;
  1234. state=1;
  1235. break;
  1236. case 1:
  1237. /*
  1238. * Only inside this switch case can the state variable
  1239. * temporarily take two additional values:
  1240. * 6: BOM error, continue with BE
  1241. * 7: BOM error, continue with LE
  1242. */
  1243. b=*source;
  1244. if(cnv->toUBytes[0]==0xfe && b==0xff) {
  1245. if(IS_UTF16LE(cnv)) {
  1246. state=7; /* illegal reverse BOM for Java "UnicodeLittle" */
  1247. } else {
  1248. state=8; /* detect UTF-16BE */
  1249. }
  1250. } else if(cnv->toUBytes[0]==0xff && b==0xfe) {
  1251. if(IS_UTF16BE(cnv)) {
  1252. state=6; /* illegal reverse BOM for Java "UnicodeBig" */
  1253. } else {
  1254. state=9; /* detect UTF-16LE */
  1255. }
  1256. } else if((IS_UTF16(cnv) && UCNV_GET_VERSION(cnv)==1)) {
  1257. state=6; /* illegal missing BOM for Java "Unicode" */
  1258. }
  1259. if(state>=8) {
  1260. /* BOM detected, consume it */
  1261. ++source;
  1262. cnv->toULength=0;
  1263. offsetDelta=(int32_t)(source-pArgs->source);
  1264. } else if(state<6) {
  1265. /* ok: no BOM, and not a reverse BOM */
  1266. if(source!=pArgs->source) {
  1267. /* reset the source for a correct first offset */
  1268. source=pArgs->source;
  1269. cnv->toULength=0;
  1270. }
  1271. if(IS_UTF16LE(cnv)) {
  1272. /* Make Java "UnicodeLittle" default to LE. */
  1273. state=9;
  1274. } else {
  1275. /* Make standard UTF-16 and Java "UnicodeBig" default to BE. */
  1276. state=8;
  1277. }
  1278. } else {
  1279. /*
  1280. * error: missing BOM, or reverse BOM
  1281. * UTF-16,version=1: Java-specific "Unicode" requires a BOM.
  1282. * UTF-16BE,version=1: Java-specific "UnicodeBig" requires a BE BOM or no BOM.
  1283. * UTF-16LE,version=1: Java-specific "UnicodeLittle" requires an LE BOM or no BOM.
  1284. */
  1285. /* report the non-BOM or reverse BOM as an illegal sequence */
  1286. cnv->toUBytes[1]=b;
  1287. cnv->toULength=2;
  1288. pArgs->source=source+1;
  1289. /* continue with conversion if the callback resets the error */
  1290. /*
  1291. * Make Java "Unicode" default to BE like standard UTF-16.
  1292. * Make Java "UnicodeBig" and "UnicodeLittle" default
  1293. * to their normal endiannesses.
  1294. */
  1295. cnv->mode=state+2;
  1296. *pErrorCode=U_ILLEGAL_ESCAPE_SEQUENCE;
  1297. return;
  1298. }
  1299. /* convert the rest of the stream */
  1300. cnv->mode=state;
  1301. continue;
  1302. case 8:
  1303. /* call UTF-16BE */
  1304. pArgs->source=source;
  1305. _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode);
  1306. source=pArgs->source;
  1307. break;
  1308. case 9:
  1309. /* call UTF-16LE */
  1310. pArgs->source=source;
  1311. _UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode);
  1312. source=pArgs->source;
  1313. break;
  1314. default:
  1315. break; /* does not occur */
  1316. }
  1317. }
  1318. /* add BOM size to offsets - see comment at offsetDelta declaration */
  1319. if(offsets!=nullptr && offsetDelta!=0) {
  1320. int32_t *offsetsLimit=pArgs->offsets;
  1321. while(offsets<offsetsLimit) {
  1322. *offsets++ += offsetDelta;
  1323. }
  1324. }
  1325. pArgs->source=source;
  1326. if(source==sourceLimit && pArgs->flush) {
  1327. /* handle truncated input */
  1328. switch(state) {
  1329. case 0:
  1330. break; /* no input at all, nothing to do */
  1331. case 8:
  1332. _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode);
  1333. break;
  1334. case 9:
  1335. _UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode);
  1336. break;
  1337. default:
  1338. /* 0<state<8: framework will report truncation, nothing to do here */
  1339. break;
  1340. }
  1341. }
  1342. cnv->mode=state;
  1343. }
  1344. static UChar32 U_CALLCONV
  1345. _UTF16GetNextUChar(UConverterToUnicodeArgs *pArgs,
  1346. UErrorCode *pErrorCode) {
  1347. switch(pArgs->converter->mode) {
  1348. case 8:
  1349. return _UTF16BEGetNextUChar(pArgs, pErrorCode);
  1350. case 9:
  1351. return _UTF16LEGetNextUChar(pArgs, pErrorCode);
  1352. default:
  1353. return UCNV_GET_NEXT_UCHAR_USE_TO_U;
  1354. }
  1355. }
  1356. U_CDECL_END
  1357. static const UConverterImpl _UTF16Impl = {
  1358. UCNV_UTF16,
  1359. nullptr,
  1360. nullptr,
  1361. _UTF16Open,
  1362. nullptr,
  1363. _UTF16Reset,
  1364. _UTF16ToUnicodeWithOffsets,
  1365. _UTF16ToUnicodeWithOffsets,
  1366. _UTF16PEFromUnicodeWithOffsets,
  1367. _UTF16PEFromUnicodeWithOffsets,
  1368. _UTF16GetNextUChar,
  1369. nullptr, /* ### TODO implement getStarters for all Unicode encodings?! */
  1370. _UTF16GetName,
  1371. nullptr,
  1372. nullptr,
  1373. ucnv_getNonSurrogateUnicodeSet,
  1374. nullptr,
  1375. nullptr
  1376. };
  1377. static const UConverterStaticData _UTF16StaticData = {
  1378. sizeof(UConverterStaticData),
  1379. "UTF-16",
  1380. 1204, /* CCSID for BOM sensitive UTF-16 */
  1381. UCNV_IBM, UCNV_UTF16, 2, 2,
  1382. #if U_IS_BIG_ENDIAN
  1383. { 0xff, 0xfd, 0, 0 }, 2,
  1384. #else
  1385. { 0xfd, 0xff, 0, 0 }, 2,
  1386. #endif
  1387. false, false,
  1388. 0,
  1389. 0,
  1390. { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
  1391. };
  1392. const UConverterSharedData _UTF16Data =
  1393. UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16StaticData, &_UTF16Impl);
  1394. static const UConverterImpl _UTF16v2Impl = {
  1395. UCNV_UTF16,
  1396. nullptr,
  1397. nullptr,
  1398. _UTF16Open,
  1399. nullptr,
  1400. _UTF16Reset,
  1401. _UTF16ToUnicodeWithOffsets,
  1402. _UTF16ToUnicodeWithOffsets,
  1403. _UTF16BEFromUnicodeWithOffsets,
  1404. _UTF16BEFromUnicodeWithOffsets,
  1405. _UTF16GetNextUChar,
  1406. nullptr, /* ### TODO implement getStarters for all Unicode encodings?! */
  1407. _UTF16GetName,
  1408. nullptr,
  1409. nullptr,
  1410. ucnv_getNonSurrogateUnicodeSet,
  1411. nullptr,
  1412. nullptr
  1413. };
  1414. static const UConverterStaticData _UTF16v2StaticData = {
  1415. sizeof(UConverterStaticData),
  1416. "UTF-16,version=2",
  1417. 1204, /* CCSID for BOM sensitive UTF-16 */
  1418. UCNV_IBM, UCNV_UTF16, 2, 2,
  1419. { 0xff, 0xfd, 0, 0 }, 2,
  1420. false, false,
  1421. 0,
  1422. 0,
  1423. { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
  1424. };
  1425. const UConverterSharedData _UTF16v2Data =
  1426. UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16v2StaticData, &_UTF16v2Impl);
  1427. #endif