ucnvbocu.cpp 45 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. ******************************************************************************
  5. *
  6. * Copyright (C) 2002-2016, International Business Machines
  7. * Corporation and others. All Rights Reserved.
  8. *
  9. ******************************************************************************
  10. * file name: ucnvbocu.cpp
  11. * encoding: UTF-8
  12. * tab size: 8 (not used)
  13. * indentation:4
  14. *
  15. * created on: 2002mar27
  16. * created by: Markus W. Scherer
  17. *
  18. * This is an implementation of the Binary Ordered Compression for Unicode,
  19. * in its MIME-friendly form as defined in http://www.unicode.org/notes/tn6/
  20. */
  21. #include "unicode/utypes.h"
  22. #if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
  23. #include "unicode/ucnv.h"
  24. #include "unicode/ucnv_cb.h"
  25. #include "unicode/utf16.h"
  26. #include "putilimp.h"
  27. #include "ucnv_bld.h"
  28. #include "ucnv_cnv.h"
  29. #include "uassert.h"
  30. /* BOCU-1 constants and macros ---------------------------------------------- */
  31. /*
  32. * BOCU-1 encodes the code points of a Unicode string as
  33. * a sequence of byte-encoded differences (slope detection),
  34. * preserving lexical order.
  35. *
  36. * Optimize the difference-taking for runs of Unicode text within
  37. * small scripts:
  38. *
  39. * Most small scripts are allocated within aligned 128-blocks of Unicode
  40. * code points. Lexical order is preserved if the "previous code point" state
  41. * is always moved into the middle of such a block.
  42. *
  43. * Additionally, "prev" is moved from anywhere in the Unihan and Hangul
  44. * areas into the middle of those areas.
  45. *
  46. * C0 control codes and space are encoded with their US-ASCII bytes.
  47. * "prev" is reset for C0 controls but not for space.
  48. */
  49. /* initial value for "prev": middle of the ASCII range */
  50. #define BOCU1_ASCII_PREV 0x40
  51. /* bounding byte values for differences */
  52. #define BOCU1_MIN 0x21
  53. #define BOCU1_MIDDLE 0x90
  54. #define BOCU1_MAX_LEAD 0xfe
  55. #define BOCU1_MAX_TRAIL 0xff
  56. #define BOCU1_RESET 0xff
  57. /* number of lead bytes */
  58. #define BOCU1_COUNT (BOCU1_MAX_LEAD-BOCU1_MIN+1)
  59. /* adjust trail byte counts for the use of some C0 control byte values */
  60. #define BOCU1_TRAIL_CONTROLS_COUNT 20
  61. #define BOCU1_TRAIL_BYTE_OFFSET (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT)
  62. /* number of trail bytes */
  63. #define BOCU1_TRAIL_COUNT ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT)
  64. /*
  65. * number of positive and negative single-byte codes
  66. * (counting 0==BOCU1_MIDDLE among the positive ones)
  67. */
  68. #define BOCU1_SINGLE 64
  69. /* number of lead bytes for positive and negative 2/3/4-byte sequences */
  70. #define BOCU1_LEAD_2 43
  71. #define BOCU1_LEAD_3 3
  72. #define BOCU1_LEAD_4 1
  73. /* The difference value range for single-byters. */
  74. #define BOCU1_REACH_POS_1 (BOCU1_SINGLE-1)
  75. #define BOCU1_REACH_NEG_1 (-BOCU1_SINGLE)
  76. /* The difference value range for double-byters. */
  77. #define BOCU1_REACH_POS_2 (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
  78. #define BOCU1_REACH_NEG_2 (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
  79. /* The difference value range for 3-byters. */
  80. #define BOCU1_REACH_POS_3 \
  81. (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
  82. #define BOCU1_REACH_NEG_3 (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
  83. /* The lead byte start values. */
  84. #define BOCU1_START_POS_2 (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1)
  85. #define BOCU1_START_POS_3 (BOCU1_START_POS_2+BOCU1_LEAD_2)
  86. #define BOCU1_START_POS_4 (BOCU1_START_POS_3+BOCU1_LEAD_3)
  87. /* ==BOCU1_MAX_LEAD */
  88. #define BOCU1_START_NEG_2 (BOCU1_MIDDLE+BOCU1_REACH_NEG_1)
  89. #define BOCU1_START_NEG_3 (BOCU1_START_NEG_2-BOCU1_LEAD_2)
  90. #define BOCU1_START_NEG_4 (BOCU1_START_NEG_3-BOCU1_LEAD_3)
  91. /* ==BOCU1_MIN+1 */
  92. /* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */
  93. #define BOCU1_LENGTH_FROM_LEAD(lead) \
  94. ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \
  95. (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \
  96. (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4)
  97. /* The length of a byte sequence, according to its packed form. */
  98. #define BOCU1_LENGTH_FROM_PACKED(packed) \
  99. ((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4)
  100. /*
  101. * 12 commonly used C0 control codes (and space) are only used to encode
  102. * themselves directly,
  103. * which makes BOCU-1 MIME-usable and reasonably safe for
  104. * ASCII-oriented software.
  105. *
  106. * These controls are
  107. * 0 NUL
  108. *
  109. * 7 BEL
  110. * 8 BS
  111. *
  112. * 9 TAB
  113. * a LF
  114. * b VT
  115. * c FF
  116. * d CR
  117. *
  118. * e SO
  119. * f SI
  120. *
  121. * 1a SUB
  122. * 1b ESC
  123. *
  124. * The other 20 C0 controls are also encoded directly (to preserve order)
  125. * but are also used as trail bytes in difference encoding
  126. * (for better compression).
  127. */
  128. #define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t])
  129. /*
  130. * Byte value map for control codes,
  131. * from external byte values 0x00..0x20
  132. * to trail byte values 0..19 (0..0x13) as used in the difference calculation.
  133. * External byte values that are illegal as trail bytes are mapped to -1.
  134. */
  135. static const int8_t
  136. bocu1ByteToTrail[BOCU1_MIN]={
  137. /* 0 1 2 3 4 5 6 7 */
  138. -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1,
  139. /* 8 9 a b c d e f */
  140. -1, -1, -1, -1, -1, -1, -1, -1,
  141. /* 10 11 12 13 14 15 16 17 */
  142. 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
  143. /* 18 19 1a 1b 1c 1d 1e 1f */
  144. 0x0e, 0x0f, -1, -1, 0x10, 0x11, 0x12, 0x13,
  145. /* 20 */
  146. -1
  147. };
  148. /*
  149. * Byte value map for control codes,
  150. * from trail byte values 0..19 (0..0x13) as used in the difference calculation
  151. * to external byte values 0x00..0x20.
  152. */
  153. static const int8_t
  154. bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={
  155. /* 0 1 2 3 4 5 6 7 */
  156. 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11,
  157. /* 8 9 a b c d e f */
  158. 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
  159. /* 10 11 12 13 */
  160. 0x1c, 0x1d, 0x1e, 0x1f
  161. };
  162. /**
  163. * Integer division and modulo with negative numerators
  164. * yields negative modulo results and quotients that are one more than
  165. * what we need here.
  166. * This macro adjust the results so that the modulo-value m is always >=0.
  167. *
  168. * For positive n, the if() condition is always false.
  169. *
  170. * @param n Number to be split into quotient and rest.
  171. * Will be modified to contain the quotient.
  172. * @param d Divisor.
  173. * @param m Output variable for the rest (modulo result).
  174. */
  175. #define NEGDIVMOD(n, d, m) UPRV_BLOCK_MACRO_BEGIN { \
  176. (m)=(n)%(d); \
  177. (n)/=(d); \
  178. if((m)<0) { \
  179. --(n); \
  180. (m)+=(d); \
  181. } \
  182. } UPRV_BLOCK_MACRO_END
  183. /* Faster versions of packDiff() for single-byte-encoded diff values. */
  184. /** Is a diff value encodable in a single byte? */
  185. #define DIFF_IS_SINGLE(diff) (BOCU1_REACH_NEG_1<=(diff) && (diff)<=BOCU1_REACH_POS_1)
  186. /** Encode a diff value in a single byte. */
  187. #define PACK_SINGLE_DIFF(diff) (BOCU1_MIDDLE+(diff))
  188. /** Is a diff value encodable in two bytes? */
  189. #define DIFF_IS_DOUBLE(diff) (BOCU1_REACH_NEG_2<=(diff) && (diff)<=BOCU1_REACH_POS_2)
  190. /* BOCU-1 implementation functions ------------------------------------------ */
  191. #define BOCU1_SIMPLE_PREV(c) (((c)&~0x7f)+BOCU1_ASCII_PREV)
  192. /**
  193. * Compute the next "previous" value for differencing
  194. * from the current code point.
  195. *
  196. * @param c current code point, 0x3040..0xd7a3 (rest handled by macro below)
  197. * @return "previous code point" state value
  198. */
  199. static inline int32_t
  200. bocu1Prev(int32_t c) {
  201. /* compute new prev */
  202. if(/* 0x3040<=c && */ c<=0x309f) {
  203. /* Hiragana is not 128-aligned */
  204. return 0x3070;
  205. } else if(0x4e00<=c && c<=0x9fa5) {
  206. /* CJK Unihan */
  207. return 0x4e00-BOCU1_REACH_NEG_2;
  208. } else if(0xac00<=c /* && c<=0xd7a3 */) {
  209. /* Korean Hangul */
  210. return (0xd7a3+0xac00)/2;
  211. } else {
  212. /* mostly small scripts */
  213. return BOCU1_SIMPLE_PREV(c);
  214. }
  215. }
  216. /** Fast version of bocu1Prev() for most scripts. */
  217. #define BOCU1_PREV(c) ((c)<0x3040 || (c)>0xd7a3 ? BOCU1_SIMPLE_PREV(c) : bocu1Prev(c))
  218. /*
  219. * The BOCU-1 converter uses the standard setup code in ucnv.c/ucnv_bld.c.
  220. * The UConverter fields are used as follows:
  221. *
  222. * fromUnicodeStatus encoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
  223. *
  224. * toUnicodeStatus decoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
  225. * mode decoder's incomplete (diff<<2)|count (ignored when toULength==0)
  226. */
  227. /* BOCU-1-from-Unicode conversion functions --------------------------------- */
  228. /**
  229. * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes
  230. * and return a packed integer with them.
  231. *
  232. * The encoding favors small absolute differences with short encodings
  233. * to compress runs of same-script characters.
  234. *
  235. * Optimized version with unrolled loops and fewer floating-point operations
  236. * than the standard packDiff().
  237. *
  238. * @param diff difference value -0x10ffff..0x10ffff
  239. * @return
  240. * 0x010000zz for 1-byte sequence zz
  241. * 0x0200yyzz for 2-byte sequence yy zz
  242. * 0x03xxyyzz for 3-byte sequence xx yy zz
  243. * 0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03)
  244. */
  245. static int32_t
  246. packDiff(int32_t diff) {
  247. int32_t result, m;
  248. U_ASSERT(!DIFF_IS_SINGLE(diff)); /* assume we won't be called where diff==BOCU1_REACH_NEG_1=-64 */
  249. if(diff>=BOCU1_REACH_NEG_1) {
  250. /* mostly positive differences, and single-byte negative ones */
  251. #if 0 /* single-byte case handled in macros, see below */
  252. if(diff<=BOCU1_REACH_POS_1) {
  253. /* single byte */
  254. return 0x01000000|(BOCU1_MIDDLE+diff);
  255. } else
  256. #endif
  257. if(diff<=BOCU1_REACH_POS_2) {
  258. /* two bytes */
  259. diff-=BOCU1_REACH_POS_1+1;
  260. result=0x02000000;
  261. m=diff%BOCU1_TRAIL_COUNT;
  262. diff/=BOCU1_TRAIL_COUNT;
  263. result|=BOCU1_TRAIL_TO_BYTE(m);
  264. result|=(BOCU1_START_POS_2+diff)<<8;
  265. } else if(diff<=BOCU1_REACH_POS_3) {
  266. /* three bytes */
  267. diff-=BOCU1_REACH_POS_2+1;
  268. result=0x03000000;
  269. m=diff%BOCU1_TRAIL_COUNT;
  270. diff/=BOCU1_TRAIL_COUNT;
  271. result|=BOCU1_TRAIL_TO_BYTE(m);
  272. m=diff%BOCU1_TRAIL_COUNT;
  273. diff/=BOCU1_TRAIL_COUNT;
  274. result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
  275. result|=(BOCU1_START_POS_3+diff)<<16;
  276. } else {
  277. /* four bytes */
  278. diff-=BOCU1_REACH_POS_3+1;
  279. m=diff%BOCU1_TRAIL_COUNT;
  280. diff/=BOCU1_TRAIL_COUNT;
  281. result=BOCU1_TRAIL_TO_BYTE(m);
  282. m=diff%BOCU1_TRAIL_COUNT;
  283. diff/=BOCU1_TRAIL_COUNT;
  284. result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
  285. /*
  286. * We know that / and % would deliver quotient 0 and rest=diff.
  287. * Avoid division and modulo for performance.
  288. */
  289. result|=BOCU1_TRAIL_TO_BYTE(diff)<<16;
  290. result|=((uint32_t)BOCU1_START_POS_4)<<24;
  291. }
  292. } else {
  293. /* two- to four-byte negative differences */
  294. if(diff>=BOCU1_REACH_NEG_2) {
  295. /* two bytes */
  296. diff-=BOCU1_REACH_NEG_1;
  297. result=0x02000000;
  298. NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
  299. result|=BOCU1_TRAIL_TO_BYTE(m);
  300. result|=(BOCU1_START_NEG_2+diff)<<8;
  301. } else if(diff>=BOCU1_REACH_NEG_3) {
  302. /* three bytes */
  303. diff-=BOCU1_REACH_NEG_2;
  304. result=0x03000000;
  305. NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
  306. result|=BOCU1_TRAIL_TO_BYTE(m);
  307. NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
  308. result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
  309. result|=(BOCU1_START_NEG_3+diff)<<16;
  310. } else {
  311. /* four bytes */
  312. diff-=BOCU1_REACH_NEG_3;
  313. NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
  314. result=BOCU1_TRAIL_TO_BYTE(m);
  315. NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
  316. result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
  317. /*
  318. * We know that NEGDIVMOD would deliver
  319. * quotient -1 and rest=diff+BOCU1_TRAIL_COUNT.
  320. * Avoid division and modulo for performance.
  321. */
  322. m=diff+BOCU1_TRAIL_COUNT;
  323. result|=BOCU1_TRAIL_TO_BYTE(m)<<16;
  324. result|=BOCU1_MIN<<24;
  325. }
  326. }
  327. return result;
  328. }
  329. static void U_CALLCONV
  330. _Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
  331. UErrorCode *pErrorCode) {
  332. UConverter *cnv;
  333. const char16_t *source, *sourceLimit;
  334. uint8_t *target;
  335. int32_t targetCapacity;
  336. int32_t *offsets;
  337. int32_t prev, c, diff;
  338. int32_t sourceIndex, nextSourceIndex;
  339. /* set up the local pointers */
  340. cnv=pArgs->converter;
  341. source=pArgs->source;
  342. sourceLimit=pArgs->sourceLimit;
  343. target=(uint8_t *)pArgs->target;
  344. targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
  345. offsets=pArgs->offsets;
  346. /* get the converter state from UConverter */
  347. c=cnv->fromUChar32;
  348. prev=(int32_t)cnv->fromUnicodeStatus;
  349. if(prev==0) {
  350. prev=BOCU1_ASCII_PREV;
  351. }
  352. /* sourceIndex=-1 if the current character began in the previous buffer */
  353. sourceIndex= c==0 ? 0 : -1;
  354. nextSourceIndex=0;
  355. /* conversion loop */
  356. if(c!=0 && targetCapacity>0) {
  357. goto getTrail;
  358. }
  359. fastSingle:
  360. /* fast loop for single-byte differences */
  361. /* use only one loop counter variable, targetCapacity, not also source */
  362. diff=(int32_t)(sourceLimit-source);
  363. if(targetCapacity>diff) {
  364. targetCapacity=diff;
  365. }
  366. while(targetCapacity>0 && (c=*source)<0x3000) {
  367. if(c<=0x20) {
  368. if(c!=0x20) {
  369. prev=BOCU1_ASCII_PREV;
  370. }
  371. *target++=(uint8_t)c;
  372. *offsets++=nextSourceIndex++;
  373. ++source;
  374. --targetCapacity;
  375. } else {
  376. diff=c-prev;
  377. if(DIFF_IS_SINGLE(diff)) {
  378. prev=BOCU1_SIMPLE_PREV(c);
  379. *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
  380. *offsets++=nextSourceIndex++;
  381. ++source;
  382. --targetCapacity;
  383. } else {
  384. break;
  385. }
  386. }
  387. }
  388. /* restore real values */
  389. targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);
  390. sourceIndex=nextSourceIndex; /* wrong if offsets==nullptr but does not matter */
  391. /* regular loop for all cases */
  392. while(source<sourceLimit) {
  393. if(targetCapacity>0) {
  394. c=*source++;
  395. ++nextSourceIndex;
  396. if(c<=0x20) {
  397. /*
  398. * ISO C0 control & space:
  399. * Encode directly for MIME compatibility,
  400. * and reset state except for space, to not disrupt compression.
  401. */
  402. if(c!=0x20) {
  403. prev=BOCU1_ASCII_PREV;
  404. }
  405. *target++=(uint8_t)c;
  406. *offsets++=sourceIndex;
  407. --targetCapacity;
  408. sourceIndex=nextSourceIndex;
  409. continue;
  410. }
  411. if(U16_IS_LEAD(c)) {
  412. getTrail:
  413. if(source<sourceLimit) {
  414. /* test the following code unit */
  415. char16_t trail=*source;
  416. if(U16_IS_TRAIL(trail)) {
  417. ++source;
  418. ++nextSourceIndex;
  419. c=U16_GET_SUPPLEMENTARY(c, trail);
  420. }
  421. } else {
  422. /* no more input */
  423. c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
  424. break;
  425. }
  426. }
  427. /*
  428. * all other Unicode code points c==U+0021..U+10ffff
  429. * are encoded with the difference c-prev
  430. *
  431. * a new prev is computed from c,
  432. * placed in the middle of a 0x80-block (for most small scripts) or
  433. * in the middle of the Unihan and Hangul blocks
  434. * to statistically minimize the following difference
  435. */
  436. diff=c-prev;
  437. prev=BOCU1_PREV(c);
  438. if(DIFF_IS_SINGLE(diff)) {
  439. *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
  440. *offsets++=sourceIndex;
  441. --targetCapacity;
  442. sourceIndex=nextSourceIndex;
  443. if(c<0x3000) {
  444. goto fastSingle;
  445. }
  446. } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
  447. /* optimize 2-byte case */
  448. int32_t m;
  449. if(diff>=0) {
  450. diff-=BOCU1_REACH_POS_1+1;
  451. m=diff%BOCU1_TRAIL_COUNT;
  452. diff/=BOCU1_TRAIL_COUNT;
  453. diff+=BOCU1_START_POS_2;
  454. } else {
  455. diff-=BOCU1_REACH_NEG_1;
  456. NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
  457. diff+=BOCU1_START_NEG_2;
  458. }
  459. *target++=(uint8_t)diff;
  460. *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
  461. *offsets++=sourceIndex;
  462. *offsets++=sourceIndex;
  463. targetCapacity-=2;
  464. sourceIndex=nextSourceIndex;
  465. } else {
  466. int32_t length; /* will be 2..4 */
  467. diff=packDiff(diff);
  468. length=BOCU1_LENGTH_FROM_PACKED(diff);
  469. /* write the output character bytes from diff and length */
  470. /* from the first if in the loop we know that targetCapacity>0 */
  471. if(length<=targetCapacity) {
  472. switch(length) {
  473. /* each branch falls through to the next one */
  474. case 4:
  475. *target++=(uint8_t)(diff>>24);
  476. *offsets++=sourceIndex;
  477. U_FALLTHROUGH;
  478. case 3:
  479. *target++=(uint8_t)(diff>>16);
  480. *offsets++=sourceIndex;
  481. U_FALLTHROUGH;
  482. case 2:
  483. *target++=(uint8_t)(diff>>8);
  484. *offsets++=sourceIndex;
  485. /* case 1: handled above */
  486. *target++=(uint8_t)diff;
  487. *offsets++=sourceIndex;
  488. U_FALLTHROUGH;
  489. default:
  490. /* will never occur */
  491. break;
  492. }
  493. targetCapacity-=length;
  494. sourceIndex=nextSourceIndex;
  495. } else {
  496. uint8_t *charErrorBuffer;
  497. /*
  498. * We actually do this backwards here:
  499. * In order to save an intermediate variable, we output
  500. * first to the overflow buffer what does not fit into the
  501. * regular target.
  502. */
  503. /* we know that 1<=targetCapacity<length<=4 */
  504. length-=targetCapacity;
  505. charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
  506. switch(length) {
  507. /* each branch falls through to the next one */
  508. case 3:
  509. *charErrorBuffer++=(uint8_t)(diff>>16);
  510. U_FALLTHROUGH;
  511. case 2:
  512. *charErrorBuffer++=(uint8_t)(diff>>8);
  513. U_FALLTHROUGH;
  514. case 1:
  515. *charErrorBuffer=(uint8_t)diff;
  516. U_FALLTHROUGH;
  517. default:
  518. /* will never occur */
  519. break;
  520. }
  521. cnv->charErrorBufferLength=(int8_t)length;
  522. /* now output what fits into the regular target */
  523. diff>>=8*length; /* length was reduced by targetCapacity */
  524. switch(targetCapacity) {
  525. /* each branch falls through to the next one */
  526. case 3:
  527. *target++=(uint8_t)(diff>>16);
  528. *offsets++=sourceIndex;
  529. U_FALLTHROUGH;
  530. case 2:
  531. *target++=(uint8_t)(diff>>8);
  532. *offsets++=sourceIndex;
  533. U_FALLTHROUGH;
  534. case 1:
  535. *target++=(uint8_t)diff;
  536. *offsets++=sourceIndex;
  537. U_FALLTHROUGH;
  538. default:
  539. /* will never occur */
  540. break;
  541. }
  542. /* target overflow */
  543. targetCapacity=0;
  544. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  545. break;
  546. }
  547. }
  548. } else {
  549. /* target is full */
  550. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  551. break;
  552. }
  553. }
  554. /* set the converter state back into UConverter */
  555. cnv->fromUChar32= c<0 ? -c : 0;
  556. cnv->fromUnicodeStatus=(uint32_t)prev;
  557. /* write back the updated pointers */
  558. pArgs->source=source;
  559. pArgs->target=(char *)target;
  560. pArgs->offsets=offsets;
  561. }
  562. /*
  563. * Identical to _Bocu1FromUnicodeWithOffsets but without offset handling.
  564. * If a change is made in the original function, then either
  565. * change this function the same way or
  566. * re-copy the original function and remove the variables
  567. * offsets, sourceIndex, and nextSourceIndex.
  568. */
  569. static void U_CALLCONV
  570. _Bocu1FromUnicode(UConverterFromUnicodeArgs *pArgs,
  571. UErrorCode *pErrorCode) {
  572. UConverter *cnv;
  573. const char16_t *source, *sourceLimit;
  574. uint8_t *target;
  575. int32_t targetCapacity;
  576. int32_t prev, c, diff;
  577. /* set up the local pointers */
  578. cnv=pArgs->converter;
  579. source=pArgs->source;
  580. sourceLimit=pArgs->sourceLimit;
  581. target=(uint8_t *)pArgs->target;
  582. targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
  583. /* get the converter state from UConverter */
  584. c=cnv->fromUChar32;
  585. prev=(int32_t)cnv->fromUnicodeStatus;
  586. if(prev==0) {
  587. prev=BOCU1_ASCII_PREV;
  588. }
  589. /* conversion loop */
  590. if(c!=0 && targetCapacity>0) {
  591. goto getTrail;
  592. }
  593. fastSingle:
  594. /* fast loop for single-byte differences */
  595. /* use only one loop counter variable, targetCapacity, not also source */
  596. diff=(int32_t)(sourceLimit-source);
  597. if(targetCapacity>diff) {
  598. targetCapacity=diff;
  599. }
  600. while(targetCapacity>0 && (c=*source)<0x3000) {
  601. if(c<=0x20) {
  602. if(c!=0x20) {
  603. prev=BOCU1_ASCII_PREV;
  604. }
  605. *target++=(uint8_t)c;
  606. } else {
  607. diff=c-prev;
  608. if(DIFF_IS_SINGLE(diff)) {
  609. prev=BOCU1_SIMPLE_PREV(c);
  610. *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
  611. } else {
  612. break;
  613. }
  614. }
  615. ++source;
  616. --targetCapacity;
  617. }
  618. /* restore real values */
  619. targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);
  620. /* regular loop for all cases */
  621. while(source<sourceLimit) {
  622. if(targetCapacity>0) {
  623. c=*source++;
  624. if(c<=0x20) {
  625. /*
  626. * ISO C0 control & space:
  627. * Encode directly for MIME compatibility,
  628. * and reset state except for space, to not disrupt compression.
  629. */
  630. if(c!=0x20) {
  631. prev=BOCU1_ASCII_PREV;
  632. }
  633. *target++=(uint8_t)c;
  634. --targetCapacity;
  635. continue;
  636. }
  637. if(U16_IS_LEAD(c)) {
  638. getTrail:
  639. if(source<sourceLimit) {
  640. /* test the following code unit */
  641. char16_t trail=*source;
  642. if(U16_IS_TRAIL(trail)) {
  643. ++source;
  644. c=U16_GET_SUPPLEMENTARY(c, trail);
  645. }
  646. } else {
  647. /* no more input */
  648. c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
  649. break;
  650. }
  651. }
  652. /*
  653. * all other Unicode code points c==U+0021..U+10ffff
  654. * are encoded with the difference c-prev
  655. *
  656. * a new prev is computed from c,
  657. * placed in the middle of a 0x80-block (for most small scripts) or
  658. * in the middle of the Unihan and Hangul blocks
  659. * to statistically minimize the following difference
  660. */
  661. diff=c-prev;
  662. prev=BOCU1_PREV(c);
  663. if(DIFF_IS_SINGLE(diff)) {
  664. *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
  665. --targetCapacity;
  666. if(c<0x3000) {
  667. goto fastSingle;
  668. }
  669. } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
  670. /* optimize 2-byte case */
  671. int32_t m;
  672. if(diff>=0) {
  673. diff-=BOCU1_REACH_POS_1+1;
  674. m=diff%BOCU1_TRAIL_COUNT;
  675. diff/=BOCU1_TRAIL_COUNT;
  676. diff+=BOCU1_START_POS_2;
  677. } else {
  678. diff-=BOCU1_REACH_NEG_1;
  679. NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
  680. diff+=BOCU1_START_NEG_2;
  681. }
  682. *target++=(uint8_t)diff;
  683. *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
  684. targetCapacity-=2;
  685. } else {
  686. int32_t length; /* will be 2..4 */
  687. diff=packDiff(diff);
  688. length=BOCU1_LENGTH_FROM_PACKED(diff);
  689. /* write the output character bytes from diff and length */
  690. /* from the first if in the loop we know that targetCapacity>0 */
  691. if(length<=targetCapacity) {
  692. switch(length) {
  693. /* each branch falls through to the next one */
  694. case 4:
  695. *target++=(uint8_t)(diff>>24);
  696. U_FALLTHROUGH;
  697. case 3:
  698. *target++=(uint8_t)(diff>>16);
  699. /* case 2: handled above */
  700. *target++=(uint8_t)(diff>>8);
  701. /* case 1: handled above */
  702. *target++=(uint8_t)diff;
  703. U_FALLTHROUGH;
  704. default:
  705. /* will never occur */
  706. break;
  707. }
  708. targetCapacity-=length;
  709. } else {
  710. uint8_t *charErrorBuffer;
  711. /*
  712. * We actually do this backwards here:
  713. * In order to save an intermediate variable, we output
  714. * first to the overflow buffer what does not fit into the
  715. * regular target.
  716. */
  717. /* we know that 1<=targetCapacity<length<=4 */
  718. length-=targetCapacity;
  719. charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
  720. switch(length) {
  721. /* each branch falls through to the next one */
  722. case 3:
  723. *charErrorBuffer++=(uint8_t)(diff>>16);
  724. U_FALLTHROUGH;
  725. case 2:
  726. *charErrorBuffer++=(uint8_t)(diff>>8);
  727. U_FALLTHROUGH;
  728. case 1:
  729. *charErrorBuffer=(uint8_t)diff;
  730. U_FALLTHROUGH;
  731. default:
  732. /* will never occur */
  733. break;
  734. }
  735. cnv->charErrorBufferLength=(int8_t)length;
  736. /* now output what fits into the regular target */
  737. diff>>=8*length; /* length was reduced by targetCapacity */
  738. switch(targetCapacity) {
  739. /* each branch falls through to the next one */
  740. case 3:
  741. *target++=(uint8_t)(diff>>16);
  742. U_FALLTHROUGH;
  743. case 2:
  744. *target++=(uint8_t)(diff>>8);
  745. U_FALLTHROUGH;
  746. case 1:
  747. *target++=(uint8_t)diff;
  748. U_FALLTHROUGH;
  749. default:
  750. /* will never occur */
  751. break;
  752. }
  753. /* target overflow */
  754. targetCapacity=0;
  755. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  756. break;
  757. }
  758. }
  759. } else {
  760. /* target is full */
  761. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  762. break;
  763. }
  764. }
  765. /* set the converter state back into UConverter */
  766. cnv->fromUChar32= c<0 ? -c : 0;
  767. cnv->fromUnicodeStatus=(uint32_t)prev;
  768. /* write back the updated pointers */
  769. pArgs->source=source;
  770. pArgs->target=(char *)target;
  771. }
  772. /* BOCU-1-to-Unicode conversion functions ----------------------------------- */
  773. /**
  774. * Function for BOCU-1 decoder; handles multi-byte lead bytes.
  775. *
  776. * @param b lead byte;
  777. * BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<BOCU1_MAX_LEAD
  778. * @return (diff<<2)|count
  779. */
  780. static inline int32_t
  781. decodeBocu1LeadByte(int32_t b) {
  782. int32_t diff, count;
  783. if(b>=BOCU1_START_NEG_2) {
  784. /* positive difference */
  785. if(b<BOCU1_START_POS_3) {
  786. /* two bytes */
  787. diff=((int32_t)b-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
  788. count=1;
  789. } else if(b<BOCU1_START_POS_4) {
  790. /* three bytes */
  791. diff=((int32_t)b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1;
  792. count=2;
  793. } else {
  794. /* four bytes */
  795. diff=BOCU1_REACH_POS_3+1;
  796. count=3;
  797. }
  798. } else {
  799. /* negative difference */
  800. if(b>=BOCU1_START_NEG_3) {
  801. /* two bytes */
  802. diff=((int32_t)b-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
  803. count=1;
  804. } else if(b>BOCU1_MIN) {
  805. /* three bytes */
  806. diff=((int32_t)b-BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_2;
  807. count=2;
  808. } else {
  809. /* four bytes */
  810. diff=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3;
  811. count=3;
  812. }
  813. }
  814. /* return the state for decoding the trail byte(s) */
  815. return ((uint32_t)diff<<2)|count;
  816. }
  817. /**
  818. * Function for BOCU-1 decoder; handles multi-byte trail bytes.
  819. *
  820. * @param count number of remaining trail bytes including this one
  821. * @param b trail byte
  822. * @return new delta for diff including b - <0 indicates an error
  823. *
  824. * @see decodeBocu1
  825. */
  826. static inline int32_t
  827. decodeBocu1TrailByte(int32_t count, int32_t b) {
  828. if(b<=0x20) {
  829. /* skip some C0 controls and make the trail byte range contiguous */
  830. b=bocu1ByteToTrail[b];
  831. /* b<0 for an illegal trail byte value will result in return<0 below */
  832. #if BOCU1_MAX_TRAIL<0xff
  833. } else if(b>BOCU1_MAX_TRAIL) {
  834. return -99;
  835. #endif
  836. } else {
  837. b-=BOCU1_TRAIL_BYTE_OFFSET;
  838. }
  839. /* add trail byte into difference and decrement count */
  840. if(count==1) {
  841. return b;
  842. } else if(count==2) {
  843. return b*BOCU1_TRAIL_COUNT;
  844. } else /* count==3 */ {
  845. return b*(BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT);
  846. }
  847. }
  848. static void U_CALLCONV
  849. _Bocu1ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
  850. UErrorCode *pErrorCode) {
  851. UConverter *cnv;
  852. const uint8_t *source, *sourceLimit;
  853. char16_t *target;
  854. const char16_t *targetLimit;
  855. int32_t *offsets;
  856. int32_t prev, count, diff, c;
  857. int8_t byteIndex;
  858. uint8_t *bytes;
  859. int32_t sourceIndex, nextSourceIndex;
  860. /* set up the local pointers */
  861. cnv=pArgs->converter;
  862. source=(const uint8_t *)pArgs->source;
  863. sourceLimit=(const uint8_t *)pArgs->sourceLimit;
  864. target=pArgs->target;
  865. targetLimit=pArgs->targetLimit;
  866. offsets=pArgs->offsets;
  867. /* get the converter state from UConverter */
  868. prev=(int32_t)cnv->toUnicodeStatus;
  869. if(prev==0) {
  870. prev=BOCU1_ASCII_PREV;
  871. }
  872. diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
  873. count=diff&3;
  874. diff>>=2;
  875. byteIndex=cnv->toULength;
  876. bytes=cnv->toUBytes;
  877. /* sourceIndex=-1 if the current character began in the previous buffer */
  878. sourceIndex=byteIndex==0 ? 0 : -1;
  879. nextSourceIndex=0;
  880. /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
  881. if(count>0 && byteIndex>0 && target<targetLimit) {
  882. goto getTrail;
  883. }
  884. fastSingle:
  885. /* fast loop for single-byte differences */
  886. /* use count as the only loop counter variable */
  887. diff=(int32_t)(sourceLimit-source);
  888. count=(int32_t)(pArgs->targetLimit-target);
  889. if(count>diff) {
  890. count=diff;
  891. }
  892. while(count>0) {
  893. if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
  894. c=prev+(c-BOCU1_MIDDLE);
  895. if(c<0x3000) {
  896. *target++=(char16_t)c;
  897. *offsets++=nextSourceIndex++;
  898. prev=BOCU1_SIMPLE_PREV(c);
  899. } else {
  900. break;
  901. }
  902. } else if(c<=0x20) {
  903. if(c!=0x20) {
  904. prev=BOCU1_ASCII_PREV;
  905. }
  906. *target++=(char16_t)c;
  907. *offsets++=nextSourceIndex++;
  908. } else {
  909. break;
  910. }
  911. ++source;
  912. --count;
  913. }
  914. sourceIndex=nextSourceIndex; /* wrong if offsets==nullptr but does not matter */
  915. /* decode a sequence of single and lead bytes */
  916. while(source<sourceLimit) {
  917. if(target>=targetLimit) {
  918. /* target is full */
  919. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  920. break;
  921. }
  922. ++nextSourceIndex;
  923. c=*source++;
  924. if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
  925. /* Write a code point directly from a single-byte difference. */
  926. c=prev+(c-BOCU1_MIDDLE);
  927. if(c<0x3000) {
  928. *target++=(char16_t)c;
  929. *offsets++=sourceIndex;
  930. prev=BOCU1_SIMPLE_PREV(c);
  931. sourceIndex=nextSourceIndex;
  932. goto fastSingle;
  933. }
  934. } else if(c<=0x20) {
  935. /*
  936. * Direct-encoded C0 control code or space.
  937. * Reset prev for C0 control codes but not for space.
  938. */
  939. if(c!=0x20) {
  940. prev=BOCU1_ASCII_PREV;
  941. }
  942. *target++=(char16_t)c;
  943. *offsets++=sourceIndex;
  944. sourceIndex=nextSourceIndex;
  945. continue;
  946. } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
  947. /* Optimize two-byte case. */
  948. if(c>=BOCU1_MIDDLE) {
  949. diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
  950. } else {
  951. diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
  952. }
  953. /* trail byte */
  954. ++nextSourceIndex;
  955. c=decodeBocu1TrailByte(1, *source++);
  956. if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) {
  957. bytes[0]=source[-2];
  958. bytes[1]=source[-1];
  959. byteIndex=2;
  960. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  961. break;
  962. }
  963. } else if(c==BOCU1_RESET) {
  964. /* only reset the state, no code point */
  965. prev=BOCU1_ASCII_PREV;
  966. sourceIndex=nextSourceIndex;
  967. continue;
  968. } else {
  969. /*
  970. * For multi-byte difference lead bytes, set the decoder state
  971. * with the partial difference value from the lead byte and
  972. * with the number of trail bytes.
  973. */
  974. bytes[0]=(uint8_t)c;
  975. byteIndex=1;
  976. diff=decodeBocu1LeadByte(c);
  977. count=diff&3;
  978. diff>>=2;
  979. getTrail:
  980. for(;;) {
  981. if(source>=sourceLimit) {
  982. goto endloop;
  983. }
  984. ++nextSourceIndex;
  985. c=bytes[byteIndex++]=*source++;
  986. /* trail byte in any position */
  987. c=decodeBocu1TrailByte(count, c);
  988. if(c<0) {
  989. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  990. goto endloop;
  991. }
  992. diff+=c;
  993. if(--count==0) {
  994. /* final trail byte, deliver a code point */
  995. byteIndex=0;
  996. c=prev+diff;
  997. if((uint32_t)c>0x10ffff) {
  998. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  999. goto endloop;
  1000. }
  1001. break;
  1002. }
  1003. }
  1004. }
  1005. /* calculate the next prev and output c */
  1006. prev=BOCU1_PREV(c);
  1007. if(c<=0xffff) {
  1008. *target++=(char16_t)c;
  1009. *offsets++=sourceIndex;
  1010. } else {
  1011. /* output surrogate pair */
  1012. *target++=U16_LEAD(c);
  1013. if(target<targetLimit) {
  1014. *target++=U16_TRAIL(c);
  1015. *offsets++=sourceIndex;
  1016. *offsets++=sourceIndex;
  1017. } else {
  1018. /* target overflow */
  1019. *offsets++=sourceIndex;
  1020. cnv->UCharErrorBuffer[0]=U16_TRAIL(c);
  1021. cnv->UCharErrorBufferLength=1;
  1022. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  1023. break;
  1024. }
  1025. }
  1026. sourceIndex=nextSourceIndex;
  1027. }
  1028. endloop:
  1029. if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
  1030. /* set the converter state in UConverter to deal with the next character */
  1031. cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
  1032. cnv->mode=0;
  1033. } else {
  1034. /* set the converter state back into UConverter */
  1035. cnv->toUnicodeStatus=(uint32_t)prev;
  1036. cnv->mode=(int32_t)((uint32_t)diff<<2)|count;
  1037. }
  1038. cnv->toULength=byteIndex;
  1039. /* write back the updated pointers */
  1040. pArgs->source=(const char *)source;
  1041. pArgs->target=target;
  1042. pArgs->offsets=offsets;
  1043. return;
  1044. }
  1045. /*
  1046. * Identical to _Bocu1ToUnicodeWithOffsets but without offset handling.
  1047. * If a change is made in the original function, then either
  1048. * change this function the same way or
  1049. * re-copy the original function and remove the variables
  1050. * offsets, sourceIndex, and nextSourceIndex.
  1051. */
  1052. static void U_CALLCONV
  1053. _Bocu1ToUnicode(UConverterToUnicodeArgs *pArgs,
  1054. UErrorCode *pErrorCode) {
  1055. UConverter *cnv;
  1056. const uint8_t *source, *sourceLimit;
  1057. char16_t *target;
  1058. const char16_t *targetLimit;
  1059. int32_t prev, count, diff, c;
  1060. int8_t byteIndex;
  1061. uint8_t *bytes;
  1062. /* set up the local pointers */
  1063. cnv=pArgs->converter;
  1064. source=(const uint8_t *)pArgs->source;
  1065. sourceLimit=(const uint8_t *)pArgs->sourceLimit;
  1066. target=pArgs->target;
  1067. targetLimit=pArgs->targetLimit;
  1068. /* get the converter state from UConverter */
  1069. prev=(int32_t)cnv->toUnicodeStatus;
  1070. if(prev==0) {
  1071. prev=BOCU1_ASCII_PREV;
  1072. }
  1073. diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
  1074. count=diff&3;
  1075. diff>>=2;
  1076. byteIndex=cnv->toULength;
  1077. bytes=cnv->toUBytes;
  1078. /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
  1079. if(count>0 && byteIndex>0 && target<targetLimit) {
  1080. goto getTrail;
  1081. }
  1082. fastSingle:
  1083. /* fast loop for single-byte differences */
  1084. /* use count as the only loop counter variable */
  1085. diff=(int32_t)(sourceLimit-source);
  1086. count=(int32_t)(pArgs->targetLimit-target);
  1087. if(count>diff) {
  1088. count=diff;
  1089. }
  1090. while(count>0) {
  1091. if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
  1092. c=prev+(c-BOCU1_MIDDLE);
  1093. if(c<0x3000) {
  1094. *target++=(char16_t)c;
  1095. prev=BOCU1_SIMPLE_PREV(c);
  1096. } else {
  1097. break;
  1098. }
  1099. } else if(c<=0x20) {
  1100. if(c!=0x20) {
  1101. prev=BOCU1_ASCII_PREV;
  1102. }
  1103. *target++=(char16_t)c;
  1104. } else {
  1105. break;
  1106. }
  1107. ++source;
  1108. --count;
  1109. }
  1110. /* decode a sequence of single and lead bytes */
  1111. while(source<sourceLimit) {
  1112. if(target>=targetLimit) {
  1113. /* target is full */
  1114. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  1115. break;
  1116. }
  1117. c=*source++;
  1118. if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
  1119. /* Write a code point directly from a single-byte difference. */
  1120. c=prev+(c-BOCU1_MIDDLE);
  1121. if(c<0x3000) {
  1122. *target++=(char16_t)c;
  1123. prev=BOCU1_SIMPLE_PREV(c);
  1124. goto fastSingle;
  1125. }
  1126. } else if(c<=0x20) {
  1127. /*
  1128. * Direct-encoded C0 control code or space.
  1129. * Reset prev for C0 control codes but not for space.
  1130. */
  1131. if(c!=0x20) {
  1132. prev=BOCU1_ASCII_PREV;
  1133. }
  1134. *target++=(char16_t)c;
  1135. continue;
  1136. } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
  1137. /* Optimize two-byte case. */
  1138. if(c>=BOCU1_MIDDLE) {
  1139. diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
  1140. } else {
  1141. diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
  1142. }
  1143. /* trail byte */
  1144. c=decodeBocu1TrailByte(1, *source++);
  1145. if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) {
  1146. bytes[0]=source[-2];
  1147. bytes[1]=source[-1];
  1148. byteIndex=2;
  1149. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  1150. break;
  1151. }
  1152. } else if(c==BOCU1_RESET) {
  1153. /* only reset the state, no code point */
  1154. prev=BOCU1_ASCII_PREV;
  1155. continue;
  1156. } else {
  1157. /*
  1158. * For multi-byte difference lead bytes, set the decoder state
  1159. * with the partial difference value from the lead byte and
  1160. * with the number of trail bytes.
  1161. */
  1162. bytes[0]=(uint8_t)c;
  1163. byteIndex=1;
  1164. diff=decodeBocu1LeadByte(c);
  1165. count=diff&3;
  1166. diff>>=2;
  1167. getTrail:
  1168. for(;;) {
  1169. if(source>=sourceLimit) {
  1170. goto endloop;
  1171. }
  1172. c=bytes[byteIndex++]=*source++;
  1173. /* trail byte in any position */
  1174. c=decodeBocu1TrailByte(count, c);
  1175. if(c<0) {
  1176. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  1177. goto endloop;
  1178. }
  1179. diff+=c;
  1180. if(--count==0) {
  1181. /* final trail byte, deliver a code point */
  1182. byteIndex=0;
  1183. c=prev+diff;
  1184. if((uint32_t)c>0x10ffff) {
  1185. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  1186. goto endloop;
  1187. }
  1188. break;
  1189. }
  1190. }
  1191. }
  1192. /* calculate the next prev and output c */
  1193. prev=BOCU1_PREV(c);
  1194. if(c<=0xffff) {
  1195. *target++=(char16_t)c;
  1196. } else {
  1197. /* output surrogate pair */
  1198. *target++=U16_LEAD(c);
  1199. if(target<targetLimit) {
  1200. *target++=U16_TRAIL(c);
  1201. } else {
  1202. /* target overflow */
  1203. cnv->UCharErrorBuffer[0]=U16_TRAIL(c);
  1204. cnv->UCharErrorBufferLength=1;
  1205. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  1206. break;
  1207. }
  1208. }
  1209. }
  1210. endloop:
  1211. if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
  1212. /* set the converter state in UConverter to deal with the next character */
  1213. cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
  1214. cnv->mode=0;
  1215. } else {
  1216. /* set the converter state back into UConverter */
  1217. cnv->toUnicodeStatus=(uint32_t)prev;
  1218. cnv->mode=((uint32_t)diff<<2)|count;
  1219. }
  1220. cnv->toULength=byteIndex;
  1221. /* write back the updated pointers */
  1222. pArgs->source=(const char *)source;
  1223. pArgs->target=target;
  1224. return;
  1225. }
  1226. /* miscellaneous ------------------------------------------------------------ */
  1227. static const UConverterImpl _Bocu1Impl={
  1228. UCNV_BOCU1,
  1229. nullptr,
  1230. nullptr,
  1231. nullptr,
  1232. nullptr,
  1233. nullptr,
  1234. _Bocu1ToUnicode,
  1235. _Bocu1ToUnicodeWithOffsets,
  1236. _Bocu1FromUnicode,
  1237. _Bocu1FromUnicodeWithOffsets,
  1238. nullptr,
  1239. nullptr,
  1240. nullptr,
  1241. nullptr,
  1242. nullptr,
  1243. ucnv_getCompleteUnicodeSet,
  1244. nullptr,
  1245. nullptr
  1246. };
  1247. static const UConverterStaticData _Bocu1StaticData={
  1248. sizeof(UConverterStaticData),
  1249. "BOCU-1",
  1250. 1214, /* CCSID for BOCU-1 */
  1251. UCNV_IBM, UCNV_BOCU1,
  1252. 1, 4, /* one char16_t generates at least 1 byte and at most 4 bytes */
  1253. { 0x1a, 0, 0, 0 }, 1, /* BOCU-1 never needs to write a subchar */
  1254. false, false,
  1255. 0,
  1256. 0,
  1257. { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
  1258. };
  1259. const UConverterSharedData _Bocu1Data=
  1260. UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_Bocu1StaticData, &_Bocu1Impl);
  1261. #endif