ucnvscsu.cpp 74 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. ******************************************************************************
  5. *
  6. * Copyright (C) 2000-2016, International Business Machines
  7. * Corporation and others. All Rights Reserved.
  8. *
  9. ******************************************************************************
  10. * file name: ucnvscsu.c
  11. * encoding: UTF-8
  12. * tab size: 8 (not used)
  13. * indentation:4
  14. *
  15. * created on: 2000nov18
  16. * created by: Markus W. Scherer
  17. *
  18. * This is an implementation of the Standard Compression Scheme for Unicode
  19. * as defined in https://www.unicode.org/reports/tr6/ .
  20. * Reserved commands and window settings are treated as illegal sequences and
  21. * will result in callback calls.
  22. */
  23. #include "unicode/utypes.h"
  24. #if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
  25. #include "unicode/ucnv.h"
  26. #include "unicode/ucnv_cb.h"
  27. #include "unicode/utf16.h"
  28. #include "ucnv_bld.h"
  29. #include "ucnv_cnv.h"
  30. #include "cmemory.h"
  31. /* SCSU definitions --------------------------------------------------------- */
  32. /* SCSU command byte values */
  33. enum {
  34. SQ0=0x01, /* Quote from window pair 0 */
  35. SQ7=0x08, /* Quote from window pair 7 */
  36. SDX=0x0B, /* Define a window as extended */
  37. Srs=0x0C, /* reserved */
  38. SQU=0x0E, /* Quote a single Unicode character */
  39. SCU=0x0F, /* Change to Unicode mode */
  40. SC0=0x10, /* Select window 0 */
  41. SC7=0x17, /* Select window 7 */
  42. SD0=0x18, /* Define and select window 0 */
  43. SD7=0x1F, /* Define and select window 7 */
  44. UC0=0xE0, /* Select window 0 */
  45. UC7=0xE7, /* Select window 7 */
  46. UD0=0xE8, /* Define and select window 0 */
  47. UD7=0xEF, /* Define and select window 7 */
  48. UQU=0xF0, /* Quote a single Unicode character */
  49. UDX=0xF1, /* Define a Window as extended */
  50. Urs=0xF2 /* reserved */
  51. };
  52. enum {
  53. /*
  54. * Unicode code points from 3400 to E000 are not adressible by
  55. * dynamic window, since in these areas no short run alphabets are
  56. * found. Therefore add gapOffset to all values from gapThreshold.
  57. */
  58. gapThreshold=0x68,
  59. gapOffset=0xAC00,
  60. /* values between reservedStart and fixedThreshold are reserved */
  61. reservedStart=0xA8,
  62. /* use table of predefined fixed offsets for values from fixedThreshold */
  63. fixedThreshold=0xF9
  64. };
  65. /* constant offsets for the 8 static windows */
  66. static const uint32_t staticOffsets[8]={
  67. 0x0000, /* ASCII for quoted tags */
  68. 0x0080, /* Latin - 1 Supplement (for access to punctuation) */
  69. 0x0100, /* Latin Extended-A */
  70. 0x0300, /* Combining Diacritical Marks */
  71. 0x2000, /* General Punctuation */
  72. 0x2080, /* Currency Symbols */
  73. 0x2100, /* Letterlike Symbols and Number Forms */
  74. 0x3000 /* CJK Symbols and punctuation */
  75. };
  76. /* initial offsets for the 8 dynamic (sliding) windows */
  77. static const uint32_t initialDynamicOffsets[8]={
  78. 0x0080, /* Latin-1 */
  79. 0x00C0, /* Latin Extended A */
  80. 0x0400, /* Cyrillic */
  81. 0x0600, /* Arabic */
  82. 0x0900, /* Devanagari */
  83. 0x3040, /* Hiragana */
  84. 0x30A0, /* Katakana */
  85. 0xFF00 /* Fullwidth ASCII */
  86. };
  87. /* Table of fixed predefined Offsets */
  88. static const uint32_t fixedOffsets[]={
  89. /* 0xF9 */ 0x00C0, /* Latin-1 Letters + half of Latin Extended A */
  90. /* 0xFA */ 0x0250, /* IPA extensions */
  91. /* 0xFB */ 0x0370, /* Greek */
  92. /* 0xFC */ 0x0530, /* Armenian */
  93. /* 0xFD */ 0x3040, /* Hiragana */
  94. /* 0xFE */ 0x30A0, /* Katakana */
  95. /* 0xFF */ 0xFF60 /* Halfwidth Katakana */
  96. };
  97. /* state values */
  98. enum {
  99. readCommand,
  100. quotePairOne,
  101. quotePairTwo,
  102. quoteOne,
  103. definePairOne,
  104. definePairTwo,
  105. defineOne
  106. };
  107. typedef struct SCSUData {
  108. /* dynamic window offsets, initialize to default values from initialDynamicOffsets */
  109. uint32_t toUDynamicOffsets[8];
  110. uint32_t fromUDynamicOffsets[8];
  111. /* state machine state - toUnicode */
  112. UBool toUIsSingleByteMode;
  113. uint8_t toUState;
  114. int8_t toUQuoteWindow, toUDynamicWindow;
  115. uint8_t toUByteOne;
  116. uint8_t toUPadding[3];
  117. /* state machine state - fromUnicode */
  118. UBool fromUIsSingleByteMode;
  119. int8_t fromUDynamicWindow;
  120. /*
  121. * windowUse[] keeps track of the use of the dynamic windows:
  122. * At nextWindowUseIndex there is the least recently used window,
  123. * and the following windows (in a wrapping manner) are more and more
  124. * recently used.
  125. * At nextWindowUseIndex-1 there is the most recently used window.
  126. */
  127. uint8_t locale;
  128. int8_t nextWindowUseIndex;
  129. int8_t windowUse[8];
  130. } SCSUData;
  131. static const int8_t initialWindowUse[8]={ 7, 0, 3, 2, 4, 5, 6, 1 };
  132. static const int8_t initialWindowUse_ja[8]={ 3, 2, 4, 1, 0, 7, 5, 6 };
  133. enum {
  134. lGeneric, l_ja
  135. };
  136. /* SCSU setup functions ----------------------------------------------------- */
  137. U_CDECL_BEGIN
  138. static void U_CALLCONV
  139. _SCSUReset(UConverter *cnv, UConverterResetChoice choice) {
  140. SCSUData *scsu=(SCSUData *)cnv->extraInfo;
  141. if(choice<=UCNV_RESET_TO_UNICODE) {
  142. /* reset toUnicode */
  143. uprv_memcpy(scsu->toUDynamicOffsets, initialDynamicOffsets, 32);
  144. scsu->toUIsSingleByteMode=true;
  145. scsu->toUState=readCommand;
  146. scsu->toUQuoteWindow=scsu->toUDynamicWindow=0;
  147. scsu->toUByteOne=0;
  148. cnv->toULength=0;
  149. }
  150. if(choice!=UCNV_RESET_TO_UNICODE) {
  151. /* reset fromUnicode */
  152. uprv_memcpy(scsu->fromUDynamicOffsets, initialDynamicOffsets, 32);
  153. scsu->fromUIsSingleByteMode=true;
  154. scsu->fromUDynamicWindow=0;
  155. scsu->nextWindowUseIndex=0;
  156. switch(scsu->locale) {
  157. case l_ja:
  158. uprv_memcpy(scsu->windowUse, initialWindowUse_ja, 8);
  159. break;
  160. default:
  161. uprv_memcpy(scsu->windowUse, initialWindowUse, 8);
  162. break;
  163. }
  164. cnv->fromUChar32=0;
  165. }
  166. }
  167. static void U_CALLCONV
  168. _SCSUOpen(UConverter *cnv,
  169. UConverterLoadArgs *pArgs,
  170. UErrorCode *pErrorCode) {
  171. const char *locale=pArgs->locale;
  172. if(pArgs->onlyTestIsLoadable) {
  173. return;
  174. }
  175. cnv->extraInfo=uprv_malloc(sizeof(SCSUData));
  176. if(cnv->extraInfo!=nullptr) {
  177. if(locale!=nullptr && locale[0]=='j' && locale[1]=='a' && (locale[2]==0 || locale[2]=='_')) {
  178. ((SCSUData *)cnv->extraInfo)->locale=l_ja;
  179. } else {
  180. ((SCSUData *)cnv->extraInfo)->locale=lGeneric;
  181. }
  182. _SCSUReset(cnv, UCNV_RESET_BOTH);
  183. } else {
  184. *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
  185. }
  186. /* Set the substitution character U+fffd as a Unicode string. */
  187. cnv->subUChars[0]=0xfffd;
  188. cnv->subCharLen=-1;
  189. }
  190. static void U_CALLCONV
  191. _SCSUClose(UConverter *cnv) {
  192. if(cnv->extraInfo!=nullptr) {
  193. if(!cnv->isExtraLocal) {
  194. uprv_free(cnv->extraInfo);
  195. }
  196. cnv->extraInfo=nullptr;
  197. }
  198. }
  199. /* SCSU-to-Unicode conversion functions ------------------------------------- */
  200. static void U_CALLCONV
  201. _SCSUToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
  202. UErrorCode *pErrorCode) {
  203. UConverter *cnv;
  204. SCSUData *scsu;
  205. const uint8_t *source, *sourceLimit;
  206. char16_t *target;
  207. const char16_t *targetLimit;
  208. int32_t *offsets;
  209. UBool isSingleByteMode;
  210. uint8_t state, byteOne;
  211. int8_t quoteWindow, dynamicWindow;
  212. int32_t sourceIndex, nextSourceIndex;
  213. uint8_t b;
  214. /* set up the local pointers */
  215. cnv=pArgs->converter;
  216. scsu=(SCSUData *)cnv->extraInfo;
  217. source=(const uint8_t *)pArgs->source;
  218. sourceLimit=(const uint8_t *)pArgs->sourceLimit;
  219. target=pArgs->target;
  220. targetLimit=pArgs->targetLimit;
  221. offsets=pArgs->offsets;
  222. /* get the state machine state */
  223. isSingleByteMode=scsu->toUIsSingleByteMode;
  224. state=scsu->toUState;
  225. quoteWindow=scsu->toUQuoteWindow;
  226. dynamicWindow=scsu->toUDynamicWindow;
  227. byteOne=scsu->toUByteOne;
  228. /* sourceIndex=-1 if the current character began in the previous buffer */
  229. sourceIndex=state==readCommand ? 0 : -1;
  230. nextSourceIndex=0;
  231. /*
  232. * conversion "loop"
  233. *
  234. * For performance, this is not a normal C loop.
  235. * Instead, there are two code blocks for the two SCSU modes.
  236. * The function branches to either one, and a change of the mode is done with a goto to
  237. * the other branch.
  238. *
  239. * Each branch has two conventional loops:
  240. * - a fast-path loop for the most common codes in the mode
  241. * - a loop for all other codes in the mode
  242. * When the fast-path runs into a code that it cannot handle, its loop ends and it
  243. * runs into the following loop to handle the other codes.
  244. * The end of the input or output buffer is also handled by the slower loop.
  245. * The slow loop jumps (goto) to the fast-path loop again as soon as possible.
  246. *
  247. * The callback handling is done by returning with an error code.
  248. * The conversion framework actually calls the callback function.
  249. */
  250. if(isSingleByteMode) {
  251. /* fast path for single-byte mode */
  252. if(state==readCommand) {
  253. fastSingle:
  254. while(source<sourceLimit && target<targetLimit && (b=*source)>=0x20) {
  255. ++source;
  256. ++nextSourceIndex;
  257. if(b<=0x7f) {
  258. /* write US-ASCII graphic character or DEL */
  259. *target++=(char16_t)b;
  260. if(offsets!=nullptr) {
  261. *offsets++=sourceIndex;
  262. }
  263. } else {
  264. /* write from dynamic window */
  265. uint32_t c=scsu->toUDynamicOffsets[dynamicWindow]+(b&0x7f);
  266. if(c<=0xffff) {
  267. *target++=(char16_t)c;
  268. if(offsets!=nullptr) {
  269. *offsets++=sourceIndex;
  270. }
  271. } else {
  272. /* output surrogate pair */
  273. *target++=(char16_t)(0xd7c0+(c>>10));
  274. if(target<targetLimit) {
  275. *target++=(char16_t)(0xdc00|(c&0x3ff));
  276. if(offsets!=nullptr) {
  277. *offsets++=sourceIndex;
  278. *offsets++=sourceIndex;
  279. }
  280. } else {
  281. /* target overflow */
  282. if(offsets!=nullptr) {
  283. *offsets++=sourceIndex;
  284. }
  285. cnv->UCharErrorBuffer[0]=(char16_t)(0xdc00|(c&0x3ff));
  286. cnv->UCharErrorBufferLength=1;
  287. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  288. goto endloop;
  289. }
  290. }
  291. }
  292. sourceIndex=nextSourceIndex;
  293. }
  294. }
  295. /* normal state machine for single-byte mode, minus handling for what fastSingle covers */
  296. singleByteMode:
  297. while(source<sourceLimit) {
  298. if(target>=targetLimit) {
  299. /* target is full */
  300. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  301. break;
  302. }
  303. b=*source++;
  304. ++nextSourceIndex;
  305. switch(state) {
  306. case readCommand:
  307. /* redundant conditions are commented out */
  308. /* here: b<0x20 because otherwise we would be in fastSingle */
  309. if((1UL<<b)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
  310. /* CR/LF/TAB/NUL */
  311. *target++=(char16_t)b;
  312. if(offsets!=nullptr) {
  313. *offsets++=sourceIndex;
  314. }
  315. sourceIndex=nextSourceIndex;
  316. goto fastSingle;
  317. } else if(SC0<=b) {
  318. if(b<=SC7) {
  319. dynamicWindow=(int8_t)(b-SC0);
  320. sourceIndex=nextSourceIndex;
  321. goto fastSingle;
  322. } else /* if(SD0<=b && b<=SD7) */ {
  323. dynamicWindow=(int8_t)(b-SD0);
  324. state=defineOne;
  325. }
  326. } else if(/* SQ0<=b && */ b<=SQ7) {
  327. quoteWindow=(int8_t)(b-SQ0);
  328. state=quoteOne;
  329. } else if(b==SDX) {
  330. state=definePairOne;
  331. } else if(b==SQU) {
  332. state=quotePairOne;
  333. } else if(b==SCU) {
  334. sourceIndex=nextSourceIndex;
  335. isSingleByteMode=false;
  336. goto fastUnicode;
  337. } else /* Srs */ {
  338. /* callback(illegal) */
  339. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  340. cnv->toUBytes[0]=b;
  341. cnv->toULength=1;
  342. goto endloop;
  343. }
  344. /* store the first byte of a multibyte sequence in toUBytes[] */
  345. cnv->toUBytes[0]=b;
  346. cnv->toULength=1;
  347. break;
  348. case quotePairOne:
  349. byteOne=b;
  350. cnv->toUBytes[1]=b;
  351. cnv->toULength=2;
  352. state=quotePairTwo;
  353. break;
  354. case quotePairTwo:
  355. *target++=(char16_t)((byteOne<<8)|b);
  356. if(offsets!=nullptr) {
  357. *offsets++=sourceIndex;
  358. }
  359. sourceIndex=nextSourceIndex;
  360. state=readCommand;
  361. goto fastSingle;
  362. case quoteOne:
  363. if(b<0x80) {
  364. /* all static offsets are in the BMP */
  365. *target++=(char16_t)(staticOffsets[quoteWindow]+b);
  366. if(offsets!=nullptr) {
  367. *offsets++=sourceIndex;
  368. }
  369. } else {
  370. /* write from dynamic window */
  371. uint32_t c=scsu->toUDynamicOffsets[quoteWindow]+(b&0x7f);
  372. if(c<=0xffff) {
  373. *target++=(char16_t)c;
  374. if(offsets!=nullptr) {
  375. *offsets++=sourceIndex;
  376. }
  377. } else {
  378. /* output surrogate pair */
  379. *target++=(char16_t)(0xd7c0+(c>>10));
  380. if(target<targetLimit) {
  381. *target++=(char16_t)(0xdc00|(c&0x3ff));
  382. if(offsets!=nullptr) {
  383. *offsets++=sourceIndex;
  384. *offsets++=sourceIndex;
  385. }
  386. } else {
  387. /* target overflow */
  388. if(offsets!=nullptr) {
  389. *offsets++=sourceIndex;
  390. }
  391. cnv->UCharErrorBuffer[0]=(char16_t)(0xdc00|(c&0x3ff));
  392. cnv->UCharErrorBufferLength=1;
  393. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  394. goto endloop;
  395. }
  396. }
  397. }
  398. sourceIndex=nextSourceIndex;
  399. state=readCommand;
  400. goto fastSingle;
  401. case definePairOne:
  402. dynamicWindow=(int8_t)((b>>5)&7);
  403. byteOne=(uint8_t)(b&0x1f);
  404. cnv->toUBytes[1]=b;
  405. cnv->toULength=2;
  406. state=definePairTwo;
  407. break;
  408. case definePairTwo:
  409. scsu->toUDynamicOffsets[dynamicWindow]=0x10000+(byteOne<<15UL | b<<7UL);
  410. sourceIndex=nextSourceIndex;
  411. state=readCommand;
  412. goto fastSingle;
  413. case defineOne:
  414. if(b==0) {
  415. /* callback(illegal): Reserved window offset value 0 */
  416. cnv->toUBytes[1]=b;
  417. cnv->toULength=2;
  418. goto endloop;
  419. } else if(b<gapThreshold) {
  420. scsu->toUDynamicOffsets[dynamicWindow]=b<<7UL;
  421. } else if((uint8_t)(b-gapThreshold)<(reservedStart-gapThreshold)) {
  422. scsu->toUDynamicOffsets[dynamicWindow]=(b<<7UL)+gapOffset;
  423. } else if(b>=fixedThreshold) {
  424. scsu->toUDynamicOffsets[dynamicWindow]=fixedOffsets[b-fixedThreshold];
  425. } else {
  426. /* callback(illegal): Reserved window offset value 0xa8..0xf8 */
  427. cnv->toUBytes[1]=b;
  428. cnv->toULength=2;
  429. goto endloop;
  430. }
  431. sourceIndex=nextSourceIndex;
  432. state=readCommand;
  433. goto fastSingle;
  434. }
  435. }
  436. } else {
  437. /* fast path for Unicode mode */
  438. if(state==readCommand) {
  439. fastUnicode:
  440. while(source+1<sourceLimit && target<targetLimit && (uint8_t)((b=*source)-UC0)>(Urs-UC0)) {
  441. *target++=(char16_t)((b<<8)|source[1]);
  442. if(offsets!=nullptr) {
  443. *offsets++=sourceIndex;
  444. }
  445. sourceIndex=nextSourceIndex;
  446. nextSourceIndex+=2;
  447. source+=2;
  448. }
  449. }
  450. /* normal state machine for Unicode mode */
  451. /* unicodeByteMode: */
  452. while(source<sourceLimit) {
  453. if(target>=targetLimit) {
  454. /* target is full */
  455. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  456. break;
  457. }
  458. b=*source++;
  459. ++nextSourceIndex;
  460. switch(state) {
  461. case readCommand:
  462. if((uint8_t)(b-UC0)>(Urs-UC0)) {
  463. byteOne=b;
  464. cnv->toUBytes[0]=b;
  465. cnv->toULength=1;
  466. state=quotePairTwo;
  467. } else if(/* UC0<=b && */ b<=UC7) {
  468. dynamicWindow=(int8_t)(b-UC0);
  469. sourceIndex=nextSourceIndex;
  470. isSingleByteMode=true;
  471. goto fastSingle;
  472. } else if(/* UD0<=b && */ b<=UD7) {
  473. dynamicWindow=(int8_t)(b-UD0);
  474. isSingleByteMode=true;
  475. cnv->toUBytes[0]=b;
  476. cnv->toULength=1;
  477. state=defineOne;
  478. goto singleByteMode;
  479. } else if(b==UDX) {
  480. isSingleByteMode=true;
  481. cnv->toUBytes[0]=b;
  482. cnv->toULength=1;
  483. state=definePairOne;
  484. goto singleByteMode;
  485. } else if(b==UQU) {
  486. cnv->toUBytes[0]=b;
  487. cnv->toULength=1;
  488. state=quotePairOne;
  489. } else /* Urs */ {
  490. /* callback(illegal) */
  491. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  492. cnv->toUBytes[0]=b;
  493. cnv->toULength=1;
  494. goto endloop;
  495. }
  496. break;
  497. case quotePairOne:
  498. byteOne=b;
  499. cnv->toUBytes[1]=b;
  500. cnv->toULength=2;
  501. state=quotePairTwo;
  502. break;
  503. case quotePairTwo:
  504. *target++=(char16_t)((byteOne<<8)|b);
  505. if(offsets!=nullptr) {
  506. *offsets++=sourceIndex;
  507. }
  508. sourceIndex=nextSourceIndex;
  509. state=readCommand;
  510. goto fastUnicode;
  511. }
  512. }
  513. }
  514. endloop:
  515. /* set the converter state back into UConverter */
  516. if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) {
  517. /* reset to deal with the next character */
  518. state=readCommand;
  519. } else if(state==readCommand) {
  520. /* not in a multi-byte sequence, reset toULength */
  521. cnv->toULength=0;
  522. }
  523. scsu->toUIsSingleByteMode=isSingleByteMode;
  524. scsu->toUState=state;
  525. scsu->toUQuoteWindow=quoteWindow;
  526. scsu->toUDynamicWindow=dynamicWindow;
  527. scsu->toUByteOne=byteOne;
  528. /* write back the updated pointers */
  529. pArgs->source=(const char *)source;
  530. pArgs->target=target;
  531. pArgs->offsets=offsets;
  532. }
  533. /*
  534. * Identical to _SCSUToUnicodeWithOffsets but without offset handling.
  535. * If a change is made in the original function, then either
  536. * change this function the same way or
  537. * re-copy the original function and remove the variables
  538. * offsets, sourceIndex, and nextSourceIndex.
  539. */
  540. static void U_CALLCONV
  541. _SCSUToUnicode(UConverterToUnicodeArgs *pArgs,
  542. UErrorCode *pErrorCode) {
  543. UConverter *cnv;
  544. SCSUData *scsu;
  545. const uint8_t *source, *sourceLimit;
  546. char16_t *target;
  547. const char16_t *targetLimit;
  548. UBool isSingleByteMode;
  549. uint8_t state, byteOne;
  550. int8_t quoteWindow, dynamicWindow;
  551. uint8_t b;
  552. /* set up the local pointers */
  553. cnv=pArgs->converter;
  554. scsu=(SCSUData *)cnv->extraInfo;
  555. source=(const uint8_t *)pArgs->source;
  556. sourceLimit=(const uint8_t *)pArgs->sourceLimit;
  557. target=pArgs->target;
  558. targetLimit=pArgs->targetLimit;
  559. /* get the state machine state */
  560. isSingleByteMode=scsu->toUIsSingleByteMode;
  561. state=scsu->toUState;
  562. quoteWindow=scsu->toUQuoteWindow;
  563. dynamicWindow=scsu->toUDynamicWindow;
  564. byteOne=scsu->toUByteOne;
  565. /*
  566. * conversion "loop"
  567. *
  568. * For performance, this is not a normal C loop.
  569. * Instead, there are two code blocks for the two SCSU modes.
  570. * The function branches to either one, and a change of the mode is done with a goto to
  571. * the other branch.
  572. *
  573. * Each branch has two conventional loops:
  574. * - a fast-path loop for the most common codes in the mode
  575. * - a loop for all other codes in the mode
  576. * When the fast-path runs into a code that it cannot handle, its loop ends and it
  577. * runs into the following loop to handle the other codes.
  578. * The end of the input or output buffer is also handled by the slower loop.
  579. * The slow loop jumps (goto) to the fast-path loop again as soon as possible.
  580. *
  581. * The callback handling is done by returning with an error code.
  582. * The conversion framework actually calls the callback function.
  583. */
  584. if(isSingleByteMode) {
  585. /* fast path for single-byte mode */
  586. if(state==readCommand) {
  587. fastSingle:
  588. while(source<sourceLimit && target<targetLimit && (b=*source)>=0x20) {
  589. ++source;
  590. if(b<=0x7f) {
  591. /* write US-ASCII graphic character or DEL */
  592. *target++=(char16_t)b;
  593. } else {
  594. /* write from dynamic window */
  595. uint32_t c=scsu->toUDynamicOffsets[dynamicWindow]+(b&0x7f);
  596. if(c<=0xffff) {
  597. *target++=(char16_t)c;
  598. } else {
  599. /* output surrogate pair */
  600. *target++=(char16_t)(0xd7c0+(c>>10));
  601. if(target<targetLimit) {
  602. *target++=(char16_t)(0xdc00|(c&0x3ff));
  603. } else {
  604. /* target overflow */
  605. cnv->UCharErrorBuffer[0]=(char16_t)(0xdc00|(c&0x3ff));
  606. cnv->UCharErrorBufferLength=1;
  607. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  608. goto endloop;
  609. }
  610. }
  611. }
  612. }
  613. }
  614. /* normal state machine for single-byte mode, minus handling for what fastSingle covers */
  615. singleByteMode:
  616. while(source<sourceLimit) {
  617. if(target>=targetLimit) {
  618. /* target is full */
  619. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  620. break;
  621. }
  622. b=*source++;
  623. switch(state) {
  624. case readCommand:
  625. /* redundant conditions are commented out */
  626. /* here: b<0x20 because otherwise we would be in fastSingle */
  627. if((1UL<<b)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
  628. /* CR/LF/TAB/NUL */
  629. *target++=(char16_t)b;
  630. goto fastSingle;
  631. } else if(SC0<=b) {
  632. if(b<=SC7) {
  633. dynamicWindow=(int8_t)(b-SC0);
  634. goto fastSingle;
  635. } else /* if(SD0<=b && b<=SD7) */ {
  636. dynamicWindow=(int8_t)(b-SD0);
  637. state=defineOne;
  638. }
  639. } else if(/* SQ0<=b && */ b<=SQ7) {
  640. quoteWindow=(int8_t)(b-SQ0);
  641. state=quoteOne;
  642. } else if(b==SDX) {
  643. state=definePairOne;
  644. } else if(b==SQU) {
  645. state=quotePairOne;
  646. } else if(b==SCU) {
  647. isSingleByteMode=false;
  648. goto fastUnicode;
  649. } else /* Srs */ {
  650. /* callback(illegal) */
  651. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  652. cnv->toUBytes[0]=b;
  653. cnv->toULength=1;
  654. goto endloop;
  655. }
  656. /* store the first byte of a multibyte sequence in toUBytes[] */
  657. cnv->toUBytes[0]=b;
  658. cnv->toULength=1;
  659. break;
  660. case quotePairOne:
  661. byteOne=b;
  662. cnv->toUBytes[1]=b;
  663. cnv->toULength=2;
  664. state=quotePairTwo;
  665. break;
  666. case quotePairTwo:
  667. *target++=(char16_t)((byteOne<<8)|b);
  668. state=readCommand;
  669. goto fastSingle;
  670. case quoteOne:
  671. if(b<0x80) {
  672. /* all static offsets are in the BMP */
  673. *target++=(char16_t)(staticOffsets[quoteWindow]+b);
  674. } else {
  675. /* write from dynamic window */
  676. uint32_t c=scsu->toUDynamicOffsets[quoteWindow]+(b&0x7f);
  677. if(c<=0xffff) {
  678. *target++=(char16_t)c;
  679. } else {
  680. /* output surrogate pair */
  681. *target++=(char16_t)(0xd7c0+(c>>10));
  682. if(target<targetLimit) {
  683. *target++=(char16_t)(0xdc00|(c&0x3ff));
  684. } else {
  685. /* target overflow */
  686. cnv->UCharErrorBuffer[0]=(char16_t)(0xdc00|(c&0x3ff));
  687. cnv->UCharErrorBufferLength=1;
  688. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  689. goto endloop;
  690. }
  691. }
  692. }
  693. state=readCommand;
  694. goto fastSingle;
  695. case definePairOne:
  696. dynamicWindow=(int8_t)((b>>5)&7);
  697. byteOne=(uint8_t)(b&0x1f);
  698. cnv->toUBytes[1]=b;
  699. cnv->toULength=2;
  700. state=definePairTwo;
  701. break;
  702. case definePairTwo:
  703. scsu->toUDynamicOffsets[dynamicWindow]=0x10000+(byteOne<<15UL | b<<7UL);
  704. state=readCommand;
  705. goto fastSingle;
  706. case defineOne:
  707. if(b==0) {
  708. /* callback(illegal): Reserved window offset value 0 */
  709. cnv->toUBytes[1]=b;
  710. cnv->toULength=2;
  711. goto endloop;
  712. } else if(b<gapThreshold) {
  713. scsu->toUDynamicOffsets[dynamicWindow]=b<<7UL;
  714. } else if((uint8_t)(b-gapThreshold)<(reservedStart-gapThreshold)) {
  715. scsu->toUDynamicOffsets[dynamicWindow]=(b<<7UL)+gapOffset;
  716. } else if(b>=fixedThreshold) {
  717. scsu->toUDynamicOffsets[dynamicWindow]=fixedOffsets[b-fixedThreshold];
  718. } else {
  719. /* callback(illegal): Reserved window offset value 0xa8..0xf8 */
  720. cnv->toUBytes[1]=b;
  721. cnv->toULength=2;
  722. goto endloop;
  723. }
  724. state=readCommand;
  725. goto fastSingle;
  726. }
  727. }
  728. } else {
  729. /* fast path for Unicode mode */
  730. if(state==readCommand) {
  731. fastUnicode:
  732. while(source+1<sourceLimit && target<targetLimit && (uint8_t)((b=*source)-UC0)>(Urs-UC0)) {
  733. *target++=(char16_t)((b<<8)|source[1]);
  734. source+=2;
  735. }
  736. }
  737. /* normal state machine for Unicode mode */
  738. /* unicodeByteMode: */
  739. while(source<sourceLimit) {
  740. if(target>=targetLimit) {
  741. /* target is full */
  742. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  743. break;
  744. }
  745. b=*source++;
  746. switch(state) {
  747. case readCommand:
  748. if((uint8_t)(b-UC0)>(Urs-UC0)) {
  749. byteOne=b;
  750. cnv->toUBytes[0]=b;
  751. cnv->toULength=1;
  752. state=quotePairTwo;
  753. } else if(/* UC0<=b && */ b<=UC7) {
  754. dynamicWindow=(int8_t)(b-UC0);
  755. isSingleByteMode=true;
  756. goto fastSingle;
  757. } else if(/* UD0<=b && */ b<=UD7) {
  758. dynamicWindow=(int8_t)(b-UD0);
  759. isSingleByteMode=true;
  760. cnv->toUBytes[0]=b;
  761. cnv->toULength=1;
  762. state=defineOne;
  763. goto singleByteMode;
  764. } else if(b==UDX) {
  765. isSingleByteMode=true;
  766. cnv->toUBytes[0]=b;
  767. cnv->toULength=1;
  768. state=definePairOne;
  769. goto singleByteMode;
  770. } else if(b==UQU) {
  771. cnv->toUBytes[0]=b;
  772. cnv->toULength=1;
  773. state=quotePairOne;
  774. } else /* Urs */ {
  775. /* callback(illegal) */
  776. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  777. cnv->toUBytes[0]=b;
  778. cnv->toULength=1;
  779. goto endloop;
  780. }
  781. break;
  782. case quotePairOne:
  783. byteOne=b;
  784. cnv->toUBytes[1]=b;
  785. cnv->toULength=2;
  786. state=quotePairTwo;
  787. break;
  788. case quotePairTwo:
  789. *target++=(char16_t)((byteOne<<8)|b);
  790. state=readCommand;
  791. goto fastUnicode;
  792. }
  793. }
  794. }
  795. endloop:
  796. /* set the converter state back into UConverter */
  797. if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) {
  798. /* reset to deal with the next character */
  799. state=readCommand;
  800. } else if(state==readCommand) {
  801. /* not in a multi-byte sequence, reset toULength */
  802. cnv->toULength=0;
  803. }
  804. scsu->toUIsSingleByteMode=isSingleByteMode;
  805. scsu->toUState=state;
  806. scsu->toUQuoteWindow=quoteWindow;
  807. scsu->toUDynamicWindow=dynamicWindow;
  808. scsu->toUByteOne=byteOne;
  809. /* write back the updated pointers */
  810. pArgs->source=(const char *)source;
  811. pArgs->target=target;
  812. }
  813. U_CDECL_END
  814. /* SCSU-from-Unicode conversion functions ----------------------------------- */
  815. /*
  816. * This SCSU Encoder is fairly simple but uses all SCSU commands to achieve
  817. * reasonable results. The lookahead is minimal.
  818. * Many cases are simple:
  819. * A character fits directly into the current mode, a dynamic or static window,
  820. * or is not compressible. These cases are tested first.
  821. * Real compression heuristics are applied to the rest, in code branches for
  822. * single/Unicode mode and BMP/supplementary code points.
  823. * The heuristics used here are extremely simple.
  824. */
  825. /* get the number of the window that this character is in, or -1 */
  826. static int8_t
  827. getWindow(const uint32_t offsets[8], uint32_t c) {
  828. int i;
  829. for(i=0; i<8; ++i) {
  830. if (c - offsets[i] <= 0x7f) {
  831. return static_cast<int8_t>(i);
  832. }
  833. }
  834. return -1;
  835. }
  836. /* is the character in the dynamic window starting at the offset, or in the direct-encoded range? */
  837. static UBool
  838. isInOffsetWindowOrDirect(uint32_t offset, uint32_t c) {
  839. return c<=offset+0x7f &&
  840. (c>=offset || (c<=0x7f &&
  841. (c>=0x20 || (1UL<<c)&0x2601)));
  842. /* binary 0010 0110 0000 0001,
  843. check for b==0xd || b==0xa || b==9 || b==0 */
  844. }
  845. /*
  846. * getNextDynamicWindow returns the next dynamic window to be redefined
  847. */
  848. static int8_t
  849. getNextDynamicWindow(SCSUData *scsu) {
  850. int8_t window=scsu->windowUse[scsu->nextWindowUseIndex];
  851. if(++scsu->nextWindowUseIndex==8) {
  852. scsu->nextWindowUseIndex=0;
  853. }
  854. return window;
  855. }
  856. /*
  857. * useDynamicWindow() adjusts
  858. * windowUse[] and nextWindowUseIndex for the algorithm to choose
  859. * the next dynamic window to be defined;
  860. * a subclass may override it and provide its own algorithm.
  861. */
  862. static void
  863. useDynamicWindow(SCSUData *scsu, int8_t window) {
  864. /*
  865. * move the existing window, which just became the most recently used one,
  866. * up in windowUse[] to nextWindowUseIndex-1
  867. */
  868. /* first, find the index of the window - backwards to favor the more recently used windows */
  869. int i, j;
  870. i=scsu->nextWindowUseIndex;
  871. do {
  872. if(--i<0) {
  873. i=7;
  874. }
  875. } while(scsu->windowUse[i]!=window);
  876. /* now copy each windowUse[i+1] to [i] */
  877. j=i+1;
  878. if(j==8) {
  879. j=0;
  880. }
  881. while(j!=scsu->nextWindowUseIndex) {
  882. scsu->windowUse[i]=scsu->windowUse[j];
  883. i=j;
  884. if(++j==8) { j=0; }
  885. }
  886. /* finally, set the window into the most recently used index */
  887. scsu->windowUse[i]=window;
  888. }
  889. /*
  890. * calculate the offset and the code for a dynamic window that contains the character
  891. * takes fixed offsets into account
  892. * the offset of the window is stored in the offset variable,
  893. * the code is returned
  894. *
  895. * return offset code: -1 none <=0xff code for SDn/UDn else code for SDX/UDX, subtract 0x200 to get the true code
  896. */
  897. static int
  898. getDynamicOffset(uint32_t c, uint32_t *pOffset) {
  899. int i;
  900. for(i=0; i<7; ++i) {
  901. if (c - fixedOffsets[i] <= 0x7f) {
  902. *pOffset=fixedOffsets[i];
  903. return 0xf9+i;
  904. }
  905. }
  906. if(c<0x80) {
  907. /* No dynamic window for US-ASCII. */
  908. return -1;
  909. } else if(c<0x3400 ||
  910. c - 0x10000 < 0x14000 - 0x10000 ||
  911. c - 0x1d000 <= 0x1ffff - 0x1d000
  912. ) {
  913. /* This character is in a code range for a "small", i.e., reasonably windowable, script. */
  914. *pOffset=c&0x7fffff80;
  915. return static_cast<int>(c >> 7);
  916. } else if(0xe000<=c && c!=0xfeff && c<0xfff0) {
  917. /* For these characters we need to take the gapOffset into account. */
  918. *pOffset=c&0x7fffff80;
  919. return static_cast<int>((c - gapOffset) >> 7);
  920. } else {
  921. return -1;
  922. }
  923. }
  924. U_CDECL_BEGIN
  925. /*
  926. * Idea for compression:
  927. * - save SCSUData and other state before really starting work
  928. * - at endloop, see if compression could be better with just unicode mode
  929. * - don't do this if a callback has been called
  930. * - if unicode mode would be smaller, then override the results with it - may need SCU at the beginning
  931. * - different buffer handling!
  932. *
  933. * Drawback or need for corrective handling:
  934. * it is desirable to encode U+feff as SQU fe ff for the SCSU signature, and
  935. * it is desirable to start a document in US-ASCII/Latin-1 for as long as possible
  936. * not only for compression but also for HTML/XML documents with following charset/encoding announcers.
  937. *
  938. * How to achieve both?
  939. * - Only replace the result after an SDX or SCU?
  940. */
  941. static void U_CALLCONV
  942. _SCSUFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
  943. UErrorCode *pErrorCode) {
  944. UConverter *cnv;
  945. SCSUData *scsu;
  946. const char16_t *source, *sourceLimit;
  947. uint8_t *target;
  948. int32_t targetCapacity;
  949. int32_t *offsets;
  950. UBool isSingleByteMode;
  951. uint8_t dynamicWindow;
  952. uint32_t currentOffset;
  953. uint32_t c, delta;
  954. int32_t sourceIndex, nextSourceIndex;
  955. int32_t length;
  956. /* variables for compression heuristics */
  957. uint32_t offset;
  958. char16_t lead, trail;
  959. int code;
  960. int8_t window;
  961. /* set up the local pointers */
  962. cnv=pArgs->converter;
  963. scsu=(SCSUData *)cnv->extraInfo;
  964. /* set up the local pointers */
  965. source=pArgs->source;
  966. sourceLimit=pArgs->sourceLimit;
  967. target=(uint8_t *)pArgs->target;
  968. targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
  969. offsets=pArgs->offsets;
  970. /* get the state machine state */
  971. isSingleByteMode=scsu->fromUIsSingleByteMode;
  972. dynamicWindow=scsu->fromUDynamicWindow;
  973. currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
  974. c=cnv->fromUChar32;
  975. /* sourceIndex=-1 if the current character began in the previous buffer */
  976. sourceIndex= c==0 ? 0 : -1;
  977. nextSourceIndex=0;
  978. /* similar conversion "loop" as in toUnicode */
  979. loop:
  980. if(isSingleByteMode) {
  981. if(c!=0 && targetCapacity>0) {
  982. goto getTrailSingle;
  983. }
  984. /* state machine for single-byte mode */
  985. /* singleByteMode: */
  986. while(source<sourceLimit) {
  987. if(targetCapacity<=0) {
  988. /* target is full */
  989. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  990. break;
  991. }
  992. c=*source++;
  993. ++nextSourceIndex;
  994. if((c-0x20)<=0x5f) {
  995. /* pass US-ASCII graphic character through */
  996. *target++=(uint8_t)c;
  997. if(offsets!=nullptr) {
  998. *offsets++=sourceIndex;
  999. }
  1000. --targetCapacity;
  1001. } else if(c<0x20) {
  1002. if((1UL<<c)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
  1003. /* CR/LF/TAB/NUL */
  1004. *target++=(uint8_t)c;
  1005. if(offsets!=nullptr) {
  1006. *offsets++=sourceIndex;
  1007. }
  1008. --targetCapacity;
  1009. } else {
  1010. /* quote C0 control character */
  1011. c|=SQ0<<8;
  1012. length=2;
  1013. goto outputBytes;
  1014. }
  1015. } else if((delta=c-currentOffset)<=0x7f) {
  1016. /* use the current dynamic window */
  1017. *target++=(uint8_t)(delta|0x80);
  1018. if(offsets!=nullptr) {
  1019. *offsets++=sourceIndex;
  1020. }
  1021. --targetCapacity;
  1022. } else if(U16_IS_SURROGATE(c)) {
  1023. if(U16_IS_SURROGATE_LEAD(c)) {
  1024. getTrailSingle:
  1025. lead=(char16_t)c;
  1026. if(source<sourceLimit) {
  1027. /* test the following code unit */
  1028. trail=*source;
  1029. if(U16_IS_TRAIL(trail)) {
  1030. ++source;
  1031. ++nextSourceIndex;
  1032. c=U16_GET_SUPPLEMENTARY(c, trail);
  1033. /* convert this surrogate code point */
  1034. /* exit this condition tree */
  1035. } else {
  1036. /* this is an unmatched lead code unit (1st surrogate) */
  1037. /* callback(illegal) */
  1038. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  1039. goto endloop;
  1040. }
  1041. } else {
  1042. /* no more input */
  1043. break;
  1044. }
  1045. } else {
  1046. /* this is an unmatched trail code unit (2nd surrogate) */
  1047. /* callback(illegal) */
  1048. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  1049. goto endloop;
  1050. }
  1051. /* compress supplementary character U+10000..U+10ffff */
  1052. if((delta=c-currentOffset)<=0x7f) {
  1053. /* use the current dynamic window */
  1054. *target++=(uint8_t)(delta|0x80);
  1055. if(offsets!=nullptr) {
  1056. *offsets++=sourceIndex;
  1057. }
  1058. --targetCapacity;
  1059. } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
  1060. /* there is a dynamic window that contains this character, change to it */
  1061. dynamicWindow=window;
  1062. currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
  1063. useDynamicWindow(scsu, dynamicWindow);
  1064. c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
  1065. length=2;
  1066. goto outputBytes;
  1067. } else if((code=getDynamicOffset(c, &offset))>=0) {
  1068. /* might check if there are more characters in this window to come */
  1069. /* define an extended window with this character */
  1070. code-=0x200;
  1071. dynamicWindow=getNextDynamicWindow(scsu);
  1072. currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
  1073. useDynamicWindow(scsu, dynamicWindow);
  1074. c=((uint32_t)SDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
  1075. length=4;
  1076. goto outputBytes;
  1077. } else {
  1078. /* change to Unicode mode and output this (lead, trail) pair */
  1079. isSingleByteMode=false;
  1080. *target++=(uint8_t)SCU;
  1081. if(offsets!=nullptr) {
  1082. *offsets++=sourceIndex;
  1083. }
  1084. --targetCapacity;
  1085. c=((uint32_t)lead<<16)|trail;
  1086. length=4;
  1087. goto outputBytes;
  1088. }
  1089. } else if(c<0xa0) {
  1090. /* quote C1 control character */
  1091. c=(c&0x7f)|(SQ0+1)<<8; /* SQ0+1==SQ1 */
  1092. length=2;
  1093. goto outputBytes;
  1094. } else if(c==0xfeff || c>=0xfff0) {
  1095. /* quote signature character=byte order mark and specials */
  1096. c|=SQU<<16;
  1097. length=3;
  1098. goto outputBytes;
  1099. } else {
  1100. /* compress all other BMP characters */
  1101. if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
  1102. /* there is a window defined that contains this character - switch to it or quote from it? */
  1103. if(source>=sourceLimit || isInOffsetWindowOrDirect(scsu->fromUDynamicOffsets[window], *source)) {
  1104. /* change to dynamic window */
  1105. dynamicWindow=window;
  1106. currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
  1107. useDynamicWindow(scsu, dynamicWindow);
  1108. c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
  1109. length=2;
  1110. goto outputBytes;
  1111. } else {
  1112. /* quote from dynamic window */
  1113. c=((uint32_t)(SQ0+window)<<8)|(c-scsu->fromUDynamicOffsets[window])|0x80;
  1114. length=2;
  1115. goto outputBytes;
  1116. }
  1117. } else if((window=getWindow(staticOffsets, c))>=0) {
  1118. /* quote from static window */
  1119. c=((uint32_t)(SQ0+window)<<8)|(c-staticOffsets[window]);
  1120. length=2;
  1121. goto outputBytes;
  1122. } else if((code=getDynamicOffset(c, &offset))>=0) {
  1123. /* define a dynamic window with this character */
  1124. dynamicWindow=getNextDynamicWindow(scsu);
  1125. currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
  1126. useDynamicWindow(scsu, dynamicWindow);
  1127. c=((uint32_t)(SD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
  1128. length=3;
  1129. goto outputBytes;
  1130. } else if ((c - 0x3400) < (0xd800 - 0x3400) &&
  1131. (source >= sourceLimit || (uint32_t)(*source - 0x3400) < (0xd800 - 0x3400))
  1132. ) {
  1133. /*
  1134. * this character is not compressible (a BMP ideograph or similar);
  1135. * switch to Unicode mode if this is the last character in the block
  1136. * or there is at least one more ideograph following immediately
  1137. */
  1138. isSingleByteMode=false;
  1139. c|=SCU<<16;
  1140. length=3;
  1141. goto outputBytes;
  1142. } else {
  1143. /* quote Unicode */
  1144. c|=SQU<<16;
  1145. length=3;
  1146. goto outputBytes;
  1147. }
  1148. }
  1149. /* normal end of conversion: prepare for a new character */
  1150. c=0;
  1151. sourceIndex=nextSourceIndex;
  1152. }
  1153. } else {
  1154. if(c!=0 && targetCapacity>0) {
  1155. goto getTrailUnicode;
  1156. }
  1157. /* state machine for Unicode mode */
  1158. /* unicodeByteMode: */
  1159. while(source<sourceLimit) {
  1160. if(targetCapacity<=0) {
  1161. /* target is full */
  1162. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  1163. break;
  1164. }
  1165. c=*source++;
  1166. ++nextSourceIndex;
  1167. if ((c - 0x3400) < (0xd800 - 0x3400)) {
  1168. /* not compressible, write character directly */
  1169. if(targetCapacity>=2) {
  1170. *target++=(uint8_t)(c>>8);
  1171. *target++=(uint8_t)c;
  1172. if(offsets!=nullptr) {
  1173. *offsets++=sourceIndex;
  1174. *offsets++=sourceIndex;
  1175. }
  1176. targetCapacity-=2;
  1177. } else {
  1178. length=2;
  1179. goto outputBytes;
  1180. }
  1181. } else if (c - 0x3400 >= 0xf300 - 0x3400 /* c<0x3400 || c>=0xf300 */) {
  1182. /* compress BMP character if the following one is not an uncompressible ideograph */
  1183. if(!(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))) {
  1184. if (c - 0x30 < 10 || c - 0x61 < 26 || c - 0x41 < 26) {
  1185. /* ASCII digit or letter */
  1186. isSingleByteMode=true;
  1187. c|=((uint32_t)(UC0+dynamicWindow)<<8)|c;
  1188. length=2;
  1189. goto outputBytes;
  1190. } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
  1191. /* there is a dynamic window that contains this character, change to it */
  1192. isSingleByteMode=true;
  1193. dynamicWindow=window;
  1194. currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
  1195. useDynamicWindow(scsu, dynamicWindow);
  1196. c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
  1197. length=2;
  1198. goto outputBytes;
  1199. } else if((code=getDynamicOffset(c, &offset))>=0) {
  1200. /* define a dynamic window with this character */
  1201. isSingleByteMode=true;
  1202. dynamicWindow=getNextDynamicWindow(scsu);
  1203. currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
  1204. useDynamicWindow(scsu, dynamicWindow);
  1205. c=((uint32_t)(UD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
  1206. length=3;
  1207. goto outputBytes;
  1208. }
  1209. }
  1210. /* don't know how to compress this character, just write it directly */
  1211. length=2;
  1212. goto outputBytes;
  1213. } else if(c<0xe000) {
  1214. /* c is a surrogate */
  1215. if(U16_IS_SURROGATE_LEAD(c)) {
  1216. getTrailUnicode:
  1217. lead=(char16_t)c;
  1218. if(source<sourceLimit) {
  1219. /* test the following code unit */
  1220. trail=*source;
  1221. if(U16_IS_TRAIL(trail)) {
  1222. ++source;
  1223. ++nextSourceIndex;
  1224. c=U16_GET_SUPPLEMENTARY(c, trail);
  1225. /* convert this surrogate code point */
  1226. /* exit this condition tree */
  1227. } else {
  1228. /* this is an unmatched lead code unit (1st surrogate) */
  1229. /* callback(illegal) */
  1230. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  1231. goto endloop;
  1232. }
  1233. } else {
  1234. /* no more input */
  1235. break;
  1236. }
  1237. } else {
  1238. /* this is an unmatched trail code unit (2nd surrogate) */
  1239. /* callback(illegal) */
  1240. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  1241. goto endloop;
  1242. }
  1243. /* compress supplementary character */
  1244. if( (window=getWindow(scsu->fromUDynamicOffsets, c))>=0 &&
  1245. !(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))
  1246. ) {
  1247. /*
  1248. * there is a dynamic window that contains this character and
  1249. * the following character is not uncompressible,
  1250. * change to the window
  1251. */
  1252. isSingleByteMode=true;
  1253. dynamicWindow=window;
  1254. currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
  1255. useDynamicWindow(scsu, dynamicWindow);
  1256. c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
  1257. length=2;
  1258. goto outputBytes;
  1259. } else if(source<sourceLimit && lead==*source && /* too lazy to check trail in same window as source[1] */
  1260. (code=getDynamicOffset(c, &offset))>=0
  1261. ) {
  1262. /* two supplementary characters in (probably) the same window - define an extended one */
  1263. isSingleByteMode=true;
  1264. code-=0x200;
  1265. dynamicWindow=getNextDynamicWindow(scsu);
  1266. currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
  1267. useDynamicWindow(scsu, dynamicWindow);
  1268. c=((uint32_t)UDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
  1269. length=4;
  1270. goto outputBytes;
  1271. } else {
  1272. /* don't know how to compress this character, just write it directly */
  1273. c=((uint32_t)lead<<16)|trail;
  1274. length=4;
  1275. goto outputBytes;
  1276. }
  1277. } else /* 0xe000<=c<0xf300 */ {
  1278. /* quote to avoid SCSU tags */
  1279. c|=UQU<<16;
  1280. length=3;
  1281. goto outputBytes;
  1282. }
  1283. /* normal end of conversion: prepare for a new character */
  1284. c=0;
  1285. sourceIndex=nextSourceIndex;
  1286. }
  1287. }
  1288. endloop:
  1289. /* set the converter state back into UConverter */
  1290. scsu->fromUIsSingleByteMode=isSingleByteMode;
  1291. scsu->fromUDynamicWindow=dynamicWindow;
  1292. cnv->fromUChar32=c;
  1293. /* write back the updated pointers */
  1294. pArgs->source=source;
  1295. pArgs->target=(char *)target;
  1296. pArgs->offsets=offsets;
  1297. return;
  1298. outputBytes:
  1299. /* write the output character bytes from c and length [code copied from ucnvmbcs.c] */
  1300. /* from the first if in the loop we know that targetCapacity>0 */
  1301. if(length<=targetCapacity) {
  1302. if(offsets==nullptr) {
  1303. switch(length) {
  1304. /* each branch falls through to the next one */
  1305. case 4:
  1306. *target++=(uint8_t)(c>>24);
  1307. U_FALLTHROUGH;
  1308. case 3:
  1309. *target++=(uint8_t)(c>>16);
  1310. U_FALLTHROUGH;
  1311. case 2:
  1312. *target++=(uint8_t)(c>>8);
  1313. U_FALLTHROUGH;
  1314. case 1:
  1315. *target++=(uint8_t)c;
  1316. U_FALLTHROUGH;
  1317. default:
  1318. /* will never occur */
  1319. break;
  1320. }
  1321. } else {
  1322. switch(length) {
  1323. /* each branch falls through to the next one */
  1324. case 4:
  1325. *target++=(uint8_t)(c>>24);
  1326. *offsets++=sourceIndex;
  1327. U_FALLTHROUGH;
  1328. case 3:
  1329. *target++=(uint8_t)(c>>16);
  1330. *offsets++=sourceIndex;
  1331. U_FALLTHROUGH;
  1332. case 2:
  1333. *target++=(uint8_t)(c>>8);
  1334. *offsets++=sourceIndex;
  1335. U_FALLTHROUGH;
  1336. case 1:
  1337. *target++=(uint8_t)c;
  1338. *offsets++=sourceIndex;
  1339. U_FALLTHROUGH;
  1340. default:
  1341. /* will never occur */
  1342. break;
  1343. }
  1344. }
  1345. targetCapacity-=length;
  1346. /* normal end of conversion: prepare for a new character */
  1347. c=0;
  1348. sourceIndex=nextSourceIndex;
  1349. goto loop;
  1350. } else {
  1351. uint8_t *p;
  1352. /*
  1353. * We actually do this backwards here:
  1354. * In order to save an intermediate variable, we output
  1355. * first to the overflow buffer what does not fit into the
  1356. * regular target.
  1357. */
  1358. /* we know that 0<=targetCapacity<length<=4 */
  1359. /* targetCapacity==0 when SCU+supplementary where SCU used up targetCapacity==1 */
  1360. length-=targetCapacity;
  1361. p=(uint8_t *)cnv->charErrorBuffer;
  1362. switch(length) {
  1363. /* each branch falls through to the next one */
  1364. case 4:
  1365. *p++=(uint8_t)(c>>24);
  1366. U_FALLTHROUGH;
  1367. case 3:
  1368. *p++=(uint8_t)(c>>16);
  1369. U_FALLTHROUGH;
  1370. case 2:
  1371. *p++=(uint8_t)(c>>8);
  1372. U_FALLTHROUGH;
  1373. case 1:
  1374. *p=(uint8_t)c;
  1375. U_FALLTHROUGH;
  1376. default:
  1377. /* will never occur */
  1378. break;
  1379. }
  1380. cnv->charErrorBufferLength=(int8_t)length;
  1381. /* now output what fits into the regular target */
  1382. c>>=8*length; /* length was reduced by targetCapacity */
  1383. switch(targetCapacity) {
  1384. /* each branch falls through to the next one */
  1385. case 3:
  1386. *target++=(uint8_t)(c>>16);
  1387. if(offsets!=nullptr) {
  1388. *offsets++=sourceIndex;
  1389. }
  1390. U_FALLTHROUGH;
  1391. case 2:
  1392. *target++=(uint8_t)(c>>8);
  1393. if(offsets!=nullptr) {
  1394. *offsets++=sourceIndex;
  1395. }
  1396. U_FALLTHROUGH;
  1397. case 1:
  1398. *target++=(uint8_t)c;
  1399. if(offsets!=nullptr) {
  1400. *offsets++=sourceIndex;
  1401. }
  1402. U_FALLTHROUGH;
  1403. default:
  1404. break;
  1405. }
  1406. /* target overflow */
  1407. targetCapacity=0;
  1408. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  1409. c=0;
  1410. goto endloop;
  1411. }
  1412. }
  1413. /*
  1414. * Identical to _SCSUFromUnicodeWithOffsets but without offset handling.
  1415. * If a change is made in the original function, then either
  1416. * change this function the same way or
  1417. * re-copy the original function and remove the variables
  1418. * offsets, sourceIndex, and nextSourceIndex.
  1419. */
  1420. static void U_CALLCONV
  1421. _SCSUFromUnicode(UConverterFromUnicodeArgs *pArgs,
  1422. UErrorCode *pErrorCode) {
  1423. UConverter *cnv;
  1424. SCSUData *scsu;
  1425. const char16_t *source, *sourceLimit;
  1426. uint8_t *target;
  1427. int32_t targetCapacity;
  1428. UBool isSingleByteMode;
  1429. uint8_t dynamicWindow;
  1430. uint32_t currentOffset;
  1431. uint32_t c, delta;
  1432. int32_t length;
  1433. /* variables for compression heuristics */
  1434. uint32_t offset;
  1435. char16_t lead, trail;
  1436. int code;
  1437. int8_t window;
  1438. /* set up the local pointers */
  1439. cnv=pArgs->converter;
  1440. scsu=(SCSUData *)cnv->extraInfo;
  1441. /* set up the local pointers */
  1442. source=pArgs->source;
  1443. sourceLimit=pArgs->sourceLimit;
  1444. target=(uint8_t *)pArgs->target;
  1445. targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
  1446. /* get the state machine state */
  1447. isSingleByteMode=scsu->fromUIsSingleByteMode;
  1448. dynamicWindow=scsu->fromUDynamicWindow;
  1449. currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
  1450. c=cnv->fromUChar32;
  1451. /* similar conversion "loop" as in toUnicode */
  1452. loop:
  1453. if(isSingleByteMode) {
  1454. if(c!=0 && targetCapacity>0) {
  1455. goto getTrailSingle;
  1456. }
  1457. /* state machine for single-byte mode */
  1458. /* singleByteMode: */
  1459. while(source<sourceLimit) {
  1460. if(targetCapacity<=0) {
  1461. /* target is full */
  1462. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  1463. break;
  1464. }
  1465. c=*source++;
  1466. if((c-0x20)<=0x5f) {
  1467. /* pass US-ASCII graphic character through */
  1468. *target++=(uint8_t)c;
  1469. --targetCapacity;
  1470. } else if(c<0x20) {
  1471. if((1UL<<c)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
  1472. /* CR/LF/TAB/NUL */
  1473. *target++=(uint8_t)c;
  1474. --targetCapacity;
  1475. } else {
  1476. /* quote C0 control character */
  1477. c|=SQ0<<8;
  1478. length=2;
  1479. goto outputBytes;
  1480. }
  1481. } else if((delta=c-currentOffset)<=0x7f) {
  1482. /* use the current dynamic window */
  1483. *target++=(uint8_t)(delta|0x80);
  1484. --targetCapacity;
  1485. } else if(U16_IS_SURROGATE(c)) {
  1486. if(U16_IS_SURROGATE_LEAD(c)) {
  1487. getTrailSingle:
  1488. lead=(char16_t)c;
  1489. if(source<sourceLimit) {
  1490. /* test the following code unit */
  1491. trail=*source;
  1492. if(U16_IS_TRAIL(trail)) {
  1493. ++source;
  1494. c=U16_GET_SUPPLEMENTARY(c, trail);
  1495. /* convert this surrogate code point */
  1496. /* exit this condition tree */
  1497. } else {
  1498. /* this is an unmatched lead code unit (1st surrogate) */
  1499. /* callback(illegal) */
  1500. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  1501. goto endloop;
  1502. }
  1503. } else {
  1504. /* no more input */
  1505. break;
  1506. }
  1507. } else {
  1508. /* this is an unmatched trail code unit (2nd surrogate) */
  1509. /* callback(illegal) */
  1510. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  1511. goto endloop;
  1512. }
  1513. /* compress supplementary character U+10000..U+10ffff */
  1514. if((delta=c-currentOffset)<=0x7f) {
  1515. /* use the current dynamic window */
  1516. *target++=(uint8_t)(delta|0x80);
  1517. --targetCapacity;
  1518. } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
  1519. /* there is a dynamic window that contains this character, change to it */
  1520. dynamicWindow=window;
  1521. currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
  1522. useDynamicWindow(scsu, dynamicWindow);
  1523. c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
  1524. length=2;
  1525. goto outputBytes;
  1526. } else if((code=getDynamicOffset(c, &offset))>=0) {
  1527. /* might check if there are more characters in this window to come */
  1528. /* define an extended window with this character */
  1529. code-=0x200;
  1530. dynamicWindow=getNextDynamicWindow(scsu);
  1531. currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
  1532. useDynamicWindow(scsu, dynamicWindow);
  1533. c=((uint32_t)SDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
  1534. length=4;
  1535. goto outputBytes;
  1536. } else {
  1537. /* change to Unicode mode and output this (lead, trail) pair */
  1538. isSingleByteMode=false;
  1539. *target++=(uint8_t)SCU;
  1540. --targetCapacity;
  1541. c=((uint32_t)lead<<16)|trail;
  1542. length=4;
  1543. goto outputBytes;
  1544. }
  1545. } else if(c<0xa0) {
  1546. /* quote C1 control character */
  1547. c=(c&0x7f)|(SQ0+1)<<8; /* SQ0+1==SQ1 */
  1548. length=2;
  1549. goto outputBytes;
  1550. } else if(c==0xfeff || c>=0xfff0) {
  1551. /* quote signature character=byte order mark and specials */
  1552. c|=SQU<<16;
  1553. length=3;
  1554. goto outputBytes;
  1555. } else {
  1556. /* compress all other BMP characters */
  1557. if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
  1558. /* there is a window defined that contains this character - switch to it or quote from it? */
  1559. if(source>=sourceLimit || isInOffsetWindowOrDirect(scsu->fromUDynamicOffsets[window], *source)) {
  1560. /* change to dynamic window */
  1561. dynamicWindow=window;
  1562. currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
  1563. useDynamicWindow(scsu, dynamicWindow);
  1564. c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
  1565. length=2;
  1566. goto outputBytes;
  1567. } else {
  1568. /* quote from dynamic window */
  1569. c=((uint32_t)(SQ0+window)<<8)|(c-scsu->fromUDynamicOffsets[window])|0x80;
  1570. length=2;
  1571. goto outputBytes;
  1572. }
  1573. } else if((window=getWindow(staticOffsets, c))>=0) {
  1574. /* quote from static window */
  1575. c=((uint32_t)(SQ0+window)<<8)|(c-staticOffsets[window]);
  1576. length=2;
  1577. goto outputBytes;
  1578. } else if((code=getDynamicOffset(c, &offset))>=0) {
  1579. /* define a dynamic window with this character */
  1580. dynamicWindow=getNextDynamicWindow(scsu);
  1581. currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
  1582. useDynamicWindow(scsu, dynamicWindow);
  1583. c=((uint32_t)(SD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
  1584. length=3;
  1585. goto outputBytes;
  1586. } else if (c - 0x3400 < 0xd800 - 0x3400 &&
  1587. (source >= sourceLimit || static_cast<uint32_t>(*source - 0x3400) < 0xd800 - 0x3400)
  1588. ) {
  1589. /*
  1590. * this character is not compressible (a BMP ideograph or similar);
  1591. * switch to Unicode mode if this is the last character in the block
  1592. * or there is at least one more ideograph following immediately
  1593. */
  1594. isSingleByteMode=false;
  1595. c|=SCU<<16;
  1596. length=3;
  1597. goto outputBytes;
  1598. } else {
  1599. /* quote Unicode */
  1600. c|=SQU<<16;
  1601. length=3;
  1602. goto outputBytes;
  1603. }
  1604. }
  1605. /* normal end of conversion: prepare for a new character */
  1606. c=0;
  1607. }
  1608. } else {
  1609. if(c!=0 && targetCapacity>0) {
  1610. goto getTrailUnicode;
  1611. }
  1612. /* state machine for Unicode mode */
  1613. /* unicodeByteMode: */
  1614. while(source<sourceLimit) {
  1615. if(targetCapacity<=0) {
  1616. /* target is full */
  1617. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  1618. break;
  1619. }
  1620. c=*source++;
  1621. if (c - 0x3400 < 0xd800 - 0x3400) {
  1622. /* not compressible, write character directly */
  1623. if(targetCapacity>=2) {
  1624. *target++=(uint8_t)(c>>8);
  1625. *target++=(uint8_t)c;
  1626. targetCapacity-=2;
  1627. } else {
  1628. length=2;
  1629. goto outputBytes;
  1630. }
  1631. } else if (c - 0x3400 >= 0xf300 - 0x3400 /* c<0x3400 || c>=0xf300 */) {
  1632. /* compress BMP character if the following one is not an uncompressible ideograph */
  1633. if(!(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))) {
  1634. if (c - 0x30 < 10 || c - 0x61 < 26 || c - 0x41 < 26) {
  1635. /* ASCII digit or letter */
  1636. isSingleByteMode=true;
  1637. c|=((uint32_t)(UC0+dynamicWindow)<<8)|c;
  1638. length=2;
  1639. goto outputBytes;
  1640. } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
  1641. /* there is a dynamic window that contains this character, change to it */
  1642. isSingleByteMode=true;
  1643. dynamicWindow=window;
  1644. currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
  1645. useDynamicWindow(scsu, dynamicWindow);
  1646. c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
  1647. length=2;
  1648. goto outputBytes;
  1649. } else if((code=getDynamicOffset(c, &offset))>=0) {
  1650. /* define a dynamic window with this character */
  1651. isSingleByteMode=true;
  1652. dynamicWindow=getNextDynamicWindow(scsu);
  1653. currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
  1654. useDynamicWindow(scsu, dynamicWindow);
  1655. c=((uint32_t)(UD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
  1656. length=3;
  1657. goto outputBytes;
  1658. }
  1659. }
  1660. /* don't know how to compress this character, just write it directly */
  1661. length=2;
  1662. goto outputBytes;
  1663. } else if(c<0xe000) {
  1664. /* c is a surrogate */
  1665. if(U16_IS_SURROGATE_LEAD(c)) {
  1666. getTrailUnicode:
  1667. lead=(char16_t)c;
  1668. if(source<sourceLimit) {
  1669. /* test the following code unit */
  1670. trail=*source;
  1671. if(U16_IS_TRAIL(trail)) {
  1672. ++source;
  1673. c=U16_GET_SUPPLEMENTARY(c, trail);
  1674. /* convert this surrogate code point */
  1675. /* exit this condition tree */
  1676. } else {
  1677. /* this is an unmatched lead code unit (1st surrogate) */
  1678. /* callback(illegal) */
  1679. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  1680. goto endloop;
  1681. }
  1682. } else {
  1683. /* no more input */
  1684. break;
  1685. }
  1686. } else {
  1687. /* this is an unmatched trail code unit (2nd surrogate) */
  1688. /* callback(illegal) */
  1689. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  1690. goto endloop;
  1691. }
  1692. /* compress supplementary character */
  1693. if( (window=getWindow(scsu->fromUDynamicOffsets, c))>=0 &&
  1694. !(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))
  1695. ) {
  1696. /*
  1697. * there is a dynamic window that contains this character and
  1698. * the following character is not uncompressible,
  1699. * change to the window
  1700. */
  1701. isSingleByteMode=true;
  1702. dynamicWindow=window;
  1703. currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
  1704. useDynamicWindow(scsu, dynamicWindow);
  1705. c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
  1706. length=2;
  1707. goto outputBytes;
  1708. } else if(source<sourceLimit && lead==*source && /* too lazy to check trail in same window as source[1] */
  1709. (code=getDynamicOffset(c, &offset))>=0
  1710. ) {
  1711. /* two supplementary characters in (probably) the same window - define an extended one */
  1712. isSingleByteMode=true;
  1713. code-=0x200;
  1714. dynamicWindow=getNextDynamicWindow(scsu);
  1715. currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
  1716. useDynamicWindow(scsu, dynamicWindow);
  1717. c=((uint32_t)UDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
  1718. length=4;
  1719. goto outputBytes;
  1720. } else {
  1721. /* don't know how to compress this character, just write it directly */
  1722. c=((uint32_t)lead<<16)|trail;
  1723. length=4;
  1724. goto outputBytes;
  1725. }
  1726. } else /* 0xe000<=c<0xf300 */ {
  1727. /* quote to avoid SCSU tags */
  1728. c|=UQU<<16;
  1729. length=3;
  1730. goto outputBytes;
  1731. }
  1732. /* normal end of conversion: prepare for a new character */
  1733. c=0;
  1734. }
  1735. }
  1736. endloop:
  1737. /* set the converter state back into UConverter */
  1738. scsu->fromUIsSingleByteMode=isSingleByteMode;
  1739. scsu->fromUDynamicWindow=dynamicWindow;
  1740. cnv->fromUChar32=c;
  1741. /* write back the updated pointers */
  1742. pArgs->source=source;
  1743. pArgs->target=(char *)target;
  1744. return;
  1745. outputBytes:
  1746. /* write the output character bytes from c and length [code copied from ucnvmbcs.c] */
  1747. /* from the first if in the loop we know that targetCapacity>0 */
  1748. if(length<=targetCapacity) {
  1749. switch(length) {
  1750. /* each branch falls through to the next one */
  1751. case 4:
  1752. *target++=(uint8_t)(c>>24);
  1753. U_FALLTHROUGH;
  1754. case 3:
  1755. *target++=(uint8_t)(c>>16);
  1756. U_FALLTHROUGH;
  1757. case 2:
  1758. *target++=(uint8_t)(c>>8);
  1759. U_FALLTHROUGH;
  1760. case 1:
  1761. *target++=(uint8_t)c;
  1762. U_FALLTHROUGH;
  1763. default:
  1764. /* will never occur */
  1765. break;
  1766. }
  1767. targetCapacity-=length;
  1768. /* normal end of conversion: prepare for a new character */
  1769. c=0;
  1770. goto loop;
  1771. } else {
  1772. uint8_t *p;
  1773. /*
  1774. * We actually do this backwards here:
  1775. * In order to save an intermediate variable, we output
  1776. * first to the overflow buffer what does not fit into the
  1777. * regular target.
  1778. */
  1779. /* we know that 0<=targetCapacity<length<=4 */
  1780. /* targetCapacity==0 when SCU+supplementary where SCU used up targetCapacity==1 */
  1781. length-=targetCapacity;
  1782. p=(uint8_t *)cnv->charErrorBuffer;
  1783. switch(length) {
  1784. /* each branch falls through to the next one */
  1785. case 4:
  1786. *p++=(uint8_t)(c>>24);
  1787. U_FALLTHROUGH;
  1788. case 3:
  1789. *p++=(uint8_t)(c>>16);
  1790. U_FALLTHROUGH;
  1791. case 2:
  1792. *p++=(uint8_t)(c>>8);
  1793. U_FALLTHROUGH;
  1794. case 1:
  1795. *p=(uint8_t)c;
  1796. U_FALLTHROUGH;
  1797. default:
  1798. /* will never occur */
  1799. break;
  1800. }
  1801. cnv->charErrorBufferLength=(int8_t)length;
  1802. /* now output what fits into the regular target */
  1803. c = (length == 4) ? 0 : c >> 8*length; /* length was reduced by targetCapacity */
  1804. switch(targetCapacity) {
  1805. /* each branch falls through to the next one */
  1806. case 3:
  1807. *target++=(uint8_t)(c>>16);
  1808. U_FALLTHROUGH;
  1809. case 2:
  1810. *target++=(uint8_t)(c>>8);
  1811. U_FALLTHROUGH;
  1812. case 1:
  1813. *target++=(uint8_t)c;
  1814. U_FALLTHROUGH;
  1815. default:
  1816. break;
  1817. }
  1818. /* target overflow */
  1819. targetCapacity=0;
  1820. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  1821. c=0;
  1822. goto endloop;
  1823. }
  1824. }
  1825. /* miscellaneous ------------------------------------------------------------ */
  1826. static const char * U_CALLCONV
  1827. _SCSUGetName(const UConverter *cnv) {
  1828. SCSUData *scsu=(SCSUData *)cnv->extraInfo;
  1829. switch(scsu->locale) {
  1830. case l_ja:
  1831. return "SCSU,locale=ja";
  1832. default:
  1833. return "SCSU";
  1834. }
  1835. }
  1836. /* structure for SafeClone calculations */
  1837. struct cloneSCSUStruct
  1838. {
  1839. UConverter cnv;
  1840. SCSUData mydata;
  1841. };
  1842. static UConverter * U_CALLCONV
  1843. _SCSUSafeClone(const UConverter *cnv,
  1844. void *stackBuffer,
  1845. int32_t *pBufferSize,
  1846. UErrorCode *status)
  1847. {
  1848. struct cloneSCSUStruct * localClone;
  1849. int32_t bufferSizeNeeded = sizeof(struct cloneSCSUStruct);
  1850. if (U_FAILURE(*status)){
  1851. return nullptr;
  1852. }
  1853. if (*pBufferSize == 0){ /* 'preflighting' request - set needed size into *pBufferSize */
  1854. *pBufferSize = bufferSizeNeeded;
  1855. return nullptr;
  1856. }
  1857. localClone = (struct cloneSCSUStruct *)stackBuffer;
  1858. /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
  1859. uprv_memcpy(&localClone->mydata, cnv->extraInfo, sizeof(SCSUData));
  1860. localClone->cnv.extraInfo = &localClone->mydata;
  1861. localClone->cnv.isExtraLocal = true;
  1862. return &localClone->cnv;
  1863. }
  1864. U_CDECL_END
  1865. static const UConverterImpl _SCSUImpl={
  1866. UCNV_SCSU,
  1867. nullptr,
  1868. nullptr,
  1869. _SCSUOpen,
  1870. _SCSUClose,
  1871. _SCSUReset,
  1872. _SCSUToUnicode,
  1873. _SCSUToUnicodeWithOffsets,
  1874. _SCSUFromUnicode,
  1875. _SCSUFromUnicodeWithOffsets,
  1876. nullptr,
  1877. nullptr,
  1878. _SCSUGetName,
  1879. nullptr,
  1880. _SCSUSafeClone,
  1881. ucnv_getCompleteUnicodeSet,
  1882. nullptr,
  1883. nullptr
  1884. };
  1885. static const UConverterStaticData _SCSUStaticData={
  1886. sizeof(UConverterStaticData),
  1887. "SCSU",
  1888. 1212, /* CCSID for SCSU */
  1889. UCNV_IBM, UCNV_SCSU,
  1890. 1, 3, /* one char16_t generates at least 1 byte and at most 3 bytes */
  1891. /*
  1892. * The subchar here is ignored because _SCSUOpen() sets U+fffd as a Unicode
  1893. * substitution string.
  1894. */
  1895. { 0x0e, 0xff, 0xfd, 0 }, 3,
  1896. false, false,
  1897. 0,
  1898. 0,
  1899. { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
  1900. };
  1901. const UConverterSharedData _SCSUData=
  1902. UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_SCSUStaticData, &_SCSUImpl);
  1903. #endif