ucnv2022.cpp 154 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. **********************************************************************
  5. * Copyright (C) 2000-2016, International Business Machines
  6. * Corporation and others. All Rights Reserved.
  7. **********************************************************************
  8. * file name: ucnv2022.cpp
  9. * encoding: UTF-8
  10. * tab size: 8 (not used)
  11. * indentation:4
  12. *
  13. * created on: 2000feb03
  14. * created by: Markus W. Scherer
  15. *
  16. * Change history:
  17. *
  18. * 06/29/2000 helena Major rewrite of the callback APIs.
  19. * 08/08/2000 Ram Included support for ISO-2022-JP-2
  20. * Changed implementation of toUnicode
  21. * function
  22. * 08/21/2000 Ram Added support for ISO-2022-KR
  23. * 08/29/2000 Ram Seperated implementation of EBCDIC to
  24. * ucnvebdc.c
  25. * 09/20/2000 Ram Added support for ISO-2022-CN
  26. * Added implementations for getNextUChar()
  27. * for specific 2022 country variants.
  28. * 10/31/2000 Ram Implemented offsets logic functions
  29. */
  30. #include "unicode/utypes.h"
  31. #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
  32. #include "unicode/ucnv.h"
  33. #include "unicode/uset.h"
  34. #include "unicode/ucnv_err.h"
  35. #include "unicode/ucnv_cb.h"
  36. #include "unicode/utf16.h"
  37. #include "ucnv_imp.h"
  38. #include "ucnv_bld.h"
  39. #include "ucnv_cnv.h"
  40. #include "ucnvmbcs.h"
  41. #include "cstring.h"
  42. #include "cmemory.h"
  43. #include "uassert.h"
  44. #ifdef U_ENABLE_GENERIC_ISO_2022
  45. /*
  46. * I am disabling the generic ISO-2022 converter after proposing to do so on
  47. * the icu mailing list two days ago.
  48. *
  49. * Reasons:
  50. * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of
  51. * its designation sequences, single shifts with return to the previous state,
  52. * switch-with-no-return to UTF-16BE or similar, etc.
  53. * This is unlike the language-specific variants like ISO-2022-JP which
  54. * require a much smaller repertoire of ISO-2022 features.
  55. * These variants continue to be supported.
  56. * 2. I believe that no one is really using the generic ISO-2022 converter
  57. * but rather always one of the language-specific variants.
  58. * Note that ICU's generic ISO-2022 converter has always output one escape
  59. * sequence followed by UTF-8 for the whole stream.
  60. * 3. Switching between subcharsets is extremely slow, because each time
  61. * the previous converter is closed and a new one opened,
  62. * without any kind of caching, least-recently-used list, etc.
  63. * 4. The code is currently buggy, and given the above it does not seem
  64. * reasonable to spend the time on maintenance.
  65. * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings.
  66. * This means, for example, that when ISO-8859-7 is designated, the following
  67. * ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff.
  68. * The ICU ISO-2022 converter does not handle this - and has no information
  69. * about which subconverter would have to be shifted vs. which is designed
  70. * for 7-bit ISO-2022.
  71. *
  72. * Markus Scherer 2003-dec-03
  73. */
  74. #endif
  75. #if !UCONFIG_ONLY_HTML_CONVERSION
  76. static const char SHIFT_IN_STR[] = "\x0F";
  77. // static const char SHIFT_OUT_STR[] = "\x0E";
  78. #endif
  79. #define CR 0x0D
  80. #define LF 0x0A
  81. #define H_TAB 0x09
  82. #define V_TAB 0x0B
  83. #define SPACE 0x20
  84. enum {
  85. HWKANA_START=0xff61,
  86. HWKANA_END=0xff9f
  87. };
  88. /*
  89. * 94-character sets with native byte values A1..FE are encoded in ISO 2022
  90. * as bytes 21..7E. (Subtract 0x80.)
  91. * 96-character sets with native byte values A0..FF are encoded in ISO 2022
  92. * as bytes 20..7F. (Subtract 0x80.)
  93. * Do not encode C1 control codes with native bytes 80..9F
  94. * as bytes 00..1F (C0 control codes).
  95. */
  96. enum {
  97. GR94_START=0xa1,
  98. GR94_END=0xfe,
  99. GR96_START=0xa0,
  100. GR96_END=0xff
  101. };
  102. /*
  103. * ISO 2022 control codes must not be converted from Unicode
  104. * because they would mess up the byte stream.
  105. * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b
  106. * corresponding to SO, SI, and ESC.
  107. */
  108. #define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0)
  109. /* for ISO-2022-JP and -CN implementations */
  110. typedef enum {
  111. /* shared values */
  112. INVALID_STATE=-1,
  113. ASCII = 0,
  114. SS2_STATE=0x10,
  115. SS3_STATE,
  116. /* JP */
  117. ISO8859_1 = 1 ,
  118. ISO8859_7 = 2 ,
  119. JISX201 = 3,
  120. JISX208 = 4,
  121. JISX212 = 5,
  122. GB2312 =6,
  123. KSC5601 =7,
  124. HWKANA_7BIT=8, /* Halfwidth Katakana 7 bit */
  125. /* CN */
  126. /* the first few enum constants must keep their values because they correspond to myConverterArray[] */
  127. GB2312_1=1,
  128. ISO_IR_165=2,
  129. CNS_11643=3,
  130. /*
  131. * these are used in StateEnum and ISO2022State variables,
  132. * but CNS_11643 must be used to index into myConverterArray[]
  133. */
  134. CNS_11643_0=0x20,
  135. CNS_11643_1,
  136. CNS_11643_2,
  137. CNS_11643_3,
  138. CNS_11643_4,
  139. CNS_11643_5,
  140. CNS_11643_6,
  141. CNS_11643_7
  142. } StateEnum;
  143. /* is the StateEnum charset value for a DBCS charset? */
  144. #if UCONFIG_ONLY_HTML_CONVERSION
  145. #define IS_JP_DBCS(cs) (JISX208==(cs))
  146. #else
  147. #define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601)
  148. #endif
  149. #define CSM(cs) ((uint16_t)1<<(cs))
  150. /*
  151. * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence
  152. * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x
  153. *
  154. * Note: The converter uses some leniency:
  155. * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in
  156. * all versions, not just JIS7 and JIS8.
  157. * - ICU does not distinguish between different versions of JIS X 0208.
  158. */
  159. #if UCONFIG_ONLY_HTML_CONVERSION
  160. enum { MAX_JA_VERSION=0 };
  161. #else
  162. enum { MAX_JA_VERSION=4 };
  163. #endif
  164. static const uint16_t jpCharsetMasks[MAX_JA_VERSION+1]={
  165. CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT),
  166. #if !UCONFIG_ONLY_HTML_CONVERSION
  167. CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212),
  168. CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
  169. CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
  170. CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7)
  171. #endif
  172. };
  173. typedef enum {
  174. ASCII1=0,
  175. LATIN1,
  176. SBCS,
  177. DBCS,
  178. MBCS,
  179. HWKANA
  180. }Cnv2022Type;
  181. typedef struct ISO2022State {
  182. int8_t cs[4]; /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */
  183. int8_t g; /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */
  184. int8_t prevG; /* g before single shift (SS2 or SS3) */
  185. } ISO2022State;
  186. #define UCNV_OPTIONS_VERSION_MASK 0xf
  187. #define UCNV_2022_MAX_CONVERTERS 10
  188. typedef struct{
  189. UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS];
  190. UConverter *currentConverter;
  191. Cnv2022Type currentType;
  192. ISO2022State toU2022State, fromU2022State;
  193. uint32_t key;
  194. uint32_t version;
  195. #ifdef U_ENABLE_GENERIC_ISO_2022
  196. UBool isFirstBuffer;
  197. #endif
  198. UBool isEmptySegment;
  199. char name[30];
  200. char locale[3];
  201. }UConverterDataISO2022;
  202. /* Protos */
  203. /* ISO-2022 ----------------------------------------------------------------- */
  204. /*Forward declaration */
  205. U_CFUNC void U_CALLCONV
  206. ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args,
  207. UErrorCode * err);
  208. U_CFUNC void U_CALLCONV
  209. ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,
  210. UErrorCode * err);
  211. #define ESC_2022 0x1B /*ESC*/
  212. typedef enum
  213. {
  214. INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/
  215. VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/
  216. VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/
  217. VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/
  218. } UCNV_TableStates_2022;
  219. /*
  220. * The way these state transition arrays work is:
  221. * ex : ESC$B is the sequence for JISX208
  222. * a) First Iteration: char is ESC
  223. * i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index
  224. * int x = normalize_esq_chars_2022[27] which is equal to 1
  225. * ii) Search for this value in escSeqStateTable_Key_2022[]
  226. * value of x is stored at escSeqStateTable_Key_2022[0]
  227. * iii) Save this index as offset
  228. * iv) Get state of this sequence from escSeqStateTable_Value_2022[]
  229. * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
  230. * b) Switch on this state and continue to next char
  231. * i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index
  232. * which is normalize_esq_chars_2022[36] == 4
  233. * ii) x is currently 1(from above)
  234. * x<<=5 -- x is now 32
  235. * x+=normalize_esq_chars_2022[36]
  236. * now x is 36
  237. * iii) Search for this value in escSeqStateTable_Key_2022[]
  238. * value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2
  239. * iv) Get state of this sequence from escSeqStateTable_Value_2022[]
  240. * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
  241. * c) Switch on this state and continue to next char
  242. * i) Get the value of B from normalize_esq_chars_2022[] with int value of B as index
  243. * ii) x is currently 36 (from above)
  244. * x<<=5 -- x is now 1152
  245. * x+=normalize_esq_chars_2022[66]
  246. * now x is 1161
  247. * iii) Search for this value in escSeqStateTable_Key_2022[]
  248. * value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21
  249. * iv) Get state of this sequence from escSeqStateTable_Value_2022[21]
  250. * escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022
  251. * v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208
  252. */
  253. /*Below are the 3 arrays depicting a state transition table*/
  254. static const int8_t normalize_esq_chars_2022[256] = {
  255. /* 0 1 2 3 4 5 6 7 8 9 */
  256. 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
  257. ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
  258. ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,1 ,0 ,0
  259. ,0 ,0 ,0 ,0 ,0 ,0 ,4 ,7 ,29 ,0
  260. ,2 ,24 ,26 ,27 ,0 ,3 ,23 ,6 ,0 ,0
  261. ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
  262. ,0 ,0 ,0 ,0 ,5 ,8 ,9 ,10 ,11 ,12
  263. ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,25 ,28
  264. ,0 ,0 ,21 ,0 ,0 ,0 ,0 ,0 ,0 ,0
  265. ,22 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
  266. ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
  267. ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
  268. ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
  269. ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
  270. ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
  271. ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
  272. ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
  273. ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
  274. ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
  275. ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
  276. ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
  277. ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
  278. ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
  279. ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
  280. ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
  281. ,0 ,0 ,0 ,0 ,0 ,0
  282. };
  283. #ifdef U_ENABLE_GENERIC_ISO_2022
  284. /*
  285. * When the generic ISO-2022 converter is completely removed, not just disabled
  286. * per #ifdef, then the following state table and the associated tables that are
  287. * dimensioned with MAX_STATES_2022 should be trimmed.
  288. *
  289. * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of
  290. * the associated escape sequences starting with ESC ( B should be removed.
  291. * This includes the ones with key values 1097 and all of the ones above 1000000.
  292. *
  293. * For the latter, the tables can simply be truncated.
  294. * For the former, since the tables must be kept parallel, it is probably best
  295. * to simply duplicate an adjacent table cell, parallel in all tables.
  296. *
  297. * It may make sense to restructure the tables, especially by using small search
  298. * tables for the variants instead of indexing them parallel to the table here.
  299. */
  300. #endif
  301. #define MAX_STATES_2022 74
  302. static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = {
  303. /* 0 1 2 3 4 5 6 7 8 9 */
  304. 1 ,34 ,36 ,39 ,55 ,57 ,60 ,61 ,1093 ,1096
  305. ,1097 ,1098 ,1099 ,1100 ,1101 ,1102 ,1103 ,1104 ,1105 ,1106
  306. ,1109 ,1154 ,1157 ,1160 ,1161 ,1176 ,1178 ,1179 ,1254 ,1257
  307. ,1768 ,1773 ,1957 ,35105 ,36933 ,36936 ,36937 ,36938 ,36939 ,36940
  308. ,36942 ,36943 ,36944 ,36945 ,36946 ,36947 ,36948 ,37640 ,37642 ,37644
  309. ,37646 ,37711 ,37744 ,37745 ,37746 ,37747 ,37748 ,40133 ,40136 ,40138
  310. ,40139 ,40140 ,40141 ,1123363 ,35947624 ,35947625 ,35947626 ,35947627 ,35947629 ,35947630
  311. ,35947631 ,35947635 ,35947636 ,35947638
  312. };
  313. #ifdef U_ENABLE_GENERIC_ISO_2022
  314. static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = {
  315. /* 0 1 2 3 4 5 6 7 8 9 */
  316. nullptr ,nullptr ,nullptr ,nullptr ,nullptr ,nullptr ,nullptr ,nullptr ,"latin1" ,"latin1"
  317. ,"latin1" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"JISX0201" ,"JISX0201" ,"latin1"
  318. ,"latin1" ,nullptr ,"JISX-208" ,"ibm-5478" ,"JISX-208" ,nullptr ,nullptr ,nullptr ,nullptr ,"UTF8"
  319. ,"ISO-8859-1" ,"ISO-8859-7" ,"JIS-X-208" ,nullptr ,"ibm-955" ,"ibm-367" ,"ibm-952" ,"ibm-949" ,"JISX-212" ,"ibm-1383"
  320. ,"ibm-952" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-5478" ,"ibm-949" ,"ISO-IR-165"
  321. ,"CNS-11643-1992,1" ,"CNS-11643-1992,2" ,"CNS-11643-1992,3" ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6" ,"CNS-11643-1992,7" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian"
  322. ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,nullptr ,"latin1" ,"ibm-912" ,"ibm-913" ,"ibm-914" ,"ibm-813" ,"ibm-1089"
  323. ,"ibm-920" ,"ibm-915" ,"ibm-915" ,"latin1"
  324. };
  325. #endif
  326. static const int8_t escSeqStateTable_Value_2022[MAX_STATES_2022] = {
  327. /* 0 1 2 3 4 5 6 7 8 9 */
  328. VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
  329. ,VALID_MAYBE_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
  330. ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022
  331. ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
  332. ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
  333. ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
  334. ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
  335. ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
  336. };
  337. /* Type def for refactoring changeState_2022 code*/
  338. typedef enum{
  339. #ifdef U_ENABLE_GENERIC_ISO_2022
  340. ISO_2022=0,
  341. #endif
  342. ISO_2022_JP=1,
  343. #if !UCONFIG_ONLY_HTML_CONVERSION
  344. ISO_2022_KR=2,
  345. ISO_2022_CN=3
  346. #endif
  347. } Variant2022;
  348. /*********** ISO 2022 Converter Protos ***********/
  349. static void U_CALLCONV
  350. _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode);
  351. static void U_CALLCONV
  352. _ISO2022Close(UConverter *converter);
  353. static void U_CALLCONV
  354. _ISO2022Reset(UConverter *converter, UConverterResetChoice choice);
  355. U_CDECL_BEGIN
  356. static const char * U_CALLCONV
  357. _ISO2022getName(const UConverter* cnv);
  358. U_CDECL_END
  359. static void U_CALLCONV
  360. _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err);
  361. U_CDECL_BEGIN
  362. static UConverter * U_CALLCONV
  363. _ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status);
  364. U_CDECL_END
  365. #ifdef U_ENABLE_GENERIC_ISO_2022
  366. static void U_CALLCONV
  367. T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err);
  368. #endif
  369. namespace {
  370. /*const UConverterSharedData _ISO2022Data;*/
  371. extern const UConverterSharedData _ISO2022JPData;
  372. #if !UCONFIG_ONLY_HTML_CONVERSION
  373. extern const UConverterSharedData _ISO2022KRData;
  374. extern const UConverterSharedData _ISO2022CNData;
  375. #endif
  376. } // namespace
  377. /*************** Converter implementations ******************/
  378. /* The purpose of this function is to get around gcc compiler warnings. */
  379. static inline void
  380. fromUWriteUInt8(UConverter *cnv,
  381. const char *bytes, int32_t length,
  382. uint8_t **target, const char *targetLimit,
  383. int32_t **offsets,
  384. int32_t sourceIndex,
  385. UErrorCode *pErrorCode)
  386. {
  387. char *targetChars = (char *)*target;
  388. ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit,
  389. offsets, sourceIndex, pErrorCode);
  390. *target = (uint8_t*)targetChars;
  391. }
  392. static inline void
  393. setInitialStateToUnicodeKR(UConverter* /*converter*/, UConverterDataISO2022 *myConverterData){
  394. if(myConverterData->version == 1) {
  395. UConverter *cnv = myConverterData->currentConverter;
  396. cnv->toUnicodeStatus=0; /* offset */
  397. cnv->mode=0; /* state */
  398. cnv->toULength=0; /* byteIndex */
  399. }
  400. }
  401. static inline void
  402. setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){
  403. /* in ISO-2022-KR the designator sequence appears only once
  404. * in a file so we append it only once
  405. */
  406. if( converter->charErrorBufferLength==0){
  407. converter->charErrorBufferLength = 4;
  408. converter->charErrorBuffer[0] = 0x1b;
  409. converter->charErrorBuffer[1] = 0x24;
  410. converter->charErrorBuffer[2] = 0x29;
  411. converter->charErrorBuffer[3] = 0x43;
  412. }
  413. if(myConverterData->version == 1) {
  414. UConverter *cnv = myConverterData->currentConverter;
  415. cnv->fromUChar32=0;
  416. cnv->fromUnicodeStatus=1; /* prevLength */
  417. }
  418. }
  419. static void U_CALLCONV
  420. _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){
  421. char myLocale[7]={' ',' ',' ',' ',' ',' ', '\0'};
  422. cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022));
  423. if(cnv->extraInfo != nullptr) {
  424. UConverterNamePieces stackPieces;
  425. UConverterLoadArgs stackArgs=UCNV_LOAD_ARGS_INITIALIZER;
  426. UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
  427. uint32_t version;
  428. stackArgs.onlyTestIsLoadable = pArgs->onlyTestIsLoadable;
  429. uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022));
  430. myConverterData->currentType = ASCII1;
  431. cnv->fromUnicodeStatus =false;
  432. if(pArgs->locale){
  433. uprv_strncpy(myLocale, pArgs->locale, sizeof(myLocale)-1);
  434. }
  435. version = pArgs->options & UCNV_OPTIONS_VERSION_MASK;
  436. myConverterData->version = version;
  437. if(myLocale[0]=='j' && (myLocale[1]=='a'|| myLocale[1]=='p') &&
  438. (myLocale[2]=='_' || myLocale[2]=='\0'))
  439. {
  440. /* open the required converters and cache them */
  441. if(version>MAX_JA_VERSION) {
  442. // ICU 55 fails to open a converter for an unsupported version.
  443. // Previously, it fell back to version 0, but that would yield
  444. // unexpected behavior.
  445. *errorCode = U_MISSING_RESOURCE_ERROR;
  446. return;
  447. }
  448. if(jpCharsetMasks[version]&CSM(ISO8859_7)) {
  449. myConverterData->myConverterArray[ISO8859_7] =
  450. ucnv_loadSharedData("ISO8859_7", &stackPieces, &stackArgs, errorCode);
  451. }
  452. myConverterData->myConverterArray[JISX208] =
  453. ucnv_loadSharedData("Shift-JIS", &stackPieces, &stackArgs, errorCode);
  454. if(jpCharsetMasks[version]&CSM(JISX212)) {
  455. myConverterData->myConverterArray[JISX212] =
  456. ucnv_loadSharedData("jisx-212", &stackPieces, &stackArgs, errorCode);
  457. }
  458. if(jpCharsetMasks[version]&CSM(GB2312)) {
  459. myConverterData->myConverterArray[GB2312] =
  460. ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode); /* gb_2312_80-1 */
  461. }
  462. if(jpCharsetMasks[version]&CSM(KSC5601)) {
  463. myConverterData->myConverterArray[KSC5601] =
  464. ucnv_loadSharedData("ksc_5601", &stackPieces, &stackArgs, errorCode);
  465. }
  466. /* set the function pointers to appropriate functions */
  467. cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData);
  468. uprv_strcpy(myConverterData->locale,"ja");
  469. (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version=");
  470. size_t len = uprv_strlen(myConverterData->name);
  471. myConverterData->name[len]=(char)(myConverterData->version+(int)'0');
  472. myConverterData->name[len+1]='\0';
  473. }
  474. #if !UCONFIG_ONLY_HTML_CONVERSION
  475. else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') &&
  476. (myLocale[2]=='_' || myLocale[2]=='\0'))
  477. {
  478. if(version>1) {
  479. // ICU 55 fails to open a converter for an unsupported version.
  480. // Previously, it fell back to version 0, but that would yield
  481. // unexpected behavior.
  482. *errorCode = U_MISSING_RESOURCE_ERROR;
  483. return;
  484. }
  485. const char *cnvName;
  486. if(version==1) {
  487. cnvName="icu-internal-25546";
  488. } else {
  489. cnvName="ibm-949";
  490. myConverterData->version=version=0;
  491. }
  492. if(pArgs->onlyTestIsLoadable) {
  493. ucnv_canCreateConverter(cnvName, errorCode); /* errorCode carries result */
  494. uprv_free(cnv->extraInfo);
  495. cnv->extraInfo=nullptr;
  496. return;
  497. } else {
  498. myConverterData->currentConverter=ucnv_open(cnvName, errorCode);
  499. if (U_FAILURE(*errorCode)) {
  500. _ISO2022Close(cnv);
  501. return;
  502. }
  503. if(version==1) {
  504. (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1");
  505. uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4);
  506. cnv->subCharLen = myConverterData->currentConverter->subCharLen;
  507. }else{
  508. (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0");
  509. }
  510. /* initialize the state variables */
  511. setInitialStateToUnicodeKR(cnv, myConverterData);
  512. setInitialStateFromUnicodeKR(cnv, myConverterData);
  513. /* set the function pointers to appropriate functions */
  514. cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData;
  515. uprv_strcpy(myConverterData->locale,"ko");
  516. }
  517. }
  518. else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&&
  519. (myLocale[2]=='_' || myLocale[2]=='\0'))
  520. {
  521. if(version>2) {
  522. // ICU 55 fails to open a converter for an unsupported version.
  523. // Previously, it fell back to version 0, but that would yield
  524. // unexpected behavior.
  525. *errorCode = U_MISSING_RESOURCE_ERROR;
  526. return;
  527. }
  528. /* open the required converters and cache them */
  529. myConverterData->myConverterArray[GB2312_1] =
  530. ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode);
  531. if(version==1) {
  532. myConverterData->myConverterArray[ISO_IR_165] =
  533. ucnv_loadSharedData("iso-ir-165", &stackPieces, &stackArgs, errorCode);
  534. }
  535. myConverterData->myConverterArray[CNS_11643] =
  536. ucnv_loadSharedData("cns-11643-1992", &stackPieces, &stackArgs, errorCode);
  537. /* set the function pointers to appropriate functions */
  538. cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData;
  539. uprv_strcpy(myConverterData->locale,"cn");
  540. if (version==0){
  541. myConverterData->version = 0;
  542. (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0");
  543. }else if (version==1){
  544. myConverterData->version = 1;
  545. (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1");
  546. }else {
  547. myConverterData->version = 2;
  548. (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=2");
  549. }
  550. }
  551. #endif // !UCONFIG_ONLY_HTML_CONVERSION
  552. else{
  553. #ifdef U_ENABLE_GENERIC_ISO_2022
  554. myConverterData->isFirstBuffer = true;
  555. /* append the UTF-8 escape sequence */
  556. cnv->charErrorBufferLength = 3;
  557. cnv->charErrorBuffer[0] = 0x1b;
  558. cnv->charErrorBuffer[1] = 0x25;
  559. cnv->charErrorBuffer[2] = 0x42;
  560. cnv->sharedData=(UConverterSharedData*)&_ISO2022Data;
  561. /* initialize the state variables */
  562. uprv_strcpy(myConverterData->name,"ISO_2022");
  563. #else
  564. *errorCode = U_MISSING_RESOURCE_ERROR;
  565. // Was U_UNSUPPORTED_ERROR but changed in ICU 55 to a more standard
  566. // data loading error code.
  567. return;
  568. #endif
  569. }
  570. cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar;
  571. if(U_FAILURE(*errorCode) || pArgs->onlyTestIsLoadable) {
  572. _ISO2022Close(cnv);
  573. }
  574. } else {
  575. *errorCode = U_MEMORY_ALLOCATION_ERROR;
  576. }
  577. }
  578. static void U_CALLCONV
  579. _ISO2022Close(UConverter *converter) {
  580. UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo);
  581. UConverterSharedData **array = myData->myConverterArray;
  582. int32_t i;
  583. if (converter->extraInfo != nullptr) {
  584. /*close the array of converter pointers and free the memory*/
  585. for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
  586. if(array[i]!=nullptr) {
  587. ucnv_unloadSharedDataIfReady(array[i]);
  588. }
  589. }
  590. ucnv_close(myData->currentConverter);
  591. if(!converter->isExtraLocal){
  592. uprv_free (converter->extraInfo);
  593. converter->extraInfo = nullptr;
  594. }
  595. }
  596. }
  597. static void U_CALLCONV
  598. _ISO2022Reset(UConverter *converter, UConverterResetChoice choice) {
  599. UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo);
  600. if(choice<=UCNV_RESET_TO_UNICODE) {
  601. uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State));
  602. myConverterData->key = 0;
  603. myConverterData->isEmptySegment = false;
  604. }
  605. if(choice!=UCNV_RESET_TO_UNICODE) {
  606. uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State));
  607. }
  608. #ifdef U_ENABLE_GENERIC_ISO_2022
  609. if(myConverterData->locale[0] == 0){
  610. if(choice<=UCNV_RESET_TO_UNICODE) {
  611. myConverterData->isFirstBuffer = true;
  612. myConverterData->key = 0;
  613. if (converter->mode == UCNV_SO){
  614. ucnv_close (myConverterData->currentConverter);
  615. myConverterData->currentConverter=nullptr;
  616. }
  617. converter->mode = UCNV_SI;
  618. }
  619. if(choice!=UCNV_RESET_TO_UNICODE) {
  620. /* re-append UTF-8 escape sequence */
  621. converter->charErrorBufferLength = 3;
  622. converter->charErrorBuffer[0] = 0x1b;
  623. converter->charErrorBuffer[1] = 0x28;
  624. converter->charErrorBuffer[2] = 0x42;
  625. }
  626. }
  627. else
  628. #endif
  629. {
  630. /* reset the state variables */
  631. if(myConverterData->locale[0] == 'k'){
  632. if(choice<=UCNV_RESET_TO_UNICODE) {
  633. setInitialStateToUnicodeKR(converter, myConverterData);
  634. }
  635. if(choice!=UCNV_RESET_TO_UNICODE) {
  636. setInitialStateFromUnicodeKR(converter, myConverterData);
  637. }
  638. }
  639. }
  640. }
  641. U_CDECL_BEGIN
  642. static const char * U_CALLCONV
  643. _ISO2022getName(const UConverter* cnv){
  644. if(cnv->extraInfo){
  645. UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo;
  646. return myData->name;
  647. }
  648. return nullptr;
  649. }
  650. U_CDECL_END
  651. /*************** to unicode *******************/
  652. /****************************************************************************
  653. * Recognized escape sequences are
  654. * <ESC>(B ASCII
  655. * <ESC>.A ISO-8859-1
  656. * <ESC>.F ISO-8859-7
  657. * <ESC>(J JISX-201
  658. * <ESC>(I JISX-201
  659. * <ESC>$B JISX-208
  660. * <ESC>$@ JISX-208
  661. * <ESC>$(D JISX-212
  662. * <ESC>$A GB2312
  663. * <ESC>$(C KSC5601
  664. */
  665. static const int8_t nextStateToUnicodeJP[MAX_STATES_2022]= {
  666. /* 0 1 2 3 4 5 6 7 8 9 */
  667. INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
  668. ,ASCII ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,JISX201 ,HWKANA_7BIT ,JISX201 ,INVALID_STATE
  669. ,INVALID_STATE ,INVALID_STATE ,JISX208 ,GB2312 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
  670. ,ISO8859_1 ,ISO8859_7 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,KSC5601 ,JISX212 ,INVALID_STATE
  671. ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
  672. ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
  673. ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
  674. ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
  675. };
  676. #if !UCONFIG_ONLY_HTML_CONVERSION
  677. /*************** to unicode *******************/
  678. static const int8_t nextStateToUnicodeCN[MAX_STATES_2022]= {
  679. /* 0 1 2 3 4 5 6 7 8 9 */
  680. INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,SS3_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
  681. ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
  682. ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
  683. ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
  684. ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,GB2312_1 ,INVALID_STATE ,ISO_IR_165
  685. ,CNS_11643_1 ,CNS_11643_2 ,CNS_11643_3 ,CNS_11643_4 ,CNS_11643_5 ,CNS_11643_6 ,CNS_11643_7 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
  686. ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
  687. ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
  688. };
  689. #endif
  690. static UCNV_TableStates_2022
  691. getKey_2022(char c,int32_t* key,int32_t* offset){
  692. int32_t togo;
  693. int32_t low = 0;
  694. int32_t hi = MAX_STATES_2022;
  695. int32_t oldmid=0;
  696. togo = normalize_esq_chars_2022[(uint8_t)c];
  697. if(togo == 0) {
  698. /* not a valid character anywhere in an escape sequence */
  699. *key = 0;
  700. *offset = 0;
  701. return INVALID_2022;
  702. }
  703. togo = (*key << 5) + togo;
  704. while (hi != low) /*binary search*/{
  705. int32_t mid = (hi+low) >> 1; /*Finds median*/
  706. if (mid == oldmid)
  707. break;
  708. if (escSeqStateTable_Key_2022[mid] > togo){
  709. hi = mid;
  710. }
  711. else if (escSeqStateTable_Key_2022[mid] < togo){
  712. low = mid;
  713. }
  714. else /*we found it*/{
  715. *key = togo;
  716. *offset = mid;
  717. return (UCNV_TableStates_2022)escSeqStateTable_Value_2022[mid];
  718. }
  719. oldmid = mid;
  720. }
  721. *key = 0;
  722. *offset = 0;
  723. return INVALID_2022;
  724. }
  725. /*runs through a state machine to determine the escape sequence - codepage correspondence
  726. */
  727. static void
  728. changeState_2022(UConverter* _this,
  729. const char** source,
  730. const char* sourceLimit,
  731. Variant2022 var,
  732. UErrorCode* err){
  733. UCNV_TableStates_2022 value;
  734. UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo);
  735. uint32_t key = myData2022->key;
  736. int32_t offset = 0;
  737. int8_t initialToULength = _this->toULength;
  738. char c;
  739. value = VALID_NON_TERMINAL_2022;
  740. while (*source < sourceLimit) {
  741. c = *(*source)++;
  742. _this->toUBytes[_this->toULength++]=(uint8_t)c;
  743. value = getKey_2022(c,(int32_t *) &key, &offset);
  744. switch (value){
  745. case VALID_NON_TERMINAL_2022 :
  746. /* continue with the loop */
  747. break;
  748. case VALID_TERMINAL_2022:
  749. key = 0;
  750. goto DONE;
  751. case INVALID_2022:
  752. goto DONE;
  753. case VALID_MAYBE_TERMINAL_2022:
  754. #ifdef U_ENABLE_GENERIC_ISO_2022
  755. /* ESC ( B is ambiguous only for ISO_2022 itself */
  756. if(var == ISO_2022) {
  757. /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */
  758. _this->toULength = 0;
  759. /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */
  760. /* continue with the loop */
  761. value = VALID_NON_TERMINAL_2022;
  762. break;
  763. } else
  764. #endif
  765. {
  766. /* not ISO_2022 itself, finish here */
  767. value = VALID_TERMINAL_2022;
  768. key = 0;
  769. goto DONE;
  770. }
  771. }
  772. }
  773. DONE:
  774. myData2022->key = key;
  775. if (value == VALID_NON_TERMINAL_2022) {
  776. /* indicate that the escape sequence is incomplete: key!=0 */
  777. return;
  778. } else if (value == INVALID_2022 ) {
  779. *err = U_ILLEGAL_ESCAPE_SEQUENCE;
  780. } else /* value == VALID_TERMINAL_2022 */ {
  781. switch(var){
  782. #ifdef U_ENABLE_GENERIC_ISO_2022
  783. case ISO_2022:
  784. {
  785. const char *chosenConverterName = escSeqStateTable_Result_2022[offset];
  786. if(chosenConverterName == nullptr) {
  787. /* SS2 or SS3 */
  788. *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
  789. _this->toUCallbackReason = UCNV_UNASSIGNED;
  790. return;
  791. }
  792. _this->mode = UCNV_SI;
  793. ucnv_close(myData2022->currentConverter);
  794. myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err);
  795. if(U_SUCCESS(*err)) {
  796. myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
  797. _this->mode = UCNV_SO;
  798. }
  799. break;
  800. }
  801. #endif
  802. case ISO_2022_JP:
  803. {
  804. StateEnum tempState=(StateEnum)nextStateToUnicodeJP[offset];
  805. switch(tempState) {
  806. case INVALID_STATE:
  807. *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
  808. break;
  809. case SS2_STATE:
  810. if(myData2022->toU2022State.cs[2]!=0) {
  811. if(myData2022->toU2022State.g<2) {
  812. myData2022->toU2022State.prevG=myData2022->toU2022State.g;
  813. }
  814. myData2022->toU2022State.g=2;
  815. } else {
  816. /* illegal to have SS2 before a matching designator */
  817. *err = U_ILLEGAL_ESCAPE_SEQUENCE;
  818. }
  819. break;
  820. /* case SS3_STATE: not used in ISO-2022-JP-x */
  821. case ISO8859_1:
  822. case ISO8859_7:
  823. if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
  824. *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
  825. } else {
  826. /* G2 charset for SS2 */
  827. myData2022->toU2022State.cs[2]=(int8_t)tempState;
  828. }
  829. break;
  830. default:
  831. if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
  832. *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
  833. } else {
  834. /* G0 charset */
  835. myData2022->toU2022State.cs[0]=(int8_t)tempState;
  836. }
  837. break;
  838. }
  839. }
  840. break;
  841. #if !UCONFIG_ONLY_HTML_CONVERSION
  842. case ISO_2022_CN:
  843. {
  844. StateEnum tempState=(StateEnum)nextStateToUnicodeCN[offset];
  845. switch(tempState) {
  846. case INVALID_STATE:
  847. *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
  848. break;
  849. case SS2_STATE:
  850. if(myData2022->toU2022State.cs[2]!=0) {
  851. if(myData2022->toU2022State.g<2) {
  852. myData2022->toU2022State.prevG=myData2022->toU2022State.g;
  853. }
  854. myData2022->toU2022State.g=2;
  855. } else {
  856. /* illegal to have SS2 before a matching designator */
  857. *err = U_ILLEGAL_ESCAPE_SEQUENCE;
  858. }
  859. break;
  860. case SS3_STATE:
  861. if(myData2022->toU2022State.cs[3]!=0) {
  862. if(myData2022->toU2022State.g<2) {
  863. myData2022->toU2022State.prevG=myData2022->toU2022State.g;
  864. }
  865. myData2022->toU2022State.g=3;
  866. } else {
  867. /* illegal to have SS3 before a matching designator */
  868. *err = U_ILLEGAL_ESCAPE_SEQUENCE;
  869. }
  870. break;
  871. case ISO_IR_165:
  872. if(myData2022->version==0) {
  873. *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
  874. break;
  875. }
  876. U_FALLTHROUGH;
  877. case GB2312_1:
  878. U_FALLTHROUGH;
  879. case CNS_11643_1:
  880. myData2022->toU2022State.cs[1]=(int8_t)tempState;
  881. break;
  882. case CNS_11643_2:
  883. myData2022->toU2022State.cs[2]=(int8_t)tempState;
  884. break;
  885. default:
  886. /* other CNS 11643 planes */
  887. if(myData2022->version==0) {
  888. *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
  889. } else {
  890. myData2022->toU2022State.cs[3]=(int8_t)tempState;
  891. }
  892. break;
  893. }
  894. }
  895. break;
  896. case ISO_2022_KR:
  897. if(offset==0x30){
  898. /* nothing to be done, just accept this one escape sequence */
  899. } else {
  900. *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
  901. }
  902. break;
  903. #endif // !UCONFIG_ONLY_HTML_CONVERSION
  904. default:
  905. *err = U_ILLEGAL_ESCAPE_SEQUENCE;
  906. break;
  907. }
  908. }
  909. if(U_SUCCESS(*err)) {
  910. _this->toULength = 0;
  911. } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) {
  912. if(_this->toULength>1) {
  913. /*
  914. * Ticket 5691: consistent illegal sequences:
  915. * - We include at least the first byte (ESC) in the illegal sequence.
  916. * - If any of the non-initial bytes could be the start of a character,
  917. * we stop the illegal sequence before the first one of those.
  918. * In escape sequences, all following bytes are "printable", that is,
  919. * unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS),
  920. * they are valid single/lead bytes.
  921. * For simplicity, we always only report the initial ESC byte as the
  922. * illegal sequence and back out all other bytes we looked at.
  923. */
  924. /* Back out some bytes. */
  925. int8_t backOutDistance=_this->toULength-1;
  926. int8_t bytesFromThisBuffer=_this->toULength-initialToULength;
  927. if(backOutDistance<=bytesFromThisBuffer) {
  928. /* same as initialToULength<=1 */
  929. *source-=backOutDistance;
  930. } else {
  931. /* Back out bytes from the previous buffer: Need to replay them. */
  932. _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
  933. /* same as -(initialToULength-1) */
  934. /* preToULength is negative! */
  935. uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength);
  936. *source-=bytesFromThisBuffer;
  937. }
  938. _this->toULength=1;
  939. }
  940. } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) {
  941. _this->toUCallbackReason = UCNV_UNASSIGNED;
  942. }
  943. }
  944. #if !UCONFIG_ONLY_HTML_CONVERSION
  945. /*Checks the characters of the buffer against valid 2022 escape sequences
  946. *if the match we return a pointer to the initial start of the sequence otherwise
  947. *we return sourceLimit
  948. */
  949. /*for 2022 looks ahead in the stream
  950. *to determine the longest possible convertible
  951. *data stream
  952. */
  953. static inline const char*
  954. getEndOfBuffer_2022(const char** source,
  955. const char* sourceLimit,
  956. UBool /*flush*/){
  957. const char* mySource = *source;
  958. #ifdef U_ENABLE_GENERIC_ISO_2022
  959. if (*source >= sourceLimit)
  960. return sourceLimit;
  961. do{
  962. if (*mySource == ESC_2022){
  963. int8_t i;
  964. int32_t key = 0;
  965. int32_t offset;
  966. UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022;
  967. /* Kludge: I could not
  968. * figure out the reason for validating an escape sequence
  969. * twice - once here and once in changeState_2022().
  970. * is it possible to have an ESC character in a ISO2022
  971. * byte stream which is valid in a code page? Is it legal?
  972. */
  973. for (i=0;
  974. (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022);
  975. i++) {
  976. value = getKey_2022(*(mySource+i), &key, &offset);
  977. }
  978. if (value > 0 || *mySource==ESC_2022)
  979. return mySource;
  980. if ((value == VALID_NON_TERMINAL_2022)&&(!flush) )
  981. return sourceLimit;
  982. }
  983. }while (++mySource < sourceLimit);
  984. return sourceLimit;
  985. #else
  986. while(mySource < sourceLimit && *mySource != ESC_2022) {
  987. ++mySource;
  988. }
  989. return mySource;
  990. #endif
  991. }
  992. #endif
  993. /* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c
  994. * any future change in _MBCSFromUChar32() function should be reflected here.
  995. * @return number of bytes in *value; negative number if fallback; 0 if no mapping
  996. */
  997. static inline int32_t
  998. MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData,
  999. UChar32 c,
  1000. uint32_t* value,
  1001. UBool useFallback,
  1002. int outputType)
  1003. {
  1004. const int32_t *cx;
  1005. const uint16_t *table;
  1006. uint32_t stage2Entry;
  1007. uint32_t myValue;
  1008. int32_t length;
  1009. const uint8_t *p;
  1010. /*
  1011. * TODO(markus): Use and require new, faster MBCS conversion table structures.
  1012. * Use internal version of ucnv_open() that verifies that the new structures are available,
  1013. * else U_INTERNAL_PROGRAM_ERROR.
  1014. */
  1015. /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
  1016. if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
  1017. table=sharedData->mbcs.fromUnicodeTable;
  1018. stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
  1019. /* get the bytes and the length for the output */
  1020. if(outputType==MBCS_OUTPUT_2){
  1021. myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
  1022. if(myValue<=0xff) {
  1023. length=1;
  1024. } else {
  1025. length=2;
  1026. }
  1027. } else /* outputType==MBCS_OUTPUT_3 */ {
  1028. p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
  1029. myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
  1030. if(myValue<=0xff) {
  1031. length=1;
  1032. } else if(myValue<=0xffff) {
  1033. length=2;
  1034. } else {
  1035. length=3;
  1036. }
  1037. }
  1038. /* is this code point assigned, or do we use fallbacks? */
  1039. if((stage2Entry&(1<<(16+(c&0xf))))!=0) {
  1040. /* assigned */
  1041. *value=myValue;
  1042. return length;
  1043. } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) {
  1044. /*
  1045. * We allow a 0 byte output if the "assigned" bit is set for this entry.
  1046. * There is no way with this data structure for fallback output
  1047. * to be a zero byte.
  1048. */
  1049. *value=myValue;
  1050. return -length;
  1051. }
  1052. }
  1053. cx=sharedData->mbcs.extIndexes;
  1054. if(cx!=nullptr) {
  1055. return ucnv_extSimpleMatchFromU(cx, c, value, useFallback);
  1056. }
  1057. /* unassigned */
  1058. return 0;
  1059. }
  1060. /* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c
  1061. * any future change in _MBCSSingleFromUChar32() function should be reflected here.
  1062. * @param retval pointer to output byte
  1063. * @return 1 roundtrip byte 0 no mapping -1 fallback byte
  1064. */
  1065. static inline int32_t
  1066. MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData,
  1067. UChar32 c,
  1068. uint32_t* retval,
  1069. UBool useFallback)
  1070. {
  1071. const uint16_t *table;
  1072. int32_t value;
  1073. /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
  1074. if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
  1075. return 0;
  1076. }
  1077. /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
  1078. table=sharedData->mbcs.fromUnicodeTable;
  1079. /* get the byte for the output */
  1080. value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
  1081. /* is this code point assigned, or do we use fallbacks? */
  1082. *retval=(uint32_t)(value&0xff);
  1083. if(value>=0xf00) {
  1084. return 1; /* roundtrip */
  1085. } else if(useFallback ? value>=0x800 : value>=0xc00) {
  1086. return -1; /* fallback taken */
  1087. } else {
  1088. return 0; /* no mapping */
  1089. }
  1090. }
  1091. /*
  1092. * Check that the result is a 2-byte value with each byte in the range A1..FE
  1093. * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte
  1094. * to move it to the ISO 2022 range 21..7E.
  1095. * Return 0 if out of range.
  1096. */
  1097. static inline uint32_t
  1098. _2022FromGR94DBCS(uint32_t value) {
  1099. if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) &&
  1100. (uint8_t)(value - 0xa1) <= (0xfe - 0xa1)
  1101. ) {
  1102. return value - 0x8080; /* shift down to 21..7e byte range */
  1103. } else {
  1104. return 0; /* not valid for ISO 2022 */
  1105. }
  1106. }
  1107. #if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */
  1108. /*
  1109. * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the
  1110. * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point
  1111. * unchanged.
  1112. */
  1113. static inline uint32_t
  1114. _2022ToGR94DBCS(uint32_t value) {
  1115. uint32_t returnValue = value + 0x8080;
  1116. if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) &&
  1117. (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) {
  1118. return returnValue;
  1119. } else {
  1120. return value;
  1121. }
  1122. }
  1123. #endif
  1124. #ifdef U_ENABLE_GENERIC_ISO_2022
  1125. /**********************************************************************************
  1126. * ISO-2022 Converter
  1127. *
  1128. *
  1129. */
  1130. static void U_CALLCONV
  1131. T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args,
  1132. UErrorCode* err){
  1133. const char* mySourceLimit, *realSourceLimit;
  1134. const char* sourceStart;
  1135. const char16_t* myTargetStart;
  1136. UConverter* saveThis;
  1137. UConverterDataISO2022* myData;
  1138. int8_t length;
  1139. saveThis = args->converter;
  1140. myData=((UConverterDataISO2022*)(saveThis->extraInfo));
  1141. realSourceLimit = args->sourceLimit;
  1142. while (args->source < realSourceLimit) {
  1143. if(myData->key == 0) { /* are we in the middle of an escape sequence? */
  1144. /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
  1145. mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush);
  1146. if(args->source < mySourceLimit) {
  1147. if(myData->currentConverter==nullptr) {
  1148. myData->currentConverter = ucnv_open("ASCII",err);
  1149. if(U_FAILURE(*err)){
  1150. return;
  1151. }
  1152. myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
  1153. saveThis->mode = UCNV_SO;
  1154. }
  1155. /* convert to before the ESC or until the end of the buffer */
  1156. myData->isFirstBuffer=false;
  1157. sourceStart = args->source;
  1158. myTargetStart = args->target;
  1159. args->converter = myData->currentConverter;
  1160. ucnv_toUnicode(args->converter,
  1161. &args->target,
  1162. args->targetLimit,
  1163. &args->source,
  1164. mySourceLimit,
  1165. args->offsets,
  1166. (UBool)(args->flush && mySourceLimit == realSourceLimit),
  1167. err);
  1168. args->converter = saveThis;
  1169. if (*err == U_BUFFER_OVERFLOW_ERROR) {
  1170. /* move the overflow buffer */
  1171. length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength;
  1172. myData->currentConverter->UCharErrorBufferLength = 0;
  1173. if(length > 0) {
  1174. uprv_memcpy(saveThis->UCharErrorBuffer,
  1175. myData->currentConverter->UCharErrorBuffer,
  1176. length*U_SIZEOF_UCHAR);
  1177. }
  1178. return;
  1179. }
  1180. /*
  1181. * At least one of:
  1182. * -Error while converting
  1183. * -Done with entire buffer
  1184. * -Need to write offsets or update the current offset
  1185. * (leave that up to the code in ucnv.c)
  1186. *
  1187. * or else we just stopped at an ESC byte and continue with changeState_2022()
  1188. */
  1189. if (U_FAILURE(*err) ||
  1190. (args->source == realSourceLimit) ||
  1191. (args->offsets != nullptr && (args->target != myTargetStart || args->source != sourceStart) ||
  1192. (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0))
  1193. ) {
  1194. /* copy partial or error input for truncated detection and error handling */
  1195. if(U_FAILURE(*err)) {
  1196. length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength;
  1197. if(length > 0) {
  1198. uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length);
  1199. }
  1200. } else {
  1201. length = saveThis->toULength = myData->currentConverter->toULength;
  1202. if(length > 0) {
  1203. uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length);
  1204. if(args->source < mySourceLimit) {
  1205. *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */
  1206. }
  1207. }
  1208. }
  1209. return;
  1210. }
  1211. }
  1212. }
  1213. sourceStart = args->source;
  1214. changeState_2022(args->converter,
  1215. &(args->source),
  1216. realSourceLimit,
  1217. ISO_2022,
  1218. err);
  1219. if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != nullptr)) {
  1220. /* let the ucnv.c code update its current offset */
  1221. return;
  1222. }
  1223. }
  1224. }
  1225. #endif
  1226. /*
  1227. * To Unicode Callback helper function
  1228. */
  1229. static void
  1230. toUnicodeCallback(UConverter *cnv,
  1231. const uint32_t sourceChar, const uint32_t targetUniChar,
  1232. UErrorCode* err){
  1233. if(sourceChar>0xff){
  1234. cnv->toUBytes[0] = (uint8_t)(sourceChar>>8);
  1235. cnv->toUBytes[1] = (uint8_t)sourceChar;
  1236. cnv->toULength = 2;
  1237. }
  1238. else{
  1239. cnv->toUBytes[0] =(char) sourceChar;
  1240. cnv->toULength = 1;
  1241. }
  1242. if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){
  1243. *err = U_INVALID_CHAR_FOUND;
  1244. }
  1245. else{
  1246. *err = U_ILLEGAL_CHAR_FOUND;
  1247. }
  1248. }
  1249. /**************************************ISO-2022-JP*************************************************/
  1250. /************************************** IMPORTANT **************************************************
  1251. * The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and
  1252. * MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32().
  1253. * The converter iterates over each Unicode codepoint
  1254. * to obtain the equivalent codepoints from the codepages supported. Since the source buffer is
  1255. * processed one char at a time it would make sense to reduce the extra processing a canned converter
  1256. * would do as far as possible.
  1257. *
  1258. * If the implementation of these macros or structure of sharedData struct change in the future, make
  1259. * sure that ISO-2022 is also changed.
  1260. ***************************************************************************************************
  1261. */
  1262. /***************************************************************************************************
  1263. * Rules for ISO-2022-jp encoding
  1264. * (i) Escape sequences must be fully contained within a line they should not
  1265. * span new lines or CRs
  1266. * (ii) If the last character on a line is represented by two bytes then an ASCII or
  1267. * JIS-Roman character escape sequence should follow before the line terminates
  1268. * (iii) If the first character on the line is represented by two bytes then a two
  1269. * byte character escape sequence should precede it
  1270. * (iv) If no escape sequence is encountered then the characters are ASCII
  1271. * (v) Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2,
  1272. * and invoked with SS2 (ESC N).
  1273. * (vi) If there is any G0 designation in text, there must be a switch to
  1274. * ASCII or to JIS X 0201-Roman before a space character (but not
  1275. * necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control
  1276. * characters such as tab or CRLF.
  1277. * (vi) Supported encodings:
  1278. * ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7
  1279. *
  1280. * source : RFC-1554
  1281. *
  1282. * JISX201, JISX208,JISX212 : new .cnv data files created
  1283. * KSC5601 : alias to ibm-949 mapping table
  1284. * GB2312 : alias to ibm-1386 mapping table
  1285. * ISO-8859-1 : Algorithmic implemented as LATIN1 case
  1286. * ISO-8859-7 : alias to ibm-9409 mapping table
  1287. */
  1288. /* preference order of JP charsets */
  1289. static const StateEnum jpCharsetPref[]={
  1290. ASCII,
  1291. JISX201,
  1292. ISO8859_1,
  1293. JISX208,
  1294. ISO8859_7,
  1295. JISX212,
  1296. GB2312,
  1297. KSC5601,
  1298. HWKANA_7BIT
  1299. };
  1300. /*
  1301. * The escape sequences must be in order of the enum constants like JISX201 = 3,
  1302. * not in order of jpCharsetPref[]!
  1303. */
  1304. static const char escSeqChars[][6] ={
  1305. "\x1B\x28\x42", /* <ESC>(B ASCII */
  1306. "\x1B\x2E\x41", /* <ESC>.A ISO-8859-1 */
  1307. "\x1B\x2E\x46", /* <ESC>.F ISO-8859-7 */
  1308. "\x1B\x28\x4A", /* <ESC>(J JISX-201 */
  1309. "\x1B\x24\x42", /* <ESC>$B JISX-208 */
  1310. "\x1B\x24\x28\x44", /* <ESC>$(D JISX-212 */
  1311. "\x1B\x24\x41", /* <ESC>$A GB2312 */
  1312. "\x1B\x24\x28\x43", /* <ESC>$(C KSC5601 */
  1313. "\x1B\x28\x49" /* <ESC>(I HWKANA_7BIT */
  1314. };
  1315. static const int8_t escSeqCharsLen[] ={
  1316. 3, /* length of <ESC>(B ASCII */
  1317. 3, /* length of <ESC>.A ISO-8859-1 */
  1318. 3, /* length of <ESC>.F ISO-8859-7 */
  1319. 3, /* length of <ESC>(J JISX-201 */
  1320. 3, /* length of <ESC>$B JISX-208 */
  1321. 4, /* length of <ESC>$(D JISX-212 */
  1322. 3, /* length of <ESC>$A GB2312 */
  1323. 4, /* length of <ESC>$(C KSC5601 */
  1324. 3 /* length of <ESC>(I HWKANA_7BIT */
  1325. };
  1326. /*
  1327. * The iteration over various code pages works this way:
  1328. * i) Get the currentState from myConverterData->currentState
  1329. * ii) Check if the character is mapped to a valid character in the currentState
  1330. * Yes -> a) set the initIterState to currentState
  1331. * b) remain in this state until an invalid character is found
  1332. * No -> a) go to the next code page and find the character
  1333. * iii) Before changing the state increment the current state check if the current state
  1334. * is equal to the intitIteration state
  1335. * Yes -> A character that cannot be represented in any of the supported encodings
  1336. * break and return a U_INVALID_CHARACTER error
  1337. * No -> Continue and find the character in next code page
  1338. *
  1339. *
  1340. * TODO: Implement a priority technique where the users are allowed to set the priority of code pages
  1341. */
  1342. /* Map 00..7F to Unicode according to JIS X 0201. */
  1343. static inline uint32_t
  1344. jisx201ToU(uint32_t value) {
  1345. if(value < 0x5c) {
  1346. return value;
  1347. } else if(value == 0x5c) {
  1348. return 0xa5;
  1349. } else if(value == 0x7e) {
  1350. return 0x203e;
  1351. } else /* value <= 0x7f */ {
  1352. return value;
  1353. }
  1354. }
  1355. /* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */
  1356. static inline uint32_t
  1357. jisx201FromU(uint32_t value) {
  1358. if(value<=0x7f) {
  1359. if(value!=0x5c && value!=0x7e) {
  1360. return value;
  1361. }
  1362. } else if(value==0xa5) {
  1363. return 0x5c;
  1364. } else if(value==0x203e) {
  1365. return 0x7e;
  1366. }
  1367. return 0xfffe;
  1368. }
  1369. /*
  1370. * Take a valid Shift-JIS byte pair, check that it is in the range corresponding
  1371. * to JIS X 0208, and convert it to a pair of 21..7E bytes.
  1372. * Return 0 if the byte pair is out of range.
  1373. */
  1374. static inline uint32_t
  1375. _2022FromSJIS(uint32_t value) {
  1376. uint8_t trail;
  1377. if(value > 0xEFFC) {
  1378. return 0; /* beyond JIS X 0208 */
  1379. }
  1380. trail = (uint8_t)value;
  1381. value &= 0xff00; /* lead byte */
  1382. if(value <= 0x9f00) {
  1383. value -= 0x7000;
  1384. } else /* 0xe000 <= value <= 0xef00 */ {
  1385. value -= 0xb000;
  1386. }
  1387. value <<= 1;
  1388. if(trail <= 0x9e) {
  1389. value -= 0x100;
  1390. if(trail <= 0x7e) {
  1391. value |= trail - 0x1f;
  1392. } else {
  1393. value |= trail - 0x20;
  1394. }
  1395. } else /* trail <= 0xfc */ {
  1396. value |= trail - 0x7e;
  1397. }
  1398. return value;
  1399. }
  1400. /*
  1401. * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS.
  1402. * If either byte is outside 21..7E make sure that the result is not valid
  1403. * for Shift-JIS so that the converter catches it.
  1404. * Some invalid byte values already turn into equally invalid Shift-JIS
  1405. * byte values and need not be tested explicitly.
  1406. */
  1407. static inline void
  1408. _2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) {
  1409. if(c1&1) {
  1410. ++c1;
  1411. if(c2 <= 0x5f) {
  1412. c2 += 0x1f;
  1413. } else if(c2 <= 0x7e) {
  1414. c2 += 0x20;
  1415. } else {
  1416. c2 = 0; /* invalid */
  1417. }
  1418. } else {
  1419. if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) {
  1420. c2 += 0x7e;
  1421. } else {
  1422. c2 = 0; /* invalid */
  1423. }
  1424. }
  1425. c1 >>= 1;
  1426. if(c1 <= 0x2f) {
  1427. c1 += 0x70;
  1428. } else if(c1 <= 0x3f) {
  1429. c1 += 0xb0;
  1430. } else {
  1431. c1 = 0; /* invalid */
  1432. }
  1433. bytes[0] = (char)c1;
  1434. bytes[1] = (char)c2;
  1435. }
  1436. /*
  1437. * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS)
  1438. * Katakana.
  1439. * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks
  1440. * because Shift-JIS roundtrips half-width Katakana to single bytes.
  1441. * These were the only fallbacks in ICU's jisx-208.ucm file.
  1442. */
  1443. static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = {
  1444. 0x2123, /* U+FF61 */
  1445. 0x2156,
  1446. 0x2157,
  1447. 0x2122,
  1448. 0x2126,
  1449. 0x2572,
  1450. 0x2521,
  1451. 0x2523,
  1452. 0x2525,
  1453. 0x2527,
  1454. 0x2529,
  1455. 0x2563,
  1456. 0x2565,
  1457. 0x2567,
  1458. 0x2543,
  1459. 0x213C, /* U+FF70 */
  1460. 0x2522,
  1461. 0x2524,
  1462. 0x2526,
  1463. 0x2528,
  1464. 0x252A,
  1465. 0x252B,
  1466. 0x252D,
  1467. 0x252F,
  1468. 0x2531,
  1469. 0x2533,
  1470. 0x2535,
  1471. 0x2537,
  1472. 0x2539,
  1473. 0x253B,
  1474. 0x253D,
  1475. 0x253F, /* U+FF80 */
  1476. 0x2541,
  1477. 0x2544,
  1478. 0x2546,
  1479. 0x2548,
  1480. 0x254A,
  1481. 0x254B,
  1482. 0x254C,
  1483. 0x254D,
  1484. 0x254E,
  1485. 0x254F,
  1486. 0x2552,
  1487. 0x2555,
  1488. 0x2558,
  1489. 0x255B,
  1490. 0x255E,
  1491. 0x255F, /* U+FF90 */
  1492. 0x2560,
  1493. 0x2561,
  1494. 0x2562,
  1495. 0x2564,
  1496. 0x2566,
  1497. 0x2568,
  1498. 0x2569,
  1499. 0x256A,
  1500. 0x256B,
  1501. 0x256C,
  1502. 0x256D,
  1503. 0x256F,
  1504. 0x2573,
  1505. 0x212B,
  1506. 0x212C /* U+FF9F */
  1507. };
  1508. static void U_CALLCONV
  1509. UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) {
  1510. UConverter *cnv = args->converter;
  1511. UConverterDataISO2022 *converterData;
  1512. ISO2022State *pFromU2022State;
  1513. uint8_t *target = (uint8_t *) args->target;
  1514. const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
  1515. const char16_t* source = args->source;
  1516. const char16_t* sourceLimit = args->sourceLimit;
  1517. int32_t* offsets = args->offsets;
  1518. UChar32 sourceChar;
  1519. char buffer[8];
  1520. int32_t len, outLen;
  1521. int8_t choices[10];
  1522. int32_t choiceCount;
  1523. uint32_t targetValue = 0;
  1524. UBool useFallback;
  1525. int32_t i;
  1526. int8_t cs, g;
  1527. /* set up the state */
  1528. converterData = (UConverterDataISO2022*)cnv->extraInfo;
  1529. pFromU2022State = &converterData->fromU2022State;
  1530. choiceCount = 0;
  1531. /* check if the last codepoint of previous buffer was a lead surrogate*/
  1532. if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
  1533. goto getTrail;
  1534. }
  1535. while(source < sourceLimit) {
  1536. if(target < targetLimit) {
  1537. sourceChar = *(source++);
  1538. /*check if the char is a First surrogate*/
  1539. if(U16_IS_SURROGATE(sourceChar)) {
  1540. if(U16_IS_SURROGATE_LEAD(sourceChar)) {
  1541. getTrail:
  1542. /*look ahead to find the trail surrogate*/
  1543. if(source < sourceLimit) {
  1544. /* test the following code unit */
  1545. char16_t trail=(char16_t) *source;
  1546. if(U16_IS_TRAIL(trail)) {
  1547. source++;
  1548. sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
  1549. cnv->fromUChar32=0x00;
  1550. /* convert this supplementary code point */
  1551. /* exit this condition tree */
  1552. } else {
  1553. /* this is an unmatched lead code unit (1st surrogate) */
  1554. /* callback(illegal) */
  1555. *err=U_ILLEGAL_CHAR_FOUND;
  1556. cnv->fromUChar32=sourceChar;
  1557. break;
  1558. }
  1559. } else {
  1560. /* no more input */
  1561. cnv->fromUChar32=sourceChar;
  1562. break;
  1563. }
  1564. } else {
  1565. /* this is an unmatched trail code unit (2nd surrogate) */
  1566. /* callback(illegal) */
  1567. *err=U_ILLEGAL_CHAR_FOUND;
  1568. cnv->fromUChar32=sourceChar;
  1569. break;
  1570. }
  1571. }
  1572. /* do not convert SO/SI/ESC */
  1573. if(IS_2022_CONTROL(sourceChar)) {
  1574. /* callback(illegal) */
  1575. *err=U_ILLEGAL_CHAR_FOUND;
  1576. cnv->fromUChar32=sourceChar;
  1577. break;
  1578. }
  1579. /* do the conversion */
  1580. if(choiceCount == 0) {
  1581. uint16_t csm;
  1582. /*
  1583. * The csm variable keeps track of which charsets are allowed
  1584. * and not used yet while building the choices[].
  1585. */
  1586. csm = jpCharsetMasks[converterData->version];
  1587. choiceCount = 0;
  1588. /* JIS7/8: try single-byte half-width Katakana before JISX208 */
  1589. if(converterData->version == 3 || converterData->version == 4) {
  1590. choices[choiceCount++] = (int8_t)HWKANA_7BIT;
  1591. }
  1592. /* Do not try single-byte half-width Katakana for other versions. */
  1593. csm &= ~CSM(HWKANA_7BIT);
  1594. /* try the current G0 charset */
  1595. choices[choiceCount++] = cs = pFromU2022State->cs[0];
  1596. csm &= ~CSM(cs);
  1597. /* try the current G2 charset */
  1598. if((cs = pFromU2022State->cs[2]) != 0) {
  1599. choices[choiceCount++] = cs;
  1600. csm &= ~CSM(cs);
  1601. }
  1602. /* try all the other possible charsets */
  1603. for(i = 0; i < UPRV_LENGTHOF(jpCharsetPref); ++i) {
  1604. cs = (int8_t)jpCharsetPref[i];
  1605. if(CSM(cs) & csm) {
  1606. choices[choiceCount++] = cs;
  1607. csm &= ~CSM(cs);
  1608. }
  1609. }
  1610. }
  1611. cs = g = 0;
  1612. /*
  1613. * len==0: no mapping found yet
  1614. * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
  1615. * len>0: found a roundtrip result, done
  1616. */
  1617. len = 0;
  1618. /*
  1619. * We will turn off useFallback after finding a fallback,
  1620. * but we still get fallbacks from PUA code points as usual.
  1621. * Therefore, we will also need to check that we don't overwrite
  1622. * an early fallback with a later one.
  1623. */
  1624. useFallback = cnv->useFallback;
  1625. for(i = 0; i < choiceCount && len <= 0; ++i) {
  1626. uint32_t value;
  1627. int32_t len2;
  1628. int8_t cs0 = choices[i];
  1629. switch(cs0) {
  1630. case ASCII:
  1631. if(sourceChar <= 0x7f) {
  1632. targetValue = (uint32_t)sourceChar;
  1633. len = 1;
  1634. cs = cs0;
  1635. g = 0;
  1636. }
  1637. break;
  1638. case ISO8859_1:
  1639. if(GR96_START <= sourceChar && sourceChar <= GR96_END) {
  1640. targetValue = (uint32_t)sourceChar - 0x80;
  1641. len = 1;
  1642. cs = cs0;
  1643. g = 2;
  1644. }
  1645. break;
  1646. case HWKANA_7BIT:
  1647. if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
  1648. if(converterData->version==3) {
  1649. /* JIS7: use G1 (SO) */
  1650. /* Shift U+FF61..U+FF9F to bytes 21..5F. */
  1651. targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21));
  1652. len = 1;
  1653. pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */
  1654. g = 1;
  1655. } else if(converterData->version==4) {
  1656. /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */
  1657. /* Shift U+FF61..U+FF9F to bytes A1..DF. */
  1658. targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1));
  1659. len = 1;
  1660. cs = pFromU2022State->cs[0];
  1661. if(IS_JP_DBCS(cs)) {
  1662. /* switch from a DBCS charset to JISX201 */
  1663. cs = (int8_t)JISX201;
  1664. }
  1665. /* else stay in the current G0 charset */
  1666. g = 0;
  1667. }
  1668. /* else do not use HWKANA_7BIT with other versions */
  1669. }
  1670. break;
  1671. case JISX201:
  1672. /* G0 SBCS */
  1673. value = jisx201FromU(sourceChar);
  1674. if(value <= 0x7f) {
  1675. targetValue = value;
  1676. len = 1;
  1677. cs = cs0;
  1678. g = 0;
  1679. useFallback = false;
  1680. }
  1681. break;
  1682. case JISX208:
  1683. /* G0 DBCS from Shift-JIS table */
  1684. len2 = MBCS_FROM_UCHAR32_ISO2022(
  1685. converterData->myConverterArray[cs0],
  1686. sourceChar, &value,
  1687. useFallback, MBCS_OUTPUT_2);
  1688. if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */
  1689. value = _2022FromSJIS(value);
  1690. if(value != 0) {
  1691. targetValue = value;
  1692. len = len2;
  1693. cs = cs0;
  1694. g = 0;
  1695. useFallback = false;
  1696. }
  1697. } else if(len == 0 && useFallback &&
  1698. (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
  1699. targetValue = hwkana_fb[sourceChar - HWKANA_START];
  1700. len = -2;
  1701. cs = cs0;
  1702. g = 0;
  1703. useFallback = false;
  1704. }
  1705. break;
  1706. case ISO8859_7:
  1707. /* G0 SBCS forced to 7-bit output */
  1708. len2 = MBCS_SINGLE_FROM_UCHAR32(
  1709. converterData->myConverterArray[cs0],
  1710. sourceChar, &value,
  1711. useFallback);
  1712. if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) {
  1713. targetValue = value - 0x80;
  1714. len = len2;
  1715. cs = cs0;
  1716. g = 2;
  1717. useFallback = false;
  1718. }
  1719. break;
  1720. default:
  1721. /* G0 DBCS */
  1722. len2 = MBCS_FROM_UCHAR32_ISO2022(
  1723. converterData->myConverterArray[cs0],
  1724. sourceChar, &value,
  1725. useFallback, MBCS_OUTPUT_2);
  1726. if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */
  1727. if(cs0 == KSC5601) {
  1728. /*
  1729. * Check for valid bytes for the encoding scheme.
  1730. * This is necessary because the sub-converter (windows-949)
  1731. * has a broader encoding scheme than is valid for 2022.
  1732. */
  1733. value = _2022FromGR94DBCS(value);
  1734. if(value == 0) {
  1735. break;
  1736. }
  1737. }
  1738. targetValue = value;
  1739. len = len2;
  1740. cs = cs0;
  1741. g = 0;
  1742. useFallback = false;
  1743. }
  1744. break;
  1745. }
  1746. }
  1747. if(len != 0) {
  1748. if(len < 0) {
  1749. len = -len; /* fallback */
  1750. }
  1751. outLen = 0; /* count output bytes */
  1752. /* write SI if necessary (only for JIS7) */
  1753. if(pFromU2022State->g == 1 && g == 0) {
  1754. buffer[outLen++] = UCNV_SI;
  1755. pFromU2022State->g = 0;
  1756. }
  1757. /* write the designation sequence if necessary */
  1758. if(cs != pFromU2022State->cs[g]) {
  1759. int32_t escLen = escSeqCharsLen[cs];
  1760. uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen);
  1761. outLen += escLen;
  1762. pFromU2022State->cs[g] = cs;
  1763. /* invalidate the choices[] */
  1764. choiceCount = 0;
  1765. }
  1766. /* write the shift sequence if necessary */
  1767. if(g != pFromU2022State->g) {
  1768. switch(g) {
  1769. /* case 0 handled before writing escapes */
  1770. case 1:
  1771. buffer[outLen++] = UCNV_SO;
  1772. pFromU2022State->g = 1;
  1773. break;
  1774. default: /* case 2 */
  1775. buffer[outLen++] = 0x1b;
  1776. buffer[outLen++] = 0x4e;
  1777. break;
  1778. /* no case 3: no SS3 in ISO-2022-JP-x */
  1779. }
  1780. }
  1781. /* write the output bytes */
  1782. if(len == 1) {
  1783. buffer[outLen++] = (char)targetValue;
  1784. } else /* len == 2 */ {
  1785. buffer[outLen++] = (char)(targetValue >> 8);
  1786. buffer[outLen++] = (char)targetValue;
  1787. }
  1788. } else {
  1789. /*
  1790. * if we cannot find the character after checking all codepages
  1791. * then this is an error
  1792. */
  1793. *err = U_INVALID_CHAR_FOUND;
  1794. cnv->fromUChar32=sourceChar;
  1795. break;
  1796. }
  1797. if(sourceChar == CR || sourceChar == LF) {
  1798. /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */
  1799. pFromU2022State->cs[2] = 0;
  1800. choiceCount = 0;
  1801. }
  1802. /* output outLen>0 bytes in buffer[] */
  1803. if(outLen == 1) {
  1804. *target++ = buffer[0];
  1805. if(offsets) {
  1806. *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
  1807. }
  1808. } else if(outLen == 2 && (target + 2) <= targetLimit) {
  1809. *target++ = buffer[0];
  1810. *target++ = buffer[1];
  1811. if(offsets) {
  1812. int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
  1813. *offsets++ = sourceIndex;
  1814. *offsets++ = sourceIndex;
  1815. }
  1816. } else {
  1817. fromUWriteUInt8(
  1818. cnv,
  1819. buffer, outLen,
  1820. &target, (const char *)targetLimit,
  1821. &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
  1822. err);
  1823. if(U_FAILURE(*err)) {
  1824. break;
  1825. }
  1826. }
  1827. } /* end if(myTargetIndex<myTargetLength) */
  1828. else{
  1829. *err =U_BUFFER_OVERFLOW_ERROR;
  1830. break;
  1831. }
  1832. }/* end while(mySourceIndex<mySourceLength) */
  1833. /*
  1834. * the end of the input stream and detection of truncated input
  1835. * are handled by the framework, but for ISO-2022-JP conversion
  1836. * we need to be in ASCII mode at the very end
  1837. *
  1838. * conditions:
  1839. * successful
  1840. * in SO mode or not in ASCII mode
  1841. * end of input and no truncated input
  1842. */
  1843. if( U_SUCCESS(*err) &&
  1844. (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) &&
  1845. args->flush && source>=sourceLimit && cnv->fromUChar32==0
  1846. ) {
  1847. int32_t sourceIndex;
  1848. outLen = 0;
  1849. if(pFromU2022State->g != 0) {
  1850. buffer[outLen++] = UCNV_SI;
  1851. pFromU2022State->g = 0;
  1852. }
  1853. if(pFromU2022State->cs[0] != ASCII) {
  1854. int32_t escLen = escSeqCharsLen[ASCII];
  1855. uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen);
  1856. outLen += escLen;
  1857. pFromU2022State->cs[0] = (int8_t)ASCII;
  1858. }
  1859. /* get the source index of the last input character */
  1860. /*
  1861. * TODO this would be simpler and more reliable if we used a pair
  1862. * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
  1863. * so that we could simply use the prevSourceIndex here;
  1864. * this code gives an incorrect result for the rare case of an unmatched
  1865. * trail surrogate that is alone in the last buffer of the text stream
  1866. */
  1867. sourceIndex=(int32_t)(source-args->source);
  1868. if(sourceIndex>0) {
  1869. --sourceIndex;
  1870. if( U16_IS_TRAIL(args->source[sourceIndex]) &&
  1871. (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
  1872. ) {
  1873. --sourceIndex;
  1874. }
  1875. } else {
  1876. sourceIndex=-1;
  1877. }
  1878. fromUWriteUInt8(
  1879. cnv,
  1880. buffer, outLen,
  1881. &target, (const char *)targetLimit,
  1882. &offsets, sourceIndex,
  1883. err);
  1884. }
  1885. /*save the state and return */
  1886. args->source = source;
  1887. args->target = (char*)target;
  1888. }
  1889. /*************** to unicode *******************/
  1890. static void U_CALLCONV
  1891. UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
  1892. UErrorCode* err){
  1893. char tempBuf[2];
  1894. const char *mySource = (char *) args->source;
  1895. char16_t *myTarget = args->target;
  1896. const char *mySourceLimit = args->sourceLimit;
  1897. uint32_t targetUniChar = 0x0000;
  1898. uint32_t mySourceChar = 0x0000;
  1899. uint32_t tmpSourceChar = 0x0000;
  1900. UConverterDataISO2022* myData;
  1901. ISO2022State *pToU2022State;
  1902. StateEnum cs;
  1903. myData=(UConverterDataISO2022*)(args->converter->extraInfo);
  1904. pToU2022State = &myData->toU2022State;
  1905. if(myData->key != 0) {
  1906. /* continue with a partial escape sequence */
  1907. goto escape;
  1908. } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
  1909. /* continue with a partial double-byte character */
  1910. mySourceChar = args->converter->toUBytes[0];
  1911. args->converter->toULength = 0;
  1912. cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
  1913. targetUniChar = missingCharMarker;
  1914. goto getTrailByte;
  1915. }
  1916. while(mySource < mySourceLimit){
  1917. targetUniChar =missingCharMarker;
  1918. if(myTarget < args->targetLimit){
  1919. mySourceChar= (unsigned char) *mySource++;
  1920. switch(mySourceChar) {
  1921. case UCNV_SI:
  1922. if(myData->version==3) {
  1923. pToU2022State->g=0;
  1924. continue;
  1925. } else {
  1926. /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
  1927. myData->isEmptySegment = false; /* reset this, we have a different error */
  1928. break;
  1929. }
  1930. case UCNV_SO:
  1931. if(myData->version==3) {
  1932. /* JIS7: switch to G1 half-width Katakana */
  1933. pToU2022State->cs[1] = (int8_t)HWKANA_7BIT;
  1934. pToU2022State->g=1;
  1935. continue;
  1936. } else {
  1937. /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
  1938. myData->isEmptySegment = false; /* reset this, we have a different error */
  1939. break;
  1940. }
  1941. case ESC_2022:
  1942. mySource--;
  1943. escape:
  1944. {
  1945. const char * mySourceBefore = mySource;
  1946. int8_t toULengthBefore = args->converter->toULength;
  1947. changeState_2022(args->converter,&(mySource),
  1948. mySourceLimit, ISO_2022_JP,err);
  1949. /* If in ISO-2022-JP only and we successfully completed an escape sequence, but previous segment was empty, create an error */
  1950. if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
  1951. *err = U_ILLEGAL_ESCAPE_SEQUENCE;
  1952. args->converter->toUCallbackReason = UCNV_IRREGULAR;
  1953. args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
  1954. }
  1955. }
  1956. /* invalid or illegal escape sequence */
  1957. if(U_FAILURE(*err)){
  1958. args->target = myTarget;
  1959. args->source = mySource;
  1960. myData->isEmptySegment = false; /* Reset to avoid future spurious errors */
  1961. return;
  1962. }
  1963. /* If we successfully completed an escape sequence, we begin a new segment, empty so far */
  1964. if(myData->key==0) {
  1965. myData->isEmptySegment = true;
  1966. }
  1967. continue;
  1968. /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
  1969. case CR:
  1970. case LF:
  1971. /* automatically reset to single-byte mode */
  1972. if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) {
  1973. pToU2022State->cs[0] = (int8_t)ASCII;
  1974. }
  1975. pToU2022State->cs[2] = 0;
  1976. pToU2022State->g = 0;
  1977. U_FALLTHROUGH;
  1978. default:
  1979. /* convert one or two bytes */
  1980. myData->isEmptySegment = false;
  1981. cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
  1982. if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 &&
  1983. !IS_JP_DBCS(cs)
  1984. ) {
  1985. /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
  1986. targetUniChar = mySourceChar + (HWKANA_START - 0xa1);
  1987. /* return from a single-shift state to the previous one */
  1988. if(pToU2022State->g >= 2) {
  1989. pToU2022State->g=pToU2022State->prevG;
  1990. }
  1991. } else switch(cs) {
  1992. case ASCII:
  1993. if(mySourceChar <= 0x7f) {
  1994. targetUniChar = mySourceChar;
  1995. }
  1996. break;
  1997. case ISO8859_1:
  1998. if(mySourceChar <= 0x7f) {
  1999. targetUniChar = mySourceChar + 0x80;
  2000. }
  2001. /* return from a single-shift state to the previous one */
  2002. pToU2022State->g=pToU2022State->prevG;
  2003. break;
  2004. case ISO8859_7:
  2005. if(mySourceChar <= 0x7f) {
  2006. /* convert mySourceChar+0x80 to use a normal 8-bit table */
  2007. targetUniChar =
  2008. _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
  2009. myData->myConverterArray[cs],
  2010. mySourceChar + 0x80);
  2011. }
  2012. /* return from a single-shift state to the previous one */
  2013. pToU2022State->g=pToU2022State->prevG;
  2014. break;
  2015. case JISX201:
  2016. if(mySourceChar <= 0x7f) {
  2017. targetUniChar = jisx201ToU(mySourceChar);
  2018. }
  2019. break;
  2020. case HWKANA_7BIT:
  2021. if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) {
  2022. /* 7-bit halfwidth Katakana */
  2023. targetUniChar = mySourceChar + (HWKANA_START - 0x21);
  2024. }
  2025. break;
  2026. default:
  2027. /* G0 DBCS */
  2028. if(mySource < mySourceLimit) {
  2029. int leadIsOk, trailIsOk;
  2030. uint8_t trailByte;
  2031. getTrailByte:
  2032. trailByte = (uint8_t)*mySource;
  2033. /*
  2034. * Ticket 5691: consistent illegal sequences:
  2035. * - We include at least the first byte in the illegal sequence.
  2036. * - If any of the non-initial bytes could be the start of a character,
  2037. * we stop the illegal sequence before the first one of those.
  2038. *
  2039. * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
  2040. * an ESC/SO/SI, we report only the first byte as the illegal sequence.
  2041. * Otherwise we convert or report the pair of bytes.
  2042. */
  2043. leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
  2044. trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
  2045. if (leadIsOk && trailIsOk) {
  2046. ++mySource;
  2047. tmpSourceChar = (mySourceChar << 8) | trailByte;
  2048. if(cs == JISX208) {
  2049. _2022ToSJIS((uint8_t)mySourceChar, trailByte, tempBuf);
  2050. mySourceChar = tmpSourceChar;
  2051. } else {
  2052. /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */
  2053. mySourceChar = tmpSourceChar;
  2054. if (cs == KSC5601) {
  2055. tmpSourceChar += 0x8080; /* = _2022ToGR94DBCS(tmpSourceChar) */
  2056. }
  2057. tempBuf[0] = (char)(tmpSourceChar >> 8);
  2058. tempBuf[1] = (char)(tmpSourceChar);
  2059. }
  2060. targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, false);
  2061. } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
  2062. /* report a pair of illegal bytes if the second byte is not a DBCS starter */
  2063. ++mySource;
  2064. /* add another bit so that the code below writes 2 bytes in case of error */
  2065. mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
  2066. }
  2067. } else {
  2068. args->converter->toUBytes[0] = (uint8_t)mySourceChar;
  2069. args->converter->toULength = 1;
  2070. goto endloop;
  2071. }
  2072. } /* End of inner switch */
  2073. break;
  2074. } /* End of outer switch */
  2075. if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
  2076. if(args->offsets){
  2077. args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
  2078. }
  2079. *(myTarget++)=(char16_t)targetUniChar;
  2080. }
  2081. else if(targetUniChar > missingCharMarker){
  2082. /* disassemble the surrogate pair and write to output*/
  2083. targetUniChar-=0x0010000;
  2084. *myTarget = (char16_t)(0xd800+(char16_t)(targetUniChar>>10));
  2085. if(args->offsets){
  2086. args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
  2087. }
  2088. ++myTarget;
  2089. if(myTarget< args->targetLimit){
  2090. *myTarget = (char16_t)(0xdc00+(char16_t)(targetUniChar&0x3ff));
  2091. if(args->offsets){
  2092. args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
  2093. }
  2094. ++myTarget;
  2095. }else{
  2096. args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
  2097. (char16_t)(0xdc00+(char16_t)(targetUniChar&0x3ff));
  2098. }
  2099. }
  2100. else{
  2101. /* Call the callback function*/
  2102. toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
  2103. break;
  2104. }
  2105. }
  2106. else{ /* goes with "if(myTarget < args->targetLimit)" way up near top of function */
  2107. *err =U_BUFFER_OVERFLOW_ERROR;
  2108. break;
  2109. }
  2110. }
  2111. endloop:
  2112. args->target = myTarget;
  2113. args->source = mySource;
  2114. }
  2115. #if !UCONFIG_ONLY_HTML_CONVERSION
  2116. /***************************************************************
  2117. * Rules for ISO-2022-KR encoding
  2118. * i) The KSC5601 designator sequence should appear only once in a file,
  2119. * at the beginning of a line before any KSC5601 characters. This usually
  2120. * means that it appears by itself on the first line of the file
  2121. * ii) There are only 2 shifting sequences SO to shift into double byte mode
  2122. * and SI to shift into single byte mode
  2123. */
  2124. static void U_CALLCONV
  2125. UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){
  2126. UConverter* saveConv = args->converter;
  2127. UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->extraInfo;
  2128. args->converter=myConverterData->currentConverter;
  2129. myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32;
  2130. ucnv_MBCSFromUnicodeWithOffsets(args,err);
  2131. saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
  2132. if(*err == U_BUFFER_OVERFLOW_ERROR) {
  2133. if(myConverterData->currentConverter->charErrorBufferLength > 0) {
  2134. uprv_memcpy(
  2135. saveConv->charErrorBuffer,
  2136. myConverterData->currentConverter->charErrorBuffer,
  2137. myConverterData->currentConverter->charErrorBufferLength);
  2138. }
  2139. saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
  2140. myConverterData->currentConverter->charErrorBufferLength = 0;
  2141. }
  2142. args->converter=saveConv;
  2143. }
  2144. static void U_CALLCONV
  2145. UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
  2146. const char16_t *source = args->source;
  2147. const char16_t *sourceLimit = args->sourceLimit;
  2148. unsigned char *target = (unsigned char *) args->target;
  2149. unsigned char *targetLimit = (unsigned char *) args->targetLimit;
  2150. int32_t* offsets = args->offsets;
  2151. uint32_t targetByteUnit = 0x0000;
  2152. UChar32 sourceChar = 0x0000;
  2153. UBool isTargetByteDBCS;
  2154. UBool oldIsTargetByteDBCS;
  2155. UConverterDataISO2022 *converterData;
  2156. UConverterSharedData* sharedData;
  2157. UBool useFallback;
  2158. int32_t length =0;
  2159. converterData=(UConverterDataISO2022*)args->converter->extraInfo;
  2160. /* if the version is 1 then the user is requesting
  2161. * conversion with ibm-25546 pass the arguments to
  2162. * MBCS converter and return
  2163. */
  2164. if(converterData->version==1){
  2165. UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
  2166. return;
  2167. }
  2168. /* initialize data */
  2169. sharedData = converterData->currentConverter->sharedData;
  2170. useFallback = args->converter->useFallback;
  2171. isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus;
  2172. oldIsTargetByteDBCS = isTargetByteDBCS;
  2173. isTargetByteDBCS = (UBool) args->converter->fromUnicodeStatus;
  2174. if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) {
  2175. goto getTrail;
  2176. }
  2177. while(source < sourceLimit){
  2178. targetByteUnit = missingCharMarker;
  2179. if(target < (unsigned char*) args->targetLimit){
  2180. sourceChar = *source++;
  2181. /* do not convert SO/SI/ESC */
  2182. if(IS_2022_CONTROL(sourceChar)) {
  2183. /* callback(illegal) */
  2184. *err=U_ILLEGAL_CHAR_FOUND;
  2185. args->converter->fromUChar32=sourceChar;
  2186. break;
  2187. }
  2188. length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2);
  2189. if(length < 0) {
  2190. length = -length; /* fallback */
  2191. }
  2192. /* only DBCS or SBCS characters are expected*/
  2193. /* DB characters with high bit set to 1 are expected */
  2194. if( length > 2 || length==0 ||
  2195. (length == 1 && targetByteUnit > 0x7f) ||
  2196. (length == 2 &&
  2197. ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) ||
  2198. (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1)))
  2199. ) {
  2200. targetByteUnit=missingCharMarker;
  2201. }
  2202. if (targetByteUnit != missingCharMarker){
  2203. oldIsTargetByteDBCS = isTargetByteDBCS;
  2204. isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF);
  2205. /* append the shift sequence */
  2206. if (oldIsTargetByteDBCS != isTargetByteDBCS ){
  2207. if (isTargetByteDBCS)
  2208. *target++ = UCNV_SO;
  2209. else
  2210. *target++ = UCNV_SI;
  2211. if(offsets)
  2212. *(offsets++) = (int32_t)(source - args->source-1);
  2213. }
  2214. /* write the targetUniChar to target */
  2215. if(targetByteUnit <= 0x00FF){
  2216. if( target < targetLimit){
  2217. *(target++) = (unsigned char) targetByteUnit;
  2218. if(offsets){
  2219. *(offsets++) = (int32_t)(source - args->source-1);
  2220. }
  2221. }else{
  2222. args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit);
  2223. *err = U_BUFFER_OVERFLOW_ERROR;
  2224. }
  2225. }else{
  2226. if(target < targetLimit){
  2227. *(target++) =(unsigned char) ((targetByteUnit>>8) -0x80);
  2228. if(offsets){
  2229. *(offsets++) = (int32_t)(source - args->source-1);
  2230. }
  2231. if(target < targetLimit){
  2232. *(target++) =(unsigned char) (targetByteUnit -0x80);
  2233. if(offsets){
  2234. *(offsets++) = (int32_t)(source - args->source-1);
  2235. }
  2236. }else{
  2237. args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80);
  2238. *err = U_BUFFER_OVERFLOW_ERROR;
  2239. }
  2240. }else{
  2241. args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80);
  2242. args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-0x80);
  2243. *err = U_BUFFER_OVERFLOW_ERROR;
  2244. }
  2245. }
  2246. }
  2247. else{
  2248. /* oops.. the code point is unassingned
  2249. * set the error and reason
  2250. */
  2251. /*check if the char is a First surrogate*/
  2252. if(U16_IS_SURROGATE(sourceChar)) {
  2253. if(U16_IS_SURROGATE_LEAD(sourceChar)) {
  2254. getTrail:
  2255. /*look ahead to find the trail surrogate*/
  2256. if(source < sourceLimit) {
  2257. /* test the following code unit */
  2258. char16_t trail=(char16_t) *source;
  2259. if(U16_IS_TRAIL(trail)) {
  2260. source++;
  2261. sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
  2262. *err = U_INVALID_CHAR_FOUND;
  2263. /* convert this surrogate code point */
  2264. /* exit this condition tree */
  2265. } else {
  2266. /* this is an unmatched lead code unit (1st surrogate) */
  2267. /* callback(illegal) */
  2268. *err=U_ILLEGAL_CHAR_FOUND;
  2269. }
  2270. } else {
  2271. /* no more input */
  2272. *err = U_ZERO_ERROR;
  2273. }
  2274. } else {
  2275. /* this is an unmatched trail code unit (2nd surrogate) */
  2276. /* callback(illegal) */
  2277. *err=U_ILLEGAL_CHAR_FOUND;
  2278. }
  2279. } else {
  2280. /* callback(unassigned) for a BMP code point */
  2281. *err = U_INVALID_CHAR_FOUND;
  2282. }
  2283. args->converter->fromUChar32=sourceChar;
  2284. break;
  2285. }
  2286. } /* end if(myTargetIndex<myTargetLength) */
  2287. else{
  2288. *err =U_BUFFER_OVERFLOW_ERROR;
  2289. break;
  2290. }
  2291. }/* end while(mySourceIndex<mySourceLength) */
  2292. /*
  2293. * the end of the input stream and detection of truncated input
  2294. * are handled by the framework, but for ISO-2022-KR conversion
  2295. * we need to be in ASCII mode at the very end
  2296. *
  2297. * conditions:
  2298. * successful
  2299. * not in ASCII mode
  2300. * end of input and no truncated input
  2301. */
  2302. if( U_SUCCESS(*err) &&
  2303. isTargetByteDBCS &&
  2304. args->flush && source>=sourceLimit && args->converter->fromUChar32==0
  2305. ) {
  2306. int32_t sourceIndex;
  2307. /* we are switching to ASCII */
  2308. isTargetByteDBCS=false;
  2309. /* get the source index of the last input character */
  2310. /*
  2311. * TODO this would be simpler and more reliable if we used a pair
  2312. * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
  2313. * so that we could simply use the prevSourceIndex here;
  2314. * this code gives an incorrect result for the rare case of an unmatched
  2315. * trail surrogate that is alone in the last buffer of the text stream
  2316. */
  2317. sourceIndex=(int32_t)(source-args->source);
  2318. if(sourceIndex>0) {
  2319. --sourceIndex;
  2320. if( U16_IS_TRAIL(args->source[sourceIndex]) &&
  2321. (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
  2322. ) {
  2323. --sourceIndex;
  2324. }
  2325. } else {
  2326. sourceIndex=-1;
  2327. }
  2328. fromUWriteUInt8(
  2329. args->converter,
  2330. SHIFT_IN_STR, 1,
  2331. &target, (const char *)targetLimit,
  2332. &offsets, sourceIndex,
  2333. err);
  2334. }
  2335. /*save the state and return */
  2336. args->source = source;
  2337. args->target = (char*)target;
  2338. args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS;
  2339. }
  2340. /************************ To Unicode ***************************************/
  2341. static void U_CALLCONV
  2342. UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args,
  2343. UErrorCode* err){
  2344. char const* sourceStart;
  2345. UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo);
  2346. UConverterToUnicodeArgs subArgs;
  2347. int32_t minArgsSize;
  2348. /* set up the subconverter arguments */
  2349. if(args->size<sizeof(UConverterToUnicodeArgs)) {
  2350. minArgsSize = args->size;
  2351. } else {
  2352. minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs);
  2353. }
  2354. uprv_memcpy(&subArgs, args, minArgsSize);
  2355. subArgs.size = (uint16_t)minArgsSize;
  2356. subArgs.converter = myData->currentConverter;
  2357. /* remember the original start of the input for offsets */
  2358. sourceStart = args->source;
  2359. if(myData->key != 0) {
  2360. /* continue with a partial escape sequence */
  2361. goto escape;
  2362. }
  2363. while(U_SUCCESS(*err) && args->source < args->sourceLimit) {
  2364. /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
  2365. subArgs.source = args->source;
  2366. subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush);
  2367. if(subArgs.source != subArgs.sourceLimit) {
  2368. /*
  2369. * get the current partial byte sequence
  2370. *
  2371. * it needs to be moved between the public and the subconverter
  2372. * so that the conversion framework, which only sees the public
  2373. * converter, can handle truncated and illegal input etc.
  2374. */
  2375. if(args->converter->toULength > 0) {
  2376. uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength);
  2377. }
  2378. subArgs.converter->toULength = args->converter->toULength;
  2379. /*
  2380. * Convert up to the end of the input, or to before the next escape character.
  2381. * Does not handle conversion extensions because the preToU[] state etc.
  2382. * is not copied.
  2383. */
  2384. ucnv_MBCSToUnicodeWithOffsets(&subArgs, err);
  2385. if(args->offsets != nullptr && sourceStart != args->source) {
  2386. /* update offsets to base them on the actual start of the input */
  2387. int32_t *offsets = args->offsets;
  2388. char16_t *target = args->target;
  2389. int32_t delta = (int32_t)(args->source - sourceStart);
  2390. while(target < subArgs.target) {
  2391. if(*offsets >= 0) {
  2392. *offsets += delta;
  2393. }
  2394. ++offsets;
  2395. ++target;
  2396. }
  2397. }
  2398. args->source = subArgs.source;
  2399. args->target = subArgs.target;
  2400. args->offsets = subArgs.offsets;
  2401. /* copy input/error/overflow buffers */
  2402. if(subArgs.converter->toULength > 0) {
  2403. uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength);
  2404. }
  2405. args->converter->toULength = subArgs.converter->toULength;
  2406. if(*err == U_BUFFER_OVERFLOW_ERROR) {
  2407. if(subArgs.converter->UCharErrorBufferLength > 0) {
  2408. uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer,
  2409. subArgs.converter->UCharErrorBufferLength);
  2410. }
  2411. args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength;
  2412. subArgs.converter->UCharErrorBufferLength = 0;
  2413. }
  2414. }
  2415. if (U_FAILURE(*err) || (args->source == args->sourceLimit)) {
  2416. return;
  2417. }
  2418. escape:
  2419. changeState_2022(args->converter,
  2420. &(args->source),
  2421. args->sourceLimit,
  2422. ISO_2022_KR,
  2423. err);
  2424. }
  2425. }
  2426. static void U_CALLCONV
  2427. UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
  2428. UErrorCode* err){
  2429. char tempBuf[2];
  2430. const char *mySource = ( char *) args->source;
  2431. char16_t *myTarget = args->target;
  2432. const char *mySourceLimit = args->sourceLimit;
  2433. UChar32 targetUniChar = 0x0000;
  2434. char16_t mySourceChar = 0x0000;
  2435. UConverterDataISO2022* myData;
  2436. UConverterSharedData* sharedData ;
  2437. UBool useFallback;
  2438. myData=(UConverterDataISO2022*)(args->converter->extraInfo);
  2439. if(myData->version==1){
  2440. UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
  2441. return;
  2442. }
  2443. /* initialize state */
  2444. sharedData = myData->currentConverter->sharedData;
  2445. useFallback = args->converter->useFallback;
  2446. if(myData->key != 0) {
  2447. /* continue with a partial escape sequence */
  2448. goto escape;
  2449. } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
  2450. /* continue with a partial double-byte character */
  2451. mySourceChar = args->converter->toUBytes[0];
  2452. args->converter->toULength = 0;
  2453. goto getTrailByte;
  2454. }
  2455. while(mySource< mySourceLimit){
  2456. if(myTarget < args->targetLimit){
  2457. mySourceChar= (unsigned char) *mySource++;
  2458. if(mySourceChar==UCNV_SI){
  2459. myData->toU2022State.g = 0;
  2460. if (myData->isEmptySegment) {
  2461. myData->isEmptySegment = false; /* we are handling it, reset to avoid future spurious errors */
  2462. *err = U_ILLEGAL_ESCAPE_SEQUENCE;
  2463. args->converter->toUCallbackReason = UCNV_IRREGULAR;
  2464. args->converter->toUBytes[0] = (uint8_t)mySourceChar;
  2465. args->converter->toULength = 1;
  2466. args->target = myTarget;
  2467. args->source = mySource;
  2468. return;
  2469. }
  2470. /*consume the source */
  2471. continue;
  2472. }else if(mySourceChar==UCNV_SO){
  2473. myData->toU2022State.g = 1;
  2474. myData->isEmptySegment = true; /* Begin a new segment, empty so far */
  2475. /*consume the source */
  2476. continue;
  2477. }else if(mySourceChar==ESC_2022){
  2478. mySource--;
  2479. escape:
  2480. myData->isEmptySegment = false; /* Any invalid ESC sequences will be detected separately, so just reset this */
  2481. changeState_2022(args->converter,&(mySource),
  2482. mySourceLimit, ISO_2022_KR, err);
  2483. if(U_FAILURE(*err)){
  2484. args->target = myTarget;
  2485. args->source = mySource;
  2486. return;
  2487. }
  2488. continue;
  2489. }
  2490. myData->isEmptySegment = false; /* Any invalid char errors will be detected separately, so just reset this */
  2491. if(myData->toU2022State.g == 1) {
  2492. if(mySource < mySourceLimit) {
  2493. int leadIsOk, trailIsOk;
  2494. uint8_t trailByte;
  2495. getTrailByte:
  2496. targetUniChar = missingCharMarker;
  2497. trailByte = (uint8_t)*mySource;
  2498. /*
  2499. * Ticket 5691: consistent illegal sequences:
  2500. * - We include at least the first byte in the illegal sequence.
  2501. * - If any of the non-initial bytes could be the start of a character,
  2502. * we stop the illegal sequence before the first one of those.
  2503. *
  2504. * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
  2505. * an ESC/SO/SI, we report only the first byte as the illegal sequence.
  2506. * Otherwise we convert or report the pair of bytes.
  2507. */
  2508. leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
  2509. trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
  2510. if (leadIsOk && trailIsOk) {
  2511. ++mySource;
  2512. tempBuf[0] = (char)(mySourceChar + 0x80);
  2513. tempBuf[1] = (char)(trailByte + 0x80);
  2514. targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback);
  2515. mySourceChar = (mySourceChar << 8) | trailByte;
  2516. } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
  2517. /* report a pair of illegal bytes if the second byte is not a DBCS starter */
  2518. ++mySource;
  2519. /* add another bit so that the code below writes 2 bytes in case of error */
  2520. mySourceChar = static_cast<char16_t>(0x10000 | (mySourceChar << 8) | trailByte);
  2521. }
  2522. } else {
  2523. args->converter->toUBytes[0] = (uint8_t)mySourceChar;
  2524. args->converter->toULength = 1;
  2525. break;
  2526. }
  2527. }
  2528. else if(mySourceChar <= 0x7f) {
  2529. targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback);
  2530. } else {
  2531. targetUniChar = 0xffff;
  2532. }
  2533. if(targetUniChar < 0xfffe){
  2534. if(args->offsets) {
  2535. args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
  2536. }
  2537. *(myTarget++)=(char16_t)targetUniChar;
  2538. }
  2539. else {
  2540. /* Call the callback function*/
  2541. toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
  2542. break;
  2543. }
  2544. }
  2545. else{
  2546. *err =U_BUFFER_OVERFLOW_ERROR;
  2547. break;
  2548. }
  2549. }
  2550. args->target = myTarget;
  2551. args->source = mySource;
  2552. }
  2553. /*************************** END ISO2022-KR *********************************/
  2554. /*************************** ISO-2022-CN *********************************
  2555. *
  2556. * Rules for ISO-2022-CN Encoding:
  2557. * i) The designator sequence must appear once on a line before any instance
  2558. * of character set it designates.
  2559. * ii) If two lines contain characters from the same character set, both lines
  2560. * must include the designator sequence.
  2561. * iii) Once the designator sequence is known, a shifting sequence has to be found
  2562. * to invoke the shifting
  2563. * iv) All lines start in ASCII and end in ASCII.
  2564. * v) Four shifting sequences are employed for this purpose:
  2565. *
  2566. * Sequcence ASCII Eq Charsets
  2567. * ---------- ------- ---------
  2568. * SI <SI> US-ASCII
  2569. * SO <SO> CNS-11643-1992 Plane 1, GB2312, ISO-IR-165
  2570. * SS2 <ESC>N CNS-11643-1992 Plane 2
  2571. * SS3 <ESC>O CNS-11643-1992 Planes 3-7
  2572. *
  2573. * vi)
  2574. * SOdesignator : ESC "$" ")" finalchar_for_SO
  2575. * SS2designator : ESC "$" "*" finalchar_for_SS2
  2576. * SS3designator : ESC "$" "+" finalchar_for_SS3
  2577. *
  2578. * ESC $ ) A Indicates the bytes following SO are Chinese
  2579. * characters as defined in GB 2312-80, until
  2580. * another SOdesignation appears
  2581. *
  2582. *
  2583. * ESC $ ) E Indicates the bytes following SO are as defined
  2584. * in ISO-IR-165 (for details, see section 2.1),
  2585. * until another SOdesignation appears
  2586. *
  2587. * ESC $ ) G Indicates the bytes following SO are as defined
  2588. * in CNS 11643-plane-1, until another
  2589. * SOdesignation appears
  2590. *
  2591. * ESC $ * H Indicates the two bytes immediately following
  2592. * SS2 is a Chinese character as defined in CNS
  2593. * 11643-plane-2, until another SS2designation
  2594. * appears
  2595. * (Meaning <ESC>N must precede every 2 byte
  2596. * sequence.)
  2597. *
  2598. * ESC $ + I Indicates the immediate two bytes following SS3
  2599. * is a Chinese character as defined in CNS
  2600. * 11643-plane-3, until another SS3designation
  2601. * appears
  2602. * (Meaning <ESC>O must precede every 2 byte
  2603. * sequence.)
  2604. *
  2605. * ESC $ + J Indicates the immediate two bytes following SS3
  2606. * is a Chinese character as defined in CNS
  2607. * 11643-plane-4, until another SS3designation
  2608. * appears
  2609. * (In English: <ESC>O must precede every 2 byte
  2610. * sequence.)
  2611. *
  2612. * ESC $ + K Indicates the immediate two bytes following SS3
  2613. * is a Chinese character as defined in CNS
  2614. * 11643-plane-5, until another SS3designation
  2615. * appears
  2616. *
  2617. * ESC $ + L Indicates the immediate two bytes following SS3
  2618. * is a Chinese character as defined in CNS
  2619. * 11643-plane-6, until another SS3designation
  2620. * appears
  2621. *
  2622. * ESC $ + M Indicates the immediate two bytes following SS3
  2623. * is a Chinese character as defined in CNS
  2624. * 11643-plane-7, until another SS3designation
  2625. * appears
  2626. *
  2627. * As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and
  2628. * has its own designation information before any Chinese characters
  2629. * appear
  2630. *
  2631. */
  2632. /* The following are defined this way to make the strings truly readonly */
  2633. static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41";
  2634. static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45";
  2635. static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47";
  2636. static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48";
  2637. static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49";
  2638. static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A";
  2639. static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B";
  2640. static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C";
  2641. static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D";
  2642. /********************** ISO2022-CN Data **************************/
  2643. static const char* const escSeqCharsCN[10] ={
  2644. SHIFT_IN_STR, /* 0 ASCII */
  2645. GB_2312_80_STR, /* 1 GB2312_1 */
  2646. ISO_IR_165_STR, /* 2 ISO_IR_165 */
  2647. CNS_11643_1992_Plane_1_STR,
  2648. CNS_11643_1992_Plane_2_STR,
  2649. CNS_11643_1992_Plane_3_STR,
  2650. CNS_11643_1992_Plane_4_STR,
  2651. CNS_11643_1992_Plane_5_STR,
  2652. CNS_11643_1992_Plane_6_STR,
  2653. CNS_11643_1992_Plane_7_STR
  2654. };
  2655. static void U_CALLCONV
  2656. UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
  2657. UConverter *cnv = args->converter;
  2658. UConverterDataISO2022 *converterData;
  2659. ISO2022State *pFromU2022State;
  2660. uint8_t *target = (uint8_t *) args->target;
  2661. const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
  2662. const char16_t* source = args->source;
  2663. const char16_t* sourceLimit = args->sourceLimit;
  2664. int32_t* offsets = args->offsets;
  2665. UChar32 sourceChar;
  2666. char buffer[8];
  2667. int32_t len;
  2668. int8_t choices[3];
  2669. int32_t choiceCount;
  2670. uint32_t targetValue = 0;
  2671. UBool useFallback;
  2672. /* set up the state */
  2673. converterData = (UConverterDataISO2022*)cnv->extraInfo;
  2674. pFromU2022State = &converterData->fromU2022State;
  2675. choiceCount = 0;
  2676. /* check if the last codepoint of previous buffer was a lead surrogate*/
  2677. if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
  2678. goto getTrail;
  2679. }
  2680. while( source < sourceLimit){
  2681. if(target < targetLimit){
  2682. sourceChar = *(source++);
  2683. /*check if the char is a First surrogate*/
  2684. if(U16_IS_SURROGATE(sourceChar)) {
  2685. if(U16_IS_SURROGATE_LEAD(sourceChar)) {
  2686. getTrail:
  2687. /*look ahead to find the trail surrogate*/
  2688. if(source < sourceLimit) {
  2689. /* test the following code unit */
  2690. char16_t trail=(char16_t) *source;
  2691. if(U16_IS_TRAIL(trail)) {
  2692. source++;
  2693. sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
  2694. cnv->fromUChar32=0x00;
  2695. /* convert this supplementary code point */
  2696. /* exit this condition tree */
  2697. } else {
  2698. /* this is an unmatched lead code unit (1st surrogate) */
  2699. /* callback(illegal) */
  2700. *err=U_ILLEGAL_CHAR_FOUND;
  2701. cnv->fromUChar32=sourceChar;
  2702. break;
  2703. }
  2704. } else {
  2705. /* no more input */
  2706. cnv->fromUChar32=sourceChar;
  2707. break;
  2708. }
  2709. } else {
  2710. /* this is an unmatched trail code unit (2nd surrogate) */
  2711. /* callback(illegal) */
  2712. *err=U_ILLEGAL_CHAR_FOUND;
  2713. cnv->fromUChar32=sourceChar;
  2714. break;
  2715. }
  2716. }
  2717. /* do the conversion */
  2718. if(sourceChar <= 0x007f ){
  2719. /* do not convert SO/SI/ESC */
  2720. if(IS_2022_CONTROL(sourceChar)) {
  2721. /* callback(illegal) */
  2722. *err=U_ILLEGAL_CHAR_FOUND;
  2723. cnv->fromUChar32=sourceChar;
  2724. break;
  2725. }
  2726. /* US-ASCII */
  2727. if(pFromU2022State->g == 0) {
  2728. buffer[0] = (char)sourceChar;
  2729. len = 1;
  2730. } else {
  2731. buffer[0] = UCNV_SI;
  2732. buffer[1] = (char)sourceChar;
  2733. len = 2;
  2734. pFromU2022State->g = 0;
  2735. choiceCount = 0;
  2736. }
  2737. if(sourceChar == CR || sourceChar == LF) {
  2738. /* reset the state at the end of a line */
  2739. uprv_memset(pFromU2022State, 0, sizeof(ISO2022State));
  2740. choiceCount = 0;
  2741. }
  2742. }
  2743. else{
  2744. /* convert U+0080..U+10ffff */
  2745. int32_t i;
  2746. int8_t cs, g;
  2747. if(choiceCount == 0) {
  2748. /* try the current SO/G1 converter first */
  2749. choices[0] = pFromU2022State->cs[1];
  2750. /* default to GB2312_1 if none is designated yet */
  2751. if(choices[0] == 0) {
  2752. choices[0] = GB2312_1;
  2753. }
  2754. if(converterData->version == 0) {
  2755. /* ISO-2022-CN */
  2756. /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */
  2757. if(choices[0] == GB2312_1) {
  2758. choices[1] = (int8_t)CNS_11643_1;
  2759. } else {
  2760. choices[1] = (int8_t)GB2312_1;
  2761. }
  2762. choiceCount = 2;
  2763. } else if (converterData->version == 1) {
  2764. /* ISO-2022-CN-EXT */
  2765. /* try one of the other converters */
  2766. switch(choices[0]) {
  2767. case GB2312_1:
  2768. choices[1] = (int8_t)CNS_11643_1;
  2769. choices[2] = (int8_t)ISO_IR_165;
  2770. break;
  2771. case ISO_IR_165:
  2772. choices[1] = (int8_t)GB2312_1;
  2773. choices[2] = (int8_t)CNS_11643_1;
  2774. break;
  2775. default: /* CNS_11643_x */
  2776. choices[1] = (int8_t)GB2312_1;
  2777. choices[2] = (int8_t)ISO_IR_165;
  2778. break;
  2779. }
  2780. choiceCount = 3;
  2781. } else {
  2782. choices[0] = (int8_t)CNS_11643_1;
  2783. choices[1] = (int8_t)GB2312_1;
  2784. }
  2785. }
  2786. cs = g = 0;
  2787. /*
  2788. * len==0: no mapping found yet
  2789. * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
  2790. * len>0: found a roundtrip result, done
  2791. */
  2792. len = 0;
  2793. /*
  2794. * We will turn off useFallback after finding a fallback,
  2795. * but we still get fallbacks from PUA code points as usual.
  2796. * Therefore, we will also need to check that we don't overwrite
  2797. * an early fallback with a later one.
  2798. */
  2799. useFallback = cnv->useFallback;
  2800. for(i = 0; i < choiceCount && len <= 0; ++i) {
  2801. int8_t cs0 = choices[i];
  2802. if(cs0 > 0) {
  2803. uint32_t value;
  2804. int32_t len2;
  2805. if(cs0 >= CNS_11643_0) {
  2806. len2 = MBCS_FROM_UCHAR32_ISO2022(
  2807. converterData->myConverterArray[CNS_11643],
  2808. sourceChar,
  2809. &value,
  2810. useFallback,
  2811. MBCS_OUTPUT_3);
  2812. if(len2 == 3 || (len2 == -3 && len == 0)) {
  2813. targetValue = value;
  2814. cs = (int8_t)(CNS_11643_0 + (value >> 16) - 0x80);
  2815. if(len2 >= 0) {
  2816. len = 2;
  2817. } else {
  2818. len = -2;
  2819. useFallback = false;
  2820. }
  2821. if(cs == CNS_11643_1) {
  2822. g = 1;
  2823. } else if(cs == CNS_11643_2) {
  2824. g = 2;
  2825. } else /* plane 3..7 */ if(converterData->version == 1) {
  2826. g = 3;
  2827. } else {
  2828. /* ISO-2022-CN (without -EXT) does not support plane 3..7 */
  2829. len = 0;
  2830. }
  2831. }
  2832. } else {
  2833. /* GB2312_1 or ISO-IR-165 */
  2834. U_ASSERT(cs0<UCNV_2022_MAX_CONVERTERS);
  2835. len2 = MBCS_FROM_UCHAR32_ISO2022(
  2836. converterData->myConverterArray[cs0],
  2837. sourceChar,
  2838. &value,
  2839. useFallback,
  2840. MBCS_OUTPUT_2);
  2841. if(len2 == 2 || (len2 == -2 && len == 0)) {
  2842. targetValue = value;
  2843. len = len2;
  2844. cs = cs0;
  2845. g = 1;
  2846. useFallback = false;
  2847. }
  2848. }
  2849. }
  2850. }
  2851. if(len != 0) {
  2852. len = 0; /* count output bytes; it must have been abs(len) == 2 */
  2853. /* write the designation sequence if necessary */
  2854. if(cs != pFromU2022State->cs[g]) {
  2855. if(cs < CNS_11643) {
  2856. uprv_memcpy(buffer, escSeqCharsCN[cs], 4);
  2857. } else {
  2858. U_ASSERT(cs >= CNS_11643_1);
  2859. uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4);
  2860. }
  2861. len = 4;
  2862. pFromU2022State->cs[g] = cs;
  2863. if(g == 1) {
  2864. /* changing the SO/G1 charset invalidates the choices[] */
  2865. choiceCount = 0;
  2866. }
  2867. }
  2868. /* write the shift sequence if necessary */
  2869. if(g != pFromU2022State->g) {
  2870. switch(g) {
  2871. case 1:
  2872. buffer[len++] = UCNV_SO;
  2873. /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */
  2874. pFromU2022State->g = 1;
  2875. break;
  2876. case 2:
  2877. buffer[len++] = 0x1b;
  2878. buffer[len++] = 0x4e;
  2879. break;
  2880. default: /* case 3 */
  2881. buffer[len++] = 0x1b;
  2882. buffer[len++] = 0x4f;
  2883. break;
  2884. }
  2885. }
  2886. /* write the two output bytes */
  2887. buffer[len++] = (char)(targetValue >> 8);
  2888. buffer[len++] = (char)targetValue;
  2889. } else {
  2890. /* if we cannot find the character after checking all codepages
  2891. * then this is an error
  2892. */
  2893. *err = U_INVALID_CHAR_FOUND;
  2894. cnv->fromUChar32=sourceChar;
  2895. break;
  2896. }
  2897. }
  2898. /* output len>0 bytes in buffer[] */
  2899. if(len == 1) {
  2900. *target++ = buffer[0];
  2901. if(offsets) {
  2902. *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
  2903. }
  2904. } else if(len == 2 && (target + 2) <= targetLimit) {
  2905. *target++ = buffer[0];
  2906. *target++ = buffer[1];
  2907. if(offsets) {
  2908. int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
  2909. *offsets++ = sourceIndex;
  2910. *offsets++ = sourceIndex;
  2911. }
  2912. } else {
  2913. fromUWriteUInt8(
  2914. cnv,
  2915. buffer, len,
  2916. &target, (const char *)targetLimit,
  2917. &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
  2918. err);
  2919. if(U_FAILURE(*err)) {
  2920. break;
  2921. }
  2922. }
  2923. } /* end if(myTargetIndex<myTargetLength) */
  2924. else{
  2925. *err =U_BUFFER_OVERFLOW_ERROR;
  2926. break;
  2927. }
  2928. }/* end while(mySourceIndex<mySourceLength) */
  2929. /*
  2930. * the end of the input stream and detection of truncated input
  2931. * are handled by the framework, but for ISO-2022-CN conversion
  2932. * we need to be in ASCII mode at the very end
  2933. *
  2934. * conditions:
  2935. * successful
  2936. * not in ASCII mode
  2937. * end of input and no truncated input
  2938. */
  2939. if( U_SUCCESS(*err) &&
  2940. pFromU2022State->g!=0 &&
  2941. args->flush && source>=sourceLimit && cnv->fromUChar32==0
  2942. ) {
  2943. int32_t sourceIndex;
  2944. /* we are switching to ASCII */
  2945. pFromU2022State->g=0;
  2946. /* get the source index of the last input character */
  2947. /*
  2948. * TODO this would be simpler and more reliable if we used a pair
  2949. * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
  2950. * so that we could simply use the prevSourceIndex here;
  2951. * this code gives an incorrect result for the rare case of an unmatched
  2952. * trail surrogate that is alone in the last buffer of the text stream
  2953. */
  2954. sourceIndex=(int32_t)(source-args->source);
  2955. if(sourceIndex>0) {
  2956. --sourceIndex;
  2957. if( U16_IS_TRAIL(args->source[sourceIndex]) &&
  2958. (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
  2959. ) {
  2960. --sourceIndex;
  2961. }
  2962. } else {
  2963. sourceIndex=-1;
  2964. }
  2965. fromUWriteUInt8(
  2966. cnv,
  2967. SHIFT_IN_STR, 1,
  2968. &target, (const char *)targetLimit,
  2969. &offsets, sourceIndex,
  2970. err);
  2971. }
  2972. /*save the state and return */
  2973. args->source = source;
  2974. args->target = (char*)target;
  2975. }
  2976. static void U_CALLCONV
  2977. UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
  2978. UErrorCode* err){
  2979. char tempBuf[3];
  2980. const char *mySource = (char *) args->source;
  2981. char16_t *myTarget = args->target;
  2982. const char *mySourceLimit = args->sourceLimit;
  2983. uint32_t targetUniChar = 0x0000;
  2984. uint32_t mySourceChar = 0x0000;
  2985. UConverterDataISO2022* myData;
  2986. ISO2022State *pToU2022State;
  2987. myData=(UConverterDataISO2022*)(args->converter->extraInfo);
  2988. pToU2022State = &myData->toU2022State;
  2989. if(myData->key != 0) {
  2990. /* continue with a partial escape sequence */
  2991. goto escape;
  2992. } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
  2993. /* continue with a partial double-byte character */
  2994. mySourceChar = args->converter->toUBytes[0];
  2995. args->converter->toULength = 0;
  2996. targetUniChar = missingCharMarker;
  2997. goto getTrailByte;
  2998. }
  2999. while(mySource < mySourceLimit){
  3000. targetUniChar =missingCharMarker;
  3001. if(myTarget < args->targetLimit){
  3002. mySourceChar= (unsigned char) *mySource++;
  3003. switch(mySourceChar){
  3004. case UCNV_SI:
  3005. pToU2022State->g=0;
  3006. if (myData->isEmptySegment) {
  3007. myData->isEmptySegment = false; /* we are handling it, reset to avoid future spurious errors */
  3008. *err = U_ILLEGAL_ESCAPE_SEQUENCE;
  3009. args->converter->toUCallbackReason = UCNV_IRREGULAR;
  3010. args->converter->toUBytes[0] = static_cast<uint8_t>(mySourceChar);
  3011. args->converter->toULength = 1;
  3012. args->target = myTarget;
  3013. args->source = mySource;
  3014. return;
  3015. }
  3016. continue;
  3017. case UCNV_SO:
  3018. if(pToU2022State->cs[1] != 0) {
  3019. pToU2022State->g=1;
  3020. myData->isEmptySegment = true; /* Begin a new segment, empty so far */
  3021. continue;
  3022. } else {
  3023. /* illegal to have SO before a matching designator */
  3024. myData->isEmptySegment = false; /* Handling a different error, reset this to avoid future spurious errs */
  3025. break;
  3026. }
  3027. case ESC_2022:
  3028. mySource--;
  3029. escape:
  3030. {
  3031. const char * mySourceBefore = mySource;
  3032. int8_t toULengthBefore = args->converter->toULength;
  3033. changeState_2022(args->converter,&(mySource),
  3034. mySourceLimit, ISO_2022_CN,err);
  3035. /* After SO there must be at least one character before a designator (designator error handled separately) */
  3036. if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
  3037. *err = U_ILLEGAL_ESCAPE_SEQUENCE;
  3038. args->converter->toUCallbackReason = UCNV_IRREGULAR;
  3039. args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
  3040. }
  3041. }
  3042. /* invalid or illegal escape sequence */
  3043. if(U_FAILURE(*err)){
  3044. args->target = myTarget;
  3045. args->source = mySource;
  3046. myData->isEmptySegment = false; /* Reset to avoid future spurious errors */
  3047. return;
  3048. }
  3049. continue;
  3050. /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */
  3051. case CR:
  3052. case LF:
  3053. uprv_memset(pToU2022State, 0, sizeof(ISO2022State));
  3054. U_FALLTHROUGH;
  3055. default:
  3056. /* convert one or two bytes */
  3057. myData->isEmptySegment = false;
  3058. if(pToU2022State->g != 0) {
  3059. if(mySource < mySourceLimit) {
  3060. UConverterSharedData *cnv;
  3061. StateEnum tempState;
  3062. int32_t tempBufLen;
  3063. int leadIsOk, trailIsOk;
  3064. uint8_t trailByte;
  3065. getTrailByte:
  3066. trailByte = (uint8_t)*mySource;
  3067. /*
  3068. * Ticket 5691: consistent illegal sequences:
  3069. * - We include at least the first byte in the illegal sequence.
  3070. * - If any of the non-initial bytes could be the start of a character,
  3071. * we stop the illegal sequence before the first one of those.
  3072. *
  3073. * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
  3074. * an ESC/SO/SI, we report only the first byte as the illegal sequence.
  3075. * Otherwise we convert or report the pair of bytes.
  3076. */
  3077. leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
  3078. trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
  3079. if (leadIsOk && trailIsOk) {
  3080. ++mySource;
  3081. tempState = (StateEnum)pToU2022State->cs[pToU2022State->g];
  3082. if(tempState >= CNS_11643_0) {
  3083. cnv = myData->myConverterArray[CNS_11643];
  3084. tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0));
  3085. tempBuf[1] = (char) (mySourceChar);
  3086. tempBuf[2] = (char) trailByte;
  3087. tempBufLen = 3;
  3088. }else{
  3089. U_ASSERT(tempState<UCNV_2022_MAX_CONVERTERS);
  3090. cnv = myData->myConverterArray[tempState];
  3091. tempBuf[0] = (char) (mySourceChar);
  3092. tempBuf[1] = (char) trailByte;
  3093. tempBufLen = 2;
  3094. }
  3095. targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, false);
  3096. mySourceChar = (mySourceChar << 8) | trailByte;
  3097. } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
  3098. /* report a pair of illegal bytes if the second byte is not a DBCS starter */
  3099. ++mySource;
  3100. /* add another bit so that the code below writes 2 bytes in case of error */
  3101. mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
  3102. }
  3103. if(pToU2022State->g>=2) {
  3104. /* return from a single-shift state to the previous one */
  3105. pToU2022State->g=pToU2022State->prevG;
  3106. }
  3107. } else {
  3108. args->converter->toUBytes[0] = (uint8_t)mySourceChar;
  3109. args->converter->toULength = 1;
  3110. goto endloop;
  3111. }
  3112. }
  3113. else{
  3114. if(mySourceChar <= 0x7f) {
  3115. targetUniChar = (char16_t) mySourceChar;
  3116. }
  3117. }
  3118. break;
  3119. }
  3120. if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
  3121. if(args->offsets){
  3122. args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
  3123. }
  3124. *(myTarget++)=(char16_t)targetUniChar;
  3125. }
  3126. else if(targetUniChar > missingCharMarker){
  3127. /* disassemble the surrogate pair and write to output*/
  3128. targetUniChar-=0x0010000;
  3129. *myTarget = (char16_t)(0xd800+(char16_t)(targetUniChar>>10));
  3130. if(args->offsets){
  3131. args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
  3132. }
  3133. ++myTarget;
  3134. if(myTarget< args->targetLimit){
  3135. *myTarget = (char16_t)(0xdc00+(char16_t)(targetUniChar&0x3ff));
  3136. if(args->offsets){
  3137. args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
  3138. }
  3139. ++myTarget;
  3140. }else{
  3141. args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
  3142. (char16_t)(0xdc00+(char16_t)(targetUniChar&0x3ff));
  3143. }
  3144. }
  3145. else{
  3146. /* Call the callback function*/
  3147. toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
  3148. break;
  3149. }
  3150. }
  3151. else{
  3152. *err =U_BUFFER_OVERFLOW_ERROR;
  3153. break;
  3154. }
  3155. }
  3156. endloop:
  3157. args->target = myTarget;
  3158. args->source = mySource;
  3159. }
  3160. #endif /* #if !UCONFIG_ONLY_HTML_CONVERSION */
  3161. static void U_CALLCONV
  3162. _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) {
  3163. UConverter *cnv = args->converter;
  3164. UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
  3165. ISO2022State *pFromU2022State=&myConverterData->fromU2022State;
  3166. char *p, *subchar;
  3167. char buffer[8];
  3168. int32_t length;
  3169. subchar=(char *)cnv->subChars;
  3170. length=cnv->subCharLen; /* assume length==1 for most variants */
  3171. p = buffer;
  3172. switch(myConverterData->locale[0]){
  3173. case 'j':
  3174. {
  3175. int8_t cs;
  3176. if(pFromU2022State->g == 1) {
  3177. /* JIS7: switch from G1 to G0 */
  3178. pFromU2022State->g = 0;
  3179. *p++ = UCNV_SI;
  3180. }
  3181. cs = pFromU2022State->cs[0];
  3182. if(cs != ASCII && cs != JISX201) {
  3183. /* not in ASCII or JIS X 0201: switch to ASCII */
  3184. pFromU2022State->cs[0] = (int8_t)ASCII;
  3185. *p++ = '\x1b';
  3186. *p++ = '\x28';
  3187. *p++ = '\x42';
  3188. }
  3189. *p++ = subchar[0];
  3190. break;
  3191. }
  3192. case 'c':
  3193. if(pFromU2022State->g != 0) {
  3194. /* not in ASCII mode: switch to ASCII */
  3195. pFromU2022State->g = 0;
  3196. *p++ = UCNV_SI;
  3197. }
  3198. *p++ = subchar[0];
  3199. break;
  3200. case 'k':
  3201. if(myConverterData->version == 0) {
  3202. if(length == 1) {
  3203. if(args->converter->fromUnicodeStatus) {
  3204. /* in DBCS mode: switch to SBCS */
  3205. args->converter->fromUnicodeStatus = 0;
  3206. *p++ = UCNV_SI;
  3207. }
  3208. *p++ = subchar[0];
  3209. } else /* length == 2*/ {
  3210. if(!args->converter->fromUnicodeStatus) {
  3211. /* in SBCS mode: switch to DBCS */
  3212. args->converter->fromUnicodeStatus = 1;
  3213. *p++ = UCNV_SO;
  3214. }
  3215. *p++ = subchar[0];
  3216. *p++ = subchar[1];
  3217. }
  3218. break;
  3219. } else {
  3220. /* save the subconverter's substitution string */
  3221. uint8_t *currentSubChars = myConverterData->currentConverter->subChars;
  3222. int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen;
  3223. /* set our substitution string into the subconverter */
  3224. myConverterData->currentConverter->subChars = (uint8_t *)subchar;
  3225. myConverterData->currentConverter->subCharLen = (int8_t)length;
  3226. /* let the subconverter write the subchar, set/retrieve fromUChar32 state */
  3227. args->converter = myConverterData->currentConverter;
  3228. myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32;
  3229. ucnv_cbFromUWriteSub(args, 0, err);
  3230. cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
  3231. args->converter = cnv;
  3232. /* restore the subconverter's substitution string */
  3233. myConverterData->currentConverter->subChars = currentSubChars;
  3234. myConverterData->currentConverter->subCharLen = currentSubCharLen;
  3235. if(*err == U_BUFFER_OVERFLOW_ERROR) {
  3236. if(myConverterData->currentConverter->charErrorBufferLength > 0) {
  3237. uprv_memcpy(
  3238. cnv->charErrorBuffer,
  3239. myConverterData->currentConverter->charErrorBuffer,
  3240. myConverterData->currentConverter->charErrorBufferLength);
  3241. }
  3242. cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
  3243. myConverterData->currentConverter->charErrorBufferLength = 0;
  3244. }
  3245. return;
  3246. }
  3247. default:
  3248. /* not expected */
  3249. break;
  3250. }
  3251. ucnv_cbFromUWriteBytes(args,
  3252. buffer, (int32_t)(p - buffer),
  3253. offsetIndex, err);
  3254. }
  3255. /*
  3256. * Structure for cloning an ISO 2022 converter into a single memory block.
  3257. */
  3258. struct cloneStruct
  3259. {
  3260. UConverter cnv;
  3261. UConverter currentConverter;
  3262. UConverterDataISO2022 mydata;
  3263. };
  3264. U_CDECL_BEGIN
  3265. static UConverter * U_CALLCONV
  3266. _ISO_2022_SafeClone(
  3267. const UConverter *cnv,
  3268. void *stackBuffer,
  3269. int32_t *pBufferSize,
  3270. UErrorCode *status)
  3271. {
  3272. struct cloneStruct * localClone;
  3273. UConverterDataISO2022 *cnvData;
  3274. int32_t i, size;
  3275. if (U_FAILURE(*status)){
  3276. return nullptr;
  3277. }
  3278. if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */
  3279. *pBufferSize = (int32_t)sizeof(struct cloneStruct);
  3280. return nullptr;
  3281. }
  3282. cnvData = (UConverterDataISO2022 *)cnv->extraInfo;
  3283. localClone = (struct cloneStruct *)stackBuffer;
  3284. /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
  3285. uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022));
  3286. localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */
  3287. localClone->cnv.isExtraLocal = true;
  3288. /* share the subconverters */
  3289. if(cnvData->currentConverter != nullptr) {
  3290. size = (int32_t)sizeof(UConverter);
  3291. localClone->mydata.currentConverter =
  3292. ucnv_safeClone(cnvData->currentConverter,
  3293. &localClone->currentConverter,
  3294. &size, status);
  3295. if(U_FAILURE(*status)) {
  3296. return nullptr;
  3297. }
  3298. }
  3299. for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) {
  3300. if(cnvData->myConverterArray[i] != nullptr) {
  3301. ucnv_incrementRefCount(cnvData->myConverterArray[i]);
  3302. }
  3303. }
  3304. return &localClone->cnv;
  3305. }
  3306. U_CDECL_END
  3307. static void U_CALLCONV
  3308. _ISO_2022_GetUnicodeSet(const UConverter *cnv,
  3309. const USetAdder *sa,
  3310. UConverterUnicodeSet which,
  3311. UErrorCode *pErrorCode)
  3312. {
  3313. int32_t i;
  3314. UConverterDataISO2022* cnvData;
  3315. if (U_FAILURE(*pErrorCode)) {
  3316. return;
  3317. }
  3318. #ifdef U_ENABLE_GENERIC_ISO_2022
  3319. if (cnv->sharedData == &_ISO2022Data) {
  3320. /* We use UTF-8 in this case */
  3321. sa->addRange(sa->set, 0, 0xd7FF);
  3322. sa->addRange(sa->set, 0xE000, 0x10FFFF);
  3323. return;
  3324. }
  3325. #endif
  3326. cnvData = (UConverterDataISO2022*)cnv->extraInfo;
  3327. /* open a set and initialize it with code points that are algorithmically round-tripped */
  3328. switch(cnvData->locale[0]){
  3329. case 'j':
  3330. /* include JIS X 0201 which is hardcoded */
  3331. sa->add(sa->set, 0xa5);
  3332. sa->add(sa->set, 0x203e);
  3333. if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) {
  3334. /* include Latin-1 for some variants of JP */
  3335. sa->addRange(sa->set, 0, 0xff);
  3336. } else {
  3337. /* include ASCII for JP */
  3338. sa->addRange(sa->set, 0, 0x7f);
  3339. }
  3340. if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
  3341. /*
  3342. * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0
  3343. * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8)
  3344. * use half-width Katakana.
  3345. * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode)
  3346. * half-width Katakana via the ESC ( I sequence.
  3347. * However, we only emit (fromUnicode) half-width Katakana according to the
  3348. * definition of each variant.
  3349. *
  3350. * When including fallbacks,
  3351. * we need to include half-width Katakana Unicode code points for all JP variants because
  3352. * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana).
  3353. */
  3354. /* include half-width Katakana for JP */
  3355. sa->addRange(sa->set, HWKANA_START, HWKANA_END);
  3356. }
  3357. break;
  3358. #if !UCONFIG_ONLY_HTML_CONVERSION
  3359. case 'c':
  3360. case 'z':
  3361. /* include ASCII for CN */
  3362. sa->addRange(sa->set, 0, 0x7f);
  3363. break;
  3364. case 'k':
  3365. /* there is only one converter for KR, and it is not in the myConverterArray[] */
  3366. cnvData->currentConverter->sharedData->impl->getUnicodeSet(
  3367. cnvData->currentConverter, sa, which, pErrorCode);
  3368. /* the loop over myConverterArray[] will simply not find another converter */
  3369. break;
  3370. #endif
  3371. default:
  3372. break;
  3373. }
  3374. #if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
  3375. if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
  3376. cnvData->version==0 && i==CNS_11643
  3377. ) {
  3378. /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */
  3379. ucnv_MBCSGetUnicodeSetForBytes(
  3380. cnvData->myConverterArray[i],
  3381. sa, UCNV_ROUNDTRIP_SET,
  3382. 0, 0x81, 0x82,
  3383. pErrorCode);
  3384. }
  3385. #endif
  3386. for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
  3387. UConverterSetFilter filter;
  3388. if(cnvData->myConverterArray[i]!=nullptr) {
  3389. if(cnvData->locale[0]=='j' && i==JISX208) {
  3390. /*
  3391. * Only add code points that map to Shift-JIS codes
  3392. * corresponding to JIS X 0208.
  3393. */
  3394. filter=UCNV_SET_FILTER_SJIS;
  3395. #if !UCONFIG_ONLY_HTML_CONVERSION
  3396. } else if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
  3397. cnvData->version==0 && i==CNS_11643) {
  3398. /*
  3399. * Version-specific for CN:
  3400. * CN version 0 does not map CNS planes 3..7 although
  3401. * they are all available in the CNS conversion table;
  3402. * CN version 1 (-EXT) does map them all.
  3403. * The two versions create different Unicode sets.
  3404. */
  3405. filter=UCNV_SET_FILTER_2022_CN;
  3406. } else if(i==KSC5601) {
  3407. /*
  3408. * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables)
  3409. * are broader than GR94.
  3410. */
  3411. filter=UCNV_SET_FILTER_GR94DBCS;
  3412. #endif
  3413. } else {
  3414. filter=UCNV_SET_FILTER_NONE;
  3415. }
  3416. ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode);
  3417. }
  3418. }
  3419. /*
  3420. * ISO 2022 converters must not convert SO/SI/ESC despite what
  3421. * sub-converters do by themselves.
  3422. * Remove these characters from the set.
  3423. */
  3424. sa->remove(sa->set, 0x0e);
  3425. sa->remove(sa->set, 0x0f);
  3426. sa->remove(sa->set, 0x1b);
  3427. /* ISO 2022 converters do not convert C1 controls either */
  3428. sa->removeRange(sa->set, 0x80, 0x9f);
  3429. }
  3430. static const UConverterImpl _ISO2022Impl={
  3431. UCNV_ISO_2022,
  3432. nullptr,
  3433. nullptr,
  3434. _ISO2022Open,
  3435. _ISO2022Close,
  3436. _ISO2022Reset,
  3437. #ifdef U_ENABLE_GENERIC_ISO_2022
  3438. T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
  3439. T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
  3440. ucnv_fromUnicode_UTF8,
  3441. ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
  3442. #else
  3443. nullptr,
  3444. nullptr,
  3445. nullptr,
  3446. nullptr,
  3447. #endif
  3448. nullptr,
  3449. nullptr,
  3450. _ISO2022getName,
  3451. _ISO_2022_WriteSub,
  3452. _ISO_2022_SafeClone,
  3453. _ISO_2022_GetUnicodeSet,
  3454. nullptr,
  3455. nullptr
  3456. };
  3457. static const UConverterStaticData _ISO2022StaticData={
  3458. sizeof(UConverterStaticData),
  3459. "ISO_2022",
  3460. 2022,
  3461. UCNV_IBM,
  3462. UCNV_ISO_2022,
  3463. 1,
  3464. 3, /* max 3 bytes per char16_t from UTF-8 (4 bytes from surrogate _pair_) */
  3465. { 0x1a, 0, 0, 0 },
  3466. 1,
  3467. false,
  3468. false,
  3469. 0,
  3470. 0,
  3471. { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
  3472. };
  3473. const UConverterSharedData _ISO2022Data=
  3474. UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022StaticData, &_ISO2022Impl);
  3475. /*************JP****************/
  3476. static const UConverterImpl _ISO2022JPImpl={
  3477. UCNV_ISO_2022,
  3478. nullptr,
  3479. nullptr,
  3480. _ISO2022Open,
  3481. _ISO2022Close,
  3482. _ISO2022Reset,
  3483. UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
  3484. UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
  3485. UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
  3486. UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
  3487. nullptr,
  3488. nullptr,
  3489. _ISO2022getName,
  3490. _ISO_2022_WriteSub,
  3491. _ISO_2022_SafeClone,
  3492. _ISO_2022_GetUnicodeSet,
  3493. nullptr,
  3494. nullptr
  3495. };
  3496. static const UConverterStaticData _ISO2022JPStaticData={
  3497. sizeof(UConverterStaticData),
  3498. "ISO_2022_JP",
  3499. 0,
  3500. UCNV_IBM,
  3501. UCNV_ISO_2022,
  3502. 1,
  3503. 6, /* max 6 bytes per char16_t: 4-byte escape sequence + DBCS */
  3504. { 0x1a, 0, 0, 0 },
  3505. 1,
  3506. false,
  3507. false,
  3508. 0,
  3509. 0,
  3510. { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
  3511. };
  3512. namespace {
  3513. const UConverterSharedData _ISO2022JPData=
  3514. UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022JPStaticData, &_ISO2022JPImpl);
  3515. } // namespace
  3516. #if !UCONFIG_ONLY_HTML_CONVERSION
  3517. /************* KR ***************/
  3518. static const UConverterImpl _ISO2022KRImpl={
  3519. UCNV_ISO_2022,
  3520. nullptr,
  3521. nullptr,
  3522. _ISO2022Open,
  3523. _ISO2022Close,
  3524. _ISO2022Reset,
  3525. UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
  3526. UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
  3527. UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
  3528. UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
  3529. nullptr,
  3530. nullptr,
  3531. _ISO2022getName,
  3532. _ISO_2022_WriteSub,
  3533. _ISO_2022_SafeClone,
  3534. _ISO_2022_GetUnicodeSet,
  3535. nullptr,
  3536. nullptr
  3537. };
  3538. static const UConverterStaticData _ISO2022KRStaticData={
  3539. sizeof(UConverterStaticData),
  3540. "ISO_2022_KR",
  3541. 0,
  3542. UCNV_IBM,
  3543. UCNV_ISO_2022,
  3544. 1,
  3545. 8, /* max 8 bytes per char16_t */
  3546. { 0x1a, 0, 0, 0 },
  3547. 1,
  3548. false,
  3549. false,
  3550. 0,
  3551. 0,
  3552. { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
  3553. };
  3554. namespace {
  3555. const UConverterSharedData _ISO2022KRData=
  3556. UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022KRStaticData, &_ISO2022KRImpl);
  3557. } // namespace
  3558. /*************** CN ***************/
  3559. static const UConverterImpl _ISO2022CNImpl={
  3560. UCNV_ISO_2022,
  3561. nullptr,
  3562. nullptr,
  3563. _ISO2022Open,
  3564. _ISO2022Close,
  3565. _ISO2022Reset,
  3566. UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
  3567. UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
  3568. UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
  3569. UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
  3570. nullptr,
  3571. nullptr,
  3572. _ISO2022getName,
  3573. _ISO_2022_WriteSub,
  3574. _ISO_2022_SafeClone,
  3575. _ISO_2022_GetUnicodeSet,
  3576. nullptr,
  3577. nullptr
  3578. };
  3579. static const UConverterStaticData _ISO2022CNStaticData={
  3580. sizeof(UConverterStaticData),
  3581. "ISO_2022_CN",
  3582. 0,
  3583. UCNV_IBM,
  3584. UCNV_ISO_2022,
  3585. 1,
  3586. 8, /* max 8 bytes per char16_t: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */
  3587. { 0x1a, 0, 0, 0 },
  3588. 1,
  3589. false,
  3590. false,
  3591. 0,
  3592. 0,
  3593. { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
  3594. };
  3595. namespace {
  3596. const UConverterSharedData _ISO2022CNData=
  3597. UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022CNStaticData, &_ISO2022CNImpl);
  3598. } // namespace
  3599. #endif /* #if !UCONFIG_ONLY_HTML_CONVERSION */
  3600. #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */