1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973 |
- // © 2016 and later: Unicode, Inc. and others.
- // License & terms of use: http://www.unicode.org/copyright.html
- /*
- **********************************************************************
- * Copyright (C) 2000-2016, International Business Machines
- * Corporation and others. All Rights Reserved.
- **********************************************************************
- * file name: ucnv2022.cpp
- * encoding: UTF-8
- * tab size: 8 (not used)
- * indentation:4
- *
- * created on: 2000feb03
- * created by: Markus W. Scherer
- *
- * Change history:
- *
- * 06/29/2000 helena Major rewrite of the callback APIs.
- * 08/08/2000 Ram Included support for ISO-2022-JP-2
- * Changed implementation of toUnicode
- * function
- * 08/21/2000 Ram Added support for ISO-2022-KR
- * 08/29/2000 Ram Seperated implementation of EBCDIC to
- * ucnvebdc.c
- * 09/20/2000 Ram Added support for ISO-2022-CN
- * Added implementations for getNextUChar()
- * for specific 2022 country variants.
- * 10/31/2000 Ram Implemented offsets logic functions
- */
- #include "unicode/utypes.h"
- #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
- #include "unicode/ucnv.h"
- #include "unicode/uset.h"
- #include "unicode/ucnv_err.h"
- #include "unicode/ucnv_cb.h"
- #include "unicode/utf16.h"
- #include "ucnv_imp.h"
- #include "ucnv_bld.h"
- #include "ucnv_cnv.h"
- #include "ucnvmbcs.h"
- #include "cstring.h"
- #include "cmemory.h"
- #include "uassert.h"
- #ifdef U_ENABLE_GENERIC_ISO_2022
- /*
- * I am disabling the generic ISO-2022 converter after proposing to do so on
- * the icu mailing list two days ago.
- *
- * Reasons:
- * 1. It does not fully support the ISO-2022/ECMA-35 specification with all of
- * its designation sequences, single shifts with return to the previous state,
- * switch-with-no-return to UTF-16BE or similar, etc.
- * This is unlike the language-specific variants like ISO-2022-JP which
- * require a much smaller repertoire of ISO-2022 features.
- * These variants continue to be supported.
- * 2. I believe that no one is really using the generic ISO-2022 converter
- * but rather always one of the language-specific variants.
- * Note that ICU's generic ISO-2022 converter has always output one escape
- * sequence followed by UTF-8 for the whole stream.
- * 3. Switching between subcharsets is extremely slow, because each time
- * the previous converter is closed and a new one opened,
- * without any kind of caching, least-recently-used list, etc.
- * 4. The code is currently buggy, and given the above it does not seem
- * reasonable to spend the time on maintenance.
- * 5. ISO-2022 subcharsets should normally be used with 7-bit byte encodings.
- * This means, for example, that when ISO-8859-7 is designated, the following
- * ISO-2022 bytes 00..7f should be interpreted as ISO-8859-7 bytes 80..ff.
- * The ICU ISO-2022 converter does not handle this - and has no information
- * about which subconverter would have to be shifted vs. which is designed
- * for 7-bit ISO-2022.
- *
- * Markus Scherer 2003-dec-03
- */
- #endif
- #if !UCONFIG_ONLY_HTML_CONVERSION
- static const char SHIFT_IN_STR[] = "\x0F";
- // static const char SHIFT_OUT_STR[] = "\x0E";
- #endif
- #define CR 0x0D
- #define LF 0x0A
- #define H_TAB 0x09
- #define V_TAB 0x0B
- #define SPACE 0x20
- enum {
- HWKANA_START=0xff61,
- HWKANA_END=0xff9f
- };
- /*
- * 94-character sets with native byte values A1..FE are encoded in ISO 2022
- * as bytes 21..7E. (Subtract 0x80.)
- * 96-character sets with native byte values A0..FF are encoded in ISO 2022
- * as bytes 20..7F. (Subtract 0x80.)
- * Do not encode C1 control codes with native bytes 80..9F
- * as bytes 00..1F (C0 control codes).
- */
- enum {
- GR94_START=0xa1,
- GR94_END=0xfe,
- GR96_START=0xa0,
- GR96_END=0xff
- };
- /*
- * ISO 2022 control codes must not be converted from Unicode
- * because they would mess up the byte stream.
- * The bit mask 0x0800c000 has bits set at bit positions 0xe, 0xf, 0x1b
- * corresponding to SO, SI, and ESC.
- */
- #define IS_2022_CONTROL(c) (((c)<0x20) && (((uint32_t)1<<(c))&0x0800c000)!=0)
- /* for ISO-2022-JP and -CN implementations */
- typedef enum {
- /* shared values */
- INVALID_STATE=-1,
- ASCII = 0,
- SS2_STATE=0x10,
- SS3_STATE,
- /* JP */
- ISO8859_1 = 1 ,
- ISO8859_7 = 2 ,
- JISX201 = 3,
- JISX208 = 4,
- JISX212 = 5,
- GB2312 =6,
- KSC5601 =7,
- HWKANA_7BIT=8, /* Halfwidth Katakana 7 bit */
- /* CN */
- /* the first few enum constants must keep their values because they correspond to myConverterArray[] */
- GB2312_1=1,
- ISO_IR_165=2,
- CNS_11643=3,
- /*
- * these are used in StateEnum and ISO2022State variables,
- * but CNS_11643 must be used to index into myConverterArray[]
- */
- CNS_11643_0=0x20,
- CNS_11643_1,
- CNS_11643_2,
- CNS_11643_3,
- CNS_11643_4,
- CNS_11643_5,
- CNS_11643_6,
- CNS_11643_7
- } StateEnum;
- /* is the StateEnum charset value for a DBCS charset? */
- #if UCONFIG_ONLY_HTML_CONVERSION
- #define IS_JP_DBCS(cs) (JISX208==(cs))
- #else
- #define IS_JP_DBCS(cs) (JISX208<=(cs) && (cs)<=KSC5601)
- #endif
- #define CSM(cs) ((uint16_t)1<<(cs))
- /*
- * Each of these charset masks (with index x) contains a bit for a charset in exact correspondence
- * to whether that charset is used in the corresponding version x of ISO_2022,locale=ja,version=x
- *
- * Note: The converter uses some leniency:
- * - The escape sequence ESC ( I for half-width 7-bit Katakana is recognized in
- * all versions, not just JIS7 and JIS8.
- * - ICU does not distinguish between different versions of JIS X 0208.
- */
- #if UCONFIG_ONLY_HTML_CONVERSION
- enum { MAX_JA_VERSION=0 };
- #else
- enum { MAX_JA_VERSION=4 };
- #endif
- static const uint16_t jpCharsetMasks[MAX_JA_VERSION+1]={
- CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT),
- #if !UCONFIG_ONLY_HTML_CONVERSION
- CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212),
- CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
- CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
- CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7)
- #endif
- };
- typedef enum {
- ASCII1=0,
- LATIN1,
- SBCS,
- DBCS,
- MBCS,
- HWKANA
- }Cnv2022Type;
- typedef struct ISO2022State {
- int8_t cs[4]; /* charset number for SI (G0)/SO (G1)/SS2 (G2)/SS3 (G3) */
- int8_t g; /* 0..3 for G0..G3 (SI/SO/SS2/SS3) */
- int8_t prevG; /* g before single shift (SS2 or SS3) */
- } ISO2022State;
- #define UCNV_OPTIONS_VERSION_MASK 0xf
- #define UCNV_2022_MAX_CONVERTERS 10
- typedef struct{
- UConverterSharedData *myConverterArray[UCNV_2022_MAX_CONVERTERS];
- UConverter *currentConverter;
- Cnv2022Type currentType;
- ISO2022State toU2022State, fromU2022State;
- uint32_t key;
- uint32_t version;
- #ifdef U_ENABLE_GENERIC_ISO_2022
- UBool isFirstBuffer;
- #endif
- UBool isEmptySegment;
- char name[30];
- char locale[3];
- }UConverterDataISO2022;
- /* Protos */
- /* ISO-2022 ----------------------------------------------------------------- */
- /*Forward declaration */
- U_CFUNC void U_CALLCONV
- ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs * args,
- UErrorCode * err);
- U_CFUNC void U_CALLCONV
- ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs * args,
- UErrorCode * err);
- #define ESC_2022 0x1B /*ESC*/
- typedef enum
- {
- INVALID_2022 = -1, /*Doesn't correspond to a valid iso 2022 escape sequence*/
- VALID_NON_TERMINAL_2022 = 0, /*so far corresponds to a valid iso 2022 escape sequence*/
- VALID_TERMINAL_2022 = 1, /*corresponds to a valid iso 2022 escape sequence*/
- VALID_MAYBE_TERMINAL_2022 = 2 /*so far matches one iso 2022 escape sequence, but by adding more characters might match another escape sequence*/
- } UCNV_TableStates_2022;
- /*
- * The way these state transition arrays work is:
- * ex : ESC$B is the sequence for JISX208
- * a) First Iteration: char is ESC
- * i) Get the value of ESC from normalize_esq_chars_2022[] with int value of ESC as index
- * int x = normalize_esq_chars_2022[27] which is equal to 1
- * ii) Search for this value in escSeqStateTable_Key_2022[]
- * value of x is stored at escSeqStateTable_Key_2022[0]
- * iii) Save this index as offset
- * iv) Get state of this sequence from escSeqStateTable_Value_2022[]
- * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
- * b) Switch on this state and continue to next char
- * i) Get the value of $ from normalize_esq_chars_2022[] with int value of $ as index
- * which is normalize_esq_chars_2022[36] == 4
- * ii) x is currently 1(from above)
- * x<<=5 -- x is now 32
- * x+=normalize_esq_chars_2022[36]
- * now x is 36
- * iii) Search for this value in escSeqStateTable_Key_2022[]
- * value of x is stored at escSeqStateTable_Key_2022[2], so offset is 2
- * iv) Get state of this sequence from escSeqStateTable_Value_2022[]
- * escSeqStateTable_Value_2022[offset], which is VALID_NON_TERMINAL_2022
- * c) Switch on this state and continue to next char
- * i) Get the value of B from normalize_esq_chars_2022[] with int value of B as index
- * ii) x is currently 36 (from above)
- * x<<=5 -- x is now 1152
- * x+=normalize_esq_chars_2022[66]
- * now x is 1161
- * iii) Search for this value in escSeqStateTable_Key_2022[]
- * value of x is stored at escSeqStateTable_Key_2022[21], so offset is 21
- * iv) Get state of this sequence from escSeqStateTable_Value_2022[21]
- * escSeqStateTable_Value_2022[offset], which is VALID_TERMINAL_2022
- * v) Get the converter name form escSeqStateTable_Result_2022[21] which is JISX208
- */
- /*Below are the 3 arrays depicting a state transition table*/
- static const int8_t normalize_esq_chars_2022[256] = {
- /* 0 1 2 3 4 5 6 7 8 9 */
- 0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
- ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
- ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,1 ,0 ,0
- ,0 ,0 ,0 ,0 ,0 ,0 ,4 ,7 ,29 ,0
- ,2 ,24 ,26 ,27 ,0 ,3 ,23 ,6 ,0 ,0
- ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
- ,0 ,0 ,0 ,0 ,5 ,8 ,9 ,10 ,11 ,12
- ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,25 ,28
- ,0 ,0 ,21 ,0 ,0 ,0 ,0 ,0 ,0 ,0
- ,22 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
- ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
- ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
- ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
- ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
- ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
- ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
- ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
- ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
- ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
- ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
- ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
- ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
- ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
- ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
- ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0 ,0
- ,0 ,0 ,0 ,0 ,0 ,0
- };
- #ifdef U_ENABLE_GENERIC_ISO_2022
- /*
- * When the generic ISO-2022 converter is completely removed, not just disabled
- * per #ifdef, then the following state table and the associated tables that are
- * dimensioned with MAX_STATES_2022 should be trimmed.
- *
- * Especially, VALID_MAYBE_TERMINAL_2022 will not be used any more, and all of
- * the associated escape sequences starting with ESC ( B should be removed.
- * This includes the ones with key values 1097 and all of the ones above 1000000.
- *
- * For the latter, the tables can simply be truncated.
- * For the former, since the tables must be kept parallel, it is probably best
- * to simply duplicate an adjacent table cell, parallel in all tables.
- *
- * It may make sense to restructure the tables, especially by using small search
- * tables for the variants instead of indexing them parallel to the table here.
- */
- #endif
- #define MAX_STATES_2022 74
- static const int32_t escSeqStateTable_Key_2022[MAX_STATES_2022] = {
- /* 0 1 2 3 4 5 6 7 8 9 */
- 1 ,34 ,36 ,39 ,55 ,57 ,60 ,61 ,1093 ,1096
- ,1097 ,1098 ,1099 ,1100 ,1101 ,1102 ,1103 ,1104 ,1105 ,1106
- ,1109 ,1154 ,1157 ,1160 ,1161 ,1176 ,1178 ,1179 ,1254 ,1257
- ,1768 ,1773 ,1957 ,35105 ,36933 ,36936 ,36937 ,36938 ,36939 ,36940
- ,36942 ,36943 ,36944 ,36945 ,36946 ,36947 ,36948 ,37640 ,37642 ,37644
- ,37646 ,37711 ,37744 ,37745 ,37746 ,37747 ,37748 ,40133 ,40136 ,40138
- ,40139 ,40140 ,40141 ,1123363 ,35947624 ,35947625 ,35947626 ,35947627 ,35947629 ,35947630
- ,35947631 ,35947635 ,35947636 ,35947638
- };
- #ifdef U_ENABLE_GENERIC_ISO_2022
- static const char* const escSeqStateTable_Result_2022[MAX_STATES_2022] = {
- /* 0 1 2 3 4 5 6 7 8 9 */
- nullptr ,nullptr ,nullptr ,nullptr ,nullptr ,nullptr ,nullptr ,nullptr ,"latin1" ,"latin1"
- ,"latin1" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"ibm-865" ,"JISX0201" ,"JISX0201" ,"latin1"
- ,"latin1" ,nullptr ,"JISX-208" ,"ibm-5478" ,"JISX-208" ,nullptr ,nullptr ,nullptr ,nullptr ,"UTF8"
- ,"ISO-8859-1" ,"ISO-8859-7" ,"JIS-X-208" ,nullptr ,"ibm-955" ,"ibm-367" ,"ibm-952" ,"ibm-949" ,"JISX-212" ,"ibm-1383"
- ,"ibm-952" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-964" ,"ibm-5478" ,"ibm-949" ,"ISO-IR-165"
- ,"CNS-11643-1992,1" ,"CNS-11643-1992,2" ,"CNS-11643-1992,3" ,"CNS-11643-1992,4" ,"CNS-11643-1992,5" ,"CNS-11643-1992,6" ,"CNS-11643-1992,7" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian"
- ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,"UTF16_PlatformEndian" ,nullptr ,"latin1" ,"ibm-912" ,"ibm-913" ,"ibm-914" ,"ibm-813" ,"ibm-1089"
- ,"ibm-920" ,"ibm-915" ,"ibm-915" ,"latin1"
- };
- #endif
- static const int8_t escSeqStateTable_Value_2022[MAX_STATES_2022] = {
- /* 0 1 2 3 4 5 6 7 8 9 */
- VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
- ,VALID_MAYBE_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
- ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022
- ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
- ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
- ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
- ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_NON_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
- ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022 ,VALID_TERMINAL_2022
- };
- /* Type def for refactoring changeState_2022 code*/
- typedef enum{
- #ifdef U_ENABLE_GENERIC_ISO_2022
- ISO_2022=0,
- #endif
- ISO_2022_JP=1,
- #if !UCONFIG_ONLY_HTML_CONVERSION
- ISO_2022_KR=2,
- ISO_2022_CN=3
- #endif
- } Variant2022;
- /*********** ISO 2022 Converter Protos ***********/
- static void U_CALLCONV
- _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode);
- static void U_CALLCONV
- _ISO2022Close(UConverter *converter);
- static void U_CALLCONV
- _ISO2022Reset(UConverter *converter, UConverterResetChoice choice);
- U_CDECL_BEGIN
- static const char * U_CALLCONV
- _ISO2022getName(const UConverter* cnv);
- U_CDECL_END
- static void U_CALLCONV
- _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err);
- U_CDECL_BEGIN
- static UConverter * U_CALLCONV
- _ISO_2022_SafeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status);
- U_CDECL_END
- #ifdef U_ENABLE_GENERIC_ISO_2022
- static void U_CALLCONV
- T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, UErrorCode* err);
- #endif
- namespace {
- /*const UConverterSharedData _ISO2022Data;*/
- extern const UConverterSharedData _ISO2022JPData;
- #if !UCONFIG_ONLY_HTML_CONVERSION
- extern const UConverterSharedData _ISO2022KRData;
- extern const UConverterSharedData _ISO2022CNData;
- #endif
- } // namespace
- /*************** Converter implementations ******************/
- /* The purpose of this function is to get around gcc compiler warnings. */
- static inline void
- fromUWriteUInt8(UConverter *cnv,
- const char *bytes, int32_t length,
- uint8_t **target, const char *targetLimit,
- int32_t **offsets,
- int32_t sourceIndex,
- UErrorCode *pErrorCode)
- {
- char *targetChars = (char *)*target;
- ucnv_fromUWriteBytes(cnv, bytes, length, &targetChars, targetLimit,
- offsets, sourceIndex, pErrorCode);
- *target = (uint8_t*)targetChars;
- }
- static inline void
- setInitialStateToUnicodeKR(UConverter* /*converter*/, UConverterDataISO2022 *myConverterData){
- if(myConverterData->version == 1) {
- UConverter *cnv = myConverterData->currentConverter;
- cnv->toUnicodeStatus=0; /* offset */
- cnv->mode=0; /* state */
- cnv->toULength=0; /* byteIndex */
- }
- }
- static inline void
- setInitialStateFromUnicodeKR(UConverter* converter,UConverterDataISO2022 *myConverterData){
- /* in ISO-2022-KR the designator sequence appears only once
- * in a file so we append it only once
- */
- if( converter->charErrorBufferLength==0){
- converter->charErrorBufferLength = 4;
- converter->charErrorBuffer[0] = 0x1b;
- converter->charErrorBuffer[1] = 0x24;
- converter->charErrorBuffer[2] = 0x29;
- converter->charErrorBuffer[3] = 0x43;
- }
- if(myConverterData->version == 1) {
- UConverter *cnv = myConverterData->currentConverter;
- cnv->fromUChar32=0;
- cnv->fromUnicodeStatus=1; /* prevLength */
- }
- }
- static void U_CALLCONV
- _ISO2022Open(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode){
- char myLocale[7]={' ',' ',' ',' ',' ',' ', '\0'};
- cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022));
- if(cnv->extraInfo != nullptr) {
- UConverterNamePieces stackPieces;
- UConverterLoadArgs stackArgs=UCNV_LOAD_ARGS_INITIALIZER;
- UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
- uint32_t version;
- stackArgs.onlyTestIsLoadable = pArgs->onlyTestIsLoadable;
- uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022));
- myConverterData->currentType = ASCII1;
- cnv->fromUnicodeStatus =false;
- if(pArgs->locale){
- uprv_strncpy(myLocale, pArgs->locale, sizeof(myLocale)-1);
- }
- version = pArgs->options & UCNV_OPTIONS_VERSION_MASK;
- myConverterData->version = version;
- if(myLocale[0]=='j' && (myLocale[1]=='a'|| myLocale[1]=='p') &&
- (myLocale[2]=='_' || myLocale[2]=='\0'))
- {
- /* open the required converters and cache them */
- if(version>MAX_JA_VERSION) {
- // ICU 55 fails to open a converter for an unsupported version.
- // Previously, it fell back to version 0, but that would yield
- // unexpected behavior.
- *errorCode = U_MISSING_RESOURCE_ERROR;
- return;
- }
- if(jpCharsetMasks[version]&CSM(ISO8859_7)) {
- myConverterData->myConverterArray[ISO8859_7] =
- ucnv_loadSharedData("ISO8859_7", &stackPieces, &stackArgs, errorCode);
- }
- myConverterData->myConverterArray[JISX208] =
- ucnv_loadSharedData("Shift-JIS", &stackPieces, &stackArgs, errorCode);
- if(jpCharsetMasks[version]&CSM(JISX212)) {
- myConverterData->myConverterArray[JISX212] =
- ucnv_loadSharedData("jisx-212", &stackPieces, &stackArgs, errorCode);
- }
- if(jpCharsetMasks[version]&CSM(GB2312)) {
- myConverterData->myConverterArray[GB2312] =
- ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode); /* gb_2312_80-1 */
- }
- if(jpCharsetMasks[version]&CSM(KSC5601)) {
- myConverterData->myConverterArray[KSC5601] =
- ucnv_loadSharedData("ksc_5601", &stackPieces, &stackArgs, errorCode);
- }
- /* set the function pointers to appropriate functions */
- cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData);
- uprv_strcpy(myConverterData->locale,"ja");
- (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version=");
- size_t len = uprv_strlen(myConverterData->name);
- myConverterData->name[len]=(char)(myConverterData->version+(int)'0');
- myConverterData->name[len+1]='\0';
- }
- #if !UCONFIG_ONLY_HTML_CONVERSION
- else if(myLocale[0]=='k' && (myLocale[1]=='o'|| myLocale[1]=='r') &&
- (myLocale[2]=='_' || myLocale[2]=='\0'))
- {
- if(version>1) {
- // ICU 55 fails to open a converter for an unsupported version.
- // Previously, it fell back to version 0, but that would yield
- // unexpected behavior.
- *errorCode = U_MISSING_RESOURCE_ERROR;
- return;
- }
- const char *cnvName;
- if(version==1) {
- cnvName="icu-internal-25546";
- } else {
- cnvName="ibm-949";
- myConverterData->version=version=0;
- }
- if(pArgs->onlyTestIsLoadable) {
- ucnv_canCreateConverter(cnvName, errorCode); /* errorCode carries result */
- uprv_free(cnv->extraInfo);
- cnv->extraInfo=nullptr;
- return;
- } else {
- myConverterData->currentConverter=ucnv_open(cnvName, errorCode);
- if (U_FAILURE(*errorCode)) {
- _ISO2022Close(cnv);
- return;
- }
- if(version==1) {
- (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=1");
- uprv_memcpy(cnv->subChars, myConverterData->currentConverter->subChars, 4);
- cnv->subCharLen = myConverterData->currentConverter->subCharLen;
- }else{
- (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=ko,version=0");
- }
- /* initialize the state variables */
- setInitialStateToUnicodeKR(cnv, myConverterData);
- setInitialStateFromUnicodeKR(cnv, myConverterData);
- /* set the function pointers to appropriate functions */
- cnv->sharedData=(UConverterSharedData*)&_ISO2022KRData;
- uprv_strcpy(myConverterData->locale,"ko");
- }
- }
- else if(((myLocale[0]=='z' && myLocale[1]=='h') || (myLocale[0]=='c'&& myLocale[1]=='n'))&&
- (myLocale[2]=='_' || myLocale[2]=='\0'))
- {
- if(version>2) {
- // ICU 55 fails to open a converter for an unsupported version.
- // Previously, it fell back to version 0, but that would yield
- // unexpected behavior.
- *errorCode = U_MISSING_RESOURCE_ERROR;
- return;
- }
- /* open the required converters and cache them */
- myConverterData->myConverterArray[GB2312_1] =
- ucnv_loadSharedData("ibm-5478", &stackPieces, &stackArgs, errorCode);
- if(version==1) {
- myConverterData->myConverterArray[ISO_IR_165] =
- ucnv_loadSharedData("iso-ir-165", &stackPieces, &stackArgs, errorCode);
- }
- myConverterData->myConverterArray[CNS_11643] =
- ucnv_loadSharedData("cns-11643-1992", &stackPieces, &stackArgs, errorCode);
- /* set the function pointers to appropriate functions */
- cnv->sharedData=(UConverterSharedData*)&_ISO2022CNData;
- uprv_strcpy(myConverterData->locale,"cn");
- if (version==0){
- myConverterData->version = 0;
- (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=0");
- }else if (version==1){
- myConverterData->version = 1;
- (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=1");
- }else {
- myConverterData->version = 2;
- (void)uprv_strcpy(myConverterData->name,"ISO_2022,locale=zh,version=2");
- }
- }
- #endif // !UCONFIG_ONLY_HTML_CONVERSION
- else{
- #ifdef U_ENABLE_GENERIC_ISO_2022
- myConverterData->isFirstBuffer = true;
- /* append the UTF-8 escape sequence */
- cnv->charErrorBufferLength = 3;
- cnv->charErrorBuffer[0] = 0x1b;
- cnv->charErrorBuffer[1] = 0x25;
- cnv->charErrorBuffer[2] = 0x42;
- cnv->sharedData=(UConverterSharedData*)&_ISO2022Data;
- /* initialize the state variables */
- uprv_strcpy(myConverterData->name,"ISO_2022");
- #else
- *errorCode = U_MISSING_RESOURCE_ERROR;
- // Was U_UNSUPPORTED_ERROR but changed in ICU 55 to a more standard
- // data loading error code.
- return;
- #endif
- }
- cnv->maxBytesPerUChar=cnv->sharedData->staticData->maxBytesPerChar;
- if(U_FAILURE(*errorCode) || pArgs->onlyTestIsLoadable) {
- _ISO2022Close(cnv);
- }
- } else {
- *errorCode = U_MEMORY_ALLOCATION_ERROR;
- }
- }
- static void U_CALLCONV
- _ISO2022Close(UConverter *converter) {
- UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo);
- UConverterSharedData **array = myData->myConverterArray;
- int32_t i;
- if (converter->extraInfo != nullptr) {
- /*close the array of converter pointers and free the memory*/
- for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
- if(array[i]!=nullptr) {
- ucnv_unloadSharedDataIfReady(array[i]);
- }
- }
- ucnv_close(myData->currentConverter);
- if(!converter->isExtraLocal){
- uprv_free (converter->extraInfo);
- converter->extraInfo = nullptr;
- }
- }
- }
- static void U_CALLCONV
- _ISO2022Reset(UConverter *converter, UConverterResetChoice choice) {
- UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) (converter->extraInfo);
- if(choice<=UCNV_RESET_TO_UNICODE) {
- uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State));
- myConverterData->key = 0;
- myConverterData->isEmptySegment = false;
- }
- if(choice!=UCNV_RESET_TO_UNICODE) {
- uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State));
- }
- #ifdef U_ENABLE_GENERIC_ISO_2022
- if(myConverterData->locale[0] == 0){
- if(choice<=UCNV_RESET_TO_UNICODE) {
- myConverterData->isFirstBuffer = true;
- myConverterData->key = 0;
- if (converter->mode == UCNV_SO){
- ucnv_close (myConverterData->currentConverter);
- myConverterData->currentConverter=nullptr;
- }
- converter->mode = UCNV_SI;
- }
- if(choice!=UCNV_RESET_TO_UNICODE) {
- /* re-append UTF-8 escape sequence */
- converter->charErrorBufferLength = 3;
- converter->charErrorBuffer[0] = 0x1b;
- converter->charErrorBuffer[1] = 0x28;
- converter->charErrorBuffer[2] = 0x42;
- }
- }
- else
- #endif
- {
- /* reset the state variables */
- if(myConverterData->locale[0] == 'k'){
- if(choice<=UCNV_RESET_TO_UNICODE) {
- setInitialStateToUnicodeKR(converter, myConverterData);
- }
- if(choice!=UCNV_RESET_TO_UNICODE) {
- setInitialStateFromUnicodeKR(converter, myConverterData);
- }
- }
- }
- }
- U_CDECL_BEGIN
- static const char * U_CALLCONV
- _ISO2022getName(const UConverter* cnv){
- if(cnv->extraInfo){
- UConverterDataISO2022* myData= (UConverterDataISO2022*)cnv->extraInfo;
- return myData->name;
- }
- return nullptr;
- }
- U_CDECL_END
- /*************** to unicode *******************/
- /****************************************************************************
- * Recognized escape sequences are
- * <ESC>(B ASCII
- * <ESC>.A ISO-8859-1
- * <ESC>.F ISO-8859-7
- * <ESC>(J JISX-201
- * <ESC>(I JISX-201
- * <ESC>$B JISX-208
- * <ESC>$@ JISX-208
- * <ESC>$(D JISX-212
- * <ESC>$A GB2312
- * <ESC>$(C KSC5601
- */
- static const int8_t nextStateToUnicodeJP[MAX_STATES_2022]= {
- /* 0 1 2 3 4 5 6 7 8 9 */
- INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
- ,ASCII ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,JISX201 ,HWKANA_7BIT ,JISX201 ,INVALID_STATE
- ,INVALID_STATE ,INVALID_STATE ,JISX208 ,GB2312 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
- ,ISO8859_1 ,ISO8859_7 ,JISX208 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,KSC5601 ,JISX212 ,INVALID_STATE
- ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
- ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
- ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
- ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
- };
- #if !UCONFIG_ONLY_HTML_CONVERSION
- /*************** to unicode *******************/
- static const int8_t nextStateToUnicodeCN[MAX_STATES_2022]= {
- /* 0 1 2 3 4 5 6 7 8 9 */
- INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,SS2_STATE ,SS3_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
- ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
- ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
- ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
- ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,GB2312_1 ,INVALID_STATE ,ISO_IR_165
- ,CNS_11643_1 ,CNS_11643_2 ,CNS_11643_3 ,CNS_11643_4 ,CNS_11643_5 ,CNS_11643_6 ,CNS_11643_7 ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
- ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
- ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE ,INVALID_STATE
- };
- #endif
- static UCNV_TableStates_2022
- getKey_2022(char c,int32_t* key,int32_t* offset){
- int32_t togo;
- int32_t low = 0;
- int32_t hi = MAX_STATES_2022;
- int32_t oldmid=0;
- togo = normalize_esq_chars_2022[(uint8_t)c];
- if(togo == 0) {
- /* not a valid character anywhere in an escape sequence */
- *key = 0;
- *offset = 0;
- return INVALID_2022;
- }
- togo = (*key << 5) + togo;
- while (hi != low) /*binary search*/{
- int32_t mid = (hi+low) >> 1; /*Finds median*/
- if (mid == oldmid)
- break;
- if (escSeqStateTable_Key_2022[mid] > togo){
- hi = mid;
- }
- else if (escSeqStateTable_Key_2022[mid] < togo){
- low = mid;
- }
- else /*we found it*/{
- *key = togo;
- *offset = mid;
- return (UCNV_TableStates_2022)escSeqStateTable_Value_2022[mid];
- }
- oldmid = mid;
- }
- *key = 0;
- *offset = 0;
- return INVALID_2022;
- }
- /*runs through a state machine to determine the escape sequence - codepage correspondence
- */
- static void
- changeState_2022(UConverter* _this,
- const char** source,
- const char* sourceLimit,
- Variant2022 var,
- UErrorCode* err){
- UCNV_TableStates_2022 value;
- UConverterDataISO2022* myData2022 = ((UConverterDataISO2022*)_this->extraInfo);
- uint32_t key = myData2022->key;
- int32_t offset = 0;
- int8_t initialToULength = _this->toULength;
- char c;
- value = VALID_NON_TERMINAL_2022;
- while (*source < sourceLimit) {
- c = *(*source)++;
- _this->toUBytes[_this->toULength++]=(uint8_t)c;
- value = getKey_2022(c,(int32_t *) &key, &offset);
- switch (value){
- case VALID_NON_TERMINAL_2022 :
- /* continue with the loop */
- break;
- case VALID_TERMINAL_2022:
- key = 0;
- goto DONE;
- case INVALID_2022:
- goto DONE;
- case VALID_MAYBE_TERMINAL_2022:
- #ifdef U_ENABLE_GENERIC_ISO_2022
- /* ESC ( B is ambiguous only for ISO_2022 itself */
- if(var == ISO_2022) {
- /* discard toUBytes[] for ESC ( B because this sequence is correct and complete */
- _this->toULength = 0;
- /* TODO need to indicate that ESC ( B was seen; if failure, then need to replay from source or from MBCS-style replay */
- /* continue with the loop */
- value = VALID_NON_TERMINAL_2022;
- break;
- } else
- #endif
- {
- /* not ISO_2022 itself, finish here */
- value = VALID_TERMINAL_2022;
- key = 0;
- goto DONE;
- }
- }
- }
- DONE:
- myData2022->key = key;
- if (value == VALID_NON_TERMINAL_2022) {
- /* indicate that the escape sequence is incomplete: key!=0 */
- return;
- } else if (value == INVALID_2022 ) {
- *err = U_ILLEGAL_ESCAPE_SEQUENCE;
- } else /* value == VALID_TERMINAL_2022 */ {
- switch(var){
- #ifdef U_ENABLE_GENERIC_ISO_2022
- case ISO_2022:
- {
- const char *chosenConverterName = escSeqStateTable_Result_2022[offset];
- if(chosenConverterName == nullptr) {
- /* SS2 or SS3 */
- *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
- _this->toUCallbackReason = UCNV_UNASSIGNED;
- return;
- }
- _this->mode = UCNV_SI;
- ucnv_close(myData2022->currentConverter);
- myData2022->currentConverter = myUConverter = ucnv_open(chosenConverterName, err);
- if(U_SUCCESS(*err)) {
- myUConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
- _this->mode = UCNV_SO;
- }
- break;
- }
- #endif
- case ISO_2022_JP:
- {
- StateEnum tempState=(StateEnum)nextStateToUnicodeJP[offset];
- switch(tempState) {
- case INVALID_STATE:
- *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
- break;
- case SS2_STATE:
- if(myData2022->toU2022State.cs[2]!=0) {
- if(myData2022->toU2022State.g<2) {
- myData2022->toU2022State.prevG=myData2022->toU2022State.g;
- }
- myData2022->toU2022State.g=2;
- } else {
- /* illegal to have SS2 before a matching designator */
- *err = U_ILLEGAL_ESCAPE_SEQUENCE;
- }
- break;
- /* case SS3_STATE: not used in ISO-2022-JP-x */
- case ISO8859_1:
- case ISO8859_7:
- if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
- *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
- } else {
- /* G2 charset for SS2 */
- myData2022->toU2022State.cs[2]=(int8_t)tempState;
- }
- break;
- default:
- if((jpCharsetMasks[myData2022->version] & CSM(tempState)) == 0) {
- *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
- } else {
- /* G0 charset */
- myData2022->toU2022State.cs[0]=(int8_t)tempState;
- }
- break;
- }
- }
- break;
- #if !UCONFIG_ONLY_HTML_CONVERSION
- case ISO_2022_CN:
- {
- StateEnum tempState=(StateEnum)nextStateToUnicodeCN[offset];
- switch(tempState) {
- case INVALID_STATE:
- *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
- break;
- case SS2_STATE:
- if(myData2022->toU2022State.cs[2]!=0) {
- if(myData2022->toU2022State.g<2) {
- myData2022->toU2022State.prevG=myData2022->toU2022State.g;
- }
- myData2022->toU2022State.g=2;
- } else {
- /* illegal to have SS2 before a matching designator */
- *err = U_ILLEGAL_ESCAPE_SEQUENCE;
- }
- break;
- case SS3_STATE:
- if(myData2022->toU2022State.cs[3]!=0) {
- if(myData2022->toU2022State.g<2) {
- myData2022->toU2022State.prevG=myData2022->toU2022State.g;
- }
- myData2022->toU2022State.g=3;
- } else {
- /* illegal to have SS3 before a matching designator */
- *err = U_ILLEGAL_ESCAPE_SEQUENCE;
- }
- break;
- case ISO_IR_165:
- if(myData2022->version==0) {
- *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
- break;
- }
- U_FALLTHROUGH;
- case GB2312_1:
- U_FALLTHROUGH;
- case CNS_11643_1:
- myData2022->toU2022State.cs[1]=(int8_t)tempState;
- break;
- case CNS_11643_2:
- myData2022->toU2022State.cs[2]=(int8_t)tempState;
- break;
- default:
- /* other CNS 11643 planes */
- if(myData2022->version==0) {
- *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
- } else {
- myData2022->toU2022State.cs[3]=(int8_t)tempState;
- }
- break;
- }
- }
- break;
- case ISO_2022_KR:
- if(offset==0x30){
- /* nothing to be done, just accept this one escape sequence */
- } else {
- *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
- }
- break;
- #endif // !UCONFIG_ONLY_HTML_CONVERSION
- default:
- *err = U_ILLEGAL_ESCAPE_SEQUENCE;
- break;
- }
- }
- if(U_SUCCESS(*err)) {
- _this->toULength = 0;
- } else if(*err==U_ILLEGAL_ESCAPE_SEQUENCE) {
- if(_this->toULength>1) {
- /*
- * Ticket 5691: consistent illegal sequences:
- * - We include at least the first byte (ESC) in the illegal sequence.
- * - If any of the non-initial bytes could be the start of a character,
- * we stop the illegal sequence before the first one of those.
- * In escape sequences, all following bytes are "printable", that is,
- * unless they are completely illegal (>7f in SBCS, outside 21..7e in DBCS),
- * they are valid single/lead bytes.
- * For simplicity, we always only report the initial ESC byte as the
- * illegal sequence and back out all other bytes we looked at.
- */
- /* Back out some bytes. */
- int8_t backOutDistance=_this->toULength-1;
- int8_t bytesFromThisBuffer=_this->toULength-initialToULength;
- if(backOutDistance<=bytesFromThisBuffer) {
- /* same as initialToULength<=1 */
- *source-=backOutDistance;
- } else {
- /* Back out bytes from the previous buffer: Need to replay them. */
- _this->preToULength=(int8_t)(bytesFromThisBuffer-backOutDistance);
- /* same as -(initialToULength-1) */
- /* preToULength is negative! */
- uprv_memcpy(_this->preToU, _this->toUBytes+1, -_this->preToULength);
- *source-=bytesFromThisBuffer;
- }
- _this->toULength=1;
- }
- } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) {
- _this->toUCallbackReason = UCNV_UNASSIGNED;
- }
- }
- #if !UCONFIG_ONLY_HTML_CONVERSION
- /*Checks the characters of the buffer against valid 2022 escape sequences
- *if the match we return a pointer to the initial start of the sequence otherwise
- *we return sourceLimit
- */
- /*for 2022 looks ahead in the stream
- *to determine the longest possible convertible
- *data stream
- */
- static inline const char*
- getEndOfBuffer_2022(const char** source,
- const char* sourceLimit,
- UBool /*flush*/){
- const char* mySource = *source;
- #ifdef U_ENABLE_GENERIC_ISO_2022
- if (*source >= sourceLimit)
- return sourceLimit;
- do{
- if (*mySource == ESC_2022){
- int8_t i;
- int32_t key = 0;
- int32_t offset;
- UCNV_TableStates_2022 value = VALID_NON_TERMINAL_2022;
- /* Kludge: I could not
- * figure out the reason for validating an escape sequence
- * twice - once here and once in changeState_2022().
- * is it possible to have an ESC character in a ISO2022
- * byte stream which is valid in a code page? Is it legal?
- */
- for (i=0;
- (mySource+i < sourceLimit)&&(value == VALID_NON_TERMINAL_2022);
- i++) {
- value = getKey_2022(*(mySource+i), &key, &offset);
- }
- if (value > 0 || *mySource==ESC_2022)
- return mySource;
- if ((value == VALID_NON_TERMINAL_2022)&&(!flush) )
- return sourceLimit;
- }
- }while (++mySource < sourceLimit);
- return sourceLimit;
- #else
- while(mySource < sourceLimit && *mySource != ESC_2022) {
- ++mySource;
- }
- return mySource;
- #endif
- }
- #endif
- /* This inline function replicates code in _MBCSFromUChar32() function in ucnvmbcs.c
- * any future change in _MBCSFromUChar32() function should be reflected here.
- * @return number of bytes in *value; negative number if fallback; 0 if no mapping
- */
- static inline int32_t
- MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData,
- UChar32 c,
- uint32_t* value,
- UBool useFallback,
- int outputType)
- {
- const int32_t *cx;
- const uint16_t *table;
- uint32_t stage2Entry;
- uint32_t myValue;
- int32_t length;
- const uint8_t *p;
- /*
- * TODO(markus): Use and require new, faster MBCS conversion table structures.
- * Use internal version of ucnv_open() that verifies that the new structures are available,
- * else U_INTERNAL_PROGRAM_ERROR.
- */
- /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
- if(c<0x10000 || (sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
- table=sharedData->mbcs.fromUnicodeTable;
- stage2Entry=MBCS_STAGE_2_FROM_U(table, c);
- /* get the bytes and the length for the output */
- if(outputType==MBCS_OUTPUT_2){
- myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
- if(myValue<=0xff) {
- length=1;
- } else {
- length=2;
- }
- } else /* outputType==MBCS_OUTPUT_3 */ {
- p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c);
- myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];
- if(myValue<=0xff) {
- length=1;
- } else if(myValue<=0xffff) {
- length=2;
- } else {
- length=3;
- }
- }
- /* is this code point assigned, or do we use fallbacks? */
- if((stage2Entry&(1<<(16+(c&0xf))))!=0) {
- /* assigned */
- *value=myValue;
- return length;
- } else if(FROM_U_USE_FALLBACK(useFallback, c) && myValue!=0) {
- /*
- * We allow a 0 byte output if the "assigned" bit is set for this entry.
- * There is no way with this data structure for fallback output
- * to be a zero byte.
- */
- *value=myValue;
- return -length;
- }
- }
- cx=sharedData->mbcs.extIndexes;
- if(cx!=nullptr) {
- return ucnv_extSimpleMatchFromU(cx, c, value, useFallback);
- }
- /* unassigned */
- return 0;
- }
- /* This inline function replicates code in _MBCSSingleFromUChar32() function in ucnvmbcs.c
- * any future change in _MBCSSingleFromUChar32() function should be reflected here.
- * @param retval pointer to output byte
- * @return 1 roundtrip byte 0 no mapping -1 fallback byte
- */
- static inline int32_t
- MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData,
- UChar32 c,
- uint32_t* retval,
- UBool useFallback)
- {
- const uint16_t *table;
- int32_t value;
- /* BMP-only codepages are stored without stage 1 entries for supplementary code points */
- if(c>=0x10000 && !(sharedData->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
- return 0;
- }
- /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
- table=sharedData->mbcs.fromUnicodeTable;
- /* get the byte for the output */
- value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->mbcs.fromUnicodeBytes, c);
- /* is this code point assigned, or do we use fallbacks? */
- *retval=(uint32_t)(value&0xff);
- if(value>=0xf00) {
- return 1; /* roundtrip */
- } else if(useFallback ? value>=0x800 : value>=0xc00) {
- return -1; /* fallback taken */
- } else {
- return 0; /* no mapping */
- }
- }
- /*
- * Check that the result is a 2-byte value with each byte in the range A1..FE
- * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte
- * to move it to the ISO 2022 range 21..7E.
- * Return 0 if out of range.
- */
- static inline uint32_t
- _2022FromGR94DBCS(uint32_t value) {
- if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) &&
- (uint8_t)(value - 0xa1) <= (0xfe - 0xa1)
- ) {
- return value - 0x8080; /* shift down to 21..7e byte range */
- } else {
- return 0; /* not valid for ISO 2022 */
- }
- }
- #if 0 /* 5691: Call sites now check for validity. They can just += 0x8080 after that. */
- /*
- * This method does the reverse of _2022FromGR94DBCS(). Given the 2022 code point, it returns the
- * 2 byte value that is in the range A1..FE for each byte. Otherwise it returns the 2022 code point
- * unchanged.
- */
- static inline uint32_t
- _2022ToGR94DBCS(uint32_t value) {
- uint32_t returnValue = value + 0x8080;
- if( (uint16_t)(returnValue - 0xa1a1) <= (0xfefe - 0xa1a1) &&
- (uint8_t)(returnValue - 0xa1) <= (0xfe - 0xa1)) {
- return returnValue;
- } else {
- return value;
- }
- }
- #endif
- #ifdef U_ENABLE_GENERIC_ISO_2022
- /**********************************************************************************
- * ISO-2022 Converter
- *
- *
- */
- static void U_CALLCONV
- T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC(UConverterToUnicodeArgs* args,
- UErrorCode* err){
- const char* mySourceLimit, *realSourceLimit;
- const char* sourceStart;
- const char16_t* myTargetStart;
- UConverter* saveThis;
- UConverterDataISO2022* myData;
- int8_t length;
- saveThis = args->converter;
- myData=((UConverterDataISO2022*)(saveThis->extraInfo));
- realSourceLimit = args->sourceLimit;
- while (args->source < realSourceLimit) {
- if(myData->key == 0) { /* are we in the middle of an escape sequence? */
- /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
- mySourceLimit = getEndOfBuffer_2022(&(args->source), realSourceLimit, args->flush);
- if(args->source < mySourceLimit) {
- if(myData->currentConverter==nullptr) {
- myData->currentConverter = ucnv_open("ASCII",err);
- if(U_FAILURE(*err)){
- return;
- }
- myData->currentConverter->fromCharErrorBehaviour = UCNV_TO_U_CALLBACK_STOP;
- saveThis->mode = UCNV_SO;
- }
- /* convert to before the ESC or until the end of the buffer */
- myData->isFirstBuffer=false;
- sourceStart = args->source;
- myTargetStart = args->target;
- args->converter = myData->currentConverter;
- ucnv_toUnicode(args->converter,
- &args->target,
- args->targetLimit,
- &args->source,
- mySourceLimit,
- args->offsets,
- (UBool)(args->flush && mySourceLimit == realSourceLimit),
- err);
- args->converter = saveThis;
- if (*err == U_BUFFER_OVERFLOW_ERROR) {
- /* move the overflow buffer */
- length = saveThis->UCharErrorBufferLength = myData->currentConverter->UCharErrorBufferLength;
- myData->currentConverter->UCharErrorBufferLength = 0;
- if(length > 0) {
- uprv_memcpy(saveThis->UCharErrorBuffer,
- myData->currentConverter->UCharErrorBuffer,
- length*U_SIZEOF_UCHAR);
- }
- return;
- }
- /*
- * At least one of:
- * -Error while converting
- * -Done with entire buffer
- * -Need to write offsets or update the current offset
- * (leave that up to the code in ucnv.c)
- *
- * or else we just stopped at an ESC byte and continue with changeState_2022()
- */
- if (U_FAILURE(*err) ||
- (args->source == realSourceLimit) ||
- (args->offsets != nullptr && (args->target != myTargetStart || args->source != sourceStart) ||
- (mySourceLimit < realSourceLimit && myData->currentConverter->toULength > 0))
- ) {
- /* copy partial or error input for truncated detection and error handling */
- if(U_FAILURE(*err)) {
- length = saveThis->invalidCharLength = myData->currentConverter->invalidCharLength;
- if(length > 0) {
- uprv_memcpy(saveThis->invalidCharBuffer, myData->currentConverter->invalidCharBuffer, length);
- }
- } else {
- length = saveThis->toULength = myData->currentConverter->toULength;
- if(length > 0) {
- uprv_memcpy(saveThis->toUBytes, myData->currentConverter->toUBytes, length);
- if(args->source < mySourceLimit) {
- *err = U_TRUNCATED_CHAR_FOUND; /* truncated input before ESC */
- }
- }
- }
- return;
- }
- }
- }
- sourceStart = args->source;
- changeState_2022(args->converter,
- &(args->source),
- realSourceLimit,
- ISO_2022,
- err);
- if (U_FAILURE(*err) || (args->source != sourceStart && args->offsets != nullptr)) {
- /* let the ucnv.c code update its current offset */
- return;
- }
- }
- }
- #endif
- /*
- * To Unicode Callback helper function
- */
- static void
- toUnicodeCallback(UConverter *cnv,
- const uint32_t sourceChar, const uint32_t targetUniChar,
- UErrorCode* err){
- if(sourceChar>0xff){
- cnv->toUBytes[0] = (uint8_t)(sourceChar>>8);
- cnv->toUBytes[1] = (uint8_t)sourceChar;
- cnv->toULength = 2;
- }
- else{
- cnv->toUBytes[0] =(char) sourceChar;
- cnv->toULength = 1;
- }
- if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){
- *err = U_INVALID_CHAR_FOUND;
- }
- else{
- *err = U_ILLEGAL_CHAR_FOUND;
- }
- }
- /**************************************ISO-2022-JP*************************************************/
- /************************************** IMPORTANT **************************************************
- * The UConverter_fromUnicode_ISO2022_JP converter does not use ucnv_fromUnicode() functions for SBCS,DBCS and
- * MBCS; instead, the values are obtained directly by calling _MBCSFromUChar32().
- * The converter iterates over each Unicode codepoint
- * to obtain the equivalent codepoints from the codepages supported. Since the source buffer is
- * processed one char at a time it would make sense to reduce the extra processing a canned converter
- * would do as far as possible.
- *
- * If the implementation of these macros or structure of sharedData struct change in the future, make
- * sure that ISO-2022 is also changed.
- ***************************************************************************************************
- */
- /***************************************************************************************************
- * Rules for ISO-2022-jp encoding
- * (i) Escape sequences must be fully contained within a line they should not
- * span new lines or CRs
- * (ii) If the last character on a line is represented by two bytes then an ASCII or
- * JIS-Roman character escape sequence should follow before the line terminates
- * (iii) If the first character on the line is represented by two bytes then a two
- * byte character escape sequence should precede it
- * (iv) If no escape sequence is encountered then the characters are ASCII
- * (v) Latin(ISO-8859-1) and Greek(ISO-8859-7) characters must be designated to G2,
- * and invoked with SS2 (ESC N).
- * (vi) If there is any G0 designation in text, there must be a switch to
- * ASCII or to JIS X 0201-Roman before a space character (but not
- * necessarily before "ESC 4/14 2/0" or "ESC N ' '") or control
- * characters such as tab or CRLF.
- * (vi) Supported encodings:
- * ASCII, JISX201, JISX208, JISX212, GB2312, KSC5601, ISO-8859-1,ISO-8859-7
- *
- * source : RFC-1554
- *
- * JISX201, JISX208,JISX212 : new .cnv data files created
- * KSC5601 : alias to ibm-949 mapping table
- * GB2312 : alias to ibm-1386 mapping table
- * ISO-8859-1 : Algorithmic implemented as LATIN1 case
- * ISO-8859-7 : alias to ibm-9409 mapping table
- */
- /* preference order of JP charsets */
- static const StateEnum jpCharsetPref[]={
- ASCII,
- JISX201,
- ISO8859_1,
- JISX208,
- ISO8859_7,
- JISX212,
- GB2312,
- KSC5601,
- HWKANA_7BIT
- };
- /*
- * The escape sequences must be in order of the enum constants like JISX201 = 3,
- * not in order of jpCharsetPref[]!
- */
- static const char escSeqChars[][6] ={
- "\x1B\x28\x42", /* <ESC>(B ASCII */
- "\x1B\x2E\x41", /* <ESC>.A ISO-8859-1 */
- "\x1B\x2E\x46", /* <ESC>.F ISO-8859-7 */
- "\x1B\x28\x4A", /* <ESC>(J JISX-201 */
- "\x1B\x24\x42", /* <ESC>$B JISX-208 */
- "\x1B\x24\x28\x44", /* <ESC>$(D JISX-212 */
- "\x1B\x24\x41", /* <ESC>$A GB2312 */
- "\x1B\x24\x28\x43", /* <ESC>$(C KSC5601 */
- "\x1B\x28\x49" /* <ESC>(I HWKANA_7BIT */
- };
- static const int8_t escSeqCharsLen[] ={
- 3, /* length of <ESC>(B ASCII */
- 3, /* length of <ESC>.A ISO-8859-1 */
- 3, /* length of <ESC>.F ISO-8859-7 */
- 3, /* length of <ESC>(J JISX-201 */
- 3, /* length of <ESC>$B JISX-208 */
- 4, /* length of <ESC>$(D JISX-212 */
- 3, /* length of <ESC>$A GB2312 */
- 4, /* length of <ESC>$(C KSC5601 */
- 3 /* length of <ESC>(I HWKANA_7BIT */
- };
- /*
- * The iteration over various code pages works this way:
- * i) Get the currentState from myConverterData->currentState
- * ii) Check if the character is mapped to a valid character in the currentState
- * Yes -> a) set the initIterState to currentState
- * b) remain in this state until an invalid character is found
- * No -> a) go to the next code page and find the character
- * iii) Before changing the state increment the current state check if the current state
- * is equal to the intitIteration state
- * Yes -> A character that cannot be represented in any of the supported encodings
- * break and return a U_INVALID_CHARACTER error
- * No -> Continue and find the character in next code page
- *
- *
- * TODO: Implement a priority technique where the users are allowed to set the priority of code pages
- */
- /* Map 00..7F to Unicode according to JIS X 0201. */
- static inline uint32_t
- jisx201ToU(uint32_t value) {
- if(value < 0x5c) {
- return value;
- } else if(value == 0x5c) {
- return 0xa5;
- } else if(value == 0x7e) {
- return 0x203e;
- } else /* value <= 0x7f */ {
- return value;
- }
- }
- /* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */
- static inline uint32_t
- jisx201FromU(uint32_t value) {
- if(value<=0x7f) {
- if(value!=0x5c && value!=0x7e) {
- return value;
- }
- } else if(value==0xa5) {
- return 0x5c;
- } else if(value==0x203e) {
- return 0x7e;
- }
- return 0xfffe;
- }
- /*
- * Take a valid Shift-JIS byte pair, check that it is in the range corresponding
- * to JIS X 0208, and convert it to a pair of 21..7E bytes.
- * Return 0 if the byte pair is out of range.
- */
- static inline uint32_t
- _2022FromSJIS(uint32_t value) {
- uint8_t trail;
- if(value > 0xEFFC) {
- return 0; /* beyond JIS X 0208 */
- }
- trail = (uint8_t)value;
- value &= 0xff00; /* lead byte */
- if(value <= 0x9f00) {
- value -= 0x7000;
- } else /* 0xe000 <= value <= 0xef00 */ {
- value -= 0xb000;
- }
- value <<= 1;
- if(trail <= 0x9e) {
- value -= 0x100;
- if(trail <= 0x7e) {
- value |= trail - 0x1f;
- } else {
- value |= trail - 0x20;
- }
- } else /* trail <= 0xfc */ {
- value |= trail - 0x7e;
- }
- return value;
- }
- /*
- * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS.
- * If either byte is outside 21..7E make sure that the result is not valid
- * for Shift-JIS so that the converter catches it.
- * Some invalid byte values already turn into equally invalid Shift-JIS
- * byte values and need not be tested explicitly.
- */
- static inline void
- _2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) {
- if(c1&1) {
- ++c1;
- if(c2 <= 0x5f) {
- c2 += 0x1f;
- } else if(c2 <= 0x7e) {
- c2 += 0x20;
- } else {
- c2 = 0; /* invalid */
- }
- } else {
- if((uint8_t)(c2-0x21) <= ((0x7e)-0x21)) {
- c2 += 0x7e;
- } else {
- c2 = 0; /* invalid */
- }
- }
- c1 >>= 1;
- if(c1 <= 0x2f) {
- c1 += 0x70;
- } else if(c1 <= 0x3f) {
- c1 += 0xb0;
- } else {
- c1 = 0; /* invalid */
- }
- bytes[0] = (char)c1;
- bytes[1] = (char)c2;
- }
- /*
- * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS)
- * Katakana.
- * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks
- * because Shift-JIS roundtrips half-width Katakana to single bytes.
- * These were the only fallbacks in ICU's jisx-208.ucm file.
- */
- static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = {
- 0x2123, /* U+FF61 */
- 0x2156,
- 0x2157,
- 0x2122,
- 0x2126,
- 0x2572,
- 0x2521,
- 0x2523,
- 0x2525,
- 0x2527,
- 0x2529,
- 0x2563,
- 0x2565,
- 0x2567,
- 0x2543,
- 0x213C, /* U+FF70 */
- 0x2522,
- 0x2524,
- 0x2526,
- 0x2528,
- 0x252A,
- 0x252B,
- 0x252D,
- 0x252F,
- 0x2531,
- 0x2533,
- 0x2535,
- 0x2537,
- 0x2539,
- 0x253B,
- 0x253D,
- 0x253F, /* U+FF80 */
- 0x2541,
- 0x2544,
- 0x2546,
- 0x2548,
- 0x254A,
- 0x254B,
- 0x254C,
- 0x254D,
- 0x254E,
- 0x254F,
- 0x2552,
- 0x2555,
- 0x2558,
- 0x255B,
- 0x255E,
- 0x255F, /* U+FF90 */
- 0x2560,
- 0x2561,
- 0x2562,
- 0x2564,
- 0x2566,
- 0x2568,
- 0x2569,
- 0x256A,
- 0x256B,
- 0x256C,
- 0x256D,
- 0x256F,
- 0x2573,
- 0x212B,
- 0x212C /* U+FF9F */
- };
- static void U_CALLCONV
- UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) {
- UConverter *cnv = args->converter;
- UConverterDataISO2022 *converterData;
- ISO2022State *pFromU2022State;
- uint8_t *target = (uint8_t *) args->target;
- const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
- const char16_t* source = args->source;
- const char16_t* sourceLimit = args->sourceLimit;
- int32_t* offsets = args->offsets;
- UChar32 sourceChar;
- char buffer[8];
- int32_t len, outLen;
- int8_t choices[10];
- int32_t choiceCount;
- uint32_t targetValue = 0;
- UBool useFallback;
- int32_t i;
- int8_t cs, g;
- /* set up the state */
- converterData = (UConverterDataISO2022*)cnv->extraInfo;
- pFromU2022State = &converterData->fromU2022State;
- choiceCount = 0;
- /* check if the last codepoint of previous buffer was a lead surrogate*/
- if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
- goto getTrail;
- }
- while(source < sourceLimit) {
- if(target < targetLimit) {
- sourceChar = *(source++);
- /*check if the char is a First surrogate*/
- if(U16_IS_SURROGATE(sourceChar)) {
- if(U16_IS_SURROGATE_LEAD(sourceChar)) {
- getTrail:
- /*look ahead to find the trail surrogate*/
- if(source < sourceLimit) {
- /* test the following code unit */
- char16_t trail=(char16_t) *source;
- if(U16_IS_TRAIL(trail)) {
- source++;
- sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
- cnv->fromUChar32=0x00;
- /* convert this supplementary code point */
- /* exit this condition tree */
- } else {
- /* this is an unmatched lead code unit (1st surrogate) */
- /* callback(illegal) */
- *err=U_ILLEGAL_CHAR_FOUND;
- cnv->fromUChar32=sourceChar;
- break;
- }
- } else {
- /* no more input */
- cnv->fromUChar32=sourceChar;
- break;
- }
- } else {
- /* this is an unmatched trail code unit (2nd surrogate) */
- /* callback(illegal) */
- *err=U_ILLEGAL_CHAR_FOUND;
- cnv->fromUChar32=sourceChar;
- break;
- }
- }
- /* do not convert SO/SI/ESC */
- if(IS_2022_CONTROL(sourceChar)) {
- /* callback(illegal) */
- *err=U_ILLEGAL_CHAR_FOUND;
- cnv->fromUChar32=sourceChar;
- break;
- }
- /* do the conversion */
- if(choiceCount == 0) {
- uint16_t csm;
- /*
- * The csm variable keeps track of which charsets are allowed
- * and not used yet while building the choices[].
- */
- csm = jpCharsetMasks[converterData->version];
- choiceCount = 0;
- /* JIS7/8: try single-byte half-width Katakana before JISX208 */
- if(converterData->version == 3 || converterData->version == 4) {
- choices[choiceCount++] = (int8_t)HWKANA_7BIT;
- }
- /* Do not try single-byte half-width Katakana for other versions. */
- csm &= ~CSM(HWKANA_7BIT);
- /* try the current G0 charset */
- choices[choiceCount++] = cs = pFromU2022State->cs[0];
- csm &= ~CSM(cs);
- /* try the current G2 charset */
- if((cs = pFromU2022State->cs[2]) != 0) {
- choices[choiceCount++] = cs;
- csm &= ~CSM(cs);
- }
- /* try all the other possible charsets */
- for(i = 0; i < UPRV_LENGTHOF(jpCharsetPref); ++i) {
- cs = (int8_t)jpCharsetPref[i];
- if(CSM(cs) & csm) {
- choices[choiceCount++] = cs;
- csm &= ~CSM(cs);
- }
- }
- }
- cs = g = 0;
- /*
- * len==0: no mapping found yet
- * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
- * len>0: found a roundtrip result, done
- */
- len = 0;
- /*
- * We will turn off useFallback after finding a fallback,
- * but we still get fallbacks from PUA code points as usual.
- * Therefore, we will also need to check that we don't overwrite
- * an early fallback with a later one.
- */
- useFallback = cnv->useFallback;
- for(i = 0; i < choiceCount && len <= 0; ++i) {
- uint32_t value;
- int32_t len2;
- int8_t cs0 = choices[i];
- switch(cs0) {
- case ASCII:
- if(sourceChar <= 0x7f) {
- targetValue = (uint32_t)sourceChar;
- len = 1;
- cs = cs0;
- g = 0;
- }
- break;
- case ISO8859_1:
- if(GR96_START <= sourceChar && sourceChar <= GR96_END) {
- targetValue = (uint32_t)sourceChar - 0x80;
- len = 1;
- cs = cs0;
- g = 2;
- }
- break;
- case HWKANA_7BIT:
- if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
- if(converterData->version==3) {
- /* JIS7: use G1 (SO) */
- /* Shift U+FF61..U+FF9F to bytes 21..5F. */
- targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0x21));
- len = 1;
- pFromU2022State->cs[1] = cs = cs0; /* do not output an escape sequence */
- g = 1;
- } else if(converterData->version==4) {
- /* JIS8: use 8-bit bytes with any single-byte charset, see escape sequence output below */
- /* Shift U+FF61..U+FF9F to bytes A1..DF. */
- targetValue = (uint32_t)(sourceChar - (HWKANA_START - 0xa1));
- len = 1;
- cs = pFromU2022State->cs[0];
- if(IS_JP_DBCS(cs)) {
- /* switch from a DBCS charset to JISX201 */
- cs = (int8_t)JISX201;
- }
- /* else stay in the current G0 charset */
- g = 0;
- }
- /* else do not use HWKANA_7BIT with other versions */
- }
- break;
- case JISX201:
- /* G0 SBCS */
- value = jisx201FromU(sourceChar);
- if(value <= 0x7f) {
- targetValue = value;
- len = 1;
- cs = cs0;
- g = 0;
- useFallback = false;
- }
- break;
- case JISX208:
- /* G0 DBCS from Shift-JIS table */
- len2 = MBCS_FROM_UCHAR32_ISO2022(
- converterData->myConverterArray[cs0],
- sourceChar, &value,
- useFallback, MBCS_OUTPUT_2);
- if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */
- value = _2022FromSJIS(value);
- if(value != 0) {
- targetValue = value;
- len = len2;
- cs = cs0;
- g = 0;
- useFallback = false;
- }
- } else if(len == 0 && useFallback &&
- (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
- targetValue = hwkana_fb[sourceChar - HWKANA_START];
- len = -2;
- cs = cs0;
- g = 0;
- useFallback = false;
- }
- break;
- case ISO8859_7:
- /* G0 SBCS forced to 7-bit output */
- len2 = MBCS_SINGLE_FROM_UCHAR32(
- converterData->myConverterArray[cs0],
- sourceChar, &value,
- useFallback);
- if(len2 != 0 && !(len2 < 0 && len != 0) && GR96_START <= value && value <= GR96_END) {
- targetValue = value - 0x80;
- len = len2;
- cs = cs0;
- g = 2;
- useFallback = false;
- }
- break;
- default:
- /* G0 DBCS */
- len2 = MBCS_FROM_UCHAR32_ISO2022(
- converterData->myConverterArray[cs0],
- sourceChar, &value,
- useFallback, MBCS_OUTPUT_2);
- if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */
- if(cs0 == KSC5601) {
- /*
- * Check for valid bytes for the encoding scheme.
- * This is necessary because the sub-converter (windows-949)
- * has a broader encoding scheme than is valid for 2022.
- */
- value = _2022FromGR94DBCS(value);
- if(value == 0) {
- break;
- }
- }
- targetValue = value;
- len = len2;
- cs = cs0;
- g = 0;
- useFallback = false;
- }
- break;
- }
- }
- if(len != 0) {
- if(len < 0) {
- len = -len; /* fallback */
- }
- outLen = 0; /* count output bytes */
- /* write SI if necessary (only for JIS7) */
- if(pFromU2022State->g == 1 && g == 0) {
- buffer[outLen++] = UCNV_SI;
- pFromU2022State->g = 0;
- }
- /* write the designation sequence if necessary */
- if(cs != pFromU2022State->cs[g]) {
- int32_t escLen = escSeqCharsLen[cs];
- uprv_memcpy(buffer + outLen, escSeqChars[cs], escLen);
- outLen += escLen;
- pFromU2022State->cs[g] = cs;
- /* invalidate the choices[] */
- choiceCount = 0;
- }
- /* write the shift sequence if necessary */
- if(g != pFromU2022State->g) {
- switch(g) {
- /* case 0 handled before writing escapes */
- case 1:
- buffer[outLen++] = UCNV_SO;
- pFromU2022State->g = 1;
- break;
- default: /* case 2 */
- buffer[outLen++] = 0x1b;
- buffer[outLen++] = 0x4e;
- break;
- /* no case 3: no SS3 in ISO-2022-JP-x */
- }
- }
- /* write the output bytes */
- if(len == 1) {
- buffer[outLen++] = (char)targetValue;
- } else /* len == 2 */ {
- buffer[outLen++] = (char)(targetValue >> 8);
- buffer[outLen++] = (char)targetValue;
- }
- } else {
- /*
- * if we cannot find the character after checking all codepages
- * then this is an error
- */
- *err = U_INVALID_CHAR_FOUND;
- cnv->fromUChar32=sourceChar;
- break;
- }
- if(sourceChar == CR || sourceChar == LF) {
- /* reset the G2 state at the end of a line (conversion got us into ASCII or JISX201 already) */
- pFromU2022State->cs[2] = 0;
- choiceCount = 0;
- }
- /* output outLen>0 bytes in buffer[] */
- if(outLen == 1) {
- *target++ = buffer[0];
- if(offsets) {
- *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
- }
- } else if(outLen == 2 && (target + 2) <= targetLimit) {
- *target++ = buffer[0];
- *target++ = buffer[1];
- if(offsets) {
- int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
- *offsets++ = sourceIndex;
- *offsets++ = sourceIndex;
- }
- } else {
- fromUWriteUInt8(
- cnv,
- buffer, outLen,
- &target, (const char *)targetLimit,
- &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
- err);
- if(U_FAILURE(*err)) {
- break;
- }
- }
- } /* end if(myTargetIndex<myTargetLength) */
- else{
- *err =U_BUFFER_OVERFLOW_ERROR;
- break;
- }
- }/* end while(mySourceIndex<mySourceLength) */
- /*
- * the end of the input stream and detection of truncated input
- * are handled by the framework, but for ISO-2022-JP conversion
- * we need to be in ASCII mode at the very end
- *
- * conditions:
- * successful
- * in SO mode or not in ASCII mode
- * end of input and no truncated input
- */
- if( U_SUCCESS(*err) &&
- (pFromU2022State->g!=0 || pFromU2022State->cs[0]!=ASCII) &&
- args->flush && source>=sourceLimit && cnv->fromUChar32==0
- ) {
- int32_t sourceIndex;
- outLen = 0;
- if(pFromU2022State->g != 0) {
- buffer[outLen++] = UCNV_SI;
- pFromU2022State->g = 0;
- }
- if(pFromU2022State->cs[0] != ASCII) {
- int32_t escLen = escSeqCharsLen[ASCII];
- uprv_memcpy(buffer + outLen, escSeqChars[ASCII], escLen);
- outLen += escLen;
- pFromU2022State->cs[0] = (int8_t)ASCII;
- }
- /* get the source index of the last input character */
- /*
- * TODO this would be simpler and more reliable if we used a pair
- * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
- * so that we could simply use the prevSourceIndex here;
- * this code gives an incorrect result for the rare case of an unmatched
- * trail surrogate that is alone in the last buffer of the text stream
- */
- sourceIndex=(int32_t)(source-args->source);
- if(sourceIndex>0) {
- --sourceIndex;
- if( U16_IS_TRAIL(args->source[sourceIndex]) &&
- (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
- ) {
- --sourceIndex;
- }
- } else {
- sourceIndex=-1;
- }
- fromUWriteUInt8(
- cnv,
- buffer, outLen,
- &target, (const char *)targetLimit,
- &offsets, sourceIndex,
- err);
- }
- /*save the state and return */
- args->source = source;
- args->target = (char*)target;
- }
- /*************** to unicode *******************/
- static void U_CALLCONV
- UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
- UErrorCode* err){
- char tempBuf[2];
- const char *mySource = (char *) args->source;
- char16_t *myTarget = args->target;
- const char *mySourceLimit = args->sourceLimit;
- uint32_t targetUniChar = 0x0000;
- uint32_t mySourceChar = 0x0000;
- uint32_t tmpSourceChar = 0x0000;
- UConverterDataISO2022* myData;
- ISO2022State *pToU2022State;
- StateEnum cs;
- myData=(UConverterDataISO2022*)(args->converter->extraInfo);
- pToU2022State = &myData->toU2022State;
- if(myData->key != 0) {
- /* continue with a partial escape sequence */
- goto escape;
- } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
- /* continue with a partial double-byte character */
- mySourceChar = args->converter->toUBytes[0];
- args->converter->toULength = 0;
- cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
- targetUniChar = missingCharMarker;
- goto getTrailByte;
- }
- while(mySource < mySourceLimit){
- targetUniChar =missingCharMarker;
- if(myTarget < args->targetLimit){
- mySourceChar= (unsigned char) *mySource++;
- switch(mySourceChar) {
- case UCNV_SI:
- if(myData->version==3) {
- pToU2022State->g=0;
- continue;
- } else {
- /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
- myData->isEmptySegment = false; /* reset this, we have a different error */
- break;
- }
- case UCNV_SO:
- if(myData->version==3) {
- /* JIS7: switch to G1 half-width Katakana */
- pToU2022State->cs[1] = (int8_t)HWKANA_7BIT;
- pToU2022State->g=1;
- continue;
- } else {
- /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
- myData->isEmptySegment = false; /* reset this, we have a different error */
- break;
- }
- case ESC_2022:
- mySource--;
- escape:
- {
- const char * mySourceBefore = mySource;
- int8_t toULengthBefore = args->converter->toULength;
- changeState_2022(args->converter,&(mySource),
- mySourceLimit, ISO_2022_JP,err);
- /* If in ISO-2022-JP only and we successfully completed an escape sequence, but previous segment was empty, create an error */
- if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
- *err = U_ILLEGAL_ESCAPE_SEQUENCE;
- args->converter->toUCallbackReason = UCNV_IRREGULAR;
- args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
- }
- }
- /* invalid or illegal escape sequence */
- if(U_FAILURE(*err)){
- args->target = myTarget;
- args->source = mySource;
- myData->isEmptySegment = false; /* Reset to avoid future spurious errors */
- return;
- }
- /* If we successfully completed an escape sequence, we begin a new segment, empty so far */
- if(myData->key==0) {
- myData->isEmptySegment = true;
- }
- continue;
- /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
- case CR:
- case LF:
- /* automatically reset to single-byte mode */
- if((StateEnum)pToU2022State->cs[0] != ASCII && (StateEnum)pToU2022State->cs[0] != JISX201) {
- pToU2022State->cs[0] = (int8_t)ASCII;
- }
- pToU2022State->cs[2] = 0;
- pToU2022State->g = 0;
- U_FALLTHROUGH;
- default:
- /* convert one or two bytes */
- myData->isEmptySegment = false;
- cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
- if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 &&
- !IS_JP_DBCS(cs)
- ) {
- /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
- targetUniChar = mySourceChar + (HWKANA_START - 0xa1);
- /* return from a single-shift state to the previous one */
- if(pToU2022State->g >= 2) {
- pToU2022State->g=pToU2022State->prevG;
- }
- } else switch(cs) {
- case ASCII:
- if(mySourceChar <= 0x7f) {
- targetUniChar = mySourceChar;
- }
- break;
- case ISO8859_1:
- if(mySourceChar <= 0x7f) {
- targetUniChar = mySourceChar + 0x80;
- }
- /* return from a single-shift state to the previous one */
- pToU2022State->g=pToU2022State->prevG;
- break;
- case ISO8859_7:
- if(mySourceChar <= 0x7f) {
- /* convert mySourceChar+0x80 to use a normal 8-bit table */
- targetUniChar =
- _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
- myData->myConverterArray[cs],
- mySourceChar + 0x80);
- }
- /* return from a single-shift state to the previous one */
- pToU2022State->g=pToU2022State->prevG;
- break;
- case JISX201:
- if(mySourceChar <= 0x7f) {
- targetUniChar = jisx201ToU(mySourceChar);
- }
- break;
- case HWKANA_7BIT:
- if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) {
- /* 7-bit halfwidth Katakana */
- targetUniChar = mySourceChar + (HWKANA_START - 0x21);
- }
- break;
- default:
- /* G0 DBCS */
- if(mySource < mySourceLimit) {
- int leadIsOk, trailIsOk;
- uint8_t trailByte;
- getTrailByte:
- trailByte = (uint8_t)*mySource;
- /*
- * Ticket 5691: consistent illegal sequences:
- * - We include at least the first byte in the illegal sequence.
- * - If any of the non-initial bytes could be the start of a character,
- * we stop the illegal sequence before the first one of those.
- *
- * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
- * an ESC/SO/SI, we report only the first byte as the illegal sequence.
- * Otherwise we convert or report the pair of bytes.
- */
- leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
- trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
- if (leadIsOk && trailIsOk) {
- ++mySource;
- tmpSourceChar = (mySourceChar << 8) | trailByte;
- if(cs == JISX208) {
- _2022ToSJIS((uint8_t)mySourceChar, trailByte, tempBuf);
- mySourceChar = tmpSourceChar;
- } else {
- /* Copy before we modify tmpSourceChar so toUnicodeCallback() sees the correct bytes. */
- mySourceChar = tmpSourceChar;
- if (cs == KSC5601) {
- tmpSourceChar += 0x8080; /* = _2022ToGR94DBCS(tmpSourceChar) */
- }
- tempBuf[0] = (char)(tmpSourceChar >> 8);
- tempBuf[1] = (char)(tmpSourceChar);
- }
- targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, false);
- } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
- /* report a pair of illegal bytes if the second byte is not a DBCS starter */
- ++mySource;
- /* add another bit so that the code below writes 2 bytes in case of error */
- mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
- }
- } else {
- args->converter->toUBytes[0] = (uint8_t)mySourceChar;
- args->converter->toULength = 1;
- goto endloop;
- }
- } /* End of inner switch */
- break;
- } /* End of outer switch */
- if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
- if(args->offsets){
- args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
- }
- *(myTarget++)=(char16_t)targetUniChar;
- }
- else if(targetUniChar > missingCharMarker){
- /* disassemble the surrogate pair and write to output*/
- targetUniChar-=0x0010000;
- *myTarget = (char16_t)(0xd800+(char16_t)(targetUniChar>>10));
- if(args->offsets){
- args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
- }
- ++myTarget;
- if(myTarget< args->targetLimit){
- *myTarget = (char16_t)(0xdc00+(char16_t)(targetUniChar&0x3ff));
- if(args->offsets){
- args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
- }
- ++myTarget;
- }else{
- args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
- (char16_t)(0xdc00+(char16_t)(targetUniChar&0x3ff));
- }
- }
- else{
- /* Call the callback function*/
- toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
- break;
- }
- }
- else{ /* goes with "if(myTarget < args->targetLimit)" way up near top of function */
- *err =U_BUFFER_OVERFLOW_ERROR;
- break;
- }
- }
- endloop:
- args->target = myTarget;
- args->source = mySource;
- }
- #if !UCONFIG_ONLY_HTML_CONVERSION
- /***************************************************************
- * Rules for ISO-2022-KR encoding
- * i) The KSC5601 designator sequence should appear only once in a file,
- * at the beginning of a line before any KSC5601 characters. This usually
- * means that it appears by itself on the first line of the file
- * ii) There are only 2 shifting sequences SO to shift into double byte mode
- * and SI to shift into single byte mode
- */
- static void U_CALLCONV
- UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterFromUnicodeArgs* args, UErrorCode* err){
- UConverter* saveConv = args->converter;
- UConverterDataISO2022 *myConverterData=(UConverterDataISO2022*)saveConv->extraInfo;
- args->converter=myConverterData->currentConverter;
- myConverterData->currentConverter->fromUChar32 = saveConv->fromUChar32;
- ucnv_MBCSFromUnicodeWithOffsets(args,err);
- saveConv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
- if(*err == U_BUFFER_OVERFLOW_ERROR) {
- if(myConverterData->currentConverter->charErrorBufferLength > 0) {
- uprv_memcpy(
- saveConv->charErrorBuffer,
- myConverterData->currentConverter->charErrorBuffer,
- myConverterData->currentConverter->charErrorBufferLength);
- }
- saveConv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
- myConverterData->currentConverter->charErrorBufferLength = 0;
- }
- args->converter=saveConv;
- }
- static void U_CALLCONV
- UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
- const char16_t *source = args->source;
- const char16_t *sourceLimit = args->sourceLimit;
- unsigned char *target = (unsigned char *) args->target;
- unsigned char *targetLimit = (unsigned char *) args->targetLimit;
- int32_t* offsets = args->offsets;
- uint32_t targetByteUnit = 0x0000;
- UChar32 sourceChar = 0x0000;
- UBool isTargetByteDBCS;
- UBool oldIsTargetByteDBCS;
- UConverterDataISO2022 *converterData;
- UConverterSharedData* sharedData;
- UBool useFallback;
- int32_t length =0;
- converterData=(UConverterDataISO2022*)args->converter->extraInfo;
- /* if the version is 1 then the user is requesting
- * conversion with ibm-25546 pass the arguments to
- * MBCS converter and return
- */
- if(converterData->version==1){
- UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
- return;
- }
- /* initialize data */
- sharedData = converterData->currentConverter->sharedData;
- useFallback = args->converter->useFallback;
- isTargetByteDBCS=(UBool)args->converter->fromUnicodeStatus;
- oldIsTargetByteDBCS = isTargetByteDBCS;
- isTargetByteDBCS = (UBool) args->converter->fromUnicodeStatus;
- if((sourceChar = args->converter->fromUChar32)!=0 && target <targetLimit) {
- goto getTrail;
- }
- while(source < sourceLimit){
- targetByteUnit = missingCharMarker;
- if(target < (unsigned char*) args->targetLimit){
- sourceChar = *source++;
- /* do not convert SO/SI/ESC */
- if(IS_2022_CONTROL(sourceChar)) {
- /* callback(illegal) */
- *err=U_ILLEGAL_CHAR_FOUND;
- args->converter->fromUChar32=sourceChar;
- break;
- }
- length = MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,&targetByteUnit,useFallback,MBCS_OUTPUT_2);
- if(length < 0) {
- length = -length; /* fallback */
- }
- /* only DBCS or SBCS characters are expected*/
- /* DB characters with high bit set to 1 are expected */
- if( length > 2 || length==0 ||
- (length == 1 && targetByteUnit > 0x7f) ||
- (length == 2 &&
- ((uint16_t)(targetByteUnit - 0xa1a1) > (0xfefe - 0xa1a1) ||
- (uint8_t)(targetByteUnit - 0xa1) > (0xfe - 0xa1)))
- ) {
- targetByteUnit=missingCharMarker;
- }
- if (targetByteUnit != missingCharMarker){
- oldIsTargetByteDBCS = isTargetByteDBCS;
- isTargetByteDBCS = (UBool)(targetByteUnit>0x00FF);
- /* append the shift sequence */
- if (oldIsTargetByteDBCS != isTargetByteDBCS ){
- if (isTargetByteDBCS)
- *target++ = UCNV_SO;
- else
- *target++ = UCNV_SI;
- if(offsets)
- *(offsets++) = (int32_t)(source - args->source-1);
- }
- /* write the targetUniChar to target */
- if(targetByteUnit <= 0x00FF){
- if( target < targetLimit){
- *(target++) = (unsigned char) targetByteUnit;
- if(offsets){
- *(offsets++) = (int32_t)(source - args->source-1);
- }
- }else{
- args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit);
- *err = U_BUFFER_OVERFLOW_ERROR;
- }
- }else{
- if(target < targetLimit){
- *(target++) =(unsigned char) ((targetByteUnit>>8) -0x80);
- if(offsets){
- *(offsets++) = (int32_t)(source - args->source-1);
- }
- if(target < targetLimit){
- *(target++) =(unsigned char) (targetByteUnit -0x80);
- if(offsets){
- *(offsets++) = (int32_t)(source - args->source-1);
- }
- }else{
- args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit -0x80);
- *err = U_BUFFER_OVERFLOW_ERROR;
- }
- }else{
- args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) ((targetByteUnit>>8) -0x80);
- args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit-0x80);
- *err = U_BUFFER_OVERFLOW_ERROR;
- }
- }
- }
- else{
- /* oops.. the code point is unassingned
- * set the error and reason
- */
- /*check if the char is a First surrogate*/
- if(U16_IS_SURROGATE(sourceChar)) {
- if(U16_IS_SURROGATE_LEAD(sourceChar)) {
- getTrail:
- /*look ahead to find the trail surrogate*/
- if(source < sourceLimit) {
- /* test the following code unit */
- char16_t trail=(char16_t) *source;
- if(U16_IS_TRAIL(trail)) {
- source++;
- sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
- *err = U_INVALID_CHAR_FOUND;
- /* convert this surrogate code point */
- /* exit this condition tree */
- } else {
- /* this is an unmatched lead code unit (1st surrogate) */
- /* callback(illegal) */
- *err=U_ILLEGAL_CHAR_FOUND;
- }
- } else {
- /* no more input */
- *err = U_ZERO_ERROR;
- }
- } else {
- /* this is an unmatched trail code unit (2nd surrogate) */
- /* callback(illegal) */
- *err=U_ILLEGAL_CHAR_FOUND;
- }
- } else {
- /* callback(unassigned) for a BMP code point */
- *err = U_INVALID_CHAR_FOUND;
- }
- args->converter->fromUChar32=sourceChar;
- break;
- }
- } /* end if(myTargetIndex<myTargetLength) */
- else{
- *err =U_BUFFER_OVERFLOW_ERROR;
- break;
- }
- }/* end while(mySourceIndex<mySourceLength) */
- /*
- * the end of the input stream and detection of truncated input
- * are handled by the framework, but for ISO-2022-KR conversion
- * we need to be in ASCII mode at the very end
- *
- * conditions:
- * successful
- * not in ASCII mode
- * end of input and no truncated input
- */
- if( U_SUCCESS(*err) &&
- isTargetByteDBCS &&
- args->flush && source>=sourceLimit && args->converter->fromUChar32==0
- ) {
- int32_t sourceIndex;
- /* we are switching to ASCII */
- isTargetByteDBCS=false;
- /* get the source index of the last input character */
- /*
- * TODO this would be simpler and more reliable if we used a pair
- * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
- * so that we could simply use the prevSourceIndex here;
- * this code gives an incorrect result for the rare case of an unmatched
- * trail surrogate that is alone in the last buffer of the text stream
- */
- sourceIndex=(int32_t)(source-args->source);
- if(sourceIndex>0) {
- --sourceIndex;
- if( U16_IS_TRAIL(args->source[sourceIndex]) &&
- (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
- ) {
- --sourceIndex;
- }
- } else {
- sourceIndex=-1;
- }
- fromUWriteUInt8(
- args->converter,
- SHIFT_IN_STR, 1,
- &target, (const char *)targetLimit,
- &offsets, sourceIndex,
- err);
- }
- /*save the state and return */
- args->source = source;
- args->target = (char*)target;
- args->converter->fromUnicodeStatus = (uint32_t)isTargetByteDBCS;
- }
- /************************ To Unicode ***************************************/
- static void U_CALLCONV
- UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(UConverterToUnicodeArgs *args,
- UErrorCode* err){
- char const* sourceStart;
- UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo);
- UConverterToUnicodeArgs subArgs;
- int32_t minArgsSize;
- /* set up the subconverter arguments */
- if(args->size<sizeof(UConverterToUnicodeArgs)) {
- minArgsSize = args->size;
- } else {
- minArgsSize = (int32_t)sizeof(UConverterToUnicodeArgs);
- }
- uprv_memcpy(&subArgs, args, minArgsSize);
- subArgs.size = (uint16_t)minArgsSize;
- subArgs.converter = myData->currentConverter;
- /* remember the original start of the input for offsets */
- sourceStart = args->source;
- if(myData->key != 0) {
- /* continue with a partial escape sequence */
- goto escape;
- }
- while(U_SUCCESS(*err) && args->source < args->sourceLimit) {
- /*Find the end of the buffer e.g : Next Escape Seq | end of Buffer*/
- subArgs.source = args->source;
- subArgs.sourceLimit = getEndOfBuffer_2022(&(args->source), args->sourceLimit, args->flush);
- if(subArgs.source != subArgs.sourceLimit) {
- /*
- * get the current partial byte sequence
- *
- * it needs to be moved between the public and the subconverter
- * so that the conversion framework, which only sees the public
- * converter, can handle truncated and illegal input etc.
- */
- if(args->converter->toULength > 0) {
- uprv_memcpy(subArgs.converter->toUBytes, args->converter->toUBytes, args->converter->toULength);
- }
- subArgs.converter->toULength = args->converter->toULength;
- /*
- * Convert up to the end of the input, or to before the next escape character.
- * Does not handle conversion extensions because the preToU[] state etc.
- * is not copied.
- */
- ucnv_MBCSToUnicodeWithOffsets(&subArgs, err);
- if(args->offsets != nullptr && sourceStart != args->source) {
- /* update offsets to base them on the actual start of the input */
- int32_t *offsets = args->offsets;
- char16_t *target = args->target;
- int32_t delta = (int32_t)(args->source - sourceStart);
- while(target < subArgs.target) {
- if(*offsets >= 0) {
- *offsets += delta;
- }
- ++offsets;
- ++target;
- }
- }
- args->source = subArgs.source;
- args->target = subArgs.target;
- args->offsets = subArgs.offsets;
- /* copy input/error/overflow buffers */
- if(subArgs.converter->toULength > 0) {
- uprv_memcpy(args->converter->toUBytes, subArgs.converter->toUBytes, subArgs.converter->toULength);
- }
- args->converter->toULength = subArgs.converter->toULength;
- if(*err == U_BUFFER_OVERFLOW_ERROR) {
- if(subArgs.converter->UCharErrorBufferLength > 0) {
- uprv_memcpy(args->converter->UCharErrorBuffer, subArgs.converter->UCharErrorBuffer,
- subArgs.converter->UCharErrorBufferLength);
- }
- args->converter->UCharErrorBufferLength=subArgs.converter->UCharErrorBufferLength;
- subArgs.converter->UCharErrorBufferLength = 0;
- }
- }
- if (U_FAILURE(*err) || (args->source == args->sourceLimit)) {
- return;
- }
- escape:
- changeState_2022(args->converter,
- &(args->source),
- args->sourceLimit,
- ISO_2022_KR,
- err);
- }
- }
- static void U_CALLCONV
- UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
- UErrorCode* err){
- char tempBuf[2];
- const char *mySource = ( char *) args->source;
- char16_t *myTarget = args->target;
- const char *mySourceLimit = args->sourceLimit;
- UChar32 targetUniChar = 0x0000;
- char16_t mySourceChar = 0x0000;
- UConverterDataISO2022* myData;
- UConverterSharedData* sharedData ;
- UBool useFallback;
- myData=(UConverterDataISO2022*)(args->converter->extraInfo);
- if(myData->version==1){
- UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC_IBM(args,err);
- return;
- }
- /* initialize state */
- sharedData = myData->currentConverter->sharedData;
- useFallback = args->converter->useFallback;
- if(myData->key != 0) {
- /* continue with a partial escape sequence */
- goto escape;
- } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
- /* continue with a partial double-byte character */
- mySourceChar = args->converter->toUBytes[0];
- args->converter->toULength = 0;
- goto getTrailByte;
- }
- while(mySource< mySourceLimit){
- if(myTarget < args->targetLimit){
- mySourceChar= (unsigned char) *mySource++;
- if(mySourceChar==UCNV_SI){
- myData->toU2022State.g = 0;
- if (myData->isEmptySegment) {
- myData->isEmptySegment = false; /* we are handling it, reset to avoid future spurious errors */
- *err = U_ILLEGAL_ESCAPE_SEQUENCE;
- args->converter->toUCallbackReason = UCNV_IRREGULAR;
- args->converter->toUBytes[0] = (uint8_t)mySourceChar;
- args->converter->toULength = 1;
- args->target = myTarget;
- args->source = mySource;
- return;
- }
- /*consume the source */
- continue;
- }else if(mySourceChar==UCNV_SO){
- myData->toU2022State.g = 1;
- myData->isEmptySegment = true; /* Begin a new segment, empty so far */
- /*consume the source */
- continue;
- }else if(mySourceChar==ESC_2022){
- mySource--;
- escape:
- myData->isEmptySegment = false; /* Any invalid ESC sequences will be detected separately, so just reset this */
- changeState_2022(args->converter,&(mySource),
- mySourceLimit, ISO_2022_KR, err);
- if(U_FAILURE(*err)){
- args->target = myTarget;
- args->source = mySource;
- return;
- }
- continue;
- }
- myData->isEmptySegment = false; /* Any invalid char errors will be detected separately, so just reset this */
- if(myData->toU2022State.g == 1) {
- if(mySource < mySourceLimit) {
- int leadIsOk, trailIsOk;
- uint8_t trailByte;
- getTrailByte:
- targetUniChar = missingCharMarker;
- trailByte = (uint8_t)*mySource;
- /*
- * Ticket 5691: consistent illegal sequences:
- * - We include at least the first byte in the illegal sequence.
- * - If any of the non-initial bytes could be the start of a character,
- * we stop the illegal sequence before the first one of those.
- *
- * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
- * an ESC/SO/SI, we report only the first byte as the illegal sequence.
- * Otherwise we convert or report the pair of bytes.
- */
- leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
- trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
- if (leadIsOk && trailIsOk) {
- ++mySource;
- tempBuf[0] = (char)(mySourceChar + 0x80);
- tempBuf[1] = (char)(trailByte + 0x80);
- targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, tempBuf, 2, useFallback);
- mySourceChar = (mySourceChar << 8) | trailByte;
- } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
- /* report a pair of illegal bytes if the second byte is not a DBCS starter */
- ++mySource;
- /* add another bit so that the code below writes 2 bytes in case of error */
- mySourceChar = static_cast<char16_t>(0x10000 | (mySourceChar << 8) | trailByte);
- }
- } else {
- args->converter->toUBytes[0] = (uint8_t)mySourceChar;
- args->converter->toULength = 1;
- break;
- }
- }
- else if(mySourceChar <= 0x7f) {
- targetUniChar = ucnv_MBCSSimpleGetNextUChar(sharedData, mySource - 1, 1, useFallback);
- } else {
- targetUniChar = 0xffff;
- }
- if(targetUniChar < 0xfffe){
- if(args->offsets) {
- args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
- }
- *(myTarget++)=(char16_t)targetUniChar;
- }
- else {
- /* Call the callback function*/
- toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
- break;
- }
- }
- else{
- *err =U_BUFFER_OVERFLOW_ERROR;
- break;
- }
- }
- args->target = myTarget;
- args->source = mySource;
- }
- /*************************** END ISO2022-KR *********************************/
- /*************************** ISO-2022-CN *********************************
- *
- * Rules for ISO-2022-CN Encoding:
- * i) The designator sequence must appear once on a line before any instance
- * of character set it designates.
- * ii) If two lines contain characters from the same character set, both lines
- * must include the designator sequence.
- * iii) Once the designator sequence is known, a shifting sequence has to be found
- * to invoke the shifting
- * iv) All lines start in ASCII and end in ASCII.
- * v) Four shifting sequences are employed for this purpose:
- *
- * Sequcence ASCII Eq Charsets
- * ---------- ------- ---------
- * SI <SI> US-ASCII
- * SO <SO> CNS-11643-1992 Plane 1, GB2312, ISO-IR-165
- * SS2 <ESC>N CNS-11643-1992 Plane 2
- * SS3 <ESC>O CNS-11643-1992 Planes 3-7
- *
- * vi)
- * SOdesignator : ESC "$" ")" finalchar_for_SO
- * SS2designator : ESC "$" "*" finalchar_for_SS2
- * SS3designator : ESC "$" "+" finalchar_for_SS3
- *
- * ESC $ ) A Indicates the bytes following SO are Chinese
- * characters as defined in GB 2312-80, until
- * another SOdesignation appears
- *
- *
- * ESC $ ) E Indicates the bytes following SO are as defined
- * in ISO-IR-165 (for details, see section 2.1),
- * until another SOdesignation appears
- *
- * ESC $ ) G Indicates the bytes following SO are as defined
- * in CNS 11643-plane-1, until another
- * SOdesignation appears
- *
- * ESC $ * H Indicates the two bytes immediately following
- * SS2 is a Chinese character as defined in CNS
- * 11643-plane-2, until another SS2designation
- * appears
- * (Meaning <ESC>N must precede every 2 byte
- * sequence.)
- *
- * ESC $ + I Indicates the immediate two bytes following SS3
- * is a Chinese character as defined in CNS
- * 11643-plane-3, until another SS3designation
- * appears
- * (Meaning <ESC>O must precede every 2 byte
- * sequence.)
- *
- * ESC $ + J Indicates the immediate two bytes following SS3
- * is a Chinese character as defined in CNS
- * 11643-plane-4, until another SS3designation
- * appears
- * (In English: <ESC>O must precede every 2 byte
- * sequence.)
- *
- * ESC $ + K Indicates the immediate two bytes following SS3
- * is a Chinese character as defined in CNS
- * 11643-plane-5, until another SS3designation
- * appears
- *
- * ESC $ + L Indicates the immediate two bytes following SS3
- * is a Chinese character as defined in CNS
- * 11643-plane-6, until another SS3designation
- * appears
- *
- * ESC $ + M Indicates the immediate two bytes following SS3
- * is a Chinese character as defined in CNS
- * 11643-plane-7, until another SS3designation
- * appears
- *
- * As in ISO-2022-CN, each line starts in ASCII, and ends in ASCII, and
- * has its own designation information before any Chinese characters
- * appear
- *
- */
- /* The following are defined this way to make the strings truly readonly */
- static const char GB_2312_80_STR[] = "\x1B\x24\x29\x41";
- static const char ISO_IR_165_STR[] = "\x1B\x24\x29\x45";
- static const char CNS_11643_1992_Plane_1_STR[] = "\x1B\x24\x29\x47";
- static const char CNS_11643_1992_Plane_2_STR[] = "\x1B\x24\x2A\x48";
- static const char CNS_11643_1992_Plane_3_STR[] = "\x1B\x24\x2B\x49";
- static const char CNS_11643_1992_Plane_4_STR[] = "\x1B\x24\x2B\x4A";
- static const char CNS_11643_1992_Plane_5_STR[] = "\x1B\x24\x2B\x4B";
- static const char CNS_11643_1992_Plane_6_STR[] = "\x1B\x24\x2B\x4C";
- static const char CNS_11643_1992_Plane_7_STR[] = "\x1B\x24\x2B\x4D";
- /********************** ISO2022-CN Data **************************/
- static const char* const escSeqCharsCN[10] ={
- SHIFT_IN_STR, /* 0 ASCII */
- GB_2312_80_STR, /* 1 GB2312_1 */
- ISO_IR_165_STR, /* 2 ISO_IR_165 */
- CNS_11643_1992_Plane_1_STR,
- CNS_11643_1992_Plane_2_STR,
- CNS_11643_1992_Plane_3_STR,
- CNS_11643_1992_Plane_4_STR,
- CNS_11643_1992_Plane_5_STR,
- CNS_11643_1992_Plane_6_STR,
- CNS_11643_1992_Plane_7_STR
- };
- static void U_CALLCONV
- UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err){
- UConverter *cnv = args->converter;
- UConverterDataISO2022 *converterData;
- ISO2022State *pFromU2022State;
- uint8_t *target = (uint8_t *) args->target;
- const uint8_t *targetLimit = (const uint8_t *) args->targetLimit;
- const char16_t* source = args->source;
- const char16_t* sourceLimit = args->sourceLimit;
- int32_t* offsets = args->offsets;
- UChar32 sourceChar;
- char buffer[8];
- int32_t len;
- int8_t choices[3];
- int32_t choiceCount;
- uint32_t targetValue = 0;
- UBool useFallback;
- /* set up the state */
- converterData = (UConverterDataISO2022*)cnv->extraInfo;
- pFromU2022State = &converterData->fromU2022State;
- choiceCount = 0;
- /* check if the last codepoint of previous buffer was a lead surrogate*/
- if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) {
- goto getTrail;
- }
- while( source < sourceLimit){
- if(target < targetLimit){
- sourceChar = *(source++);
- /*check if the char is a First surrogate*/
- if(U16_IS_SURROGATE(sourceChar)) {
- if(U16_IS_SURROGATE_LEAD(sourceChar)) {
- getTrail:
- /*look ahead to find the trail surrogate*/
- if(source < sourceLimit) {
- /* test the following code unit */
- char16_t trail=(char16_t) *source;
- if(U16_IS_TRAIL(trail)) {
- source++;
- sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
- cnv->fromUChar32=0x00;
- /* convert this supplementary code point */
- /* exit this condition tree */
- } else {
- /* this is an unmatched lead code unit (1st surrogate) */
- /* callback(illegal) */
- *err=U_ILLEGAL_CHAR_FOUND;
- cnv->fromUChar32=sourceChar;
- break;
- }
- } else {
- /* no more input */
- cnv->fromUChar32=sourceChar;
- break;
- }
- } else {
- /* this is an unmatched trail code unit (2nd surrogate) */
- /* callback(illegal) */
- *err=U_ILLEGAL_CHAR_FOUND;
- cnv->fromUChar32=sourceChar;
- break;
- }
- }
- /* do the conversion */
- if(sourceChar <= 0x007f ){
- /* do not convert SO/SI/ESC */
- if(IS_2022_CONTROL(sourceChar)) {
- /* callback(illegal) */
- *err=U_ILLEGAL_CHAR_FOUND;
- cnv->fromUChar32=sourceChar;
- break;
- }
- /* US-ASCII */
- if(pFromU2022State->g == 0) {
- buffer[0] = (char)sourceChar;
- len = 1;
- } else {
- buffer[0] = UCNV_SI;
- buffer[1] = (char)sourceChar;
- len = 2;
- pFromU2022State->g = 0;
- choiceCount = 0;
- }
- if(sourceChar == CR || sourceChar == LF) {
- /* reset the state at the end of a line */
- uprv_memset(pFromU2022State, 0, sizeof(ISO2022State));
- choiceCount = 0;
- }
- }
- else{
- /* convert U+0080..U+10ffff */
- int32_t i;
- int8_t cs, g;
- if(choiceCount == 0) {
- /* try the current SO/G1 converter first */
- choices[0] = pFromU2022State->cs[1];
- /* default to GB2312_1 if none is designated yet */
- if(choices[0] == 0) {
- choices[0] = GB2312_1;
- }
- if(converterData->version == 0) {
- /* ISO-2022-CN */
- /* try the other SO/G1 converter; a CNS_11643_1 lookup may result in any plane */
- if(choices[0] == GB2312_1) {
- choices[1] = (int8_t)CNS_11643_1;
- } else {
- choices[1] = (int8_t)GB2312_1;
- }
- choiceCount = 2;
- } else if (converterData->version == 1) {
- /* ISO-2022-CN-EXT */
- /* try one of the other converters */
- switch(choices[0]) {
- case GB2312_1:
- choices[1] = (int8_t)CNS_11643_1;
- choices[2] = (int8_t)ISO_IR_165;
- break;
- case ISO_IR_165:
- choices[1] = (int8_t)GB2312_1;
- choices[2] = (int8_t)CNS_11643_1;
- break;
- default: /* CNS_11643_x */
- choices[1] = (int8_t)GB2312_1;
- choices[2] = (int8_t)ISO_IR_165;
- break;
- }
- choiceCount = 3;
- } else {
- choices[0] = (int8_t)CNS_11643_1;
- choices[1] = (int8_t)GB2312_1;
- }
- }
- cs = g = 0;
- /*
- * len==0: no mapping found yet
- * len<0: found a fallback result: continue looking for a roundtrip but no further fallbacks
- * len>0: found a roundtrip result, done
- */
- len = 0;
- /*
- * We will turn off useFallback after finding a fallback,
- * but we still get fallbacks from PUA code points as usual.
- * Therefore, we will also need to check that we don't overwrite
- * an early fallback with a later one.
- */
- useFallback = cnv->useFallback;
- for(i = 0; i < choiceCount && len <= 0; ++i) {
- int8_t cs0 = choices[i];
- if(cs0 > 0) {
- uint32_t value;
- int32_t len2;
- if(cs0 >= CNS_11643_0) {
- len2 = MBCS_FROM_UCHAR32_ISO2022(
- converterData->myConverterArray[CNS_11643],
- sourceChar,
- &value,
- useFallback,
- MBCS_OUTPUT_3);
- if(len2 == 3 || (len2 == -3 && len == 0)) {
- targetValue = value;
- cs = (int8_t)(CNS_11643_0 + (value >> 16) - 0x80);
- if(len2 >= 0) {
- len = 2;
- } else {
- len = -2;
- useFallback = false;
- }
- if(cs == CNS_11643_1) {
- g = 1;
- } else if(cs == CNS_11643_2) {
- g = 2;
- } else /* plane 3..7 */ if(converterData->version == 1) {
- g = 3;
- } else {
- /* ISO-2022-CN (without -EXT) does not support plane 3..7 */
- len = 0;
- }
- }
- } else {
- /* GB2312_1 or ISO-IR-165 */
- U_ASSERT(cs0<UCNV_2022_MAX_CONVERTERS);
- len2 = MBCS_FROM_UCHAR32_ISO2022(
- converterData->myConverterArray[cs0],
- sourceChar,
- &value,
- useFallback,
- MBCS_OUTPUT_2);
- if(len2 == 2 || (len2 == -2 && len == 0)) {
- targetValue = value;
- len = len2;
- cs = cs0;
- g = 1;
- useFallback = false;
- }
- }
- }
- }
- if(len != 0) {
- len = 0; /* count output bytes; it must have been abs(len) == 2 */
- /* write the designation sequence if necessary */
- if(cs != pFromU2022State->cs[g]) {
- if(cs < CNS_11643) {
- uprv_memcpy(buffer, escSeqCharsCN[cs], 4);
- } else {
- U_ASSERT(cs >= CNS_11643_1);
- uprv_memcpy(buffer, escSeqCharsCN[CNS_11643 + (cs - CNS_11643_1)], 4);
- }
- len = 4;
- pFromU2022State->cs[g] = cs;
- if(g == 1) {
- /* changing the SO/G1 charset invalidates the choices[] */
- choiceCount = 0;
- }
- }
- /* write the shift sequence if necessary */
- if(g != pFromU2022State->g) {
- switch(g) {
- case 1:
- buffer[len++] = UCNV_SO;
- /* set the new state only if it is the locking shift SO/G1, not for SS2 or SS3 */
- pFromU2022State->g = 1;
- break;
- case 2:
- buffer[len++] = 0x1b;
- buffer[len++] = 0x4e;
- break;
- default: /* case 3 */
- buffer[len++] = 0x1b;
- buffer[len++] = 0x4f;
- break;
- }
- }
- /* write the two output bytes */
- buffer[len++] = (char)(targetValue >> 8);
- buffer[len++] = (char)targetValue;
- } else {
- /* if we cannot find the character after checking all codepages
- * then this is an error
- */
- *err = U_INVALID_CHAR_FOUND;
- cnv->fromUChar32=sourceChar;
- break;
- }
- }
- /* output len>0 bytes in buffer[] */
- if(len == 1) {
- *target++ = buffer[0];
- if(offsets) {
- *offsets++ = (int32_t)(source - args->source - 1); /* -1: known to be ASCII */
- }
- } else if(len == 2 && (target + 2) <= targetLimit) {
- *target++ = buffer[0];
- *target++ = buffer[1];
- if(offsets) {
- int32_t sourceIndex = (int32_t)(source - args->source - U16_LENGTH(sourceChar));
- *offsets++ = sourceIndex;
- *offsets++ = sourceIndex;
- }
- } else {
- fromUWriteUInt8(
- cnv,
- buffer, len,
- &target, (const char *)targetLimit,
- &offsets, (int32_t)(source - args->source - U16_LENGTH(sourceChar)),
- err);
- if(U_FAILURE(*err)) {
- break;
- }
- }
- } /* end if(myTargetIndex<myTargetLength) */
- else{
- *err =U_BUFFER_OVERFLOW_ERROR;
- break;
- }
- }/* end while(mySourceIndex<mySourceLength) */
- /*
- * the end of the input stream and detection of truncated input
- * are handled by the framework, but for ISO-2022-CN conversion
- * we need to be in ASCII mode at the very end
- *
- * conditions:
- * successful
- * not in ASCII mode
- * end of input and no truncated input
- */
- if( U_SUCCESS(*err) &&
- pFromU2022State->g!=0 &&
- args->flush && source>=sourceLimit && cnv->fromUChar32==0
- ) {
- int32_t sourceIndex;
- /* we are switching to ASCII */
- pFromU2022State->g=0;
- /* get the source index of the last input character */
- /*
- * TODO this would be simpler and more reliable if we used a pair
- * of sourceIndex/prevSourceIndex like in ucnvmbcs.c
- * so that we could simply use the prevSourceIndex here;
- * this code gives an incorrect result for the rare case of an unmatched
- * trail surrogate that is alone in the last buffer of the text stream
- */
- sourceIndex=(int32_t)(source-args->source);
- if(sourceIndex>0) {
- --sourceIndex;
- if( U16_IS_TRAIL(args->source[sourceIndex]) &&
- (sourceIndex==0 || U16_IS_LEAD(args->source[sourceIndex-1]))
- ) {
- --sourceIndex;
- }
- } else {
- sourceIndex=-1;
- }
- fromUWriteUInt8(
- cnv,
- SHIFT_IN_STR, 1,
- &target, (const char *)targetLimit,
- &offsets, sourceIndex,
- err);
- }
- /*save the state and return */
- args->source = source;
- args->target = (char*)target;
- }
- static void U_CALLCONV
- UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
- UErrorCode* err){
- char tempBuf[3];
- const char *mySource = (char *) args->source;
- char16_t *myTarget = args->target;
- const char *mySourceLimit = args->sourceLimit;
- uint32_t targetUniChar = 0x0000;
- uint32_t mySourceChar = 0x0000;
- UConverterDataISO2022* myData;
- ISO2022State *pToU2022State;
- myData=(UConverterDataISO2022*)(args->converter->extraInfo);
- pToU2022State = &myData->toU2022State;
- if(myData->key != 0) {
- /* continue with a partial escape sequence */
- goto escape;
- } else if(args->converter->toULength == 1 && mySource < mySourceLimit && myTarget < args->targetLimit) {
- /* continue with a partial double-byte character */
- mySourceChar = args->converter->toUBytes[0];
- args->converter->toULength = 0;
- targetUniChar = missingCharMarker;
- goto getTrailByte;
- }
- while(mySource < mySourceLimit){
- targetUniChar =missingCharMarker;
- if(myTarget < args->targetLimit){
- mySourceChar= (unsigned char) *mySource++;
- switch(mySourceChar){
- case UCNV_SI:
- pToU2022State->g=0;
- if (myData->isEmptySegment) {
- myData->isEmptySegment = false; /* we are handling it, reset to avoid future spurious errors */
- *err = U_ILLEGAL_ESCAPE_SEQUENCE;
- args->converter->toUCallbackReason = UCNV_IRREGULAR;
- args->converter->toUBytes[0] = static_cast<uint8_t>(mySourceChar);
- args->converter->toULength = 1;
- args->target = myTarget;
- args->source = mySource;
- return;
- }
- continue;
- case UCNV_SO:
- if(pToU2022State->cs[1] != 0) {
- pToU2022State->g=1;
- myData->isEmptySegment = true; /* Begin a new segment, empty so far */
- continue;
- } else {
- /* illegal to have SO before a matching designator */
- myData->isEmptySegment = false; /* Handling a different error, reset this to avoid future spurious errs */
- break;
- }
- case ESC_2022:
- mySource--;
- escape:
- {
- const char * mySourceBefore = mySource;
- int8_t toULengthBefore = args->converter->toULength;
- changeState_2022(args->converter,&(mySource),
- mySourceLimit, ISO_2022_CN,err);
- /* After SO there must be at least one character before a designator (designator error handled separately) */
- if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
- *err = U_ILLEGAL_ESCAPE_SEQUENCE;
- args->converter->toUCallbackReason = UCNV_IRREGULAR;
- args->converter->toULength = (int8_t)(toULengthBefore + (mySource - mySourceBefore));
- }
- }
- /* invalid or illegal escape sequence */
- if(U_FAILURE(*err)){
- args->target = myTarget;
- args->source = mySource;
- myData->isEmptySegment = false; /* Reset to avoid future spurious errors */
- return;
- }
- continue;
- /* ISO-2022-CN does not use single-byte (C1) SS2 and SS3 */
- case CR:
- case LF:
- uprv_memset(pToU2022State, 0, sizeof(ISO2022State));
- U_FALLTHROUGH;
- default:
- /* convert one or two bytes */
- myData->isEmptySegment = false;
- if(pToU2022State->g != 0) {
- if(mySource < mySourceLimit) {
- UConverterSharedData *cnv;
- StateEnum tempState;
- int32_t tempBufLen;
- int leadIsOk, trailIsOk;
- uint8_t trailByte;
- getTrailByte:
- trailByte = (uint8_t)*mySource;
- /*
- * Ticket 5691: consistent illegal sequences:
- * - We include at least the first byte in the illegal sequence.
- * - If any of the non-initial bytes could be the start of a character,
- * we stop the illegal sequence before the first one of those.
- *
- * In ISO-2022 DBCS, if the second byte is in the 21..7e range or is
- * an ESC/SO/SI, we report only the first byte as the illegal sequence.
- * Otherwise we convert or report the pair of bytes.
- */
- leadIsOk = (uint8_t)(mySourceChar - 0x21) <= (0x7e - 0x21);
- trailIsOk = (uint8_t)(trailByte - 0x21) <= (0x7e - 0x21);
- if (leadIsOk && trailIsOk) {
- ++mySource;
- tempState = (StateEnum)pToU2022State->cs[pToU2022State->g];
- if(tempState >= CNS_11643_0) {
- cnv = myData->myConverterArray[CNS_11643];
- tempBuf[0] = (char) (0x80+(tempState-CNS_11643_0));
- tempBuf[1] = (char) (mySourceChar);
- tempBuf[2] = (char) trailByte;
- tempBufLen = 3;
- }else{
- U_ASSERT(tempState<UCNV_2022_MAX_CONVERTERS);
- cnv = myData->myConverterArray[tempState];
- tempBuf[0] = (char) (mySourceChar);
- tempBuf[1] = (char) trailByte;
- tempBufLen = 2;
- }
- targetUniChar = ucnv_MBCSSimpleGetNextUChar(cnv, tempBuf, tempBufLen, false);
- mySourceChar = (mySourceChar << 8) | trailByte;
- } else if (!(trailIsOk || IS_2022_CONTROL(trailByte))) {
- /* report a pair of illegal bytes if the second byte is not a DBCS starter */
- ++mySource;
- /* add another bit so that the code below writes 2 bytes in case of error */
- mySourceChar = 0x10000 | (mySourceChar << 8) | trailByte;
- }
- if(pToU2022State->g>=2) {
- /* return from a single-shift state to the previous one */
- pToU2022State->g=pToU2022State->prevG;
- }
- } else {
- args->converter->toUBytes[0] = (uint8_t)mySourceChar;
- args->converter->toULength = 1;
- goto endloop;
- }
- }
- else{
- if(mySourceChar <= 0x7f) {
- targetUniChar = (char16_t) mySourceChar;
- }
- }
- break;
- }
- if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
- if(args->offsets){
- args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
- }
- *(myTarget++)=(char16_t)targetUniChar;
- }
- else if(targetUniChar > missingCharMarker){
- /* disassemble the surrogate pair and write to output*/
- targetUniChar-=0x0010000;
- *myTarget = (char16_t)(0xd800+(char16_t)(targetUniChar>>10));
- if(args->offsets){
- args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
- }
- ++myTarget;
- if(myTarget< args->targetLimit){
- *myTarget = (char16_t)(0xdc00+(char16_t)(targetUniChar&0x3ff));
- if(args->offsets){
- args->offsets[myTarget - args->target] = (int32_t)(mySource - args->source - (mySourceChar <= 0xff ? 1 : 2));
- }
- ++myTarget;
- }else{
- args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
- (char16_t)(0xdc00+(char16_t)(targetUniChar&0x3ff));
- }
- }
- else{
- /* Call the callback function*/
- toUnicodeCallback(args->converter,mySourceChar,targetUniChar,err);
- break;
- }
- }
- else{
- *err =U_BUFFER_OVERFLOW_ERROR;
- break;
- }
- }
- endloop:
- args->target = myTarget;
- args->source = mySource;
- }
- #endif /* #if !UCONFIG_ONLY_HTML_CONVERSION */
- static void U_CALLCONV
- _ISO_2022_WriteSub(UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode *err) {
- UConverter *cnv = args->converter;
- UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
- ISO2022State *pFromU2022State=&myConverterData->fromU2022State;
- char *p, *subchar;
- char buffer[8];
- int32_t length;
- subchar=(char *)cnv->subChars;
- length=cnv->subCharLen; /* assume length==1 for most variants */
- p = buffer;
- switch(myConverterData->locale[0]){
- case 'j':
- {
- int8_t cs;
- if(pFromU2022State->g == 1) {
- /* JIS7: switch from G1 to G0 */
- pFromU2022State->g = 0;
- *p++ = UCNV_SI;
- }
- cs = pFromU2022State->cs[0];
- if(cs != ASCII && cs != JISX201) {
- /* not in ASCII or JIS X 0201: switch to ASCII */
- pFromU2022State->cs[0] = (int8_t)ASCII;
- *p++ = '\x1b';
- *p++ = '\x28';
- *p++ = '\x42';
- }
- *p++ = subchar[0];
- break;
- }
- case 'c':
- if(pFromU2022State->g != 0) {
- /* not in ASCII mode: switch to ASCII */
- pFromU2022State->g = 0;
- *p++ = UCNV_SI;
- }
- *p++ = subchar[0];
- break;
- case 'k':
- if(myConverterData->version == 0) {
- if(length == 1) {
- if(args->converter->fromUnicodeStatus) {
- /* in DBCS mode: switch to SBCS */
- args->converter->fromUnicodeStatus = 0;
- *p++ = UCNV_SI;
- }
- *p++ = subchar[0];
- } else /* length == 2*/ {
- if(!args->converter->fromUnicodeStatus) {
- /* in SBCS mode: switch to DBCS */
- args->converter->fromUnicodeStatus = 1;
- *p++ = UCNV_SO;
- }
- *p++ = subchar[0];
- *p++ = subchar[1];
- }
- break;
- } else {
- /* save the subconverter's substitution string */
- uint8_t *currentSubChars = myConverterData->currentConverter->subChars;
- int8_t currentSubCharLen = myConverterData->currentConverter->subCharLen;
- /* set our substitution string into the subconverter */
- myConverterData->currentConverter->subChars = (uint8_t *)subchar;
- myConverterData->currentConverter->subCharLen = (int8_t)length;
- /* let the subconverter write the subchar, set/retrieve fromUChar32 state */
- args->converter = myConverterData->currentConverter;
- myConverterData->currentConverter->fromUChar32 = cnv->fromUChar32;
- ucnv_cbFromUWriteSub(args, 0, err);
- cnv->fromUChar32 = myConverterData->currentConverter->fromUChar32;
- args->converter = cnv;
- /* restore the subconverter's substitution string */
- myConverterData->currentConverter->subChars = currentSubChars;
- myConverterData->currentConverter->subCharLen = currentSubCharLen;
- if(*err == U_BUFFER_OVERFLOW_ERROR) {
- if(myConverterData->currentConverter->charErrorBufferLength > 0) {
- uprv_memcpy(
- cnv->charErrorBuffer,
- myConverterData->currentConverter->charErrorBuffer,
- myConverterData->currentConverter->charErrorBufferLength);
- }
- cnv->charErrorBufferLength = myConverterData->currentConverter->charErrorBufferLength;
- myConverterData->currentConverter->charErrorBufferLength = 0;
- }
- return;
- }
- default:
- /* not expected */
- break;
- }
- ucnv_cbFromUWriteBytes(args,
- buffer, (int32_t)(p - buffer),
- offsetIndex, err);
- }
- /*
- * Structure for cloning an ISO 2022 converter into a single memory block.
- */
- struct cloneStruct
- {
- UConverter cnv;
- UConverter currentConverter;
- UConverterDataISO2022 mydata;
- };
- U_CDECL_BEGIN
- static UConverter * U_CALLCONV
- _ISO_2022_SafeClone(
- const UConverter *cnv,
- void *stackBuffer,
- int32_t *pBufferSize,
- UErrorCode *status)
- {
- struct cloneStruct * localClone;
- UConverterDataISO2022 *cnvData;
- int32_t i, size;
- if (U_FAILURE(*status)){
- return nullptr;
- }
- if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */
- *pBufferSize = (int32_t)sizeof(struct cloneStruct);
- return nullptr;
- }
- cnvData = (UConverterDataISO2022 *)cnv->extraInfo;
- localClone = (struct cloneStruct *)stackBuffer;
- /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
- uprv_memcpy(&localClone->mydata, cnvData, sizeof(UConverterDataISO2022));
- localClone->cnv.extraInfo = &localClone->mydata; /* set pointer to extra data */
- localClone->cnv.isExtraLocal = true;
- /* share the subconverters */
- if(cnvData->currentConverter != nullptr) {
- size = (int32_t)sizeof(UConverter);
- localClone->mydata.currentConverter =
- ucnv_safeClone(cnvData->currentConverter,
- &localClone->currentConverter,
- &size, status);
- if(U_FAILURE(*status)) {
- return nullptr;
- }
- }
- for(i=0; i<UCNV_2022_MAX_CONVERTERS; ++i) {
- if(cnvData->myConverterArray[i] != nullptr) {
- ucnv_incrementRefCount(cnvData->myConverterArray[i]);
- }
- }
- return &localClone->cnv;
- }
- U_CDECL_END
- static void U_CALLCONV
- _ISO_2022_GetUnicodeSet(const UConverter *cnv,
- const USetAdder *sa,
- UConverterUnicodeSet which,
- UErrorCode *pErrorCode)
- {
- int32_t i;
- UConverterDataISO2022* cnvData;
- if (U_FAILURE(*pErrorCode)) {
- return;
- }
- #ifdef U_ENABLE_GENERIC_ISO_2022
- if (cnv->sharedData == &_ISO2022Data) {
- /* We use UTF-8 in this case */
- sa->addRange(sa->set, 0, 0xd7FF);
- sa->addRange(sa->set, 0xE000, 0x10FFFF);
- return;
- }
- #endif
- cnvData = (UConverterDataISO2022*)cnv->extraInfo;
- /* open a set and initialize it with code points that are algorithmically round-tripped */
- switch(cnvData->locale[0]){
- case 'j':
- /* include JIS X 0201 which is hardcoded */
- sa->add(sa->set, 0xa5);
- sa->add(sa->set, 0x203e);
- if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) {
- /* include Latin-1 for some variants of JP */
- sa->addRange(sa->set, 0, 0xff);
- } else {
- /* include ASCII for JP */
- sa->addRange(sa->set, 0, 0x7f);
- }
- if(cnvData->version==3 || cnvData->version==4 || which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) {
- /*
- * Do not test (jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT))!=0
- * because the bit is on for all JP versions although only versions 3 & 4 (JIS7 & JIS8)
- * use half-width Katakana.
- * This is because all ISO-2022-JP variants are lenient in that they accept (in toUnicode)
- * half-width Katakana via the ESC ( I sequence.
- * However, we only emit (fromUnicode) half-width Katakana according to the
- * definition of each variant.
- *
- * When including fallbacks,
- * we need to include half-width Katakana Unicode code points for all JP variants because
- * JIS X 0208 has hardcoded fallbacks for them (which map to full-width Katakana).
- */
- /* include half-width Katakana for JP */
- sa->addRange(sa->set, HWKANA_START, HWKANA_END);
- }
- break;
- #if !UCONFIG_ONLY_HTML_CONVERSION
- case 'c':
- case 'z':
- /* include ASCII for CN */
- sa->addRange(sa->set, 0, 0x7f);
- break;
- case 'k':
- /* there is only one converter for KR, and it is not in the myConverterArray[] */
- cnvData->currentConverter->sharedData->impl->getUnicodeSet(
- cnvData->currentConverter, sa, which, pErrorCode);
- /* the loop over myConverterArray[] will simply not find another converter */
- break;
- #endif
- default:
- break;
- }
- #if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
- if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
- cnvData->version==0 && i==CNS_11643
- ) {
- /* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */
- ucnv_MBCSGetUnicodeSetForBytes(
- cnvData->myConverterArray[i],
- sa, UCNV_ROUNDTRIP_SET,
- 0, 0x81, 0x82,
- pErrorCode);
- }
- #endif
- for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
- UConverterSetFilter filter;
- if(cnvData->myConverterArray[i]!=nullptr) {
- if(cnvData->locale[0]=='j' && i==JISX208) {
- /*
- * Only add code points that map to Shift-JIS codes
- * corresponding to JIS X 0208.
- */
- filter=UCNV_SET_FILTER_SJIS;
- #if !UCONFIG_ONLY_HTML_CONVERSION
- } else if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
- cnvData->version==0 && i==CNS_11643) {
- /*
- * Version-specific for CN:
- * CN version 0 does not map CNS planes 3..7 although
- * they are all available in the CNS conversion table;
- * CN version 1 (-EXT) does map them all.
- * The two versions create different Unicode sets.
- */
- filter=UCNV_SET_FILTER_2022_CN;
- } else if(i==KSC5601) {
- /*
- * Some of the KSC 5601 tables (convrtrs.txt has this aliases on multiple tables)
- * are broader than GR94.
- */
- filter=UCNV_SET_FILTER_GR94DBCS;
- #endif
- } else {
- filter=UCNV_SET_FILTER_NONE;
- }
- ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode);
- }
- }
- /*
- * ISO 2022 converters must not convert SO/SI/ESC despite what
- * sub-converters do by themselves.
- * Remove these characters from the set.
- */
- sa->remove(sa->set, 0x0e);
- sa->remove(sa->set, 0x0f);
- sa->remove(sa->set, 0x1b);
- /* ISO 2022 converters do not convert C1 controls either */
- sa->removeRange(sa->set, 0x80, 0x9f);
- }
- static const UConverterImpl _ISO2022Impl={
- UCNV_ISO_2022,
- nullptr,
- nullptr,
- _ISO2022Open,
- _ISO2022Close,
- _ISO2022Reset,
- #ifdef U_ENABLE_GENERIC_ISO_2022
- T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
- T_UConverter_toUnicode_ISO_2022_OFFSETS_LOGIC,
- ucnv_fromUnicode_UTF8,
- ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
- #else
- nullptr,
- nullptr,
- nullptr,
- nullptr,
- #endif
- nullptr,
- nullptr,
- _ISO2022getName,
- _ISO_2022_WriteSub,
- _ISO_2022_SafeClone,
- _ISO_2022_GetUnicodeSet,
- nullptr,
- nullptr
- };
- static const UConverterStaticData _ISO2022StaticData={
- sizeof(UConverterStaticData),
- "ISO_2022",
- 2022,
- UCNV_IBM,
- UCNV_ISO_2022,
- 1,
- 3, /* max 3 bytes per char16_t from UTF-8 (4 bytes from surrogate _pair_) */
- { 0x1a, 0, 0, 0 },
- 1,
- false,
- false,
- 0,
- 0,
- { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
- };
- const UConverterSharedData _ISO2022Data=
- UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022StaticData, &_ISO2022Impl);
- /*************JP****************/
- static const UConverterImpl _ISO2022JPImpl={
- UCNV_ISO_2022,
- nullptr,
- nullptr,
- _ISO2022Open,
- _ISO2022Close,
- _ISO2022Reset,
- UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
- UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
- UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
- UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
- nullptr,
- nullptr,
- _ISO2022getName,
- _ISO_2022_WriteSub,
- _ISO_2022_SafeClone,
- _ISO_2022_GetUnicodeSet,
- nullptr,
- nullptr
- };
- static const UConverterStaticData _ISO2022JPStaticData={
- sizeof(UConverterStaticData),
- "ISO_2022_JP",
- 0,
- UCNV_IBM,
- UCNV_ISO_2022,
- 1,
- 6, /* max 6 bytes per char16_t: 4-byte escape sequence + DBCS */
- { 0x1a, 0, 0, 0 },
- 1,
- false,
- false,
- 0,
- 0,
- { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
- };
- namespace {
- const UConverterSharedData _ISO2022JPData=
- UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022JPStaticData, &_ISO2022JPImpl);
- } // namespace
- #if !UCONFIG_ONLY_HTML_CONVERSION
- /************* KR ***************/
- static const UConverterImpl _ISO2022KRImpl={
- UCNV_ISO_2022,
- nullptr,
- nullptr,
- _ISO2022Open,
- _ISO2022Close,
- _ISO2022Reset,
- UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
- UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
- UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
- UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
- nullptr,
- nullptr,
- _ISO2022getName,
- _ISO_2022_WriteSub,
- _ISO_2022_SafeClone,
- _ISO_2022_GetUnicodeSet,
- nullptr,
- nullptr
- };
- static const UConverterStaticData _ISO2022KRStaticData={
- sizeof(UConverterStaticData),
- "ISO_2022_KR",
- 0,
- UCNV_IBM,
- UCNV_ISO_2022,
- 1,
- 8, /* max 8 bytes per char16_t */
- { 0x1a, 0, 0, 0 },
- 1,
- false,
- false,
- 0,
- 0,
- { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
- };
- namespace {
- const UConverterSharedData _ISO2022KRData=
- UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022KRStaticData, &_ISO2022KRImpl);
- } // namespace
- /*************** CN ***************/
- static const UConverterImpl _ISO2022CNImpl={
- UCNV_ISO_2022,
- nullptr,
- nullptr,
- _ISO2022Open,
- _ISO2022Close,
- _ISO2022Reset,
- UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
- UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
- UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
- UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
- nullptr,
- nullptr,
- _ISO2022getName,
- _ISO_2022_WriteSub,
- _ISO_2022_SafeClone,
- _ISO_2022_GetUnicodeSet,
- nullptr,
- nullptr
- };
- static const UConverterStaticData _ISO2022CNStaticData={
- sizeof(UConverterStaticData),
- "ISO_2022_CN",
- 0,
- UCNV_IBM,
- UCNV_ISO_2022,
- 1,
- 8, /* max 8 bytes per char16_t: 4-byte CNS designator + 2 bytes for SS2/SS3 + DBCS */
- { 0x1a, 0, 0, 0 },
- 1,
- false,
- false,
- 0,
- 0,
- { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
- };
- namespace {
- const UConverterSharedData _ISO2022CNData=
- UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISO2022CNStaticData, &_ISO2022CNImpl);
- } // namespace
- #endif /* #if !UCONFIG_ONLY_HTML_CONVERSION */
- #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */
|