1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150 |
- /*
- * _codecs_iso2022.c: Codecs collection for ISO-2022 encodings.
- *
- * Written by Hye-Shik Chang <perky@FreeBSD.org>
- */
- #define USING_IMPORTED_MAPS
- #define USING_BINARY_PAIR_SEARCH
- #define EXTERN_JISX0213_PAIR
- #define EMULATE_JISX0213_2000_ENCODE_INVALID MAP_UNMAPPABLE
- #define EMULATE_JISX0213_2000_DECODE_INVALID MAP_UNMAPPABLE
- #define CJK_MOD_SPECIFIC_STATE \
- /* kr */ \
- const encode_map *cp949_encmap; \
- const decode_map *ksx1001_decmap; \
- \
- /* jp */ \
- const encode_map *jisxcommon_encmap; \
- const decode_map *jisx0208_decmap; \
- const decode_map *jisx0212_decmap; \
- const encode_map *jisx0213_bmp_encmap; \
- const decode_map *jisx0213_1_bmp_decmap; \
- const decode_map *jisx0213_2_bmp_decmap; \
- const encode_map *jisx0213_emp_encmap; \
- const decode_map *jisx0213_1_emp_decmap; \
- const decode_map *jisx0213_2_emp_decmap; \
- \
- /* cn */ \
- const encode_map *gbcommon_encmap; \
- const decode_map *gb2312_decmap;
- #include "cjkcodecs.h"
- #include "alg_jisx0201.h"
- #include "emu_jisx0213_2000.h"
- #include "mappings_jisx0213_pair.h"
- /* STATE
- state->c[0-3]
- 00000000
- ||^^^^^|
- |+-----+---- G0-3 Character Set
- +----------- Is G0-3 double byte?
- state->c[4]
- 00000000
- ||
- |+---- Locked-Shift?
- +----- ESC Throughout
- */
- #define ESC 0x1B
- #define SO 0x0E
- #define SI 0x0F
- #define LF 0x0A
- #define MAX_ESCSEQLEN 16
- #define CHARSET_ISO8859_1 'A'
- #define CHARSET_ASCII 'B'
- #define CHARSET_ISO8859_7 'F'
- #define CHARSET_JISX0201_K 'I'
- #define CHARSET_JISX0201_R 'J'
- #define CHARSET_GB2312 ('A'|CHARSET_DBCS)
- #define CHARSET_JISX0208 ('B'|CHARSET_DBCS)
- #define CHARSET_KSX1001 ('C'|CHARSET_DBCS)
- #define CHARSET_JISX0212 ('D'|CHARSET_DBCS)
- #define CHARSET_GB2312_8565 ('E'|CHARSET_DBCS)
- #define CHARSET_CNS11643_1 ('G'|CHARSET_DBCS)
- #define CHARSET_CNS11643_2 ('H'|CHARSET_DBCS)
- #define CHARSET_JISX0213_2000_1 ('O'|CHARSET_DBCS)
- #define CHARSET_JISX0213_2 ('P'|CHARSET_DBCS)
- #define CHARSET_JISX0213_2004_1 ('Q'|CHARSET_DBCS)
- #define CHARSET_JISX0208_O ('@'|CHARSET_DBCS)
- #define CHARSET_DBCS 0x80
- #define ESCMARK(mark) ((mark) & 0x7f)
- #define IS_ESCEND(c) (((c) >= 'A' && (c) <= 'Z') || (c) == '@')
- #define IS_ISO2022ESC(c2) \
- ((c2) == '(' || (c2) == ')' || (c2) == '$' || \
- (c2) == '.' || (c2) == '&')
- /* this is not a complete list of ISO-2022 escape sequence headers.
- * but, it's enough to implement CJK instances of iso-2022. */
- #define MAP_UNMAPPABLE 0xFFFF
- #define MAP_MULTIPLE_AVAIL 0xFFFE /* for JIS X 0213 */
- #define F_SHIFTED 0x01
- #define F_ESCTHROUGHOUT 0x02
- #define STATE_SETG(dn, v) do { ((state)->c[dn]) = (v); } while (0)
- #define STATE_GETG(dn) ((state)->c[dn])
- #define STATE_G0 STATE_GETG(0)
- #define STATE_G1 STATE_GETG(1)
- #define STATE_G2 STATE_GETG(2)
- #define STATE_G3 STATE_GETG(3)
- #define STATE_SETG0(v) STATE_SETG(0, v)
- #define STATE_SETG1(v) STATE_SETG(1, v)
- #define STATE_SETG2(v) STATE_SETG(2, v)
- #define STATE_SETG3(v) STATE_SETG(3, v)
- #define STATE_SETFLAG(f) do { ((state)->c[4]) |= (f); } while (0)
- #define STATE_GETFLAG(f) ((state)->c[4] & (f))
- #define STATE_CLEARFLAG(f) do { ((state)->c[4]) &= ~(f); } while (0)
- #define STATE_CLEARFLAGS() do { ((state)->c[4]) = 0; } while (0)
- #define ISO2022_CONFIG ((const struct iso2022_config *)(codec->config))
- #define CONFIG_ISSET(flag) (ISO2022_CONFIG->flags & (flag))
- #define CONFIG_DESIGNATIONS (ISO2022_CONFIG->designations)
- /* iso2022_config.flags */
- #define NO_SHIFT 0x01
- #define USE_G2 0x02
- #define USE_JISX0208_EXT 0x04
- /*-*- internal data structures -*-*/
- typedef int (*iso2022_init_func)(const MultibyteCodec *codec);
- typedef Py_UCS4 (*iso2022_decode_func)(const MultibyteCodec *codec,
- const unsigned char *data);
- typedef DBCHAR (*iso2022_encode_func)(const MultibyteCodec *codec,
- const Py_UCS4 *data,
- Py_ssize_t *length);
- struct iso2022_designation {
- unsigned char mark;
- unsigned char plane;
- unsigned char width;
- iso2022_init_func initializer;
- iso2022_decode_func decoder;
- iso2022_encode_func encoder;
- };
- struct iso2022_config {
- int flags;
- const struct iso2022_designation *designations; /* non-ascii desigs */
- };
- /*-*- iso-2022 codec implementation -*-*/
- CODEC_INIT(iso2022)
- {
- const struct iso2022_designation *desig;
- for (desig = CONFIG_DESIGNATIONS; desig->mark; desig++) {
- if (desig->initializer != NULL && desig->initializer(codec) != 0) {
- return -1;
- }
- }
- return 0;
- }
- ENCODER_INIT(iso2022)
- {
- STATE_CLEARFLAGS();
- STATE_SETG0(CHARSET_ASCII);
- STATE_SETG1(CHARSET_ASCII);
- return 0;
- }
- ENCODER_RESET(iso2022)
- {
- if (STATE_GETFLAG(F_SHIFTED)) {
- WRITEBYTE1(SI);
- NEXT_OUT(1);
- STATE_CLEARFLAG(F_SHIFTED);
- }
- if (STATE_G0 != CHARSET_ASCII) {
- WRITEBYTE3(ESC, '(', 'B');
- NEXT_OUT(3);
- STATE_SETG0(CHARSET_ASCII);
- }
- return 0;
- }
- ENCODER(iso2022)
- {
- while (*inpos < inlen) {
- const struct iso2022_designation *dsg;
- DBCHAR encoded;
- Py_UCS4 c = INCHAR1;
- Py_ssize_t insize;
- if (c < 0x80) {
- if (STATE_G0 != CHARSET_ASCII) {
- WRITEBYTE3(ESC, '(', 'B');
- STATE_SETG0(CHARSET_ASCII);
- NEXT_OUT(3);
- }
- if (STATE_GETFLAG(F_SHIFTED)) {
- WRITEBYTE1(SI);
- STATE_CLEARFLAG(F_SHIFTED);
- NEXT_OUT(1);
- }
- WRITEBYTE1((unsigned char)c);
- NEXT(1, 1);
- continue;
- }
- insize = 1;
- encoded = MAP_UNMAPPABLE;
- for (dsg = CONFIG_DESIGNATIONS; dsg->mark; dsg++) {
- Py_UCS4 buf[2] = {c, 0};
- Py_ssize_t length = 1;
- encoded = dsg->encoder(codec, buf, &length);
- if (encoded == MAP_MULTIPLE_AVAIL) {
- /* this implementation won't work for pair
- * of non-bmp characters. */
- if (inlen - *inpos < 2) {
- if (!(flags & MBENC_FLUSH))
- return MBERR_TOOFEW;
- length = -1;
- }
- else {
- buf[1] = INCHAR2;
- length = 2;
- }
- encoded = dsg->encoder(codec, buf, &length);
- if (encoded != MAP_UNMAPPABLE) {
- insize = length;
- break;
- }
- }
- else if (encoded != MAP_UNMAPPABLE)
- break;
- }
- if (!dsg->mark)
- return 1;
- assert(dsg->width == 1 || dsg->width == 2);
- switch (dsg->plane) {
- case 0: /* G0 */
- if (STATE_GETFLAG(F_SHIFTED)) {
- WRITEBYTE1(SI);
- STATE_CLEARFLAG(F_SHIFTED);
- NEXT_OUT(1);
- }
- if (STATE_G0 != dsg->mark) {
- if (dsg->width == 1) {
- WRITEBYTE3(ESC, '(', ESCMARK(dsg->mark));
- STATE_SETG0(dsg->mark);
- NEXT_OUT(3);
- }
- else if (dsg->mark == CHARSET_JISX0208) {
- WRITEBYTE3(ESC, '$', ESCMARK(dsg->mark));
- STATE_SETG0(dsg->mark);
- NEXT_OUT(3);
- }
- else {
- WRITEBYTE4(ESC, '$', '(',
- ESCMARK(dsg->mark));
- STATE_SETG0(dsg->mark);
- NEXT_OUT(4);
- }
- }
- break;
- case 1: /* G1 */
- if (STATE_G1 != dsg->mark) {
- if (dsg->width == 1) {
- WRITEBYTE3(ESC, ')', ESCMARK(dsg->mark));
- STATE_SETG1(dsg->mark);
- NEXT_OUT(3);
- }
- else {
- WRITEBYTE4(ESC, '$', ')', ESCMARK(dsg->mark));
- STATE_SETG1(dsg->mark);
- NEXT_OUT(4);
- }
- }
- if (!STATE_GETFLAG(F_SHIFTED)) {
- WRITEBYTE1(SO);
- STATE_SETFLAG(F_SHIFTED);
- NEXT_OUT(1);
- }
- break;
- default: /* G2 and G3 is not supported: no encoding in
- * CJKCodecs are using them yet */
- return MBERR_INTERNAL;
- }
- if (dsg->width == 1) {
- WRITEBYTE1((unsigned char)encoded);
- NEXT_OUT(1);
- }
- else {
- WRITEBYTE2(encoded >> 8, encoded & 0xff);
- NEXT_OUT(2);
- }
- NEXT_INCHAR(insize);
- }
- return 0;
- }
- DECODER_INIT(iso2022)
- {
- STATE_CLEARFLAGS();
- STATE_SETG0(CHARSET_ASCII);
- STATE_SETG1(CHARSET_ASCII);
- STATE_SETG2(CHARSET_ASCII);
- return 0;
- }
- DECODER_RESET(iso2022)
- {
- STATE_SETG0(CHARSET_ASCII);
- STATE_CLEARFLAG(F_SHIFTED);
- return 0;
- }
- static Py_ssize_t
- iso2022processesc(const MultibyteCodec *codec, MultibyteCodec_State *state,
- const unsigned char **inbuf, Py_ssize_t *inleft)
- {
- unsigned char charset, designation;
- Py_ssize_t i, esclen = 0;
- for (i = 1;i < MAX_ESCSEQLEN;i++) {
- if (i >= *inleft)
- return MBERR_TOOFEW;
- if (IS_ESCEND((*inbuf)[i])) {
- esclen = i + 1;
- break;
- }
- else if (CONFIG_ISSET(USE_JISX0208_EXT) && i+1 < *inleft &&
- (*inbuf)[i] == '&' && (*inbuf)[i+1] == '@') {
- i += 2;
- }
- }
- switch (esclen) {
- case 0:
- return 1; /* unterminated escape sequence */
- case 3:
- if (INBYTE2 == '$') {
- charset = INBYTE3 | CHARSET_DBCS;
- designation = 0;
- }
- else {
- charset = INBYTE3;
- if (INBYTE2 == '(')
- designation = 0;
- else if (INBYTE2 == ')')
- designation = 1;
- else if (CONFIG_ISSET(USE_G2) && INBYTE2 == '.')
- designation = 2;
- else
- return 3;
- }
- break;
- case 4:
- if (INBYTE2 != '$')
- return 4;
- charset = INBYTE4 | CHARSET_DBCS;
- if (INBYTE3 == '(')
- designation = 0;
- else if (INBYTE3 == ')')
- designation = 1;
- else
- return 4;
- break;
- case 6: /* designation with prefix */
- if (CONFIG_ISSET(USE_JISX0208_EXT) &&
- (*inbuf)[3] == ESC && (*inbuf)[4] == '$' &&
- (*inbuf)[5] == 'B') {
- charset = 'B' | CHARSET_DBCS;
- designation = 0;
- }
- else
- return 6;
- break;
- default:
- return esclen;
- }
- /* raise error when the charset is not designated for this encoding */
- if (charset != CHARSET_ASCII) {
- const struct iso2022_designation *dsg;
- for (dsg = CONFIG_DESIGNATIONS; dsg->mark; dsg++) {
- if (dsg->mark == charset)
- break;
- }
- if (!dsg->mark)
- return esclen;
- }
- STATE_SETG(designation, charset);
- *inleft -= esclen;
- (*inbuf) += esclen;
- return 0;
- }
- #define ISO8859_7_DECODE(c, writer) \
- if ((c) < 0xa0) { \
- OUTCHAR(c); \
- } else if ((c) < 0xc0 && (0x288f3bc9L & (1L << ((c)-0xa0)))) { \
- OUTCHAR(c); \
- } else if ((c) >= 0xb4 && (c) <= 0xfe && ((c) >= 0xd4 || \
- (0xbffffd77L & (1L << ((c)-0xb4))))) { \
- OUTCHAR(0x02d0 + (c)); \
- } else if ((c) == 0xa1) { \
- OUTCHAR(0x2018); \
- } else if ((c) == 0xa2) { \
- OUTCHAR(0x2019); \
- } else if ((c) == 0xaf) { \
- OUTCHAR(0x2015); \
- }
- static Py_ssize_t
- iso2022processg2(const MultibyteCodec *codec, MultibyteCodec_State *state,
- const unsigned char **inbuf, Py_ssize_t *inleft,
- _PyUnicodeWriter *writer)
- {
- /* not written to use encoder, decoder functions because only few
- * encodings use G2 designations in CJKCodecs */
- if (STATE_G2 == CHARSET_ISO8859_1) {
- if (INBYTE3 < 0x80)
- OUTCHAR(INBYTE3 + 0x80);
- else
- return 3;
- }
- else if (STATE_G2 == CHARSET_ISO8859_7) {
- ISO8859_7_DECODE(INBYTE3 ^ 0x80, writer)
- else
- return 3;
- }
- else if (STATE_G2 == CHARSET_ASCII) {
- if (INBYTE3 & 0x80)
- return 3;
- else
- OUTCHAR(INBYTE3);
- }
- else
- return MBERR_INTERNAL;
- (*inbuf) += 3;
- *inleft -= 3;
- return 0;
- }
- DECODER(iso2022)
- {
- const struct iso2022_designation *dsgcache = NULL;
- while (inleft > 0) {
- unsigned char c = INBYTE1;
- Py_ssize_t err;
- if (STATE_GETFLAG(F_ESCTHROUGHOUT)) {
- /* ESC throughout mode:
- * for non-iso2022 escape sequences */
- OUTCHAR(c); /* assume as ISO-8859-1 */
- NEXT_IN(1);
- if (IS_ESCEND(c)) {
- STATE_CLEARFLAG(F_ESCTHROUGHOUT);
- }
- continue;
- }
- switch (c) {
- case ESC:
- REQUIRE_INBUF(2);
- if (IS_ISO2022ESC(INBYTE2)) {
- err = iso2022processesc(codec, state,
- inbuf, &inleft);
- if (err != 0)
- return err;
- }
- else if (CONFIG_ISSET(USE_G2) && INBYTE2 == 'N') {/* SS2 */
- REQUIRE_INBUF(3);
- err = iso2022processg2(codec, state,
- inbuf, &inleft, writer);
- if (err != 0)
- return err;
- }
- else {
- OUTCHAR(ESC);
- STATE_SETFLAG(F_ESCTHROUGHOUT);
- NEXT_IN(1);
- }
- break;
- case SI:
- if (CONFIG_ISSET(NO_SHIFT))
- goto bypass;
- STATE_CLEARFLAG(F_SHIFTED);
- NEXT_IN(1);
- break;
- case SO:
- if (CONFIG_ISSET(NO_SHIFT))
- goto bypass;
- STATE_SETFLAG(F_SHIFTED);
- NEXT_IN(1);
- break;
- case LF:
- STATE_CLEARFLAG(F_SHIFTED);
- OUTCHAR(LF);
- NEXT_IN(1);
- break;
- default:
- if (c < 0x20) /* C0 */
- goto bypass;
- else if (c >= 0x80)
- return 1;
- else {
- const struct iso2022_designation *dsg;
- unsigned char charset;
- Py_UCS4 decoded;
- if (STATE_GETFLAG(F_SHIFTED))
- charset = STATE_G1;
- else
- charset = STATE_G0;
- if (charset == CHARSET_ASCII) {
- bypass:
- OUTCHAR(c);
- NEXT_IN(1);
- break;
- }
- if (dsgcache != NULL &&
- dsgcache->mark == charset)
- dsg = dsgcache;
- else {
- for (dsg = CONFIG_DESIGNATIONS;
- dsg->mark != charset
- #ifdef Py_DEBUG
- && dsg->mark != '\0'
- #endif
- ; dsg++)
- {
- /* noop */
- }
- assert(dsg->mark != '\0');
- dsgcache = dsg;
- }
- REQUIRE_INBUF(dsg->width);
- decoded = dsg->decoder(codec, *inbuf);
- if (decoded == MAP_UNMAPPABLE)
- return dsg->width;
- if (decoded < 0x10000) {
- OUTCHAR(decoded);
- }
- else if (decoded < 0x30000) {
- OUTCHAR(decoded);
- }
- else { /* JIS X 0213 pairs */
- OUTCHAR2(decoded >> 16, decoded & 0xffff);
- }
- NEXT_IN(dsg->width);
- }
- break;
- }
- }
- return 0;
- }
- /*-*- mapping access functions -*-*/
- static int
- ksx1001_init(const MultibyteCodec *codec)
- {
- cjkcodecs_module_state *st = codec->modstate;
- if (IMPORT_MAP(kr, cp949, &st->cp949_encmap, NULL) ||
- IMPORT_MAP(kr, ksx1001, NULL, &st->ksx1001_decmap))
- {
- return -1;
- }
- return 0;
- }
- static Py_UCS4
- ksx1001_decoder(const MultibyteCodec *codec, const unsigned char *data)
- {
- Py_UCS4 u;
- if (TRYMAP_DEC_ST(ksx1001, u, data[0], data[1]))
- return u;
- else
- return MAP_UNMAPPABLE;
- }
- static DBCHAR
- ksx1001_encoder(const MultibyteCodec *codec, const Py_UCS4 *data,
- Py_ssize_t *length)
- {
- DBCHAR coded;
- assert(*length == 1);
- if (*data < 0x10000) {
- if (TRYMAP_ENC_ST(cp949, coded, *data)) {
- if (!(coded & 0x8000))
- return coded;
- }
- }
- return MAP_UNMAPPABLE;
- }
- static int
- jisx0208_init(const MultibyteCodec *codec)
- {
- cjkcodecs_module_state *st = codec->modstate;
- if (IMPORT_MAP(jp, jisxcommon, &st->jisxcommon_encmap, NULL) ||
- IMPORT_MAP(jp, jisx0208, NULL, &st->jisx0208_decmap))
- {
- return -1;
- }
- return 0;
- }
- static Py_UCS4
- jisx0208_decoder(const MultibyteCodec *codec, const unsigned char *data)
- {
- Py_UCS4 u;
- if (data[0] == 0x21 && data[1] == 0x40) /* F/W REVERSE SOLIDUS */
- return 0xff3c;
- else if (TRYMAP_DEC_ST(jisx0208, u, data[0], data[1]))
- return u;
- else
- return MAP_UNMAPPABLE;
- }
- static DBCHAR
- jisx0208_encoder(const MultibyteCodec *codec, const Py_UCS4 *data,
- Py_ssize_t *length)
- {
- DBCHAR coded;
- assert(*length == 1);
- if (*data < 0x10000) {
- if (*data == 0xff3c) /* F/W REVERSE SOLIDUS */
- return 0x2140;
- else if (TRYMAP_ENC_ST(jisxcommon, coded, *data)) {
- if (!(coded & 0x8000))
- return coded;
- }
- }
- return MAP_UNMAPPABLE;
- }
- static int
- jisx0212_init(const MultibyteCodec *codec)
- {
- cjkcodecs_module_state *st = codec->modstate;
- if (IMPORT_MAP(jp, jisxcommon, &st->jisxcommon_encmap, NULL) ||
- IMPORT_MAP(jp, jisx0212, NULL, &st->jisx0212_decmap))
- {
- return -1;
- }
- return 0;
- }
- static Py_UCS4
- jisx0212_decoder(const MultibyteCodec *codec, const unsigned char *data)
- {
- Py_UCS4 u;
- if (TRYMAP_DEC_ST(jisx0212, u, data[0], data[1]))
- return u;
- else
- return MAP_UNMAPPABLE;
- }
- static DBCHAR
- jisx0212_encoder(const MultibyteCodec *codec, const Py_UCS4 *data,
- Py_ssize_t *length)
- {
- DBCHAR coded;
- assert(*length == 1);
- if (*data < 0x10000) {
- if (TRYMAP_ENC_ST(jisxcommon, coded, *data)) {
- if (coded & 0x8000)
- return coded & 0x7fff;
- }
- }
- return MAP_UNMAPPABLE;
- }
- static int
- jisx0213_init(const MultibyteCodec *codec)
- {
- cjkcodecs_module_state *st = codec->modstate;
- if (jisx0208_init(codec) ||
- IMPORT_MAP(jp, jisx0213_bmp, &st->jisx0213_bmp_encmap, NULL) ||
- IMPORT_MAP(jp, jisx0213_1_bmp, NULL, &st->jisx0213_1_bmp_decmap) ||
- IMPORT_MAP(jp, jisx0213_2_bmp, NULL, &st->jisx0213_2_bmp_decmap) ||
- IMPORT_MAP(jp, jisx0213_emp, &st->jisx0213_emp_encmap, NULL) ||
- IMPORT_MAP(jp, jisx0213_1_emp, NULL, &st->jisx0213_1_emp_decmap) ||
- IMPORT_MAP(jp, jisx0213_2_emp, NULL, &st->jisx0213_2_emp_decmap) ||
- IMPORT_MAP(jp, jisx0213_pair,
- &jisx0213_pair_encmap, &jisx0213_pair_decmap))
- {
- return -1;
- }
- return 0;
- }
- #define config ((void *)2000)
- static Py_UCS4
- jisx0213_2000_1_decoder(const MultibyteCodec *codec, const unsigned char *data)
- {
- Py_UCS4 u;
- EMULATE_JISX0213_2000_DECODE_PLANE1(config, u, data[0], data[1])
- else if (data[0] == 0x21 && data[1] == 0x40) /* F/W REVERSE SOLIDUS */
- return 0xff3c;
- else if (TRYMAP_DEC_ST(jisx0208, u, data[0], data[1]))
- ;
- else if (TRYMAP_DEC_ST(jisx0213_1_bmp, u, data[0], data[1]))
- ;
- else if (TRYMAP_DEC_ST(jisx0213_1_emp, u, data[0], data[1]))
- u |= 0x20000;
- else if (TRYMAP_DEC(jisx0213_pair, u, data[0], data[1]))
- ;
- else
- return MAP_UNMAPPABLE;
- return u;
- }
- static Py_UCS4
- jisx0213_2000_2_decoder(const MultibyteCodec *codec, const unsigned char *data)
- {
- Py_UCS4 u;
- EMULATE_JISX0213_2000_DECODE_PLANE2_CHAR(config, u, data[0], data[1])
- if (TRYMAP_DEC_ST(jisx0213_2_bmp, u, data[0], data[1]))
- ;
- else if (TRYMAP_DEC_ST(jisx0213_2_emp, u, data[0], data[1]))
- u |= 0x20000;
- else
- return MAP_UNMAPPABLE;
- return u;
- }
- #undef config
- static Py_UCS4
- jisx0213_2004_1_decoder(const MultibyteCodec *codec, const unsigned char *data)
- {
- Py_UCS4 u;
- if (data[0] == 0x21 && data[1] == 0x40) /* F/W REVERSE SOLIDUS */
- return 0xff3c;
- else if (TRYMAP_DEC_ST(jisx0208, u, data[0], data[1]))
- ;
- else if (TRYMAP_DEC_ST(jisx0213_1_bmp, u, data[0], data[1]))
- ;
- else if (TRYMAP_DEC_ST(jisx0213_1_emp, u, data[0], data[1]))
- u |= 0x20000;
- else if (TRYMAP_DEC(jisx0213_pair, u, data[0], data[1]))
- ;
- else
- return MAP_UNMAPPABLE;
- return u;
- }
- static Py_UCS4
- jisx0213_2004_2_decoder(const MultibyteCodec *codec, const unsigned char *data)
- {
- Py_UCS4 u;
- if (TRYMAP_DEC_ST(jisx0213_2_bmp, u, data[0], data[1]))
- ;
- else if (TRYMAP_DEC_ST(jisx0213_2_emp, u, data[0], data[1]))
- u |= 0x20000;
- else
- return MAP_UNMAPPABLE;
- return u;
- }
- static DBCHAR
- jisx0213_encoder(const MultibyteCodec *codec, const Py_UCS4 *data,
- Py_ssize_t *length, const void *config)
- {
- DBCHAR coded;
- switch (*length) {
- case 1: /* first character */
- if (*data >= 0x10000) {
- if ((*data) >> 16 == 0x20000 >> 16) {
- EMULATE_JISX0213_2000_ENCODE_EMP(config, coded, *data)
- else if (TRYMAP_ENC_ST(jisx0213_emp, coded, (*data) & 0xffff))
- return coded;
- }
- return MAP_UNMAPPABLE;
- }
- EMULATE_JISX0213_2000_ENCODE_BMP(config, coded, *data)
- else if (TRYMAP_ENC_ST(jisx0213_bmp, coded, *data)) {
- if (coded == MULTIC)
- return MAP_MULTIPLE_AVAIL;
- }
- else if (TRYMAP_ENC_ST(jisxcommon, coded, *data)) {
- if (coded & 0x8000)
- return MAP_UNMAPPABLE;
- }
- else
- return MAP_UNMAPPABLE;
- return coded;
- case 2: /* second character of unicode pair */
- coded = find_pairencmap((ucs2_t)data[0], (ucs2_t)data[1],
- jisx0213_pair_encmap, JISX0213_ENCPAIRS);
- if (coded != DBCINV)
- return coded;
- /* fall through */
- case -1: /* flush unterminated */
- *length = 1;
- coded = find_pairencmap((ucs2_t)data[0], 0,
- jisx0213_pair_encmap, JISX0213_ENCPAIRS);
- if (coded == DBCINV)
- return MAP_UNMAPPABLE;
- else
- return coded;
- break;
- default:
- return MAP_UNMAPPABLE;
- }
- }
- static DBCHAR
- jisx0213_2000_1_encoder(const MultibyteCodec *codec, const Py_UCS4 *data,
- Py_ssize_t *length)
- {
- DBCHAR coded = jisx0213_encoder(codec, data, length, (void *)2000);
- if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
- return coded;
- else if (coded & 0x8000)
- return MAP_UNMAPPABLE;
- else
- return coded;
- }
- static DBCHAR
- jisx0213_2000_1_encoder_paironly(const MultibyteCodec *codec,
- const Py_UCS4 *data, Py_ssize_t *length)
- {
- DBCHAR coded;
- Py_ssize_t ilength = *length;
- coded = jisx0213_encoder(codec, data, length, (void *)2000);
- switch (ilength) {
- case 1:
- if (coded == MAP_MULTIPLE_AVAIL)
- return MAP_MULTIPLE_AVAIL;
- else
- return MAP_UNMAPPABLE;
- case 2:
- if (*length != 2)
- return MAP_UNMAPPABLE;
- else
- return coded;
- default:
- return MAP_UNMAPPABLE;
- }
- }
- static DBCHAR
- jisx0213_2000_2_encoder(const MultibyteCodec *codec, const Py_UCS4 *data,
- Py_ssize_t *length)
- {
- DBCHAR coded = jisx0213_encoder(codec, data, length, (void *)2000);
- if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
- return coded;
- else if (coded & 0x8000)
- return coded & 0x7fff;
- else
- return MAP_UNMAPPABLE;
- }
- static DBCHAR
- jisx0213_2004_1_encoder(const MultibyteCodec *codec, const Py_UCS4 *data,
- Py_ssize_t *length)
- {
- DBCHAR coded = jisx0213_encoder(codec, data, length, NULL);
- if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
- return coded;
- else if (coded & 0x8000)
- return MAP_UNMAPPABLE;
- else
- return coded;
- }
- static DBCHAR
- jisx0213_2004_1_encoder_paironly(const MultibyteCodec *codec,
- const Py_UCS4 *data, Py_ssize_t *length)
- {
- DBCHAR coded;
- Py_ssize_t ilength = *length;
- coded = jisx0213_encoder(codec, data, length, NULL);
- switch (ilength) {
- case 1:
- if (coded == MAP_MULTIPLE_AVAIL)
- return MAP_MULTIPLE_AVAIL;
- else
- return MAP_UNMAPPABLE;
- case 2:
- if (*length != 2)
- return MAP_UNMAPPABLE;
- else
- return coded;
- default:
- return MAP_UNMAPPABLE;
- }
- }
- static DBCHAR
- jisx0213_2004_2_encoder(const MultibyteCodec *codec, const Py_UCS4 *data,
- Py_ssize_t *length)
- {
- DBCHAR coded = jisx0213_encoder(codec, data, length, NULL);
- if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
- return coded;
- else if (coded & 0x8000)
- return coded & 0x7fff;
- else
- return MAP_UNMAPPABLE;
- }
- static Py_UCS4
- jisx0201_r_decoder(const MultibyteCodec *codec, const unsigned char *data)
- {
- Py_UCS4 u;
- JISX0201_R_DECODE_CHAR(*data, u)
- else
- return MAP_UNMAPPABLE;
- return u;
- }
- static DBCHAR
- jisx0201_r_encoder(const MultibyteCodec *codec, const Py_UCS4 *data,
- Py_ssize_t *length)
- {
- DBCHAR coded;
- JISX0201_R_ENCODE(*data, coded)
- else
- return MAP_UNMAPPABLE;
- return coded;
- }
- static Py_UCS4
- jisx0201_k_decoder(const MultibyteCodec *codec, const unsigned char *data)
- {
- Py_UCS4 u;
- JISX0201_K_DECODE_CHAR(*data ^ 0x80, u)
- else
- return MAP_UNMAPPABLE;
- return u;
- }
- static DBCHAR
- jisx0201_k_encoder(const MultibyteCodec *codec, const Py_UCS4 *data,
- Py_ssize_t *length)
- {
- DBCHAR coded;
- JISX0201_K_ENCODE(*data, coded)
- else
- return MAP_UNMAPPABLE;
- return coded - 0x80;
- }
- static int
- gb2312_init(const MultibyteCodec *codec)
- {
- cjkcodecs_module_state *st = codec->modstate;
- if (IMPORT_MAP(cn, gbcommon, &st->gbcommon_encmap, NULL) ||
- IMPORT_MAP(cn, gb2312, NULL, &st->gb2312_decmap))
- {
- return -1;
- }
- return 0;
- }
- static Py_UCS4
- gb2312_decoder(const MultibyteCodec *codec, const unsigned char *data)
- {
- Py_UCS4 u;
- if (TRYMAP_DEC_ST(gb2312, u, data[0], data[1]))
- return u;
- else
- return MAP_UNMAPPABLE;
- }
- static DBCHAR
- gb2312_encoder(const MultibyteCodec *codec, const Py_UCS4 *data,
- Py_ssize_t *length)
- {
- DBCHAR coded;
- assert(*length == 1);
- if (*data < 0x10000) {
- if (TRYMAP_ENC_ST(gbcommon, coded, *data)) {
- if (!(coded & 0x8000))
- return coded;
- }
- }
- return MAP_UNMAPPABLE;
- }
- static Py_UCS4
- dummy_decoder(const MultibyteCodec *codec, const unsigned char *data)
- {
- return MAP_UNMAPPABLE;
- }
- static DBCHAR
- dummy_encoder(const MultibyteCodec *codec, const Py_UCS4 *data,
- Py_ssize_t *length)
- {
- return MAP_UNMAPPABLE;
- }
- /*-*- registry tables -*-*/
- #define REGISTRY_KSX1001_G0 { CHARSET_KSX1001, 0, 2, \
- ksx1001_init, \
- ksx1001_decoder, ksx1001_encoder }
- #define REGISTRY_KSX1001_G1 { CHARSET_KSX1001, 1, 2, \
- ksx1001_init, \
- ksx1001_decoder, ksx1001_encoder }
- #define REGISTRY_JISX0201_R { CHARSET_JISX0201_R, 0, 1, \
- NULL, \
- jisx0201_r_decoder, jisx0201_r_encoder }
- #define REGISTRY_JISX0201_K { CHARSET_JISX0201_K, 0, 1, \
- NULL, \
- jisx0201_k_decoder, jisx0201_k_encoder }
- #define REGISTRY_JISX0208 { CHARSET_JISX0208, 0, 2, \
- jisx0208_init, \
- jisx0208_decoder, jisx0208_encoder }
- #define REGISTRY_JISX0208_O { CHARSET_JISX0208_O, 0, 2, \
- jisx0208_init, \
- jisx0208_decoder, jisx0208_encoder }
- #define REGISTRY_JISX0212 { CHARSET_JISX0212, 0, 2, \
- jisx0212_init, \
- jisx0212_decoder, jisx0212_encoder }
- #define REGISTRY_JISX0213_2000_1 { CHARSET_JISX0213_2000_1, 0, 2, \
- jisx0213_init, \
- jisx0213_2000_1_decoder, \
- jisx0213_2000_1_encoder }
- #define REGISTRY_JISX0213_2000_1_PAIRONLY { CHARSET_JISX0213_2000_1, 0, 2, \
- jisx0213_init, \
- jisx0213_2000_1_decoder, \
- jisx0213_2000_1_encoder_paironly }
- #define REGISTRY_JISX0213_2000_2 { CHARSET_JISX0213_2, 0, 2, \
- jisx0213_init, \
- jisx0213_2000_2_decoder, \
- jisx0213_2000_2_encoder }
- #define REGISTRY_JISX0213_2004_1 { CHARSET_JISX0213_2004_1, 0, 2, \
- jisx0213_init, \
- jisx0213_2004_1_decoder, \
- jisx0213_2004_1_encoder }
- #define REGISTRY_JISX0213_2004_1_PAIRONLY { CHARSET_JISX0213_2004_1, 0, 2, \
- jisx0213_init, \
- jisx0213_2004_1_decoder, \
- jisx0213_2004_1_encoder_paironly }
- #define REGISTRY_JISX0213_2004_2 { CHARSET_JISX0213_2, 0, 2, \
- jisx0213_init, \
- jisx0213_2004_2_decoder, \
- jisx0213_2004_2_encoder }
- #define REGISTRY_GB2312 { CHARSET_GB2312, 0, 2, \
- gb2312_init, \
- gb2312_decoder, gb2312_encoder }
- #define REGISTRY_CNS11643_1 { CHARSET_CNS11643_1, 1, 2, \
- cns11643_init, \
- cns11643_1_decoder, cns11643_1_encoder }
- #define REGISTRY_CNS11643_2 { CHARSET_CNS11643_2, 2, 2, \
- cns11643_init, \
- cns11643_2_decoder, cns11643_2_encoder }
- #define REGISTRY_ISO8859_1 { CHARSET_ISO8859_1, 2, 1, \
- NULL, dummy_decoder, dummy_encoder }
- #define REGISTRY_ISO8859_7 { CHARSET_ISO8859_7, 2, 1, \
- NULL, dummy_decoder, dummy_encoder }
- #define REGISTRY_SENTINEL { 0, }
- #define CONFIGDEF(var, attrs) \
- static const struct iso2022_config iso2022_##var##_config = { \
- attrs, iso2022_##var##_designations \
- };
- static const struct iso2022_designation iso2022_kr_designations[] = {
- REGISTRY_KSX1001_G1, REGISTRY_SENTINEL
- };
- CONFIGDEF(kr, 0)
- static const struct iso2022_designation iso2022_jp_designations[] = {
- REGISTRY_JISX0208, REGISTRY_JISX0201_R, REGISTRY_JISX0208_O,
- REGISTRY_SENTINEL
- };
- CONFIGDEF(jp, NO_SHIFT | USE_JISX0208_EXT)
- static const struct iso2022_designation iso2022_jp_1_designations[] = {
- REGISTRY_JISX0208, REGISTRY_JISX0212, REGISTRY_JISX0201_R,
- REGISTRY_JISX0208_O, REGISTRY_SENTINEL
- };
- CONFIGDEF(jp_1, NO_SHIFT | USE_JISX0208_EXT)
- static const struct iso2022_designation iso2022_jp_2_designations[] = {
- REGISTRY_JISX0208, REGISTRY_JISX0212, REGISTRY_KSX1001_G0,
- REGISTRY_GB2312, REGISTRY_JISX0201_R, REGISTRY_JISX0208_O,
- REGISTRY_ISO8859_1, REGISTRY_ISO8859_7, REGISTRY_SENTINEL
- };
- CONFIGDEF(jp_2, NO_SHIFT | USE_G2 | USE_JISX0208_EXT)
- static const struct iso2022_designation iso2022_jp_2004_designations[] = {
- REGISTRY_JISX0213_2004_1_PAIRONLY, REGISTRY_JISX0208,
- REGISTRY_JISX0213_2004_1, REGISTRY_JISX0213_2004_2, REGISTRY_SENTINEL
- };
- CONFIGDEF(jp_2004, NO_SHIFT | USE_JISX0208_EXT)
- static const struct iso2022_designation iso2022_jp_3_designations[] = {
- REGISTRY_JISX0213_2000_1_PAIRONLY, REGISTRY_JISX0208,
- REGISTRY_JISX0213_2000_1, REGISTRY_JISX0213_2000_2, REGISTRY_SENTINEL
- };
- CONFIGDEF(jp_3, NO_SHIFT | USE_JISX0208_EXT)
- static const struct iso2022_designation iso2022_jp_ext_designations[] = {
- REGISTRY_JISX0208, REGISTRY_JISX0212, REGISTRY_JISX0201_R,
- REGISTRY_JISX0201_K, REGISTRY_JISX0208_O, REGISTRY_SENTINEL
- };
- CONFIGDEF(jp_ext, NO_SHIFT | USE_JISX0208_EXT)
- BEGIN_MAPPINGS_LIST(0)
- /* no mapping table here */
- END_MAPPINGS_LIST
- #define ISO2022_CODEC(variation) \
- NEXT_CODEC = (MultibyteCodec){ \
- "iso2022_" #variation, \
- &iso2022_##variation##_config, \
- iso2022_codec_init, \
- _STATEFUL_METHODS(iso2022) \
- };
- BEGIN_CODECS_LIST(7)
- ISO2022_CODEC(kr)
- ISO2022_CODEC(jp)
- ISO2022_CODEC(jp_1)
- ISO2022_CODEC(jp_2)
- ISO2022_CODEC(jp_2004)
- ISO2022_CODEC(jp_3)
- ISO2022_CODEC(jp_ext)
- END_CODECS_LIST
- I_AM_A_MODULE_FOR(iso2022)
|