_codecs_iso2022.c 35 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150
  1. /*
  2. * _codecs_iso2022.c: Codecs collection for ISO-2022 encodings.
  3. *
  4. * Written by Hye-Shik Chang <perky@FreeBSD.org>
  5. */
  6. #define USING_IMPORTED_MAPS
  7. #define USING_BINARY_PAIR_SEARCH
  8. #define EXTERN_JISX0213_PAIR
  9. #define EMULATE_JISX0213_2000_ENCODE_INVALID MAP_UNMAPPABLE
  10. #define EMULATE_JISX0213_2000_DECODE_INVALID MAP_UNMAPPABLE
  11. #define CJK_MOD_SPECIFIC_STATE \
  12. /* kr */ \
  13. const encode_map *cp949_encmap; \
  14. const decode_map *ksx1001_decmap; \
  15. \
  16. /* jp */ \
  17. const encode_map *jisxcommon_encmap; \
  18. const decode_map *jisx0208_decmap; \
  19. const decode_map *jisx0212_decmap; \
  20. const encode_map *jisx0213_bmp_encmap; \
  21. const decode_map *jisx0213_1_bmp_decmap; \
  22. const decode_map *jisx0213_2_bmp_decmap; \
  23. const encode_map *jisx0213_emp_encmap; \
  24. const decode_map *jisx0213_1_emp_decmap; \
  25. const decode_map *jisx0213_2_emp_decmap; \
  26. \
  27. /* cn */ \
  28. const encode_map *gbcommon_encmap; \
  29. const decode_map *gb2312_decmap;
  30. #include "cjkcodecs.h"
  31. #include "alg_jisx0201.h"
  32. #include "emu_jisx0213_2000.h"
  33. #include "mappings_jisx0213_pair.h"
  34. /* STATE
  35. state->c[0-3]
  36. 00000000
  37. ||^^^^^|
  38. |+-----+---- G0-3 Character Set
  39. +----------- Is G0-3 double byte?
  40. state->c[4]
  41. 00000000
  42. ||
  43. |+---- Locked-Shift?
  44. +----- ESC Throughout
  45. */
  46. #define ESC 0x1B
  47. #define SO 0x0E
  48. #define SI 0x0F
  49. #define LF 0x0A
  50. #define MAX_ESCSEQLEN 16
  51. #define CHARSET_ISO8859_1 'A'
  52. #define CHARSET_ASCII 'B'
  53. #define CHARSET_ISO8859_7 'F'
  54. #define CHARSET_JISX0201_K 'I'
  55. #define CHARSET_JISX0201_R 'J'
  56. #define CHARSET_GB2312 ('A'|CHARSET_DBCS)
  57. #define CHARSET_JISX0208 ('B'|CHARSET_DBCS)
  58. #define CHARSET_KSX1001 ('C'|CHARSET_DBCS)
  59. #define CHARSET_JISX0212 ('D'|CHARSET_DBCS)
  60. #define CHARSET_GB2312_8565 ('E'|CHARSET_DBCS)
  61. #define CHARSET_CNS11643_1 ('G'|CHARSET_DBCS)
  62. #define CHARSET_CNS11643_2 ('H'|CHARSET_DBCS)
  63. #define CHARSET_JISX0213_2000_1 ('O'|CHARSET_DBCS)
  64. #define CHARSET_JISX0213_2 ('P'|CHARSET_DBCS)
  65. #define CHARSET_JISX0213_2004_1 ('Q'|CHARSET_DBCS)
  66. #define CHARSET_JISX0208_O ('@'|CHARSET_DBCS)
  67. #define CHARSET_DBCS 0x80
  68. #define ESCMARK(mark) ((mark) & 0x7f)
  69. #define IS_ESCEND(c) (((c) >= 'A' && (c) <= 'Z') || (c) == '@')
  70. #define IS_ISO2022ESC(c2) \
  71. ((c2) == '(' || (c2) == ')' || (c2) == '$' || \
  72. (c2) == '.' || (c2) == '&')
  73. /* this is not a complete list of ISO-2022 escape sequence headers.
  74. * but, it's enough to implement CJK instances of iso-2022. */
  75. #define MAP_UNMAPPABLE 0xFFFF
  76. #define MAP_MULTIPLE_AVAIL 0xFFFE /* for JIS X 0213 */
  77. #define F_SHIFTED 0x01
  78. #define F_ESCTHROUGHOUT 0x02
  79. #define STATE_SETG(dn, v) do { ((state)->c[dn]) = (v); } while (0)
  80. #define STATE_GETG(dn) ((state)->c[dn])
  81. #define STATE_G0 STATE_GETG(0)
  82. #define STATE_G1 STATE_GETG(1)
  83. #define STATE_G2 STATE_GETG(2)
  84. #define STATE_G3 STATE_GETG(3)
  85. #define STATE_SETG0(v) STATE_SETG(0, v)
  86. #define STATE_SETG1(v) STATE_SETG(1, v)
  87. #define STATE_SETG2(v) STATE_SETG(2, v)
  88. #define STATE_SETG3(v) STATE_SETG(3, v)
  89. #define STATE_SETFLAG(f) do { ((state)->c[4]) |= (f); } while (0)
  90. #define STATE_GETFLAG(f) ((state)->c[4] & (f))
  91. #define STATE_CLEARFLAG(f) do { ((state)->c[4]) &= ~(f); } while (0)
  92. #define STATE_CLEARFLAGS() do { ((state)->c[4]) = 0; } while (0)
  93. #define ISO2022_CONFIG ((const struct iso2022_config *)(codec->config))
  94. #define CONFIG_ISSET(flag) (ISO2022_CONFIG->flags & (flag))
  95. #define CONFIG_DESIGNATIONS (ISO2022_CONFIG->designations)
  96. /* iso2022_config.flags */
  97. #define NO_SHIFT 0x01
  98. #define USE_G2 0x02
  99. #define USE_JISX0208_EXT 0x04
  100. /*-*- internal data structures -*-*/
  101. typedef int (*iso2022_init_func)(const MultibyteCodec *codec);
  102. typedef Py_UCS4 (*iso2022_decode_func)(const MultibyteCodec *codec,
  103. const unsigned char *data);
  104. typedef DBCHAR (*iso2022_encode_func)(const MultibyteCodec *codec,
  105. const Py_UCS4 *data,
  106. Py_ssize_t *length);
  107. struct iso2022_designation {
  108. unsigned char mark;
  109. unsigned char plane;
  110. unsigned char width;
  111. iso2022_init_func initializer;
  112. iso2022_decode_func decoder;
  113. iso2022_encode_func encoder;
  114. };
  115. struct iso2022_config {
  116. int flags;
  117. const struct iso2022_designation *designations; /* non-ascii desigs */
  118. };
  119. /*-*- iso-2022 codec implementation -*-*/
  120. CODEC_INIT(iso2022)
  121. {
  122. const struct iso2022_designation *desig;
  123. for (desig = CONFIG_DESIGNATIONS; desig->mark; desig++) {
  124. if (desig->initializer != NULL && desig->initializer(codec) != 0) {
  125. return -1;
  126. }
  127. }
  128. return 0;
  129. }
  130. ENCODER_INIT(iso2022)
  131. {
  132. STATE_CLEARFLAGS();
  133. STATE_SETG0(CHARSET_ASCII);
  134. STATE_SETG1(CHARSET_ASCII);
  135. return 0;
  136. }
  137. ENCODER_RESET(iso2022)
  138. {
  139. if (STATE_GETFLAG(F_SHIFTED)) {
  140. WRITEBYTE1(SI);
  141. NEXT_OUT(1);
  142. STATE_CLEARFLAG(F_SHIFTED);
  143. }
  144. if (STATE_G0 != CHARSET_ASCII) {
  145. WRITEBYTE3(ESC, '(', 'B');
  146. NEXT_OUT(3);
  147. STATE_SETG0(CHARSET_ASCII);
  148. }
  149. return 0;
  150. }
  151. ENCODER(iso2022)
  152. {
  153. while (*inpos < inlen) {
  154. const struct iso2022_designation *dsg;
  155. DBCHAR encoded;
  156. Py_UCS4 c = INCHAR1;
  157. Py_ssize_t insize;
  158. if (c < 0x80) {
  159. if (STATE_G0 != CHARSET_ASCII) {
  160. WRITEBYTE3(ESC, '(', 'B');
  161. STATE_SETG0(CHARSET_ASCII);
  162. NEXT_OUT(3);
  163. }
  164. if (STATE_GETFLAG(F_SHIFTED)) {
  165. WRITEBYTE1(SI);
  166. STATE_CLEARFLAG(F_SHIFTED);
  167. NEXT_OUT(1);
  168. }
  169. WRITEBYTE1((unsigned char)c);
  170. NEXT(1, 1);
  171. continue;
  172. }
  173. insize = 1;
  174. encoded = MAP_UNMAPPABLE;
  175. for (dsg = CONFIG_DESIGNATIONS; dsg->mark; dsg++) {
  176. Py_UCS4 buf[2] = {c, 0};
  177. Py_ssize_t length = 1;
  178. encoded = dsg->encoder(codec, buf, &length);
  179. if (encoded == MAP_MULTIPLE_AVAIL) {
  180. /* this implementation won't work for pair
  181. * of non-bmp characters. */
  182. if (inlen - *inpos < 2) {
  183. if (!(flags & MBENC_FLUSH))
  184. return MBERR_TOOFEW;
  185. length = -1;
  186. }
  187. else {
  188. buf[1] = INCHAR2;
  189. length = 2;
  190. }
  191. encoded = dsg->encoder(codec, buf, &length);
  192. if (encoded != MAP_UNMAPPABLE) {
  193. insize = length;
  194. break;
  195. }
  196. }
  197. else if (encoded != MAP_UNMAPPABLE)
  198. break;
  199. }
  200. if (!dsg->mark)
  201. return 1;
  202. assert(dsg->width == 1 || dsg->width == 2);
  203. switch (dsg->plane) {
  204. case 0: /* G0 */
  205. if (STATE_GETFLAG(F_SHIFTED)) {
  206. WRITEBYTE1(SI);
  207. STATE_CLEARFLAG(F_SHIFTED);
  208. NEXT_OUT(1);
  209. }
  210. if (STATE_G0 != dsg->mark) {
  211. if (dsg->width == 1) {
  212. WRITEBYTE3(ESC, '(', ESCMARK(dsg->mark));
  213. STATE_SETG0(dsg->mark);
  214. NEXT_OUT(3);
  215. }
  216. else if (dsg->mark == CHARSET_JISX0208) {
  217. WRITEBYTE3(ESC, '$', ESCMARK(dsg->mark));
  218. STATE_SETG0(dsg->mark);
  219. NEXT_OUT(3);
  220. }
  221. else {
  222. WRITEBYTE4(ESC, '$', '(',
  223. ESCMARK(dsg->mark));
  224. STATE_SETG0(dsg->mark);
  225. NEXT_OUT(4);
  226. }
  227. }
  228. break;
  229. case 1: /* G1 */
  230. if (STATE_G1 != dsg->mark) {
  231. if (dsg->width == 1) {
  232. WRITEBYTE3(ESC, ')', ESCMARK(dsg->mark));
  233. STATE_SETG1(dsg->mark);
  234. NEXT_OUT(3);
  235. }
  236. else {
  237. WRITEBYTE4(ESC, '$', ')', ESCMARK(dsg->mark));
  238. STATE_SETG1(dsg->mark);
  239. NEXT_OUT(4);
  240. }
  241. }
  242. if (!STATE_GETFLAG(F_SHIFTED)) {
  243. WRITEBYTE1(SO);
  244. STATE_SETFLAG(F_SHIFTED);
  245. NEXT_OUT(1);
  246. }
  247. break;
  248. default: /* G2 and G3 is not supported: no encoding in
  249. * CJKCodecs are using them yet */
  250. return MBERR_INTERNAL;
  251. }
  252. if (dsg->width == 1) {
  253. WRITEBYTE1((unsigned char)encoded);
  254. NEXT_OUT(1);
  255. }
  256. else {
  257. WRITEBYTE2(encoded >> 8, encoded & 0xff);
  258. NEXT_OUT(2);
  259. }
  260. NEXT_INCHAR(insize);
  261. }
  262. return 0;
  263. }
  264. DECODER_INIT(iso2022)
  265. {
  266. STATE_CLEARFLAGS();
  267. STATE_SETG0(CHARSET_ASCII);
  268. STATE_SETG1(CHARSET_ASCII);
  269. STATE_SETG2(CHARSET_ASCII);
  270. return 0;
  271. }
  272. DECODER_RESET(iso2022)
  273. {
  274. STATE_SETG0(CHARSET_ASCII);
  275. STATE_CLEARFLAG(F_SHIFTED);
  276. return 0;
  277. }
  278. static Py_ssize_t
  279. iso2022processesc(const MultibyteCodec *codec, MultibyteCodec_State *state,
  280. const unsigned char **inbuf, Py_ssize_t *inleft)
  281. {
  282. unsigned char charset, designation;
  283. Py_ssize_t i, esclen = 0;
  284. for (i = 1;i < MAX_ESCSEQLEN;i++) {
  285. if (i >= *inleft)
  286. return MBERR_TOOFEW;
  287. if (IS_ESCEND((*inbuf)[i])) {
  288. esclen = i + 1;
  289. break;
  290. }
  291. else if (CONFIG_ISSET(USE_JISX0208_EXT) && i+1 < *inleft &&
  292. (*inbuf)[i] == '&' && (*inbuf)[i+1] == '@') {
  293. i += 2;
  294. }
  295. }
  296. switch (esclen) {
  297. case 0:
  298. return 1; /* unterminated escape sequence */
  299. case 3:
  300. if (INBYTE2 == '$') {
  301. charset = INBYTE3 | CHARSET_DBCS;
  302. designation = 0;
  303. }
  304. else {
  305. charset = INBYTE3;
  306. if (INBYTE2 == '(')
  307. designation = 0;
  308. else if (INBYTE2 == ')')
  309. designation = 1;
  310. else if (CONFIG_ISSET(USE_G2) && INBYTE2 == '.')
  311. designation = 2;
  312. else
  313. return 3;
  314. }
  315. break;
  316. case 4:
  317. if (INBYTE2 != '$')
  318. return 4;
  319. charset = INBYTE4 | CHARSET_DBCS;
  320. if (INBYTE3 == '(')
  321. designation = 0;
  322. else if (INBYTE3 == ')')
  323. designation = 1;
  324. else
  325. return 4;
  326. break;
  327. case 6: /* designation with prefix */
  328. if (CONFIG_ISSET(USE_JISX0208_EXT) &&
  329. (*inbuf)[3] == ESC && (*inbuf)[4] == '$' &&
  330. (*inbuf)[5] == 'B') {
  331. charset = 'B' | CHARSET_DBCS;
  332. designation = 0;
  333. }
  334. else
  335. return 6;
  336. break;
  337. default:
  338. return esclen;
  339. }
  340. /* raise error when the charset is not designated for this encoding */
  341. if (charset != CHARSET_ASCII) {
  342. const struct iso2022_designation *dsg;
  343. for (dsg = CONFIG_DESIGNATIONS; dsg->mark; dsg++) {
  344. if (dsg->mark == charset)
  345. break;
  346. }
  347. if (!dsg->mark)
  348. return esclen;
  349. }
  350. STATE_SETG(designation, charset);
  351. *inleft -= esclen;
  352. (*inbuf) += esclen;
  353. return 0;
  354. }
  355. #define ISO8859_7_DECODE(c, writer) \
  356. if ((c) < 0xa0) { \
  357. OUTCHAR(c); \
  358. } else if ((c) < 0xc0 && (0x288f3bc9L & (1L << ((c)-0xa0)))) { \
  359. OUTCHAR(c); \
  360. } else if ((c) >= 0xb4 && (c) <= 0xfe && ((c) >= 0xd4 || \
  361. (0xbffffd77L & (1L << ((c)-0xb4))))) { \
  362. OUTCHAR(0x02d0 + (c)); \
  363. } else if ((c) == 0xa1) { \
  364. OUTCHAR(0x2018); \
  365. } else if ((c) == 0xa2) { \
  366. OUTCHAR(0x2019); \
  367. } else if ((c) == 0xaf) { \
  368. OUTCHAR(0x2015); \
  369. }
  370. static Py_ssize_t
  371. iso2022processg2(const MultibyteCodec *codec, MultibyteCodec_State *state,
  372. const unsigned char **inbuf, Py_ssize_t *inleft,
  373. _PyUnicodeWriter *writer)
  374. {
  375. /* not written to use encoder, decoder functions because only few
  376. * encodings use G2 designations in CJKCodecs */
  377. if (STATE_G2 == CHARSET_ISO8859_1) {
  378. if (INBYTE3 < 0x80)
  379. OUTCHAR(INBYTE3 + 0x80);
  380. else
  381. return 3;
  382. }
  383. else if (STATE_G2 == CHARSET_ISO8859_7) {
  384. ISO8859_7_DECODE(INBYTE3 ^ 0x80, writer)
  385. else
  386. return 3;
  387. }
  388. else if (STATE_G2 == CHARSET_ASCII) {
  389. if (INBYTE3 & 0x80)
  390. return 3;
  391. else
  392. OUTCHAR(INBYTE3);
  393. }
  394. else
  395. return MBERR_INTERNAL;
  396. (*inbuf) += 3;
  397. *inleft -= 3;
  398. return 0;
  399. }
  400. DECODER(iso2022)
  401. {
  402. const struct iso2022_designation *dsgcache = NULL;
  403. while (inleft > 0) {
  404. unsigned char c = INBYTE1;
  405. Py_ssize_t err;
  406. if (STATE_GETFLAG(F_ESCTHROUGHOUT)) {
  407. /* ESC throughout mode:
  408. * for non-iso2022 escape sequences */
  409. OUTCHAR(c); /* assume as ISO-8859-1 */
  410. NEXT_IN(1);
  411. if (IS_ESCEND(c)) {
  412. STATE_CLEARFLAG(F_ESCTHROUGHOUT);
  413. }
  414. continue;
  415. }
  416. switch (c) {
  417. case ESC:
  418. REQUIRE_INBUF(2);
  419. if (IS_ISO2022ESC(INBYTE2)) {
  420. err = iso2022processesc(codec, state,
  421. inbuf, &inleft);
  422. if (err != 0)
  423. return err;
  424. }
  425. else if (CONFIG_ISSET(USE_G2) && INBYTE2 == 'N') {/* SS2 */
  426. REQUIRE_INBUF(3);
  427. err = iso2022processg2(codec, state,
  428. inbuf, &inleft, writer);
  429. if (err != 0)
  430. return err;
  431. }
  432. else {
  433. OUTCHAR(ESC);
  434. STATE_SETFLAG(F_ESCTHROUGHOUT);
  435. NEXT_IN(1);
  436. }
  437. break;
  438. case SI:
  439. if (CONFIG_ISSET(NO_SHIFT))
  440. goto bypass;
  441. STATE_CLEARFLAG(F_SHIFTED);
  442. NEXT_IN(1);
  443. break;
  444. case SO:
  445. if (CONFIG_ISSET(NO_SHIFT))
  446. goto bypass;
  447. STATE_SETFLAG(F_SHIFTED);
  448. NEXT_IN(1);
  449. break;
  450. case LF:
  451. STATE_CLEARFLAG(F_SHIFTED);
  452. OUTCHAR(LF);
  453. NEXT_IN(1);
  454. break;
  455. default:
  456. if (c < 0x20) /* C0 */
  457. goto bypass;
  458. else if (c >= 0x80)
  459. return 1;
  460. else {
  461. const struct iso2022_designation *dsg;
  462. unsigned char charset;
  463. Py_UCS4 decoded;
  464. if (STATE_GETFLAG(F_SHIFTED))
  465. charset = STATE_G1;
  466. else
  467. charset = STATE_G0;
  468. if (charset == CHARSET_ASCII) {
  469. bypass:
  470. OUTCHAR(c);
  471. NEXT_IN(1);
  472. break;
  473. }
  474. if (dsgcache != NULL &&
  475. dsgcache->mark == charset)
  476. dsg = dsgcache;
  477. else {
  478. for (dsg = CONFIG_DESIGNATIONS;
  479. dsg->mark != charset
  480. #ifdef Py_DEBUG
  481. && dsg->mark != '\0'
  482. #endif
  483. ; dsg++)
  484. {
  485. /* noop */
  486. }
  487. assert(dsg->mark != '\0');
  488. dsgcache = dsg;
  489. }
  490. REQUIRE_INBUF(dsg->width);
  491. decoded = dsg->decoder(codec, *inbuf);
  492. if (decoded == MAP_UNMAPPABLE)
  493. return dsg->width;
  494. if (decoded < 0x10000) {
  495. OUTCHAR(decoded);
  496. }
  497. else if (decoded < 0x30000) {
  498. OUTCHAR(decoded);
  499. }
  500. else { /* JIS X 0213 pairs */
  501. OUTCHAR2(decoded >> 16, decoded & 0xffff);
  502. }
  503. NEXT_IN(dsg->width);
  504. }
  505. break;
  506. }
  507. }
  508. return 0;
  509. }
  510. /*-*- mapping access functions -*-*/
  511. static int
  512. ksx1001_init(const MultibyteCodec *codec)
  513. {
  514. cjkcodecs_module_state *st = codec->modstate;
  515. if (IMPORT_MAP(kr, cp949, &st->cp949_encmap, NULL) ||
  516. IMPORT_MAP(kr, ksx1001, NULL, &st->ksx1001_decmap))
  517. {
  518. return -1;
  519. }
  520. return 0;
  521. }
  522. static Py_UCS4
  523. ksx1001_decoder(const MultibyteCodec *codec, const unsigned char *data)
  524. {
  525. Py_UCS4 u;
  526. if (TRYMAP_DEC_ST(ksx1001, u, data[0], data[1]))
  527. return u;
  528. else
  529. return MAP_UNMAPPABLE;
  530. }
  531. static DBCHAR
  532. ksx1001_encoder(const MultibyteCodec *codec, const Py_UCS4 *data,
  533. Py_ssize_t *length)
  534. {
  535. DBCHAR coded;
  536. assert(*length == 1);
  537. if (*data < 0x10000) {
  538. if (TRYMAP_ENC_ST(cp949, coded, *data)) {
  539. if (!(coded & 0x8000))
  540. return coded;
  541. }
  542. }
  543. return MAP_UNMAPPABLE;
  544. }
  545. static int
  546. jisx0208_init(const MultibyteCodec *codec)
  547. {
  548. cjkcodecs_module_state *st = codec->modstate;
  549. if (IMPORT_MAP(jp, jisxcommon, &st->jisxcommon_encmap, NULL) ||
  550. IMPORT_MAP(jp, jisx0208, NULL, &st->jisx0208_decmap))
  551. {
  552. return -1;
  553. }
  554. return 0;
  555. }
  556. static Py_UCS4
  557. jisx0208_decoder(const MultibyteCodec *codec, const unsigned char *data)
  558. {
  559. Py_UCS4 u;
  560. if (data[0] == 0x21 && data[1] == 0x40) /* F/W REVERSE SOLIDUS */
  561. return 0xff3c;
  562. else if (TRYMAP_DEC_ST(jisx0208, u, data[0], data[1]))
  563. return u;
  564. else
  565. return MAP_UNMAPPABLE;
  566. }
  567. static DBCHAR
  568. jisx0208_encoder(const MultibyteCodec *codec, const Py_UCS4 *data,
  569. Py_ssize_t *length)
  570. {
  571. DBCHAR coded;
  572. assert(*length == 1);
  573. if (*data < 0x10000) {
  574. if (*data == 0xff3c) /* F/W REVERSE SOLIDUS */
  575. return 0x2140;
  576. else if (TRYMAP_ENC_ST(jisxcommon, coded, *data)) {
  577. if (!(coded & 0x8000))
  578. return coded;
  579. }
  580. }
  581. return MAP_UNMAPPABLE;
  582. }
  583. static int
  584. jisx0212_init(const MultibyteCodec *codec)
  585. {
  586. cjkcodecs_module_state *st = codec->modstate;
  587. if (IMPORT_MAP(jp, jisxcommon, &st->jisxcommon_encmap, NULL) ||
  588. IMPORT_MAP(jp, jisx0212, NULL, &st->jisx0212_decmap))
  589. {
  590. return -1;
  591. }
  592. return 0;
  593. }
  594. static Py_UCS4
  595. jisx0212_decoder(const MultibyteCodec *codec, const unsigned char *data)
  596. {
  597. Py_UCS4 u;
  598. if (TRYMAP_DEC_ST(jisx0212, u, data[0], data[1]))
  599. return u;
  600. else
  601. return MAP_UNMAPPABLE;
  602. }
  603. static DBCHAR
  604. jisx0212_encoder(const MultibyteCodec *codec, const Py_UCS4 *data,
  605. Py_ssize_t *length)
  606. {
  607. DBCHAR coded;
  608. assert(*length == 1);
  609. if (*data < 0x10000) {
  610. if (TRYMAP_ENC_ST(jisxcommon, coded, *data)) {
  611. if (coded & 0x8000)
  612. return coded & 0x7fff;
  613. }
  614. }
  615. return MAP_UNMAPPABLE;
  616. }
  617. static int
  618. jisx0213_init(const MultibyteCodec *codec)
  619. {
  620. cjkcodecs_module_state *st = codec->modstate;
  621. if (jisx0208_init(codec) ||
  622. IMPORT_MAP(jp, jisx0213_bmp, &st->jisx0213_bmp_encmap, NULL) ||
  623. IMPORT_MAP(jp, jisx0213_1_bmp, NULL, &st->jisx0213_1_bmp_decmap) ||
  624. IMPORT_MAP(jp, jisx0213_2_bmp, NULL, &st->jisx0213_2_bmp_decmap) ||
  625. IMPORT_MAP(jp, jisx0213_emp, &st->jisx0213_emp_encmap, NULL) ||
  626. IMPORT_MAP(jp, jisx0213_1_emp, NULL, &st->jisx0213_1_emp_decmap) ||
  627. IMPORT_MAP(jp, jisx0213_2_emp, NULL, &st->jisx0213_2_emp_decmap) ||
  628. IMPORT_MAP(jp, jisx0213_pair,
  629. &jisx0213_pair_encmap, &jisx0213_pair_decmap))
  630. {
  631. return -1;
  632. }
  633. return 0;
  634. }
  635. #define config ((void *)2000)
  636. static Py_UCS4
  637. jisx0213_2000_1_decoder(const MultibyteCodec *codec, const unsigned char *data)
  638. {
  639. Py_UCS4 u;
  640. EMULATE_JISX0213_2000_DECODE_PLANE1(config, u, data[0], data[1])
  641. else if (data[0] == 0x21 && data[1] == 0x40) /* F/W REVERSE SOLIDUS */
  642. return 0xff3c;
  643. else if (TRYMAP_DEC_ST(jisx0208, u, data[0], data[1]))
  644. ;
  645. else if (TRYMAP_DEC_ST(jisx0213_1_bmp, u, data[0], data[1]))
  646. ;
  647. else if (TRYMAP_DEC_ST(jisx0213_1_emp, u, data[0], data[1]))
  648. u |= 0x20000;
  649. else if (TRYMAP_DEC(jisx0213_pair, u, data[0], data[1]))
  650. ;
  651. else
  652. return MAP_UNMAPPABLE;
  653. return u;
  654. }
  655. static Py_UCS4
  656. jisx0213_2000_2_decoder(const MultibyteCodec *codec, const unsigned char *data)
  657. {
  658. Py_UCS4 u;
  659. EMULATE_JISX0213_2000_DECODE_PLANE2_CHAR(config, u, data[0], data[1])
  660. if (TRYMAP_DEC_ST(jisx0213_2_bmp, u, data[0], data[1]))
  661. ;
  662. else if (TRYMAP_DEC_ST(jisx0213_2_emp, u, data[0], data[1]))
  663. u |= 0x20000;
  664. else
  665. return MAP_UNMAPPABLE;
  666. return u;
  667. }
  668. #undef config
  669. static Py_UCS4
  670. jisx0213_2004_1_decoder(const MultibyteCodec *codec, const unsigned char *data)
  671. {
  672. Py_UCS4 u;
  673. if (data[0] == 0x21 && data[1] == 0x40) /* F/W REVERSE SOLIDUS */
  674. return 0xff3c;
  675. else if (TRYMAP_DEC_ST(jisx0208, u, data[0], data[1]))
  676. ;
  677. else if (TRYMAP_DEC_ST(jisx0213_1_bmp, u, data[0], data[1]))
  678. ;
  679. else if (TRYMAP_DEC_ST(jisx0213_1_emp, u, data[0], data[1]))
  680. u |= 0x20000;
  681. else if (TRYMAP_DEC(jisx0213_pair, u, data[0], data[1]))
  682. ;
  683. else
  684. return MAP_UNMAPPABLE;
  685. return u;
  686. }
  687. static Py_UCS4
  688. jisx0213_2004_2_decoder(const MultibyteCodec *codec, const unsigned char *data)
  689. {
  690. Py_UCS4 u;
  691. if (TRYMAP_DEC_ST(jisx0213_2_bmp, u, data[0], data[1]))
  692. ;
  693. else if (TRYMAP_DEC_ST(jisx0213_2_emp, u, data[0], data[1]))
  694. u |= 0x20000;
  695. else
  696. return MAP_UNMAPPABLE;
  697. return u;
  698. }
  699. static DBCHAR
  700. jisx0213_encoder(const MultibyteCodec *codec, const Py_UCS4 *data,
  701. Py_ssize_t *length, const void *config)
  702. {
  703. DBCHAR coded;
  704. switch (*length) {
  705. case 1: /* first character */
  706. if (*data >= 0x10000) {
  707. if ((*data) >> 16 == 0x20000 >> 16) {
  708. EMULATE_JISX0213_2000_ENCODE_EMP(config, coded, *data)
  709. else if (TRYMAP_ENC_ST(jisx0213_emp, coded, (*data) & 0xffff))
  710. return coded;
  711. }
  712. return MAP_UNMAPPABLE;
  713. }
  714. EMULATE_JISX0213_2000_ENCODE_BMP(config, coded, *data)
  715. else if (TRYMAP_ENC_ST(jisx0213_bmp, coded, *data)) {
  716. if (coded == MULTIC)
  717. return MAP_MULTIPLE_AVAIL;
  718. }
  719. else if (TRYMAP_ENC_ST(jisxcommon, coded, *data)) {
  720. if (coded & 0x8000)
  721. return MAP_UNMAPPABLE;
  722. }
  723. else
  724. return MAP_UNMAPPABLE;
  725. return coded;
  726. case 2: /* second character of unicode pair */
  727. coded = find_pairencmap((ucs2_t)data[0], (ucs2_t)data[1],
  728. jisx0213_pair_encmap, JISX0213_ENCPAIRS);
  729. if (coded != DBCINV)
  730. return coded;
  731. /* fall through */
  732. case -1: /* flush unterminated */
  733. *length = 1;
  734. coded = find_pairencmap((ucs2_t)data[0], 0,
  735. jisx0213_pair_encmap, JISX0213_ENCPAIRS);
  736. if (coded == DBCINV)
  737. return MAP_UNMAPPABLE;
  738. else
  739. return coded;
  740. break;
  741. default:
  742. return MAP_UNMAPPABLE;
  743. }
  744. }
  745. static DBCHAR
  746. jisx0213_2000_1_encoder(const MultibyteCodec *codec, const Py_UCS4 *data,
  747. Py_ssize_t *length)
  748. {
  749. DBCHAR coded = jisx0213_encoder(codec, data, length, (void *)2000);
  750. if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
  751. return coded;
  752. else if (coded & 0x8000)
  753. return MAP_UNMAPPABLE;
  754. else
  755. return coded;
  756. }
  757. static DBCHAR
  758. jisx0213_2000_1_encoder_paironly(const MultibyteCodec *codec,
  759. const Py_UCS4 *data, Py_ssize_t *length)
  760. {
  761. DBCHAR coded;
  762. Py_ssize_t ilength = *length;
  763. coded = jisx0213_encoder(codec, data, length, (void *)2000);
  764. switch (ilength) {
  765. case 1:
  766. if (coded == MAP_MULTIPLE_AVAIL)
  767. return MAP_MULTIPLE_AVAIL;
  768. else
  769. return MAP_UNMAPPABLE;
  770. case 2:
  771. if (*length != 2)
  772. return MAP_UNMAPPABLE;
  773. else
  774. return coded;
  775. default:
  776. return MAP_UNMAPPABLE;
  777. }
  778. }
  779. static DBCHAR
  780. jisx0213_2000_2_encoder(const MultibyteCodec *codec, const Py_UCS4 *data,
  781. Py_ssize_t *length)
  782. {
  783. DBCHAR coded = jisx0213_encoder(codec, data, length, (void *)2000);
  784. if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
  785. return coded;
  786. else if (coded & 0x8000)
  787. return coded & 0x7fff;
  788. else
  789. return MAP_UNMAPPABLE;
  790. }
  791. static DBCHAR
  792. jisx0213_2004_1_encoder(const MultibyteCodec *codec, const Py_UCS4 *data,
  793. Py_ssize_t *length)
  794. {
  795. DBCHAR coded = jisx0213_encoder(codec, data, length, NULL);
  796. if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
  797. return coded;
  798. else if (coded & 0x8000)
  799. return MAP_UNMAPPABLE;
  800. else
  801. return coded;
  802. }
  803. static DBCHAR
  804. jisx0213_2004_1_encoder_paironly(const MultibyteCodec *codec,
  805. const Py_UCS4 *data, Py_ssize_t *length)
  806. {
  807. DBCHAR coded;
  808. Py_ssize_t ilength = *length;
  809. coded = jisx0213_encoder(codec, data, length, NULL);
  810. switch (ilength) {
  811. case 1:
  812. if (coded == MAP_MULTIPLE_AVAIL)
  813. return MAP_MULTIPLE_AVAIL;
  814. else
  815. return MAP_UNMAPPABLE;
  816. case 2:
  817. if (*length != 2)
  818. return MAP_UNMAPPABLE;
  819. else
  820. return coded;
  821. default:
  822. return MAP_UNMAPPABLE;
  823. }
  824. }
  825. static DBCHAR
  826. jisx0213_2004_2_encoder(const MultibyteCodec *codec, const Py_UCS4 *data,
  827. Py_ssize_t *length)
  828. {
  829. DBCHAR coded = jisx0213_encoder(codec, data, length, NULL);
  830. if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL)
  831. return coded;
  832. else if (coded & 0x8000)
  833. return coded & 0x7fff;
  834. else
  835. return MAP_UNMAPPABLE;
  836. }
  837. static Py_UCS4
  838. jisx0201_r_decoder(const MultibyteCodec *codec, const unsigned char *data)
  839. {
  840. Py_UCS4 u;
  841. JISX0201_R_DECODE_CHAR(*data, u)
  842. else
  843. return MAP_UNMAPPABLE;
  844. return u;
  845. }
  846. static DBCHAR
  847. jisx0201_r_encoder(const MultibyteCodec *codec, const Py_UCS4 *data,
  848. Py_ssize_t *length)
  849. {
  850. DBCHAR coded;
  851. JISX0201_R_ENCODE(*data, coded)
  852. else
  853. return MAP_UNMAPPABLE;
  854. return coded;
  855. }
  856. static Py_UCS4
  857. jisx0201_k_decoder(const MultibyteCodec *codec, const unsigned char *data)
  858. {
  859. Py_UCS4 u;
  860. JISX0201_K_DECODE_CHAR(*data ^ 0x80, u)
  861. else
  862. return MAP_UNMAPPABLE;
  863. return u;
  864. }
  865. static DBCHAR
  866. jisx0201_k_encoder(const MultibyteCodec *codec, const Py_UCS4 *data,
  867. Py_ssize_t *length)
  868. {
  869. DBCHAR coded;
  870. JISX0201_K_ENCODE(*data, coded)
  871. else
  872. return MAP_UNMAPPABLE;
  873. return coded - 0x80;
  874. }
  875. static int
  876. gb2312_init(const MultibyteCodec *codec)
  877. {
  878. cjkcodecs_module_state *st = codec->modstate;
  879. if (IMPORT_MAP(cn, gbcommon, &st->gbcommon_encmap, NULL) ||
  880. IMPORT_MAP(cn, gb2312, NULL, &st->gb2312_decmap))
  881. {
  882. return -1;
  883. }
  884. return 0;
  885. }
  886. static Py_UCS4
  887. gb2312_decoder(const MultibyteCodec *codec, const unsigned char *data)
  888. {
  889. Py_UCS4 u;
  890. if (TRYMAP_DEC_ST(gb2312, u, data[0], data[1]))
  891. return u;
  892. else
  893. return MAP_UNMAPPABLE;
  894. }
  895. static DBCHAR
  896. gb2312_encoder(const MultibyteCodec *codec, const Py_UCS4 *data,
  897. Py_ssize_t *length)
  898. {
  899. DBCHAR coded;
  900. assert(*length == 1);
  901. if (*data < 0x10000) {
  902. if (TRYMAP_ENC_ST(gbcommon, coded, *data)) {
  903. if (!(coded & 0x8000))
  904. return coded;
  905. }
  906. }
  907. return MAP_UNMAPPABLE;
  908. }
  909. static Py_UCS4
  910. dummy_decoder(const MultibyteCodec *codec, const unsigned char *data)
  911. {
  912. return MAP_UNMAPPABLE;
  913. }
  914. static DBCHAR
  915. dummy_encoder(const MultibyteCodec *codec, const Py_UCS4 *data,
  916. Py_ssize_t *length)
  917. {
  918. return MAP_UNMAPPABLE;
  919. }
  920. /*-*- registry tables -*-*/
  921. #define REGISTRY_KSX1001_G0 { CHARSET_KSX1001, 0, 2, \
  922. ksx1001_init, \
  923. ksx1001_decoder, ksx1001_encoder }
  924. #define REGISTRY_KSX1001_G1 { CHARSET_KSX1001, 1, 2, \
  925. ksx1001_init, \
  926. ksx1001_decoder, ksx1001_encoder }
  927. #define REGISTRY_JISX0201_R { CHARSET_JISX0201_R, 0, 1, \
  928. NULL, \
  929. jisx0201_r_decoder, jisx0201_r_encoder }
  930. #define REGISTRY_JISX0201_K { CHARSET_JISX0201_K, 0, 1, \
  931. NULL, \
  932. jisx0201_k_decoder, jisx0201_k_encoder }
  933. #define REGISTRY_JISX0208 { CHARSET_JISX0208, 0, 2, \
  934. jisx0208_init, \
  935. jisx0208_decoder, jisx0208_encoder }
  936. #define REGISTRY_JISX0208_O { CHARSET_JISX0208_O, 0, 2, \
  937. jisx0208_init, \
  938. jisx0208_decoder, jisx0208_encoder }
  939. #define REGISTRY_JISX0212 { CHARSET_JISX0212, 0, 2, \
  940. jisx0212_init, \
  941. jisx0212_decoder, jisx0212_encoder }
  942. #define REGISTRY_JISX0213_2000_1 { CHARSET_JISX0213_2000_1, 0, 2, \
  943. jisx0213_init, \
  944. jisx0213_2000_1_decoder, \
  945. jisx0213_2000_1_encoder }
  946. #define REGISTRY_JISX0213_2000_1_PAIRONLY { CHARSET_JISX0213_2000_1, 0, 2, \
  947. jisx0213_init, \
  948. jisx0213_2000_1_decoder, \
  949. jisx0213_2000_1_encoder_paironly }
  950. #define REGISTRY_JISX0213_2000_2 { CHARSET_JISX0213_2, 0, 2, \
  951. jisx0213_init, \
  952. jisx0213_2000_2_decoder, \
  953. jisx0213_2000_2_encoder }
  954. #define REGISTRY_JISX0213_2004_1 { CHARSET_JISX0213_2004_1, 0, 2, \
  955. jisx0213_init, \
  956. jisx0213_2004_1_decoder, \
  957. jisx0213_2004_1_encoder }
  958. #define REGISTRY_JISX0213_2004_1_PAIRONLY { CHARSET_JISX0213_2004_1, 0, 2, \
  959. jisx0213_init, \
  960. jisx0213_2004_1_decoder, \
  961. jisx0213_2004_1_encoder_paironly }
  962. #define REGISTRY_JISX0213_2004_2 { CHARSET_JISX0213_2, 0, 2, \
  963. jisx0213_init, \
  964. jisx0213_2004_2_decoder, \
  965. jisx0213_2004_2_encoder }
  966. #define REGISTRY_GB2312 { CHARSET_GB2312, 0, 2, \
  967. gb2312_init, \
  968. gb2312_decoder, gb2312_encoder }
  969. #define REGISTRY_CNS11643_1 { CHARSET_CNS11643_1, 1, 2, \
  970. cns11643_init, \
  971. cns11643_1_decoder, cns11643_1_encoder }
  972. #define REGISTRY_CNS11643_2 { CHARSET_CNS11643_2, 2, 2, \
  973. cns11643_init, \
  974. cns11643_2_decoder, cns11643_2_encoder }
  975. #define REGISTRY_ISO8859_1 { CHARSET_ISO8859_1, 2, 1, \
  976. NULL, dummy_decoder, dummy_encoder }
  977. #define REGISTRY_ISO8859_7 { CHARSET_ISO8859_7, 2, 1, \
  978. NULL, dummy_decoder, dummy_encoder }
  979. #define REGISTRY_SENTINEL { 0, }
  980. #define CONFIGDEF(var, attrs) \
  981. static const struct iso2022_config iso2022_##var##_config = { \
  982. attrs, iso2022_##var##_designations \
  983. };
  984. static const struct iso2022_designation iso2022_kr_designations[] = {
  985. REGISTRY_KSX1001_G1, REGISTRY_SENTINEL
  986. };
  987. CONFIGDEF(kr, 0)
  988. static const struct iso2022_designation iso2022_jp_designations[] = {
  989. REGISTRY_JISX0208, REGISTRY_JISX0201_R, REGISTRY_JISX0208_O,
  990. REGISTRY_SENTINEL
  991. };
  992. CONFIGDEF(jp, NO_SHIFT | USE_JISX0208_EXT)
  993. static const struct iso2022_designation iso2022_jp_1_designations[] = {
  994. REGISTRY_JISX0208, REGISTRY_JISX0212, REGISTRY_JISX0201_R,
  995. REGISTRY_JISX0208_O, REGISTRY_SENTINEL
  996. };
  997. CONFIGDEF(jp_1, NO_SHIFT | USE_JISX0208_EXT)
  998. static const struct iso2022_designation iso2022_jp_2_designations[] = {
  999. REGISTRY_JISX0208, REGISTRY_JISX0212, REGISTRY_KSX1001_G0,
  1000. REGISTRY_GB2312, REGISTRY_JISX0201_R, REGISTRY_JISX0208_O,
  1001. REGISTRY_ISO8859_1, REGISTRY_ISO8859_7, REGISTRY_SENTINEL
  1002. };
  1003. CONFIGDEF(jp_2, NO_SHIFT | USE_G2 | USE_JISX0208_EXT)
  1004. static const struct iso2022_designation iso2022_jp_2004_designations[] = {
  1005. REGISTRY_JISX0213_2004_1_PAIRONLY, REGISTRY_JISX0208,
  1006. REGISTRY_JISX0213_2004_1, REGISTRY_JISX0213_2004_2, REGISTRY_SENTINEL
  1007. };
  1008. CONFIGDEF(jp_2004, NO_SHIFT | USE_JISX0208_EXT)
  1009. static const struct iso2022_designation iso2022_jp_3_designations[] = {
  1010. REGISTRY_JISX0213_2000_1_PAIRONLY, REGISTRY_JISX0208,
  1011. REGISTRY_JISX0213_2000_1, REGISTRY_JISX0213_2000_2, REGISTRY_SENTINEL
  1012. };
  1013. CONFIGDEF(jp_3, NO_SHIFT | USE_JISX0208_EXT)
  1014. static const struct iso2022_designation iso2022_jp_ext_designations[] = {
  1015. REGISTRY_JISX0208, REGISTRY_JISX0212, REGISTRY_JISX0201_R,
  1016. REGISTRY_JISX0201_K, REGISTRY_JISX0208_O, REGISTRY_SENTINEL
  1017. };
  1018. CONFIGDEF(jp_ext, NO_SHIFT | USE_JISX0208_EXT)
  1019. BEGIN_MAPPINGS_LIST(0)
  1020. /* no mapping table here */
  1021. END_MAPPINGS_LIST
  1022. #define ISO2022_CODEC(variation) \
  1023. NEXT_CODEC = (MultibyteCodec){ \
  1024. "iso2022_" #variation, \
  1025. &iso2022_##variation##_config, \
  1026. iso2022_codec_init, \
  1027. _STATEFUL_METHODS(iso2022) \
  1028. };
  1029. BEGIN_CODECS_LIST(7)
  1030. ISO2022_CODEC(kr)
  1031. ISO2022_CODEC(jp)
  1032. ISO2022_CODEC(jp_1)
  1033. ISO2022_CODEC(jp_2)
  1034. ISO2022_CODEC(jp_2004)
  1035. ISO2022_CODEC(jp_3)
  1036. ISO2022_CODEC(jp_ext)
  1037. END_CODECS_LIST
  1038. I_AM_A_MODULE_FOR(iso2022)