_codecs_cn.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470
  1. /*
  2. * _codecs_cn.c: Codecs collection for Mainland Chinese encodings
  3. *
  4. * Written by Hye-Shik Chang <perky@FreeBSD.org>
  5. */
  6. #include "cjkcodecs.h"
  7. #include "mappings_cn.h"
  8. /**
  9. * hz is predefined as 100 on AIX. So we undefine it to avoid
  10. * conflict against hz codec's.
  11. */
  12. #ifdef _AIX
  13. #undef hz
  14. #endif
  15. /* GBK and GB2312 map differently in few code points that are listed below:
  16. *
  17. * gb2312 gbk
  18. * A1A4 U+30FB KATAKANA MIDDLE DOT U+00B7 MIDDLE DOT
  19. * A1AA U+2015 HORIZONTAL BAR U+2014 EM DASH
  20. * A844 undefined U+2015 HORIZONTAL BAR
  21. */
  22. #define GBK_DECODE(dc1, dc2, writer) \
  23. if ((dc1) == 0xa1 && (dc2) == 0xaa) { \
  24. OUTCHAR(0x2014); \
  25. } \
  26. else if ((dc1) == 0xa8 && (dc2) == 0x44) { \
  27. OUTCHAR(0x2015); \
  28. } \
  29. else if ((dc1) == 0xa1 && (dc2) == 0xa4) { \
  30. OUTCHAR(0x00b7); \
  31. } \
  32. else if (TRYMAP_DEC(gb2312, decoded, dc1 ^ 0x80, dc2 ^ 0x80)) { \
  33. OUTCHAR(decoded); \
  34. } \
  35. else if (TRYMAP_DEC(gbkext, decoded, dc1, dc2)) { \
  36. OUTCHAR(decoded); \
  37. }
  38. #define GBK_ENCODE(code, assi) \
  39. if ((code) == 0x2014) { \
  40. (assi) = 0xa1aa; \
  41. } else if ((code) == 0x2015) { \
  42. (assi) = 0xa844; \
  43. } else if ((code) == 0x00b7) { \
  44. (assi) = 0xa1a4; \
  45. } else if ((code) != 0x30fb && TRYMAP_ENC(gbcommon, assi, code)) { \
  46. ; \
  47. }
  48. /*
  49. * codecs in this file use the first byte of MultibyteCodec_State.c[8]
  50. * to store a 0 or 1 state value
  51. */
  52. #define CN_STATE_OFFSET 0
  53. /*
  54. * GB2312 codec
  55. */
  56. ENCODER(gb2312)
  57. {
  58. while (*inpos < inlen) {
  59. Py_UCS4 c = INCHAR1;
  60. DBCHAR code;
  61. if (c < 0x80) {
  62. WRITEBYTE1((unsigned char)c);
  63. NEXT(1, 1);
  64. continue;
  65. }
  66. if (c > 0xFFFF)
  67. return 1;
  68. REQUIRE_OUTBUF(2);
  69. if (TRYMAP_ENC(gbcommon, code, c))
  70. ;
  71. else
  72. return 1;
  73. if (code & 0x8000) /* MSB set: GBK */
  74. return 1;
  75. OUTBYTE1((code >> 8) | 0x80);
  76. OUTBYTE2((code & 0xFF) | 0x80);
  77. NEXT(1, 2);
  78. }
  79. return 0;
  80. }
  81. DECODER(gb2312)
  82. {
  83. while (inleft > 0) {
  84. unsigned char c = **inbuf;
  85. Py_UCS4 decoded;
  86. if (c < 0x80) {
  87. OUTCHAR(c);
  88. NEXT_IN(1);
  89. continue;
  90. }
  91. REQUIRE_INBUF(2);
  92. if (TRYMAP_DEC(gb2312, decoded, c ^ 0x80, INBYTE2 ^ 0x80)) {
  93. OUTCHAR(decoded);
  94. NEXT_IN(2);
  95. }
  96. else
  97. return 1;
  98. }
  99. return 0;
  100. }
  101. /*
  102. * GBK codec
  103. */
  104. ENCODER(gbk)
  105. {
  106. while (*inpos < inlen) {
  107. Py_UCS4 c = INCHAR1;
  108. DBCHAR code;
  109. if (c < 0x80) {
  110. WRITEBYTE1((unsigned char)c);
  111. NEXT(1, 1);
  112. continue;
  113. }
  114. if (c > 0xFFFF)
  115. return 1;
  116. REQUIRE_OUTBUF(2);
  117. GBK_ENCODE(c, code)
  118. else
  119. return 1;
  120. OUTBYTE1((code >> 8) | 0x80);
  121. if (code & 0x8000)
  122. OUTBYTE2((code & 0xFF)); /* MSB set: GBK */
  123. else
  124. OUTBYTE2((code & 0xFF) | 0x80); /* MSB unset: GB2312 */
  125. NEXT(1, 2);
  126. }
  127. return 0;
  128. }
  129. DECODER(gbk)
  130. {
  131. while (inleft > 0) {
  132. unsigned char c = INBYTE1;
  133. Py_UCS4 decoded;
  134. if (c < 0x80) {
  135. OUTCHAR(c);
  136. NEXT_IN(1);
  137. continue;
  138. }
  139. REQUIRE_INBUF(2);
  140. GBK_DECODE(c, INBYTE2, writer)
  141. else
  142. return 1;
  143. NEXT_IN(2);
  144. }
  145. return 0;
  146. }
  147. /*
  148. * GB18030 codec
  149. */
  150. ENCODER(gb18030)
  151. {
  152. while (*inpos < inlen) {
  153. Py_UCS4 c = INCHAR1;
  154. DBCHAR code;
  155. if (c < 0x80) {
  156. WRITEBYTE1(c);
  157. NEXT(1, 1);
  158. continue;
  159. }
  160. if (c >= 0x10000) {
  161. Py_UCS4 tc = c - 0x10000;
  162. assert (c <= 0x10FFFF);
  163. REQUIRE_OUTBUF(4);
  164. OUTBYTE4((unsigned char)(tc % 10) + 0x30);
  165. tc /= 10;
  166. OUTBYTE3((unsigned char)(tc % 126) + 0x81);
  167. tc /= 126;
  168. OUTBYTE2((unsigned char)(tc % 10) + 0x30);
  169. tc /= 10;
  170. OUTBYTE1((unsigned char)(tc + 0x90));
  171. NEXT(1, 4);
  172. continue;
  173. }
  174. REQUIRE_OUTBUF(2);
  175. GBK_ENCODE(c, code)
  176. else if (TRYMAP_ENC(gb18030ext, code, c))
  177. ;
  178. else {
  179. const struct _gb18030_to_unibmp_ranges *utrrange;
  180. REQUIRE_OUTBUF(4);
  181. for (utrrange = gb18030_to_unibmp_ranges;
  182. utrrange->first != 0;
  183. utrrange++)
  184. if (utrrange->first <= c &&
  185. c <= utrrange->last) {
  186. Py_UCS4 tc;
  187. tc = c - utrrange->first +
  188. utrrange->base;
  189. OUTBYTE4((unsigned char)(tc % 10) + 0x30);
  190. tc /= 10;
  191. OUTBYTE3((unsigned char)(tc % 126) + 0x81);
  192. tc /= 126;
  193. OUTBYTE2((unsigned char)(tc % 10) + 0x30);
  194. tc /= 10;
  195. OUTBYTE1((unsigned char)tc + 0x81);
  196. NEXT(1, 4);
  197. break;
  198. }
  199. if (utrrange->first == 0)
  200. return 1;
  201. continue;
  202. }
  203. OUTBYTE1((code >> 8) | 0x80);
  204. if (code & 0x8000)
  205. OUTBYTE2((code & 0xFF)); /* MSB set: GBK or GB18030ext */
  206. else
  207. OUTBYTE2((code & 0xFF) | 0x80); /* MSB unset: GB2312 */
  208. NEXT(1, 2);
  209. }
  210. return 0;
  211. }
  212. DECODER(gb18030)
  213. {
  214. while (inleft > 0) {
  215. unsigned char c = INBYTE1, c2;
  216. Py_UCS4 decoded;
  217. if (c < 0x80) {
  218. OUTCHAR(c);
  219. NEXT_IN(1);
  220. continue;
  221. }
  222. REQUIRE_INBUF(2);
  223. c2 = INBYTE2;
  224. if (c2 >= 0x30 && c2 <= 0x39) { /* 4 bytes seq */
  225. const struct _gb18030_to_unibmp_ranges *utr;
  226. unsigned char c3, c4;
  227. Py_UCS4 lseq;
  228. REQUIRE_INBUF(4);
  229. c3 = INBYTE3;
  230. c4 = INBYTE4;
  231. if (c < 0x81 || c > 0xFE ||
  232. c3 < 0x81 || c3 > 0xFE ||
  233. c4 < 0x30 || c4 > 0x39)
  234. return 1;
  235. c -= 0x81; c2 -= 0x30;
  236. c3 -= 0x81; c4 -= 0x30;
  237. if (c < 4) { /* U+0080 - U+FFFF */
  238. lseq = ((Py_UCS4)c * 10 + c2) * 1260 +
  239. (Py_UCS4)c3 * 10 + c4;
  240. if (lseq < 39420) {
  241. for (utr = gb18030_to_unibmp_ranges;
  242. lseq >= (utr + 1)->base;
  243. utr++) ;
  244. OUTCHAR(utr->first - utr->base + lseq);
  245. NEXT_IN(4);
  246. continue;
  247. }
  248. }
  249. else if (c >= 15) { /* U+10000 - U+10FFFF */
  250. lseq = 0x10000 + (((Py_UCS4)c-15) * 10 + c2)
  251. * 1260 + (Py_UCS4)c3 * 10 + c4;
  252. if (lseq <= 0x10FFFF) {
  253. OUTCHAR(lseq);
  254. NEXT_IN(4);
  255. continue;
  256. }
  257. }
  258. return 1;
  259. }
  260. GBK_DECODE(c, c2, writer)
  261. else if (TRYMAP_DEC(gb18030ext, decoded, c, c2))
  262. OUTCHAR(decoded);
  263. else
  264. return 1;
  265. NEXT_IN(2);
  266. }
  267. return 0;
  268. }
  269. /*
  270. * HZ codec
  271. */
  272. ENCODER_INIT(hz)
  273. {
  274. state->c[CN_STATE_OFFSET] = 0;
  275. return 0;
  276. }
  277. ENCODER_RESET(hz)
  278. {
  279. if (state->c[CN_STATE_OFFSET] != 0) {
  280. WRITEBYTE2('~', '}');
  281. state->c[CN_STATE_OFFSET] = 0;
  282. NEXT_OUT(2);
  283. }
  284. return 0;
  285. }
  286. ENCODER(hz)
  287. {
  288. while (*inpos < inlen) {
  289. Py_UCS4 c = INCHAR1;
  290. DBCHAR code;
  291. if (c < 0x80) {
  292. if (state->c[CN_STATE_OFFSET]) {
  293. WRITEBYTE2('~', '}');
  294. NEXT_OUT(2);
  295. state->c[CN_STATE_OFFSET] = 0;
  296. }
  297. WRITEBYTE1((unsigned char)c);
  298. NEXT(1, 1);
  299. if (c == '~') {
  300. WRITEBYTE1('~');
  301. NEXT_OUT(1);
  302. }
  303. continue;
  304. }
  305. if (c > 0xFFFF)
  306. return 1;
  307. if (TRYMAP_ENC(gbcommon, code, c))
  308. ;
  309. else
  310. return 1;
  311. if (code & 0x8000) /* MSB set: GBK */
  312. return 1;
  313. if (state->c[CN_STATE_OFFSET] == 0) {
  314. WRITEBYTE4('~', '{', code >> 8, code & 0xff);
  315. NEXT(1, 4);
  316. state->c[CN_STATE_OFFSET] = 1;
  317. }
  318. else {
  319. WRITEBYTE2(code >> 8, code & 0xff);
  320. NEXT(1, 2);
  321. }
  322. }
  323. return 0;
  324. }
  325. DECODER_INIT(hz)
  326. {
  327. state->c[CN_STATE_OFFSET] = 0;
  328. return 0;
  329. }
  330. DECODER_RESET(hz)
  331. {
  332. state->c[CN_STATE_OFFSET] = 0;
  333. return 0;
  334. }
  335. DECODER(hz)
  336. {
  337. while (inleft > 0) {
  338. unsigned char c = INBYTE1;
  339. Py_UCS4 decoded;
  340. if (c == '~') {
  341. unsigned char c2 = INBYTE2;
  342. REQUIRE_INBUF(2);
  343. if (c2 == '~' && state->c[CN_STATE_OFFSET] == 0)
  344. OUTCHAR('~');
  345. else if (c2 == '{' && state->c[CN_STATE_OFFSET] == 0)
  346. state->c[CN_STATE_OFFSET] = 1; /* set GB */
  347. else if (c2 == '\n' && state->c[CN_STATE_OFFSET] == 0)
  348. ; /* line-continuation */
  349. else if (c2 == '}' && state->c[CN_STATE_OFFSET] == 1)
  350. state->c[CN_STATE_OFFSET] = 0; /* set ASCII */
  351. else
  352. return 1;
  353. NEXT_IN(2);
  354. continue;
  355. }
  356. if (c & 0x80)
  357. return 1;
  358. if (state->c[CN_STATE_OFFSET] == 0) { /* ASCII mode */
  359. OUTCHAR(c);
  360. NEXT_IN(1);
  361. }
  362. else { /* GB mode */
  363. REQUIRE_INBUF(2);
  364. if (TRYMAP_DEC(gb2312, decoded, c, INBYTE2)) {
  365. OUTCHAR(decoded);
  366. NEXT_IN(2);
  367. }
  368. else
  369. return 1;
  370. }
  371. }
  372. return 0;
  373. }
  374. BEGIN_MAPPINGS_LIST(4)
  375. MAPPING_DECONLY(gb2312)
  376. MAPPING_DECONLY(gbkext)
  377. MAPPING_ENCONLY(gbcommon)
  378. MAPPING_ENCDEC(gb18030ext)
  379. END_MAPPINGS_LIST
  380. BEGIN_CODECS_LIST(4)
  381. CODEC_STATELESS(gb2312)
  382. CODEC_STATELESS(gbk)
  383. CODEC_STATELESS(gb18030)
  384. CODEC_STATEFUL(hz)
  385. END_CODECS_LIST
  386. I_AM_A_MODULE_FOR(cn)