_codecs_hk.c 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190
  1. /*
  2. * _codecs_hk.c: Codecs collection for encodings from Hong Kong
  3. *
  4. * Written by Hye-Shik Chang <perky@FreeBSD.org>
  5. */
  6. #define USING_IMPORTED_MAPS
  7. #define CJK_MOD_SPECIFIC_STATE \
  8. const encode_map *big5_encmap; \
  9. const decode_map *big5_decmap;
  10. #include "cjkcodecs.h"
  11. #include "mappings_hk.h"
  12. /*
  13. * BIG5HKSCS codec
  14. */
  15. CODEC_INIT(big5hkscs)
  16. {
  17. cjkcodecs_module_state *st = codec->modstate;
  18. if (IMPORT_MAP(tw, big5, &st->big5_encmap, &st->big5_decmap)) {
  19. return -1;
  20. }
  21. return 0;
  22. }
  23. /*
  24. * There are four possible pair unicode -> big5hkscs maps as in HKSCS 2004:
  25. * U+00CA U+0304 -> 8862 (U+00CA alone is mapped to 8866)
  26. * U+00CA U+030C -> 8864
  27. * U+00EA U+0304 -> 88a3 (U+00EA alone is mapped to 88a7)
  28. * U+00EA U+030C -> 88a5
  29. * These are handled by not mapping tables but a hand-written code.
  30. */
  31. static const DBCHAR big5hkscs_pairenc_table[4] = {0x8862, 0x8864, 0x88a3, 0x88a5};
  32. ENCODER(big5hkscs)
  33. {
  34. while (*inpos < inlen) {
  35. Py_UCS4 c = INCHAR1;
  36. DBCHAR code;
  37. Py_ssize_t insize;
  38. if (c < 0x80) {
  39. REQUIRE_OUTBUF(1);
  40. **outbuf = (unsigned char)c;
  41. NEXT(1, 1);
  42. continue;
  43. }
  44. insize = 1;
  45. REQUIRE_OUTBUF(2);
  46. if (c < 0x10000) {
  47. if (TRYMAP_ENC(big5hkscs_bmp, code, c)) {
  48. if (code == MULTIC) {
  49. Py_UCS4 c2;
  50. if (inlen - *inpos >= 2)
  51. c2 = INCHAR2;
  52. else
  53. c2 = 0;
  54. if (inlen - *inpos >= 2 &&
  55. ((c & 0xffdf) == 0x00ca) &&
  56. ((c2 & 0xfff7) == 0x0304)) {
  57. code = big5hkscs_pairenc_table[
  58. ((c >> 4) |
  59. (c2 >> 3)) & 3];
  60. insize = 2;
  61. }
  62. else if (inlen - *inpos < 2 &&
  63. !(flags & MBENC_FLUSH))
  64. return MBERR_TOOFEW;
  65. else {
  66. if (c == 0xca)
  67. code = 0x8866;
  68. else /* c == 0xea */
  69. code = 0x88a7;
  70. }
  71. }
  72. }
  73. else if (TRYMAP_ENC_ST(big5, code, c))
  74. ;
  75. else
  76. return 1;
  77. }
  78. else if (c < 0x20000)
  79. return insize;
  80. else if (c < 0x30000) {
  81. if (TRYMAP_ENC(big5hkscs_nonbmp, code, c & 0xffff))
  82. ;
  83. else
  84. return insize;
  85. }
  86. else
  87. return insize;
  88. OUTBYTE1(code >> 8);
  89. OUTBYTE2(code & 0xFF);
  90. NEXT(insize, 2);
  91. }
  92. return 0;
  93. }
  94. #define BH2S(c1, c2) (((c1) - 0x87) * (0xfe - 0x40 + 1) + ((c2) - 0x40))
  95. DECODER(big5hkscs)
  96. {
  97. while (inleft > 0) {
  98. unsigned char c = INBYTE1;
  99. Py_UCS4 decoded;
  100. if (c < 0x80) {
  101. OUTCHAR(c);
  102. NEXT_IN(1);
  103. continue;
  104. }
  105. REQUIRE_INBUF(2);
  106. if (0xc6 > c || c > 0xc8 || (c < 0xc7 && INBYTE2 < 0xa1)) {
  107. if (TRYMAP_DEC_ST(big5, decoded, c, INBYTE2)) {
  108. OUTCHAR(decoded);
  109. NEXT_IN(2);
  110. continue;
  111. }
  112. }
  113. if (TRYMAP_DEC(big5hkscs, decoded, c, INBYTE2))
  114. {
  115. int s = BH2S(c, INBYTE2);
  116. const unsigned char *hintbase;
  117. assert(0x87 <= c && c <= 0xfe);
  118. assert(0x40 <= INBYTE2 && INBYTE2 <= 0xfe);
  119. if (BH2S(0x87, 0x40) <= s && s <= BH2S(0xa0, 0xfe)) {
  120. hintbase = big5hkscs_phint_0;
  121. s -= BH2S(0x87, 0x40);
  122. }
  123. else if (BH2S(0xc6,0xa1) <= s && s <= BH2S(0xc8,0xfe)){
  124. hintbase = big5hkscs_phint_12130;
  125. s -= BH2S(0xc6, 0xa1);
  126. }
  127. else if (BH2S(0xf9,0xd6) <= s && s <= BH2S(0xfe,0xfe)){
  128. hintbase = big5hkscs_phint_21924;
  129. s -= BH2S(0xf9, 0xd6);
  130. }
  131. else
  132. return MBERR_INTERNAL;
  133. if (hintbase[s >> 3] & (1 << (s & 7))) {
  134. OUTCHAR(decoded | 0x20000);
  135. NEXT_IN(2);
  136. }
  137. else {
  138. OUTCHAR(decoded);
  139. NEXT_IN(2);
  140. }
  141. continue;
  142. }
  143. switch ((c << 8) | INBYTE2) {
  144. case 0x8862: OUTCHAR2(0x00ca, 0x0304); break;
  145. case 0x8864: OUTCHAR2(0x00ca, 0x030c); break;
  146. case 0x88a3: OUTCHAR2(0x00ea, 0x0304); break;
  147. case 0x88a5: OUTCHAR2(0x00ea, 0x030c); break;
  148. default: return 1;
  149. }
  150. NEXT_IN(2); /* all decoded code points are pairs, above. */
  151. }
  152. return 0;
  153. }
  154. BEGIN_MAPPINGS_LIST(3)
  155. MAPPING_DECONLY(big5hkscs)
  156. MAPPING_ENCONLY(big5hkscs_bmp)
  157. MAPPING_ENCONLY(big5hkscs_nonbmp)
  158. END_MAPPINGS_LIST
  159. BEGIN_CODECS_LIST(1)
  160. CODEC_STATELESS_WINIT(big5hkscs)
  161. END_CODECS_LIST
  162. I_AM_A_MODULE_FOR(hk)