johab_hangul.h 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262
  1. /*
  2. * Copyright (C) 1999-2001 Free Software Foundation, Inc.
  3. * This file is part of the GNU LIBICONV Library.
  4. *
  5. * The GNU LIBICONV Library is free software; you can redistribute it
  6. * and/or modify it under the terms of the GNU Library General Public
  7. * License as published by the Free Software Foundation; either version 2
  8. * of the License, or (at your option) any later version.
  9. *
  10. * The GNU LIBICONV Library is distributed in the hope that it will be
  11. * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  13. * Library General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU Library General Public
  16. * License along with the GNU LIBICONV Library; see the file COPYING.LIB.
  17. * If not, write to the Free Software Foundation, Inc., 51 Franklin Street,
  18. * Fifth Floor, Boston, MA 02110-1301, USA.
  19. */
  20. /*
  21. * JOHAB Hangul
  22. *
  23. * Ken Lunde writes in his "CJKV Information Processing" book, p. 114:
  24. * "Hangul can be composed of two or three jamo (some jamo are considered
  25. * compound). Johab uses 19 initial jamo (consonants), 21 medial jamo (vowels)
  26. * and 27 final jamo (consonants; 28 when you include the "fill" character
  27. * for Hangul containing only two jamo). Multiplying these numbers results in
  28. * 11172."
  29. *
  30. * Structure of the Johab encoding (see p. 181-184):
  31. * bit 15 = 1
  32. * bit 14..10 = initial jamo, only 19+1 out of 32 possible values are used
  33. * bit 9..5 = medial jamo, only 21+1 out of 32 possible values are used
  34. * bit 4..0 = final jamo, only 27+1 out of 32 possible values are used
  35. *
  36. * Structure of the Unicode encoding:
  37. * grep '^0x\([8-C]...\|D[0-7]..\)' unicode.org-mappings/EASTASIA/KSC/JOHAB.TXT
  38. * You see that all characters there are marked "HANGUL LETTER" or "HANGUL
  39. * SYLLABLE". If you eliminate the "HANGUL LETTER"s, the table is sorted
  40. * in ascending order according to Johab encoding and according to the Unicode
  41. * encoding. Now look a little more carefully, and you see that the following
  42. * formula holds:
  43. * unicode == 0xAC00
  44. * + 21 * 28 * (jamo_initial_index[(johab >> 10) & 31] - 1)
  45. * + 28 * (jamo_medial_index[(johab >> 5) & 31] - 1)
  46. * + jamo_final_index[johab & 31]
  47. * where the index tables are defined as below.
  48. */
  49. /* Tables mapping 5-bit groups to jamo letters. */
  50. /* Note that Jamo XX = UHC 0xA4A0+XX = Unicode 0x3130+XX */
  51. #define NONE 0xfd
  52. #define FILL 0xff
  53. static const unsigned char jamo_initial[32] = {
  54. NONE, FILL, 0x01, 0x02, 0x04, 0x07, 0x08, 0x09,
  55. 0x11, 0x12, 0x13, 0x15, 0x16, 0x17, 0x18, 0x19,
  56. 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, NONE, NONE, NONE,
  57. NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
  58. };
  59. static const unsigned char jamo_medial[32] = {
  60. NONE, NONE, FILL, 0x1f, 0x20, 0x21, 0x22, 0x23,
  61. NONE, NONE, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29,
  62. NONE, NONE, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
  63. NONE, NONE, 0x30, 0x31, 0x32, 0x33, NONE, NONE,
  64. };
  65. static const unsigned char jamo_final[32] = {
  66. NONE, FILL, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
  67. 0x07, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
  68. 0x10, 0x11, NONE, 0x12, 0x14, 0x15, 0x16, 0x17,
  69. 0x18, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, NONE, NONE,
  70. };
  71. /* Same as jamo_final, except that it excludes characters already
  72. contained in jamo_initial. 11 characters instead of 27. */
  73. static const unsigned char jamo_final_notinitial[32] = {
  74. NONE, NONE, NONE, NONE, 0x03, NONE, 0x05, 0x06,
  75. NONE, NONE, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
  76. 0x10, NONE, NONE, NONE, 0x14, NONE, NONE, NONE,
  77. NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
  78. };
  79. /* Tables mapping 5-bit groups to packed indices. */
  80. #define none -1
  81. #define fill 0
  82. static const signed char jamo_initial_index[32] = {
  83. none, fill, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
  84. 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
  85. 0x0f, 0x10, 0x11, 0x12, 0x13, none, none, none,
  86. none, none, none, none, none, none, none, none,
  87. };
  88. static const signed char jamo_medial_index[32] = {
  89. none, none, fill, 0x01, 0x02, 0x03, 0x04, 0x05,
  90. none, none, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b,
  91. none, none, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11,
  92. none, none, 0x12, 0x13, 0x14, 0x15, none, none,
  93. };
  94. static const signed char jamo_final_index[32] = {
  95. none, fill, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
  96. 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
  97. 0x0f, 0x10, none, 0x11, 0x12, 0x13, 0x14, 0x15,
  98. 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, none, none,
  99. };
  100. static int
  101. johab_hangul_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n)
  102. {
  103. unsigned char c1 = s[0];
  104. if ((c1 >= 0x84 && c1 <= 0xd3)) {
  105. if (n >= 2) {
  106. unsigned char c2 = s[1];
  107. if ((c2 >= 0x41 && c2 < 0x7f) || (c2 >= 0x81 && c2 < 0xff)) {
  108. unsigned int johab = (c1 << 8) | c2;
  109. unsigned int bitspart1 = (johab >> 10) & 31;
  110. unsigned int bitspart2 = (johab >> 5) & 31;
  111. unsigned int bitspart3 = johab & 31;
  112. int index1 = jamo_initial_index[bitspart1];
  113. int index2 = jamo_medial_index[bitspart2];
  114. int index3 = jamo_final_index[bitspart3];
  115. /* Exclude "none" values. */
  116. if (index1 >= 0 && index2 >= 0 && index3 >= 0) {
  117. /* Deal with "fill" values in initial or medial position. */
  118. if (index1 == fill) {
  119. if (index2 == fill) {
  120. unsigned char jamo3 = jamo_final_notinitial[bitspart3];
  121. if (jamo3 != NONE) {
  122. *pwc = (ucs4_t) 0x3130 + jamo3;
  123. return 2;
  124. }
  125. } else if (index3 == fill) {
  126. unsigned char jamo2 = jamo_medial[bitspart2];
  127. if (jamo2 != NONE && jamo2 != FILL) {
  128. *pwc = (ucs4_t) 0x3130 + jamo2;
  129. return 2;
  130. }
  131. }
  132. /* Syllables composed only of medial and final don't exist. */
  133. } else if (index2 == fill) {
  134. if (index3 == fill) {
  135. unsigned char jamo1 = jamo_initial[bitspart1];
  136. if (jamo1 != NONE && jamo1 != FILL) {
  137. *pwc = (ucs4_t) 0x3130 + jamo1;
  138. return 2;
  139. }
  140. }
  141. /* Syllables composed only of initial and final don't exist. */
  142. } else {
  143. /* index1 and index2 are not fill, but index3 may be fill. */
  144. /* Nothing more to exclude. All 11172 code points are valid. */
  145. *pwc = 0xac00 + ((index1 - 1) * 21 + (index2 - 1)) * 28 + index3;
  146. return 2;
  147. }
  148. }
  149. }
  150. return RET_ILSEQ;
  151. }
  152. return RET_TOOFEW(0);
  153. }
  154. return RET_ILSEQ;
  155. }
  156. /* 51 Jamo: 19 initial, 21 medial, 11 final not initial. */
  157. static const unsigned short johab_hangul_page31[51] = {
  158. 0x8841, 0x8c41, 0x8444, 0x9041, 0x8446, 0x8447, 0x9441, /*0x30-0x37*/
  159. 0x9841, 0x9c41, 0x844a, 0x844b, 0x844c, 0x844d, 0x844e, 0x844f, /*0x38-0x3f*/
  160. 0x8450, 0xa041, 0xa441, 0xa841, 0x8454, 0xac41, 0xb041, 0xb441, /*0x40-0x47*/
  161. 0xb841, 0xbc41, 0xc041, 0xc441, 0xc841, 0xcc41, 0xd041, 0x8461, /*0x48-0x4f*/
  162. 0x8481, 0x84a1, 0x84c1, 0x84e1, 0x8541, 0x8561, 0x8581, 0x85a1, /*0x50-0x57*/
  163. 0x85c1, 0x85e1, 0x8641, 0x8661, 0x8681, 0x86a1, 0x86c1, 0x86e1, /*0x58-0x5f*/
  164. 0x8741, 0x8761, 0x8781, 0x87a1, /*0x60-0x67*/
  165. };
  166. /* Tables mapping packed indices to 5-bit groups. */
  167. /* index1+1 = jamo_initial_index[bitspart1] <==>
  168. bitspart1 = jamo_initial_index_inverse[index1] */
  169. static const char jamo_initial_index_inverse[19] = {
  170. 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
  171. 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
  172. 0x10, 0x11, 0x12, 0x13, 0x14,
  173. };
  174. /* index2+1 = jamo_medial_index[bitspart2] <==>
  175. bitspart2 = jamo_medial_index_inverse[index2] */
  176. static const char jamo_medial_index_inverse[21] = {
  177. 0x03, 0x04, 0x05, 0x06, 0x07,
  178. 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
  179. 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
  180. 0x1a, 0x1b, 0x1c, 0x1d,
  181. };
  182. /* index3 = jamo_final_index[bitspart3] <==>
  183. bitspart3 = jamo_final_index_inverse[index3] */
  184. static const char jamo_final_index_inverse[28] = {
  185. 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
  186. 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
  187. 0x10, 0x11, 0x13, 0x14, 0x15, 0x16, 0x17,
  188. 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d,
  189. };
  190. static int
  191. johab_hangul_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n)
  192. {
  193. if (n >= 2) {
  194. if (wc >= 0x3131 && wc < 0x3164) {
  195. unsigned short c = johab_hangul_page31[wc-0x3131];
  196. r[0] = (c >> 8); r[1] = (c & 0xff);
  197. return 2;
  198. } else if (wc >= 0xac00 && wc < 0xd7a4) {
  199. unsigned int index1;
  200. unsigned int index2;
  201. unsigned int index3;
  202. unsigned short c;
  203. unsigned int tmp = wc - 0xac00;
  204. index3 = tmp % 28; tmp = tmp / 28;
  205. index2 = tmp % 21; tmp = tmp / 21;
  206. index1 = tmp;
  207. c = (((((1 << 5)
  208. | jamo_initial_index_inverse[index1]) << 5)
  209. | jamo_medial_index_inverse[index2]) << 5)
  210. | jamo_final_index_inverse[index3];
  211. r[0] = (c >> 8); r[1] = (c & 0xff);
  212. return 2;
  213. }
  214. return RET_ILUNI;
  215. }
  216. return RET_TOOSMALL;
  217. }
  218. /*
  219. * Decomposition of JOHAB Hangul in one to three Johab Jamo elements.
  220. */
  221. /* Decompose wc into r[0..2], and return the number of resulting Jamo elements.
  222. Return RET_ILUNI if decomposition is not possible. */
  223. static int johab_hangul_decompose (conv_t conv, ucs4_t* r, ucs4_t wc)
  224. {
  225. unsigned char buf[2];
  226. int ret = johab_hangul_wctomb(conv,buf,wc,2);
  227. if (ret != RET_ILUNI) {
  228. unsigned int hangul = (buf[0] << 8) | buf[1];
  229. unsigned char jamo1 = jamo_initial[(hangul >> 10) & 31];
  230. unsigned char jamo2 = jamo_medial[(hangul >> 5) & 31];
  231. unsigned char jamo3 = jamo_final[hangul & 31];
  232. if ((hangul >> 15) != 1) abort();
  233. if (jamo1 != NONE && jamo2 != NONE && jamo3 != NONE) {
  234. /* They are not all three == FILL because that would correspond to
  235. johab = 0x8441, which doesn't exist. */
  236. ucs4_t* p = r;
  237. if (jamo1 != FILL)
  238. *p++ = 0x3130 + jamo1;
  239. if (jamo2 != FILL)
  240. *p++ = 0x3130 + jamo2;
  241. if (jamo3 != FILL)
  242. *p++ = 0x3130 + jamo3;
  243. return p-r;
  244. }
  245. }
  246. return RET_ILUNI;
  247. }
  248. #undef fill
  249. #undef none
  250. #undef FILL
  251. #undef NONE