iso2022_jp3.h 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538
  1. /*
  2. * Copyright (C) 1999-2004, 2008 Free Software Foundation, Inc.
  3. * This file is part of the GNU LIBICONV Library.
  4. *
  5. * The GNU LIBICONV Library is free software; you can redistribute it
  6. * and/or modify it under the terms of the GNU Library General Public
  7. * License as published by the Free Software Foundation; either version 2
  8. * of the License, or (at your option) any later version.
  9. *
  10. * The GNU LIBICONV Library is distributed in the hope that it will be
  11. * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  13. * Library General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU Library General Public
  16. * License along with the GNU LIBICONV Library; see the file COPYING.LIB.
  17. * If not, write to the Free Software Foundation, Inc., 51 Franklin Street,
  18. * Fifth Floor, Boston, MA 02110-1301, USA.
  19. */
  20. /*
  21. * ISO-2022-JP-3
  22. */
  23. #include "jisx0213.h"
  24. #define ESC 0x1b
  25. /*
  26. * The state is composed of one of the following values
  27. */
  28. #define STATE_ASCII 0 /* Esc ( B */
  29. #define STATE_JISX0201ROMAN 1 /* Esc ( J */
  30. #define STATE_JISX0201KATAKANA 2 /* Esc ( I */
  31. #define STATE_JISX0208 3 /* Esc $ @ or Esc $ B */
  32. #define STATE_JISX02131 4 /* Esc $ ( O or Esc $ ( Q*/
  33. #define STATE_JISX02132 5 /* Esc $ ( P */
  34. /*
  35. * In the ISO-2022-JP-3 to UCS-4 direction, the state also holds the last
  36. * character to be output, shifted by 3 bits.
  37. */
  38. static int
  39. iso2022_jp3_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n)
  40. {
  41. ucs4_t last_wc = conv->istate >> 3;
  42. if (last_wc) {
  43. /* Output the buffered character. */
  44. conv->istate &= 7;
  45. *pwc = last_wc;
  46. return 0; /* Don't advance the input pointer. */
  47. } else {
  48. state_t state = conv->istate;
  49. int count = 0;
  50. unsigned char c;
  51. for (;;) {
  52. c = *s;
  53. if (c == ESC) {
  54. if (n < count+3)
  55. goto none;
  56. if (s[1] == '(') {
  57. if (s[2] == 'B') {
  58. state = STATE_ASCII;
  59. s += 3; count += 3;
  60. if (n < count+1)
  61. goto none;
  62. continue;
  63. }
  64. if (s[2] == 'J') {
  65. state = STATE_JISX0201ROMAN;
  66. s += 3; count += 3;
  67. if (n < count+1)
  68. goto none;
  69. continue;
  70. }
  71. if (s[2] == 'I') {
  72. state = STATE_JISX0201KATAKANA;
  73. s += 3; count += 3;
  74. if (n < count+1)
  75. goto none;
  76. continue;
  77. }
  78. goto ilseq;
  79. }
  80. if (s[1] == '$') {
  81. if (s[2] == '@' || s[2] == 'B') {
  82. /* We don't distinguish JIS X 0208-1978 and JIS X 0208-1983. */
  83. state = STATE_JISX0208;
  84. s += 3; count += 3;
  85. if (n < count+1)
  86. goto none;
  87. continue;
  88. }
  89. if (s[2] == '(') {
  90. if (n < count+4)
  91. goto none;
  92. if (s[3] == 'O' || s[3] == 'Q') {
  93. state = STATE_JISX02131;
  94. s += 4; count += 4;
  95. if (n < count+1)
  96. goto none;
  97. continue;
  98. }
  99. if (s[3] == 'P') {
  100. state = STATE_JISX02132;
  101. s += 4; count += 4;
  102. if (n < count+1)
  103. goto none;
  104. continue;
  105. }
  106. }
  107. goto ilseq;
  108. }
  109. goto ilseq;
  110. }
  111. break;
  112. }
  113. switch (state) {
  114. case STATE_ASCII:
  115. if (c < 0x80) {
  116. int ret = ascii_mbtowc(conv,pwc,s,1);
  117. if (ret == RET_ILSEQ)
  118. goto ilseq;
  119. if (ret != 1) abort();
  120. conv->istate = state;
  121. return count+1;
  122. } else
  123. goto ilseq;
  124. case STATE_JISX0201ROMAN:
  125. if (c < 0x80) {
  126. int ret = jisx0201_mbtowc(conv,pwc,s,1);
  127. if (ret == RET_ILSEQ)
  128. goto ilseq;
  129. if (ret != 1) abort();
  130. conv->istate = state;
  131. return count+1;
  132. } else
  133. goto ilseq;
  134. case STATE_JISX0201KATAKANA:
  135. if (c < 0x80) {
  136. unsigned char buf = c+0x80;
  137. int ret = jisx0201_mbtowc(conv,pwc,&buf,1);
  138. if (ret == RET_ILSEQ)
  139. goto ilseq;
  140. if (ret != 1) abort();
  141. conv->istate = state;
  142. return count+1;
  143. } else
  144. goto ilseq;
  145. case STATE_JISX0208:
  146. if (n < count+2)
  147. goto none;
  148. if (s[0] < 0x80 && s[1] < 0x80) {
  149. int ret = jisx0208_mbtowc(conv,pwc,s,2);
  150. if (ret == RET_ILSEQ)
  151. goto ilseq;
  152. if (ret != 2) abort();
  153. conv->istate = state;
  154. return count+2;
  155. } else
  156. goto ilseq;
  157. case STATE_JISX02131:
  158. case STATE_JISX02132:
  159. if (n < count+2)
  160. goto none;
  161. if (s[0] < 0x80 && s[1] < 0x80) {
  162. ucs4_t wc = jisx0213_to_ucs4(((state-STATE_JISX02131+1)<<8)+s[0],s[1]);
  163. if (wc) {
  164. if (wc < 0x80) {
  165. /* It's a combining character. */
  166. ucs4_t wc1 = jisx0213_to_ucs_combining[wc - 1][0];
  167. ucs4_t wc2 = jisx0213_to_ucs_combining[wc - 1][1];
  168. /* We cannot output two Unicode characters at once. So,
  169. output the first character and buffer the second one. */
  170. *pwc = wc1;
  171. conv->istate = (wc2 << 3) | state;
  172. } else {
  173. *pwc = wc;
  174. conv->istate = state;
  175. }
  176. return count+2;
  177. }
  178. }
  179. goto ilseq;
  180. default: abort();
  181. }
  182. none:
  183. conv->istate = state;
  184. return RET_TOOFEW(count);
  185. ilseq:
  186. conv->istate = state;
  187. return RET_SHIFT_ILSEQ(count);
  188. }
  189. }
  190. static int
  191. iso2022_jp3_flushwc (conv_t conv, ucs4_t *pwc)
  192. {
  193. ucs4_t last_wc = conv->istate >> 3;
  194. if (last_wc) {
  195. /* Output the buffered character. */
  196. conv->istate &= 7;
  197. *pwc = last_wc;
  198. return 1;
  199. } else
  200. return 0;
  201. }
  202. /*
  203. * In the UCS-4 to ISO-2022-JP-3 direction, the state also holds the last two
  204. * bytes to be output, shifted by 3 bits, and the STATE_xxxxx value that was
  205. * effective before this buffered character, shifted by 19 bits.
  206. */
  207. /* Composition tables for each of the relevant combining characters. */
  208. static const struct { unsigned short base; unsigned short composed; } iso2022_jp3_comp_table_data[] = {
  209. #define iso2022_jp3_comp_table02e5_idx 0
  210. #define iso2022_jp3_comp_table02e5_len 1
  211. { 0x2b64, 0x2b65 }, /* 0x12B65 = 0x12B64 U+02E5 */
  212. #define iso2022_jp3_comp_table02e9_idx (iso2022_jp3_comp_table02e5_idx+iso2022_jp3_comp_table02e5_len)
  213. #define iso2022_jp3_comp_table02e9_len 1
  214. { 0x2b60, 0x2b66 }, /* 0x12B66 = 0x12B60 U+02E9 */
  215. #define iso2022_jp3_comp_table0300_idx (iso2022_jp3_comp_table02e9_idx+iso2022_jp3_comp_table02e9_len)
  216. #define iso2022_jp3_comp_table0300_len 5
  217. { 0x295c, 0x2b44 }, /* 0x12B44 = 0x1295C U+0300 */
  218. { 0x2b38, 0x2b48 }, /* 0x12B48 = 0x12B38 U+0300 */
  219. { 0x2b37, 0x2b4a }, /* 0x12B4A = 0x12B37 U+0300 */
  220. { 0x2b30, 0x2b4c }, /* 0x12B4C = 0x12B30 U+0300 */
  221. { 0x2b43, 0x2b4e }, /* 0x12B4E = 0x12B43 U+0300 */
  222. #define iso2022_jp3_comp_table0301_idx (iso2022_jp3_comp_table0300_idx+iso2022_jp3_comp_table0300_len)
  223. #define iso2022_jp3_comp_table0301_len 4
  224. { 0x2b38, 0x2b49 }, /* 0x12B49 = 0x12B38 U+0301 */
  225. { 0x2b37, 0x2b4b }, /* 0x12B4B = 0x12B37 U+0301 */
  226. { 0x2b30, 0x2b4d }, /* 0x12B4D = 0x12B30 U+0301 */
  227. { 0x2b43, 0x2b4f }, /* 0x12B4F = 0x12B43 U+0301 */
  228. #define iso2022_jp3_comp_table309a_idx (iso2022_jp3_comp_table0301_idx+iso2022_jp3_comp_table0301_len)
  229. #define iso2022_jp3_comp_table309a_len 14
  230. { 0x242b, 0x2477 }, /* 0x12477 = 0x1242B U+309A */
  231. { 0x242d, 0x2478 }, /* 0x12478 = 0x1242D U+309A */
  232. { 0x242f, 0x2479 }, /* 0x12479 = 0x1242F U+309A */
  233. { 0x2431, 0x247a }, /* 0x1247A = 0x12431 U+309A */
  234. { 0x2433, 0x247b }, /* 0x1247B = 0x12433 U+309A */
  235. { 0x252b, 0x2577 }, /* 0x12577 = 0x1252B U+309A */
  236. { 0x252d, 0x2578 }, /* 0x12578 = 0x1252D U+309A */
  237. { 0x252f, 0x2579 }, /* 0x12579 = 0x1252F U+309A */
  238. { 0x2531, 0x257a }, /* 0x1257A = 0x12531 U+309A */
  239. { 0x2533, 0x257b }, /* 0x1257B = 0x12533 U+309A */
  240. { 0x253b, 0x257c }, /* 0x1257C = 0x1253B U+309A */
  241. { 0x2544, 0x257d }, /* 0x1257D = 0x12544 U+309A */
  242. { 0x2548, 0x257e }, /* 0x1257E = 0x12548 U+309A */
  243. { 0x2675, 0x2678 }, /* 0x12678 = 0x12675 U+309A */
  244. };
  245. #define SPLIT_STATE \
  246. unsigned short lasttwo = state >> 3; state_t prevstate = state >> 19; state &= 7
  247. #define COMBINE_STATE \
  248. state |= (prevstate << 19) | (lasttwo << 3)
  249. #define COMBINE_STATE_NO_LASTTWO \
  250. /* assume lasttwo == 0, then prevstate is ignored */
  251. static int
  252. iso2022_jp3_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n)
  253. {
  254. int count = 0;
  255. unsigned char buf[2];
  256. unsigned short jch;
  257. int ret;
  258. state_t state = conv->ostate;
  259. SPLIT_STATE;
  260. if (lasttwo) {
  261. /* Attempt to combine the last character with this one. */
  262. unsigned int idx;
  263. unsigned int len;
  264. if (wc == 0x02e5)
  265. idx = iso2022_jp3_comp_table02e5_idx,
  266. len = iso2022_jp3_comp_table02e5_len;
  267. else if (wc == 0x02e9)
  268. idx = iso2022_jp3_comp_table02e9_idx,
  269. len = iso2022_jp3_comp_table02e9_len;
  270. else if (wc == 0x0300)
  271. idx = iso2022_jp3_comp_table0300_idx,
  272. len = iso2022_jp3_comp_table0300_len;
  273. else if (wc == 0x0301)
  274. idx = iso2022_jp3_comp_table0301_idx,
  275. len = iso2022_jp3_comp_table0301_len;
  276. else if (wc == 0x309a)
  277. idx = iso2022_jp3_comp_table309a_idx,
  278. len = iso2022_jp3_comp_table309a_len;
  279. else
  280. goto not_combining;
  281. do
  282. if (iso2022_jp3_comp_table_data[idx].base == lasttwo)
  283. break;
  284. while (++idx, --len > 0);
  285. if (len > 0) {
  286. /* Output the combined character. */
  287. /* We know the combined character is in JISX0213 plane 1, but
  288. the buffered character may have been in JISX0208 or in
  289. JISX0213 plane 1. */
  290. count = (state != STATE_JISX02131 ? 4 : 0) + 2;
  291. if (n < count)
  292. return RET_TOOSMALL;
  293. if (state != STATE_JISX02131) {
  294. r[0] = ESC;
  295. r[1] = '$';
  296. r[2] = '(';
  297. r[3] = 'Q';
  298. r += 4;
  299. state = STATE_JISX02131;
  300. }
  301. lasttwo = iso2022_jp3_comp_table_data[idx].composed;
  302. r[0] = (lasttwo >> 8) & 0xff;
  303. r[1] = lasttwo & 0xff;
  304. COMBINE_STATE_NO_LASTTWO;
  305. conv->ostate = state;
  306. return count;
  307. }
  308. not_combining:
  309. /* Output the buffered character. */
  310. /* We know it is in JISX0208 or in JISX0213 plane 1. */
  311. count = (prevstate != state ? 3 : 0) + 2;
  312. if (n < count)
  313. return RET_TOOSMALL;
  314. if (prevstate != state) {
  315. if (state != STATE_JISX0208) abort();
  316. r[0] = ESC;
  317. r[1] = '$';
  318. r[2] = 'B';
  319. r += 3;
  320. }
  321. r[0] = (lasttwo >> 8) & 0xff;
  322. r[1] = lasttwo & 0xff;
  323. r += 2;
  324. }
  325. /* Try ASCII. */
  326. ret = ascii_wctomb(conv,buf,wc,1);
  327. if (ret != RET_ILUNI) {
  328. if (ret != 1) abort();
  329. if (buf[0] < 0x80) {
  330. count += (state == STATE_ASCII ? 1 : 4);
  331. if (n < count)
  332. return RET_TOOSMALL;
  333. if (state != STATE_ASCII) {
  334. r[0] = ESC;
  335. r[1] = '(';
  336. r[2] = 'B';
  337. r += 3;
  338. state = STATE_ASCII;
  339. }
  340. r[0] = buf[0];
  341. COMBINE_STATE_NO_LASTTWO;
  342. conv->ostate = state;
  343. return count;
  344. }
  345. }
  346. /* Try JIS X 0201-1976 Roman. */
  347. ret = jisx0201_wctomb(conv,buf,wc,1);
  348. if (ret != RET_ILUNI) {
  349. if (ret != 1) abort();
  350. if (buf[0] < 0x80) {
  351. count += (state == STATE_JISX0201ROMAN ? 1 : 4);
  352. if (n < count)
  353. return RET_TOOSMALL;
  354. if (state != STATE_JISX0201ROMAN) {
  355. r[0] = ESC;
  356. r[1] = '(';
  357. r[2] = 'J';
  358. r += 3;
  359. state = STATE_JISX0201ROMAN;
  360. }
  361. r[0] = buf[0];
  362. COMBINE_STATE_NO_LASTTWO;
  363. conv->ostate = state;
  364. return count;
  365. }
  366. }
  367. jch = ucs4_to_jisx0213(wc);
  368. /* Try JIS X 0208-1990 in place of JIS X 0208-1978 and JIS X 0208-1983. */
  369. ret = jisx0208_wctomb(conv,buf,wc,2);
  370. if (ret != RET_ILUNI) {
  371. if (ret != 2) abort();
  372. if (buf[0] < 0x80 && buf[1] < 0x80) {
  373. if (jch & 0x0080) {
  374. /* A possible match in comp_table_data. Buffer it. */
  375. prevstate = state;
  376. lasttwo = jch & 0x7f7f;
  377. state = STATE_JISX0208;
  378. COMBINE_STATE;
  379. conv->ostate = state;
  380. return count;
  381. } else {
  382. count += (state == STATE_JISX0208 ? 2 : 5);
  383. if (n < count)
  384. return RET_TOOSMALL;
  385. if (state != STATE_JISX0208) {
  386. r[0] = ESC;
  387. r[1] = '$';
  388. r[2] = 'B';
  389. r += 3;
  390. state = STATE_JISX0208;
  391. }
  392. r[0] = buf[0];
  393. r[1] = buf[1];
  394. COMBINE_STATE_NO_LASTTWO;
  395. conv->ostate = state;
  396. return count;
  397. }
  398. }
  399. }
  400. /* Try JISX 0213 plane 1 and JISX 0213 plane 2. */
  401. if (jch != 0) {
  402. if (jch & 0x8000) {
  403. /* JISX 0213 plane 2. */
  404. if (state != STATE_JISX02132) {
  405. count += 4;
  406. if (n < count)
  407. return RET_TOOSMALL;
  408. r[0] = ESC;
  409. r[1] = '$';
  410. r[2] = '(';
  411. r[3] = 'P';
  412. r += 4;
  413. state = STATE_JISX02132;
  414. }
  415. } else {
  416. /* JISX 0213 plane 1. */
  417. if (state != STATE_JISX02131) {
  418. count += 4;
  419. if (n < count)
  420. return RET_TOOSMALL;
  421. r[0] = ESC;
  422. r[1] = '$';
  423. r[2] = '(';
  424. r[3] = 'Q';
  425. r += 4;
  426. state = STATE_JISX02131;
  427. }
  428. }
  429. if (jch & 0x0080) {
  430. /* A possible match in comp_table_data. We have to buffer it. */
  431. /* We know it's a JISX 0213 plane 1 character. */
  432. if (jch & 0x8000) abort();
  433. prevstate = state;
  434. lasttwo = jch & 0x7f7f;
  435. COMBINE_STATE;
  436. conv->ostate = state;
  437. return count;
  438. }
  439. count += 2;
  440. if (n < count)
  441. return RET_TOOSMALL;
  442. r[0] = (jch >> 8) & 0x7f;
  443. r[1] = jch & 0x7f;
  444. COMBINE_STATE_NO_LASTTWO;
  445. conv->ostate = state;
  446. return count;
  447. }
  448. /* Try JIS X 0201-1976 Katakana. This is not officially part of
  449. ISO-2022-JP-3. Therefore we try it after all other attempts. */
  450. ret = jisx0201_wctomb(conv,buf,wc,1);
  451. if (ret != RET_ILUNI) {
  452. if (ret != 1) abort();
  453. if (buf[0] >= 0x80) {
  454. count += (state == STATE_JISX0201KATAKANA ? 1 : 4);
  455. if (n < count)
  456. return RET_TOOSMALL;
  457. if (state != STATE_JISX0201KATAKANA) {
  458. r[0] = ESC;
  459. r[1] = '(';
  460. r[2] = 'I';
  461. r += 3;
  462. state = STATE_JISX0201KATAKANA;
  463. }
  464. r[0] = buf[0]-0x80;
  465. COMBINE_STATE_NO_LASTTWO;
  466. conv->ostate = state;
  467. return count;
  468. }
  469. }
  470. return RET_ILUNI;
  471. }
  472. static int
  473. iso2022_jp3_reset (conv_t conv, unsigned char *r, int n)
  474. {
  475. state_t state = conv->ostate;
  476. SPLIT_STATE;
  477. {
  478. int count =
  479. (lasttwo ? (prevstate != state ? 3 : 0) + 2 : 0)
  480. + (state != STATE_ASCII ? 3 : 0);
  481. if (n < count)
  482. return RET_TOOSMALL;
  483. if (lasttwo) {
  484. if (prevstate != state) {
  485. if (state != STATE_JISX0208) abort();
  486. r[0] = ESC;
  487. r[1] = '$';
  488. r[2] = 'B';
  489. r += 3;
  490. }
  491. r[0] = (lasttwo >> 8) & 0xff;
  492. r[1] = lasttwo & 0xff;
  493. r += 2;
  494. }
  495. if (state != STATE_ASCII) {
  496. r[0] = ESC;
  497. r[1] = '(';
  498. r[2] = 'B';
  499. }
  500. /* conv->ostate = 0; will be done by the caller */
  501. return count;
  502. }
  503. }
  504. #undef COMBINE_STATE_NO_LASTTWO
  505. #undef COMBINE_STATE
  506. #undef SPLIT_STATE
  507. #undef STATE_JISX02132
  508. #undef STATE_JISX02131
  509. #undef STATE_JISX0208
  510. #undef STATE_JISX0201KATAKANA
  511. #undef STATE_JISX0201ROMAN
  512. #undef STATE_ASCII