iconv.c 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595
  1. /*
  2. * Copyright (C) 1999-2008 Free Software Foundation, Inc.
  3. * This file is part of the GNU LIBICONV Library.
  4. *
  5. * The GNU LIBICONV Library is free software; you can redistribute it
  6. * and/or modify it under the terms of the GNU Library General Public
  7. * License as published by the Free Software Foundation; either version 2
  8. * of the License, or (at your option) any later version.
  9. *
  10. * The GNU LIBICONV Library is distributed in the hope that it will be
  11. * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  13. * Library General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU Library General Public
  16. * License along with the GNU LIBICONV Library; see the file COPYING.LIB.
  17. * If not, write to the Free Software Foundation, Inc., 51 Franklin Street,
  18. * Fifth Floor, Boston, MA 02110-1301, USA.
  19. */
  20. #include "iconv.h"
  21. #include <stdlib.h>
  22. #include <string.h>
  23. #include "config.h"
  24. #include "localcharset.h"
  25. #if ENABLE_EXTRA
  26. /*
  27. * Consider all system dependent encodings, for any system,
  28. * and the extra encodings.
  29. */
  30. #define USE_AIX
  31. #define USE_OSF1
  32. #define USE_DOS
  33. #define USE_EXTRA
  34. #else
  35. /*
  36. * Consider those system dependent encodings that are needed for the
  37. * current system.
  38. */
  39. #ifdef _AIX
  40. #define USE_AIX
  41. #endif
  42. #if defined(__osf__) || defined(VMS)
  43. #define USE_OSF1
  44. #endif
  45. #if defined(__DJGPP__) || (defined(_WIN32) && (defined(_MSC_VER) || defined(__MINGW32__)))
  46. #define USE_DOS
  47. #endif
  48. #endif
  49. /*
  50. * Data type for general conversion loop.
  51. */
  52. struct loop_funcs {
  53. size_t (*loop_convert) (iconv_t icd,
  54. const char* * inbuf, size_t *inbytesleft,
  55. char* * outbuf, size_t *outbytesleft);
  56. size_t (*loop_reset) (iconv_t icd,
  57. char* * outbuf, size_t *outbytesleft);
  58. };
  59. /*
  60. * Converters.
  61. */
  62. #include "converters.h"
  63. /*
  64. * Transliteration tables.
  65. */
  66. #include "cjk_variants.h"
  67. #include "translit.h"
  68. /*
  69. * Table of all supported encodings.
  70. */
  71. struct encoding {
  72. struct mbtowc_funcs ifuncs; /* conversion multibyte -> unicode */
  73. struct wctomb_funcs ofuncs; /* conversion unicode -> multibyte */
  74. int oflags; /* flags for unicode -> multibyte conversion */
  75. };
  76. #define DEFALIAS(xxx_alias,xxx) /* nothing */
  77. enum {
  78. #define DEFENCODING(xxx_names,xxx,xxx_ifuncs1,xxx_ifuncs2,xxx_ofuncs1,xxx_ofuncs2) \
  79. ei_##xxx ,
  80. #include "encodings.def"
  81. #ifdef USE_AIX
  82. # include "encodings_aix.def"
  83. #endif
  84. #ifdef USE_OSF1
  85. # include "encodings_osf1.def"
  86. #endif
  87. #ifdef USE_DOS
  88. # include "encodings_dos.def"
  89. #endif
  90. #ifdef USE_EXTRA
  91. # include "encodings_extra.def"
  92. #endif
  93. #include "encodings_local.def"
  94. #undef DEFENCODING
  95. ei_for_broken_compilers_that_dont_like_trailing_commas
  96. };
  97. #include "flags.h"
  98. static struct encoding const all_encodings[] = {
  99. #define DEFENCODING(xxx_names,xxx,xxx_ifuncs1,xxx_ifuncs2,xxx_ofuncs1,xxx_ofuncs2) \
  100. { xxx_ifuncs1,xxx_ifuncs2, xxx_ofuncs1,xxx_ofuncs2, ei_##xxx##_oflags },
  101. #include "encodings.def"
  102. #ifdef USE_AIX
  103. # include "encodings_aix.def"
  104. #endif
  105. #ifdef USE_OSF1
  106. # include "encodings_osf1.def"
  107. #endif
  108. #ifdef USE_DOS
  109. # include "encodings_dos.def"
  110. #endif
  111. #ifdef USE_EXTRA
  112. # include "encodings_extra.def"
  113. #endif
  114. #undef DEFENCODING
  115. #define DEFENCODING(xxx_names,xxx,xxx_ifuncs1,xxx_ifuncs2,xxx_ofuncs1,xxx_ofuncs2) \
  116. { xxx_ifuncs1,xxx_ifuncs2, xxx_ofuncs1,xxx_ofuncs2, 0 },
  117. #include "encodings_local.def"
  118. #undef DEFENCODING
  119. };
  120. #undef DEFALIAS
  121. /*
  122. * Conversion loops.
  123. */
  124. #include "loops.h"
  125. /*
  126. * Alias lookup function.
  127. * Defines
  128. * struct alias { int name; unsigned int encoding_index; };
  129. * const struct alias * aliases_lookup (const char *str, unsigned int len);
  130. * #define MAX_WORD_LENGTH ...
  131. */
  132. #if defined _AIX
  133. # include "aliases_sysaix.h"
  134. #elif defined hpux || defined __hpux
  135. # include "aliases_syshpux.h"
  136. #elif defined __osf__
  137. # include "aliases_sysosf1.h"
  138. #elif defined __sun
  139. # include "aliases_syssolaris.h"
  140. #elif defined(ARCADIA_ICONV_NOCJK)
  141. # include "aliases_nocjk.h"
  142. #else
  143. # include "aliases.h"
  144. #endif
  145. /*
  146. * System dependent alias lookup function.
  147. * Defines
  148. * const struct alias * aliases2_lookup (const char *str);
  149. */
  150. #if defined(USE_AIX) || defined(USE_OSF1) || defined(USE_DOS) || defined(USE_EXTRA) /* || ... */
  151. struct stringpool2_t {
  152. #define S(tag,name,encoding_index) char stringpool_##tag[sizeof(name)];
  153. #include "aliases2.h"
  154. #undef S
  155. };
  156. static const struct stringpool2_t stringpool2_contents = {
  157. #define S(tag,name,encoding_index) name,
  158. #include "aliases2.h"
  159. #undef S
  160. };
  161. #define stringpool2 ((const char *) &stringpool2_contents)
  162. static const struct alias sysdep_aliases[] = {
  163. #define S(tag,name,encoding_index) { (int)(long)&((struct stringpool2_t *)0)->stringpool_##tag, encoding_index },
  164. #include "aliases2.h"
  165. #undef S
  166. };
  167. const struct alias *
  168. aliases2_lookup (const char *str)
  169. {
  170. const struct alias * ptr;
  171. unsigned int count;
  172. for (ptr = sysdep_aliases, count = sizeof(sysdep_aliases)/sizeof(sysdep_aliases[0]); count > 0; ptr++, count--)
  173. if (!strcmp(str, stringpool2 + ptr->name))
  174. return ptr;
  175. return NULL;
  176. }
  177. #else
  178. #define aliases2_lookup(str) NULL
  179. #define stringpool2 NULL
  180. #endif
  181. #if 0
  182. /* Like !strcasecmp, except that the both strings can be assumed to be ASCII
  183. and the first string can be assumed to be in uppercase. */
  184. static int strequal (const char* str1, const char* str2)
  185. {
  186. unsigned char c1;
  187. unsigned char c2;
  188. for (;;) {
  189. c1 = * (unsigned char *) str1++;
  190. c2 = * (unsigned char *) str2++;
  191. if (c1 == 0)
  192. break;
  193. if (c2 >= 'a' && c2 <= 'z')
  194. c2 -= 'a'-'A';
  195. if (c1 != c2)
  196. break;
  197. }
  198. return (c1 == c2);
  199. }
  200. #endif
  201. iconv_t iconv_open (const char* tocode, const char* fromcode)
  202. {
  203. struct conv_struct * cd;
  204. unsigned int from_index;
  205. int from_wchar;
  206. unsigned int to_index;
  207. int to_wchar;
  208. int transliterate;
  209. int discard_ilseq;
  210. #include "iconv_open1.h"
  211. cd = (struct conv_struct *) malloc(from_wchar != to_wchar
  212. ? sizeof(struct wchar_conv_struct)
  213. : sizeof(struct conv_struct));
  214. if (cd == NULL) {
  215. errno = ENOMEM;
  216. return (iconv_t)(-1);
  217. }
  218. #include "iconv_open2.h"
  219. return (iconv_t)cd;
  220. invalid:
  221. errno = EINVAL;
  222. return (iconv_t)(-1);
  223. }
  224. size_t iconv (iconv_t icd,
  225. ICONV_CONST char* * inbuf, size_t *inbytesleft,
  226. char* * outbuf, size_t *outbytesleft)
  227. {
  228. conv_t cd = (conv_t) icd;
  229. if (inbuf == NULL || *inbuf == NULL)
  230. return cd->lfuncs.loop_reset(icd,outbuf,outbytesleft);
  231. else
  232. return cd->lfuncs.loop_convert(icd,
  233. (const char* *)inbuf,inbytesleft,
  234. outbuf,outbytesleft);
  235. }
  236. int iconv_close (iconv_t icd)
  237. {
  238. conv_t cd = (conv_t) icd;
  239. free(cd);
  240. return 0;
  241. }
  242. #ifndef LIBICONV_PLUG
  243. /*
  244. * Verify that a 'struct conv_struct' and a 'struct wchar_conv_struct' each
  245. * fit in an iconv_allocation_t.
  246. * If this verification fails, iconv_allocation_t must be made larger and
  247. * the major version in LIBICONV_VERSION_INFO must be bumped.
  248. * Currently 'struct conv_struct' has 21 integer/pointer fields, and
  249. * 'struct wchar_conv_struct' additionally has an 'mbstate_t' field.
  250. */
  251. typedef int verify_size_1[2 * (sizeof (struct conv_struct) <= sizeof (iconv_allocation_t)) - 1];
  252. typedef int verify_size_2[2 * (sizeof (struct wchar_conv_struct) <= sizeof (iconv_allocation_t)) - 1];
  253. int iconv_open_into (const char* tocode, const char* fromcode,
  254. iconv_allocation_t* resultp)
  255. {
  256. struct conv_struct * cd;
  257. unsigned int from_index;
  258. int from_wchar;
  259. unsigned int to_index;
  260. int to_wchar;
  261. int transliterate;
  262. int discard_ilseq;
  263. #include "iconv_open1.h"
  264. cd = (struct conv_struct *) resultp;
  265. #include "iconv_open2.h"
  266. return 0;
  267. invalid:
  268. errno = EINVAL;
  269. return -1;
  270. }
  271. int iconvctl (iconv_t icd, int request, void* argument)
  272. {
  273. conv_t cd = (conv_t) icd;
  274. switch (request) {
  275. case ICONV_TRIVIALP:
  276. *(int *)argument =
  277. ((cd->lfuncs.loop_convert == unicode_loop_convert
  278. && cd->iindex == cd->oindex)
  279. || cd->lfuncs.loop_convert == wchar_id_loop_convert
  280. ? 1 : 0);
  281. return 0;
  282. case ICONV_GET_TRANSLITERATE:
  283. *(int *)argument = cd->transliterate;
  284. return 0;
  285. case ICONV_SET_TRANSLITERATE:
  286. cd->transliterate = (*(const int *)argument ? 1 : 0);
  287. return 0;
  288. case ICONV_GET_DISCARD_ILSEQ:
  289. *(int *)argument = cd->discard_ilseq;
  290. return 0;
  291. case ICONV_SET_DISCARD_ILSEQ:
  292. cd->discard_ilseq = (*(const int *)argument ? 1 : 0);
  293. return 0;
  294. case ICONV_SET_HOOKS:
  295. if (argument != NULL) {
  296. cd->hooks = *(const struct iconv_hooks *)argument;
  297. } else {
  298. cd->hooks.uc_hook = NULL;
  299. cd->hooks.wc_hook = NULL;
  300. cd->hooks.data = NULL;
  301. }
  302. return 0;
  303. case ICONV_SET_FALLBACKS:
  304. if (argument != NULL) {
  305. cd->fallbacks = *(const struct iconv_fallbacks *)argument;
  306. } else {
  307. cd->fallbacks.mb_to_uc_fallback = NULL;
  308. cd->fallbacks.uc_to_mb_fallback = NULL;
  309. cd->fallbacks.mb_to_wc_fallback = NULL;
  310. cd->fallbacks.wc_to_mb_fallback = NULL;
  311. cd->fallbacks.data = NULL;
  312. }
  313. return 0;
  314. default:
  315. errno = EINVAL;
  316. return -1;
  317. }
  318. }
  319. /* An alias after its name has been converted from 'int' to 'const char*'. */
  320. struct nalias { const char* name; unsigned int encoding_index; };
  321. static int compare_by_index (const void * arg1, const void * arg2)
  322. {
  323. const struct nalias * alias1 = (const struct nalias *) arg1;
  324. const struct nalias * alias2 = (const struct nalias *) arg2;
  325. return (int)alias1->encoding_index - (int)alias2->encoding_index;
  326. }
  327. static int compare_by_name (const void * arg1, const void * arg2)
  328. {
  329. const char * name1 = *(const char **)arg1;
  330. const char * name2 = *(const char **)arg2;
  331. /* Compare alphabetically, but put "CS" names at the end. */
  332. int sign = strcmp(name1,name2);
  333. if (sign != 0) {
  334. sign = ((name1[0]=='C' && name1[1]=='S') - (name2[0]=='C' && name2[1]=='S'))
  335. * 4 + (sign >= 0 ? 1 : -1);
  336. }
  337. return sign;
  338. }
  339. void iconvlist (int (*do_one) (unsigned int namescount,
  340. const char * const * names,
  341. void* data),
  342. void* data)
  343. {
  344. #define aliascount1 sizeof(aliases)/sizeof(aliases[0])
  345. #ifndef aliases2_lookup
  346. #define aliascount2 sizeof(sysdep_aliases)/sizeof(sysdep_aliases[0])
  347. #else
  348. #define aliascount2 0
  349. #endif
  350. #define aliascount (aliascount1+aliascount2)
  351. struct nalias aliasbuf[aliascount];
  352. const char * namesbuf[aliascount];
  353. size_t num_aliases;
  354. {
  355. /* Put all existing aliases into a buffer. */
  356. size_t i;
  357. size_t j;
  358. j = 0;
  359. for (i = 0; i < aliascount1; i++) {
  360. const struct alias * p = &aliases[i];
  361. if (p->name >= 0
  362. && p->encoding_index != ei_local_char
  363. && p->encoding_index != ei_local_wchar_t) {
  364. aliasbuf[j].name = stringpool + p->name;
  365. aliasbuf[j].encoding_index = p->encoding_index;
  366. j++;
  367. }
  368. }
  369. #ifndef aliases2_lookup
  370. for (i = 0; i < aliascount2; i++) {
  371. aliasbuf[j].name = stringpool2 + sysdep_aliases[i].name;
  372. aliasbuf[j].encoding_index = sysdep_aliases[i].encoding_index;
  373. j++;
  374. }
  375. #endif
  376. num_aliases = j;
  377. }
  378. /* Sort by encoding_index. */
  379. if (num_aliases > 1)
  380. qsort(aliasbuf, num_aliases, sizeof(struct nalias), compare_by_index);
  381. {
  382. /* Process all aliases with the same encoding_index together. */
  383. size_t j;
  384. j = 0;
  385. while (j < num_aliases) {
  386. unsigned int ei = aliasbuf[j].encoding_index;
  387. size_t i = 0;
  388. do
  389. namesbuf[i++] = aliasbuf[j++].name;
  390. while (j < num_aliases && aliasbuf[j].encoding_index == ei);
  391. if (i > 1)
  392. qsort(namesbuf, i, sizeof(const char *), compare_by_name);
  393. /* Call the callback. */
  394. if (do_one(i,namesbuf,data))
  395. break;
  396. }
  397. }
  398. #undef aliascount
  399. #undef aliascount2
  400. #undef aliascount1
  401. }
  402. /*
  403. * Table of canonical names of encodings.
  404. * Instead of strings, it contains offsets into stringpool and stringpool2.
  405. */
  406. static const unsigned short all_canonical[] = {
  407. #if defined _AIX
  408. # include "canonical_sysaix.h"
  409. #elif defined hpux || defined __hpux
  410. # include "canonical_syshpux.h"
  411. #elif defined __osf__
  412. # include "canonical_sysosf1.h"
  413. #elif defined __sun
  414. # include "canonical_syssolaris.h"
  415. #elif defined(ARCADIA_ICONV_NOCJK)
  416. # include "canonical_nocjk.h"
  417. #else
  418. # include "canonical.h"
  419. #endif
  420. #ifdef USE_AIX
  421. # if defined _AIX
  422. # include "canonical_aix_sysaix.h"
  423. # else
  424. # include "canonical_aix.h"
  425. # endif
  426. #endif
  427. #ifdef USE_OSF1
  428. # if defined __osf__
  429. # include "canonical_osf1_sysosf1.h"
  430. # else
  431. # include "canonical_osf1.h"
  432. # endif
  433. #endif
  434. #ifdef USE_DOS
  435. # include "canonical_dos.h"
  436. #endif
  437. #ifdef USE_EXTRA
  438. # include "canonical_extra.h"
  439. #endif
  440. #if defined _AIX
  441. # include "canonical_local_sysaix.h"
  442. #elif defined hpux || defined __hpux
  443. # include "canonical_local_syshpux.h"
  444. #elif defined __osf__
  445. # include "canonical_local_sysosf1.h"
  446. #elif defined __sun
  447. # include "canonical_local_syssolaris.h"
  448. #elif defined(ARCADIA_ICONV_NOCJK)
  449. # include "canonical_local_nocjk.h"
  450. #else
  451. # include "canonical_local.h"
  452. #endif
  453. };
  454. const char * iconv_canonicalize (const char * name)
  455. {
  456. const char* code;
  457. char buf[MAX_WORD_LENGTH+10+1];
  458. const char* cp;
  459. char* bp;
  460. const struct alias * ap;
  461. unsigned int count;
  462. unsigned int index;
  463. const char* pool;
  464. /* Before calling aliases_lookup, convert the input string to upper case,
  465. * and check whether it's entirely ASCII (we call gperf with option "-7"
  466. * to achieve a smaller table) and non-empty. If it's not entirely ASCII,
  467. * or if it's too long, it is not a valid encoding name.
  468. */
  469. for (code = name;;) {
  470. /* Search code in the table. */
  471. for (cp = code, bp = buf, count = MAX_WORD_LENGTH+10+1; ; cp++, bp++) {
  472. unsigned char c = * (unsigned char *) cp;
  473. if (c >= 0x80)
  474. goto invalid;
  475. if (c >= 'a' && c <= 'z')
  476. c -= 'a'-'A';
  477. *bp = c;
  478. if (c == '\0')
  479. break;
  480. if (--count == 0)
  481. goto invalid;
  482. }
  483. for (;;) {
  484. if (bp-buf >= 10 && memcmp(bp-10,"//TRANSLIT",10)==0) {
  485. bp -= 10;
  486. *bp = '\0';
  487. continue;
  488. }
  489. if (bp-buf >= 8 && memcmp(bp-8,"//IGNORE",8)==0) {
  490. bp -= 8;
  491. *bp = '\0';
  492. continue;
  493. }
  494. break;
  495. }
  496. if (buf[0] == '\0') {
  497. code = locale_charset();
  498. /* Avoid an endless loop that could occur when using an older version
  499. of localcharset.c. */
  500. if (code[0] == '\0')
  501. goto invalid;
  502. continue;
  503. }
  504. pool = stringpool;
  505. ap = aliases_lookup(buf,bp-buf);
  506. if (ap == NULL) {
  507. pool = stringpool2;
  508. ap = aliases2_lookup(buf);
  509. if (ap == NULL)
  510. goto invalid;
  511. }
  512. if (ap->encoding_index == ei_local_char) {
  513. code = locale_charset();
  514. /* Avoid an endless loop that could occur when using an older version
  515. of localcharset.c. */
  516. if (code[0] == '\0')
  517. goto invalid;
  518. continue;
  519. }
  520. if (ap->encoding_index == ei_local_wchar_t) {
  521. /* On systems which define __STDC_ISO_10646__, wchar_t is Unicode.
  522. This is also the case on native Woe32 systems. */
  523. #if __STDC_ISO_10646__ || ((defined _WIN32 || defined __WIN32__) && !defined __CYGWIN__)
  524. if (sizeof(wchar_t) == 4) {
  525. index = ei_ucs4internal;
  526. break;
  527. }
  528. if (sizeof(wchar_t) == 2) {
  529. index = ei_ucs2internal;
  530. break;
  531. }
  532. if (sizeof(wchar_t) == 1) {
  533. index = ei_iso8859_1;
  534. break;
  535. }
  536. #endif
  537. }
  538. index = ap->encoding_index;
  539. break;
  540. }
  541. return all_canonical[index] + pool;
  542. invalid:
  543. return name;
  544. }
  545. int _libiconv_version = _LIBICONV_VERSION;
  546. #if defined __FreeBSD__ && !defined __gnu_freebsd__
  547. /* GNU libiconv is the native FreeBSD iconv implementation since 2002.
  548. It wants to define the symbols 'iconv_open', 'iconv', 'iconv_close'. */
  549. #define strong_alias(name, aliasname) _strong_alias(name, aliasname)
  550. #define _strong_alias(name, aliasname) \
  551. extern __typeof (name) aliasname __attribute__ ((alias (#name)));
  552. #undef iconv_open
  553. #undef iconv
  554. #undef iconv_close
  555. strong_alias (libiconv_open, iconv_open)
  556. strong_alias (libiconv, iconv)
  557. strong_alias (libiconv_close, iconv_close)
  558. #endif
  559. #endif