charsets.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571
  1. /* Text conversion from one charset to another.
  2. Copyright (C) 2001 Walery Studennikov <despair@sama.ru>
  3. This program is free software; you can redistribute it and/or modify
  4. it under the terms of the GNU General Public License as published by
  5. the Free Software Foundation; either version 2 of the License, or
  6. (at your option) any later version.
  7. This program is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. GNU General Public License for more details.
  11. You should have received a copy of the GNU General Public License
  12. along with this program; if not, write to the Free Software
  13. Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  14. */
  15. /** \file charsets.c
  16. * \brief Source: Text conversion from one charset to another
  17. */
  18. #include <config.h>
  19. #ifdef HAVE_CHARSET
  20. #include <stdio.h>
  21. #include <stdlib.h>
  22. #include <string.h>
  23. #include "lib/global.h"
  24. #include "lib/strutil.h" /* utf-8 functions */
  25. #include "lib/fileloc.h"
  26. #include "lib/charsets.h"
  27. #include "src/main.h"
  28. /*** global variables ****************************************************************************/
  29. GPtrArray *codepages = NULL;
  30. unsigned char conv_displ[256];
  31. unsigned char conv_input[256];
  32. const char *cp_display = NULL;
  33. const char *cp_source = NULL;
  34. /*** file scope macro definitions ****************************************************************/
  35. #define OTHER_8BIT "Other_8_bit"
  36. /*
  37. * FIXME: This assumes that ASCII is always the first encoding
  38. * in mc.charsets
  39. */
  40. #define CP_ASCII 0
  41. /*** file scope type declarations ****************************************************************/
  42. /*** file scope variables ************************************************************************/
  43. /*** file scope functions ************************************************************************/
  44. /* --------------------------------------------------------------------------------------------- */
  45. static codepage_desc *
  46. new_codepage_desc (const char *id, const char *name)
  47. {
  48. codepage_desc *desc;
  49. desc = g_new (codepage_desc, 1);
  50. desc->id = g_strdup (id);
  51. desc->name = g_strdup (name);
  52. return desc;
  53. }
  54. /* --------------------------------------------------------------------------------------------- */
  55. static void
  56. free_codepage_desc (gpointer data, gpointer user_data)
  57. {
  58. codepage_desc *desc = (codepage_desc *) data;
  59. (void) user_data;
  60. g_free (desc->id);
  61. g_free (desc->name);
  62. g_free (desc);
  63. }
  64. /* --------------------------------------------------------------------------------------------- */
  65. /* returns display codepage */
  66. static void
  67. load_codepages_list_from_file (GPtrArray ** list, const char *fname)
  68. {
  69. FILE *f;
  70. guint i;
  71. char buf[BUF_MEDIUM];
  72. char *default_codepage = NULL;
  73. f = fopen (fname, "r");
  74. if (f == NULL)
  75. return;
  76. for (i = 0; fgets (buf, sizeof buf, f) != NULL;)
  77. {
  78. /* split string into id and cpname */
  79. char *p = buf;
  80. size_t buflen = strlen (buf);
  81. if (*p == '\n' || *p == '\0' || *p == '#')
  82. continue;
  83. if (buflen > 0 && buf[buflen - 1] == '\n')
  84. buf[buflen - 1] = '\0';
  85. while (*p != '\t' && *p != ' ' && *p != '\0')
  86. ++p;
  87. if (*p == '\0')
  88. goto fail;
  89. *p++ = '\0';
  90. g_strstrip (p);
  91. if (*p == '\0')
  92. goto fail;
  93. if (strcmp (buf, "default") == 0)
  94. default_codepage = g_strdup (p);
  95. else
  96. {
  97. const char *id = buf;
  98. if (*list == NULL)
  99. {
  100. *list = g_ptr_array_sized_new (16);
  101. g_ptr_array_add (*list, new_codepage_desc (id, p));
  102. }
  103. else
  104. {
  105. /* whether id is already present in list */
  106. /* if yes, overwrite description */
  107. for (i = 0; i < (*list)->len; i++)
  108. {
  109. codepage_desc *desc;
  110. desc = (codepage_desc *) g_ptr_array_index (*list, i);
  111. if (strcmp (id, desc->id) == 0)
  112. {
  113. /* found */
  114. g_free (desc->name);
  115. desc->name = g_strdup (p);
  116. break;
  117. }
  118. }
  119. /* not found */
  120. if (i == (*list)->len)
  121. g_ptr_array_add (*list, new_codepage_desc (id, p));
  122. }
  123. }
  124. }
  125. if (default_codepage != NULL)
  126. {
  127. display_codepage = get_codepage_index (default_codepage);
  128. g_free (default_codepage);
  129. }
  130. fail:
  131. fclose (f);
  132. }
  133. /* --------------------------------------------------------------------------------------------- */
  134. static char
  135. translate_character (GIConv cd, char c)
  136. {
  137. gchar *tmp_buff = NULL;
  138. gsize bytes_read, bytes_written = 0;
  139. const char *ibuf = &c;
  140. char ch = UNKNCHAR;
  141. int ibuflen = 1;
  142. tmp_buff = g_convert_with_iconv (ibuf, ibuflen, cd, &bytes_read, &bytes_written, NULL);
  143. if (tmp_buff)
  144. ch = tmp_buff[0];
  145. g_free (tmp_buff);
  146. return ch;
  147. }
  148. /* --------------------------------------------------------------------------------------------- */
  149. /*** public functions ****************************************************************************/
  150. /* --------------------------------------------------------------------------------------------- */
  151. void
  152. load_codepages_list (void)
  153. {
  154. char *fname;
  155. /* 1: try load /usr/share/mc/mc.charsets */
  156. fname = g_build_filename (mc_home_alt, CHARSETS_LIST, (char *) NULL);
  157. load_codepages_list_from_file (&codepages, fname);
  158. g_free (fname);
  159. /* 2: try load /etc/mc/mc.charsets */
  160. fname = g_build_filename (mc_home, CHARSETS_LIST, (char *) NULL);
  161. load_codepages_list_from_file (&codepages, fname);
  162. g_free (fname);
  163. if (codepages == NULL)
  164. {
  165. /* files are not found, add defaullt codepage */
  166. fprintf (stderr, "%s\n", _("Warning: cannot load codepages list"));
  167. codepages = g_ptr_array_new ();
  168. g_ptr_array_add (codepages, new_codepage_desc ("ASCII", _("7-bit ASCII")));
  169. }
  170. }
  171. /* --------------------------------------------------------------------------------------------- */
  172. void
  173. free_codepages_list (void)
  174. {
  175. g_ptr_array_foreach (codepages, free_codepage_desc, NULL);
  176. g_ptr_array_free (codepages, TRUE);
  177. }
  178. /* --------------------------------------------------------------------------------------------- */
  179. const char *
  180. get_codepage_id (const int n)
  181. {
  182. return (n < 0) ? OTHER_8BIT : ((codepage_desc *) g_ptr_array_index (codepages, n))->id;
  183. }
  184. /* --------------------------------------------------------------------------------------------- */
  185. int
  186. get_codepage_index (const char *id)
  187. {
  188. size_t i;
  189. if (strcmp (id, OTHER_8BIT) == 0)
  190. return -1;
  191. if (codepages == NULL)
  192. return -1;
  193. for (i = 0; i < codepages->len; i++)
  194. if (strcmp (id, ((codepage_desc *) g_ptr_array_index (codepages, i))->id) == 0)
  195. return i;
  196. return -1;
  197. }
  198. /* --------------------------------------------------------------------------------------------- */
  199. /** Check if specified encoding can be used in mc.
  200. * @param encoding name of encoding
  201. * @returns TRUE if encoding has supported by mc, FALSE otherwise
  202. */
  203. gboolean
  204. is_supported_encoding (const char *encoding)
  205. {
  206. gboolean result = FALSE;
  207. guint t;
  208. for (t = 0; t < codepages->len; t++)
  209. {
  210. const char *id = ((codepage_desc *) g_ptr_array_index (codepages, t))->id;
  211. result |= (g_ascii_strncasecmp (encoding, id, strlen (id)) == 0);
  212. }
  213. return result;
  214. }
  215. /* --------------------------------------------------------------------------------------------- */
  216. char *
  217. init_translation_table (int cpsource, int cpdisplay)
  218. {
  219. int i;
  220. GIConv cd;
  221. /* Fill inpit <-> display tables */
  222. if (cpsource < 0 || cpdisplay < 0 || cpsource == cpdisplay)
  223. {
  224. for (i = 0; i <= 255; ++i)
  225. {
  226. conv_displ[i] = i;
  227. conv_input[i] = i;
  228. cp_source = cp_display;
  229. }
  230. return NULL;
  231. }
  232. for (i = 0; i <= 127; ++i)
  233. {
  234. conv_displ[i] = i;
  235. conv_input[i] = i;
  236. }
  237. cp_source = ((codepage_desc *) g_ptr_array_index (codepages, cpsource))->id;
  238. cp_display = ((codepage_desc *) g_ptr_array_index (codepages, cpdisplay))->id;
  239. /* display <- inpit table */
  240. cd = g_iconv_open (cp_display, cp_source);
  241. if (cd == INVALID_CONV)
  242. return g_strdup_printf (_("Cannot translate from %s to %s"), cp_source, cp_display);
  243. for (i = 128; i <= 255; ++i)
  244. conv_displ[i] = translate_character (cd, i);
  245. g_iconv_close (cd);
  246. /* inpit <- display table */
  247. cd = g_iconv_open (cp_source, cp_display);
  248. if (cd == INVALID_CONV)
  249. return g_strdup_printf (_("Cannot translate from %s to %s"), cp_display, cp_source);
  250. for (i = 128; i <= 255; ++i)
  251. {
  252. unsigned char ch;
  253. ch = translate_character (cd, i);
  254. conv_input[i] = (ch == UNKNCHAR) ? i : ch;
  255. }
  256. g_iconv_close (cd);
  257. return NULL;
  258. }
  259. /* --------------------------------------------------------------------------------------------- */
  260. void
  261. convert_to_display (char *str)
  262. {
  263. if (!str)
  264. return;
  265. while (*str)
  266. {
  267. *str = conv_displ[(unsigned char) *str];
  268. str++;
  269. }
  270. }
  271. /* --------------------------------------------------------------------------------------------- */
  272. GString *
  273. str_convert_to_display (char *str)
  274. {
  275. return str_nconvert_to_display (str, -1);
  276. }
  277. /* --------------------------------------------------------------------------------------------- */
  278. GString *
  279. str_nconvert_to_display (char *str, int len)
  280. {
  281. GString *buff;
  282. GIConv conv;
  283. if (!str)
  284. return g_string_new ("");
  285. if (cp_display == cp_source)
  286. return g_string_new (str);
  287. conv = str_crt_conv_from (cp_source);
  288. buff = g_string_new ("");
  289. str_nconvert (conv, str, len, buff);
  290. str_close_conv (conv);
  291. return buff;
  292. }
  293. /* --------------------------------------------------------------------------------------------- */
  294. void
  295. convert_from_input (char *str)
  296. {
  297. if (!str)
  298. return;
  299. while (*str)
  300. {
  301. *str = conv_input[(unsigned char) *str];
  302. str++;
  303. }
  304. }
  305. /* --------------------------------------------------------------------------------------------- */
  306. GString *
  307. str_convert_to_input (char *str)
  308. {
  309. return str_nconvert_to_input (str, -1);
  310. }
  311. /* --------------------------------------------------------------------------------------------- */
  312. GString *
  313. str_nconvert_to_input (char *str, int len)
  314. {
  315. GString *buff;
  316. GIConv conv;
  317. if (!str)
  318. return g_string_new ("");
  319. if (cp_display == cp_source)
  320. return g_string_new (str);
  321. conv = str_crt_conv_to (cp_source);
  322. buff = g_string_new ("");
  323. str_nconvert (conv, str, len, buff);
  324. str_close_conv (conv);
  325. return buff;
  326. }
  327. /* --------------------------------------------------------------------------------------------- */
  328. unsigned char
  329. convert_from_utf_to_current (const char *str)
  330. {
  331. unsigned char buf_ch[6 + 1];
  332. unsigned char ch = '.';
  333. GIConv conv;
  334. const char *cp_to;
  335. if (!str)
  336. return '.';
  337. cp_to = get_codepage_id (source_codepage);
  338. conv = str_crt_conv_to (cp_to);
  339. if (conv != INVALID_CONV)
  340. {
  341. switch (str_translate_char (conv, str, -1, (char *) buf_ch, sizeof (buf_ch)))
  342. {
  343. case ESTR_SUCCESS:
  344. ch = buf_ch[0];
  345. break;
  346. case ESTR_PROBLEM:
  347. case ESTR_FAILURE:
  348. ch = '.';
  349. break;
  350. }
  351. str_close_conv (conv);
  352. }
  353. return ch;
  354. }
  355. /* --------------------------------------------------------------------------------------------- */
  356. unsigned char
  357. convert_from_utf_to_current_c (const int input_char, GIConv conv)
  358. {
  359. unsigned char str[6 + 1];
  360. unsigned char buf_ch[6 + 1];
  361. unsigned char ch = '.';
  362. int res = 0;
  363. res = g_unichar_to_utf8 (input_char, (char *) str);
  364. if (res == 0)
  365. {
  366. return ch;
  367. }
  368. str[res] = '\0';
  369. switch (str_translate_char (conv, (char *) str, -1, (char *) buf_ch, sizeof (buf_ch)))
  370. {
  371. case ESTR_SUCCESS:
  372. ch = buf_ch[0];
  373. break;
  374. case ESTR_PROBLEM:
  375. case ESTR_FAILURE:
  376. ch = '.';
  377. break;
  378. }
  379. return ch;
  380. }
  381. /* --------------------------------------------------------------------------------------------- */
  382. int
  383. convert_from_8bit_to_utf_c (const char input_char, GIConv conv)
  384. {
  385. unsigned char str[2];
  386. unsigned char buf_ch[6 + 1];
  387. int ch = '.';
  388. int res = 0;
  389. str[0] = (unsigned char) input_char;
  390. str[1] = '\0';
  391. switch (str_translate_char (conv, (char *) str, -1, (char *) buf_ch, sizeof (buf_ch)))
  392. {
  393. case ESTR_SUCCESS:
  394. res = g_utf8_get_char_validated ((char *) buf_ch, -1);
  395. if (res < 0)
  396. {
  397. ch = buf_ch[0];
  398. }
  399. else
  400. {
  401. ch = res;
  402. }
  403. break;
  404. case ESTR_PROBLEM:
  405. case ESTR_FAILURE:
  406. ch = '.';
  407. break;
  408. }
  409. return ch;
  410. }
  411. /* --------------------------------------------------------------------------------------------- */
  412. int
  413. convert_from_8bit_to_utf_c2 (const char input_char)
  414. {
  415. unsigned char str[2];
  416. unsigned char buf_ch[6 + 1];
  417. int ch = '.';
  418. int res = 0;
  419. GIConv conv;
  420. const char *cp_from;
  421. str[0] = (unsigned char) input_char;
  422. str[1] = '\0';
  423. cp_from = get_codepage_id (source_codepage);
  424. conv = str_crt_conv_to (cp_from);
  425. if (conv != INVALID_CONV)
  426. {
  427. switch (str_translate_char (conv, (char *) str, -1, (char *) buf_ch, sizeof (buf_ch)))
  428. {
  429. case ESTR_SUCCESS:
  430. res = g_utf8_get_char_validated ((char *) buf_ch, -1);
  431. if (res < 0)
  432. {
  433. ch = buf_ch[0];
  434. }
  435. else
  436. {
  437. ch = res;
  438. }
  439. break;
  440. case ESTR_PROBLEM:
  441. case ESTR_FAILURE:
  442. ch = '.';
  443. break;
  444. }
  445. str_close_conv (conv);
  446. }
  447. return ch;
  448. }
  449. /* --------------------------------------------------------------------------------------------- */
  450. #endif /* HAVE_CHARSET */