charsets.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569
  1. /* Text conversion from one charset to another.
  2. Copyright (C) 2001 Walery Studennikov <despair@sama.ru>
  3. This program is free software; you can redistribute it and/or modify
  4. it under the terms of the GNU General Public License as published by
  5. the Free Software Foundation; either version 2 of the License, or
  6. (at your option) any later version.
  7. This program is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. GNU General Public License for more details.
  11. You should have received a copy of the GNU General Public License
  12. along with this program; if not, write to the Free Software
  13. Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  14. */
  15. /** \file charsets.c
  16. * \brief Source: Text conversion from one charset to another
  17. */
  18. #include <config.h>
  19. #ifdef HAVE_CHARSET
  20. #include <stdio.h>
  21. #include <stdlib.h>
  22. #include <string.h>
  23. #include "lib/global.h"
  24. #include "lib/strutil.h" /* utf-8 functions */
  25. #include "lib/fileloc.h"
  26. #include "lib/charsets.h"
  27. /*** global variables ****************************************************************************/
  28. GPtrArray *codepages = NULL;
  29. unsigned char conv_displ[256];
  30. unsigned char conv_input[256];
  31. const char *cp_display = NULL;
  32. const char *cp_source = NULL;
  33. /*** file scope macro definitions ****************************************************************/
  34. #define OTHER_8BIT "Other_8_bit"
  35. /*
  36. * FIXME: This assumes that ASCII is always the first encoding
  37. * in mc.charsets
  38. */
  39. #define CP_ASCII 0
  40. /*** file scope type declarations ****************************************************************/
  41. /*** file scope variables ************************************************************************/
  42. /*** file scope functions ************************************************************************/
  43. /* --------------------------------------------------------------------------------------------- */
  44. static codepage_desc *
  45. new_codepage_desc (const char *id, const char *name)
  46. {
  47. codepage_desc *desc;
  48. desc = g_new (codepage_desc, 1);
  49. desc->id = g_strdup (id);
  50. desc->name = g_strdup (name);
  51. return desc;
  52. }
  53. /* --------------------------------------------------------------------------------------------- */
  54. static void
  55. free_codepage_desc (gpointer data, gpointer user_data)
  56. {
  57. codepage_desc *desc = (codepage_desc *) data;
  58. (void) user_data;
  59. g_free (desc->id);
  60. g_free (desc->name);
  61. g_free (desc);
  62. }
  63. /* --------------------------------------------------------------------------------------------- */
  64. /* returns display codepage */
  65. static void
  66. load_codepages_list_from_file (GPtrArray ** list, const char *fname)
  67. {
  68. FILE *f;
  69. guint i;
  70. char buf[BUF_MEDIUM];
  71. char *default_codepage = NULL;
  72. f = fopen (fname, "r");
  73. if (f == NULL)
  74. return;
  75. for (i = 0; fgets (buf, sizeof buf, f) != NULL;)
  76. {
  77. /* split string into id and cpname */
  78. char *p = buf;
  79. size_t buflen = strlen (buf);
  80. if (*p == '\n' || *p == '\0' || *p == '#')
  81. continue;
  82. if (buflen > 0 && buf[buflen - 1] == '\n')
  83. buf[buflen - 1] = '\0';
  84. while (*p != '\t' && *p != ' ' && *p != '\0')
  85. ++p;
  86. if (*p == '\0')
  87. goto fail;
  88. *p++ = '\0';
  89. g_strstrip (p);
  90. if (*p == '\0')
  91. goto fail;
  92. if (strcmp (buf, "default") == 0)
  93. default_codepage = g_strdup (p);
  94. else
  95. {
  96. const char *id = buf;
  97. if (*list == NULL)
  98. {
  99. *list = g_ptr_array_sized_new (16);
  100. g_ptr_array_add (*list, new_codepage_desc (id, p));
  101. }
  102. else
  103. {
  104. /* whether id is already present in list */
  105. /* if yes, overwrite description */
  106. for (i = 0; i < (*list)->len; i++)
  107. {
  108. codepage_desc *desc;
  109. desc = (codepage_desc *) g_ptr_array_index (*list, i);
  110. if (strcmp (id, desc->id) == 0)
  111. {
  112. /* found */
  113. g_free (desc->name);
  114. desc->name = g_strdup (p);
  115. break;
  116. }
  117. }
  118. /* not found */
  119. if (i == (*list)->len)
  120. g_ptr_array_add (*list, new_codepage_desc (id, p));
  121. }
  122. }
  123. }
  124. if (default_codepage != NULL)
  125. {
  126. mc_global.display_codepage = get_codepage_index (default_codepage);
  127. g_free (default_codepage);
  128. }
  129. fail:
  130. fclose (f);
  131. }
  132. /* --------------------------------------------------------------------------------------------- */
  133. static char
  134. translate_character (GIConv cd, char c)
  135. {
  136. gchar *tmp_buff = NULL;
  137. gsize bytes_read, bytes_written = 0;
  138. const char *ibuf = &c;
  139. char ch = UNKNCHAR;
  140. int ibuflen = 1;
  141. tmp_buff = g_convert_with_iconv (ibuf, ibuflen, cd, &bytes_read, &bytes_written, NULL);
  142. if (tmp_buff)
  143. ch = tmp_buff[0];
  144. g_free (tmp_buff);
  145. return ch;
  146. }
  147. /* --------------------------------------------------------------------------------------------- */
  148. /*** public functions ****************************************************************************/
  149. /* --------------------------------------------------------------------------------------------- */
  150. void
  151. load_codepages_list (void)
  152. {
  153. char *fname;
  154. /* 1: try load /usr/share/mc/mc.charsets */
  155. fname = g_build_filename (mc_global.share_data_dir, CHARSETS_LIST, (char *) NULL);
  156. load_codepages_list_from_file (&codepages, fname);
  157. g_free (fname);
  158. /* 2: try load /etc/mc/mc.charsets */
  159. fname = g_build_filename (mc_global.sysconfig_dir, CHARSETS_LIST, (char *) NULL);
  160. load_codepages_list_from_file (&codepages, fname);
  161. g_free (fname);
  162. if (codepages == NULL)
  163. {
  164. /* files are not found, add defaullt codepage */
  165. fprintf (stderr, "%s\n", _("Warning: cannot load codepages list"));
  166. codepages = g_ptr_array_new ();
  167. g_ptr_array_add (codepages, new_codepage_desc ("ASCII", _("7-bit ASCII")));
  168. }
  169. }
  170. /* --------------------------------------------------------------------------------------------- */
  171. void
  172. free_codepages_list (void)
  173. {
  174. g_ptr_array_foreach (codepages, free_codepage_desc, NULL);
  175. g_ptr_array_free (codepages, TRUE);
  176. }
  177. /* --------------------------------------------------------------------------------------------- */
  178. const char *
  179. get_codepage_id (const int n)
  180. {
  181. return (n < 0) ? OTHER_8BIT : ((codepage_desc *) g_ptr_array_index (codepages, n))->id;
  182. }
  183. /* --------------------------------------------------------------------------------------------- */
  184. int
  185. get_codepage_index (const char *id)
  186. {
  187. size_t i;
  188. if (strcmp (id, OTHER_8BIT) == 0)
  189. return -1;
  190. if (codepages == NULL)
  191. return -1;
  192. for (i = 0; i < codepages->len; i++)
  193. if (strcmp (id, ((codepage_desc *) g_ptr_array_index (codepages, i))->id) == 0)
  194. return i;
  195. return -1;
  196. }
  197. /* --------------------------------------------------------------------------------------------- */
  198. /** Check if specified encoding can be used in mc.
  199. * @param encoding name of encoding
  200. * @returns TRUE if encoding has supported by mc, FALSE otherwise
  201. */
  202. gboolean
  203. is_supported_encoding (const char *encoding)
  204. {
  205. gboolean result = FALSE;
  206. guint t;
  207. for (t = 0; t < codepages->len; t++)
  208. {
  209. const char *id = ((codepage_desc *) g_ptr_array_index (codepages, t))->id;
  210. result |= (g_ascii_strncasecmp (encoding, id, strlen (id)) == 0);
  211. }
  212. return result;
  213. }
  214. /* --------------------------------------------------------------------------------------------- */
  215. char *
  216. init_translation_table (int cpsource, int cpdisplay)
  217. {
  218. int i;
  219. GIConv cd;
  220. /* Fill inpit <-> display tables */
  221. if (cpsource < 0 || cpdisplay < 0 || cpsource == cpdisplay)
  222. {
  223. for (i = 0; i <= 255; ++i)
  224. {
  225. conv_displ[i] = i;
  226. conv_input[i] = i;
  227. cp_source = cp_display;
  228. }
  229. return NULL;
  230. }
  231. for (i = 0; i <= 127; ++i)
  232. {
  233. conv_displ[i] = i;
  234. conv_input[i] = i;
  235. }
  236. cp_source = ((codepage_desc *) g_ptr_array_index (codepages, cpsource))->id;
  237. cp_display = ((codepage_desc *) g_ptr_array_index (codepages, cpdisplay))->id;
  238. /* display <- inpit table */
  239. cd = g_iconv_open (cp_display, cp_source);
  240. if (cd == INVALID_CONV)
  241. return g_strdup_printf (_("Cannot translate from %s to %s"), cp_source, cp_display);
  242. for (i = 128; i <= 255; ++i)
  243. conv_displ[i] = translate_character (cd, i);
  244. g_iconv_close (cd);
  245. /* inpit <- display table */
  246. cd = g_iconv_open (cp_source, cp_display);
  247. if (cd == INVALID_CONV)
  248. return g_strdup_printf (_("Cannot translate from %s to %s"), cp_display, cp_source);
  249. for (i = 128; i <= 255; ++i)
  250. {
  251. unsigned char ch;
  252. ch = translate_character (cd, i);
  253. conv_input[i] = (ch == UNKNCHAR) ? i : ch;
  254. }
  255. g_iconv_close (cd);
  256. return NULL;
  257. }
  258. /* --------------------------------------------------------------------------------------------- */
  259. void
  260. convert_to_display (char *str)
  261. {
  262. if (!str)
  263. return;
  264. while (*str)
  265. {
  266. *str = conv_displ[(unsigned char) *str];
  267. str++;
  268. }
  269. }
  270. /* --------------------------------------------------------------------------------------------- */
  271. GString *
  272. str_convert_to_display (char *str)
  273. {
  274. return str_nconvert_to_display (str, -1);
  275. }
  276. /* --------------------------------------------------------------------------------------------- */
  277. GString *
  278. str_nconvert_to_display (char *str, int len)
  279. {
  280. GString *buff;
  281. GIConv conv;
  282. if (!str)
  283. return g_string_new ("");
  284. if (cp_display == cp_source)
  285. return g_string_new (str);
  286. conv = str_crt_conv_from (cp_source);
  287. buff = g_string_new ("");
  288. str_nconvert (conv, str, len, buff);
  289. str_close_conv (conv);
  290. return buff;
  291. }
  292. /* --------------------------------------------------------------------------------------------- */
  293. void
  294. convert_from_input (char *str)
  295. {
  296. if (!str)
  297. return;
  298. while (*str)
  299. {
  300. *str = conv_input[(unsigned char) *str];
  301. str++;
  302. }
  303. }
  304. /* --------------------------------------------------------------------------------------------- */
  305. GString *
  306. str_convert_to_input (char *str)
  307. {
  308. return str_nconvert_to_input (str, -1);
  309. }
  310. /* --------------------------------------------------------------------------------------------- */
  311. GString *
  312. str_nconvert_to_input (char *str, int len)
  313. {
  314. GString *buff;
  315. GIConv conv;
  316. if (!str)
  317. return g_string_new ("");
  318. if (cp_display == cp_source)
  319. return g_string_new (str);
  320. conv = str_crt_conv_to (cp_source);
  321. buff = g_string_new ("");
  322. str_nconvert (conv, str, len, buff);
  323. str_close_conv (conv);
  324. return buff;
  325. }
  326. /* --------------------------------------------------------------------------------------------- */
  327. unsigned char
  328. convert_from_utf_to_current (const char *str)
  329. {
  330. unsigned char buf_ch[6 + 1];
  331. unsigned char ch = '.';
  332. GIConv conv;
  333. const char *cp_to;
  334. if (!str)
  335. return '.';
  336. cp_to = get_codepage_id (mc_global.source_codepage);
  337. conv = str_crt_conv_to (cp_to);
  338. if (conv != INVALID_CONV)
  339. {
  340. switch (str_translate_char (conv, str, -1, (char *) buf_ch, sizeof (buf_ch)))
  341. {
  342. case ESTR_SUCCESS:
  343. ch = buf_ch[0];
  344. break;
  345. case ESTR_PROBLEM:
  346. case ESTR_FAILURE:
  347. ch = '.';
  348. break;
  349. }
  350. str_close_conv (conv);
  351. }
  352. return ch;
  353. }
  354. /* --------------------------------------------------------------------------------------------- */
  355. unsigned char
  356. convert_from_utf_to_current_c (const int input_char, GIConv conv)
  357. {
  358. unsigned char str[6 + 1];
  359. unsigned char buf_ch[6 + 1];
  360. unsigned char ch = '.';
  361. int res = 0;
  362. res = g_unichar_to_utf8 (input_char, (char *) str);
  363. if (res == 0)
  364. {
  365. return ch;
  366. }
  367. str[res] = '\0';
  368. switch (str_translate_char (conv, (char *) str, -1, (char *) buf_ch, sizeof (buf_ch)))
  369. {
  370. case ESTR_SUCCESS:
  371. ch = buf_ch[0];
  372. break;
  373. case ESTR_PROBLEM:
  374. case ESTR_FAILURE:
  375. ch = '.';
  376. break;
  377. }
  378. return ch;
  379. }
  380. /* --------------------------------------------------------------------------------------------- */
  381. int
  382. convert_from_8bit_to_utf_c (const char input_char, GIConv conv)
  383. {
  384. unsigned char str[2];
  385. unsigned char buf_ch[6 + 1];
  386. int ch = '.';
  387. int res = 0;
  388. str[0] = (unsigned char) input_char;
  389. str[1] = '\0';
  390. switch (str_translate_char (conv, (char *) str, -1, (char *) buf_ch, sizeof (buf_ch)))
  391. {
  392. case ESTR_SUCCESS:
  393. res = g_utf8_get_char_validated ((char *) buf_ch, -1);
  394. if (res < 0)
  395. {
  396. ch = buf_ch[0];
  397. }
  398. else
  399. {
  400. ch = res;
  401. }
  402. break;
  403. case ESTR_PROBLEM:
  404. case ESTR_FAILURE:
  405. ch = '.';
  406. break;
  407. }
  408. return ch;
  409. }
  410. /* --------------------------------------------------------------------------------------------- */
  411. int
  412. convert_from_8bit_to_utf_c2 (const char input_char)
  413. {
  414. unsigned char str[2];
  415. unsigned char buf_ch[6 + 1];
  416. int ch = '.';
  417. int res = 0;
  418. GIConv conv;
  419. const char *cp_from;
  420. str[0] = (unsigned char) input_char;
  421. str[1] = '\0';
  422. cp_from = get_codepage_id (mc_global.source_codepage);
  423. conv = str_crt_conv_to (cp_from);
  424. if (conv != INVALID_CONV)
  425. {
  426. switch (str_translate_char (conv, (char *) str, -1, (char *) buf_ch, sizeof (buf_ch)))
  427. {
  428. case ESTR_SUCCESS:
  429. res = g_utf8_get_char_validated ((char *) buf_ch, -1);
  430. if (res < 0)
  431. {
  432. ch = buf_ch[0];
  433. }
  434. else
  435. {
  436. ch = res;
  437. }
  438. break;
  439. case ESTR_PROBLEM:
  440. case ESTR_FAILURE:
  441. ch = '.';
  442. break;
  443. }
  444. str_close_conv (conv);
  445. }
  446. return ch;
  447. }
  448. /* --------------------------------------------------------------------------------------------- */
  449. #endif /* HAVE_CHARSET */