charsets.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572
  1. /* Text conversion from one charset to another.
  2. Copyright (C) 2001 Walery Studennikov <despair@sama.ru>
  3. This program is free software; you can redistribute it and/or modify
  4. it under the terms of the GNU General Public License as published by
  5. the Free Software Foundation; either version 2 of the License, or
  6. (at your option) any later version.
  7. This program is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. GNU General Public License for more details.
  11. You should have received a copy of the GNU General Public License
  12. along with this program; if not, write to the Free Software
  13. Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  14. */
  15. /** \file charsets.c
  16. * \brief Source: Text conversion from one charset to another
  17. */
  18. #include <config.h>
  19. #ifdef HAVE_CHARSET
  20. #include <stdio.h>
  21. #include <stdlib.h>
  22. #include <string.h>
  23. #include "lib/global.h"
  24. #include "lib/strutil.h" /* utf-8 functions */
  25. #include "lib/fileloc.h"
  26. #include "lib/charsets.h"
  27. #include "lib/mcconfig.h"
  28. #include "src/main.h" /* display_codepage, source_codepage */
  29. /*** global variables ****************************************************************************/
  30. GPtrArray *codepages = NULL;
  31. unsigned char conv_displ[256];
  32. unsigned char conv_input[256];
  33. const char *cp_display = NULL;
  34. const char *cp_source = NULL;
  35. /*** file scope macro definitions ****************************************************************/
  36. #define OTHER_8BIT "Other_8_bit"
  37. /*
  38. * FIXME: This assumes that ASCII is always the first encoding
  39. * in mc.charsets
  40. */
  41. #define CP_ASCII 0
  42. /*** file scope type declarations ****************************************************************/
  43. /*** file scope variables ************************************************************************/
  44. /*** file scope functions ************************************************************************/
  45. /* --------------------------------------------------------------------------------------------- */
  46. static codepage_desc *
  47. new_codepage_desc (const char *id, const char *name)
  48. {
  49. codepage_desc *desc;
  50. desc = g_new (codepage_desc, 1);
  51. desc->id = g_strdup (id);
  52. desc->name = g_strdup (name);
  53. return desc;
  54. }
  55. /* --------------------------------------------------------------------------------------------- */
  56. static void
  57. free_codepage_desc (gpointer data, gpointer user_data)
  58. {
  59. codepage_desc *desc = (codepage_desc *) data;
  60. (void) user_data;
  61. g_free (desc->id);
  62. g_free (desc->name);
  63. g_free (desc);
  64. }
  65. /* --------------------------------------------------------------------------------------------- */
  66. /* returns display codepage */
  67. static void
  68. load_codepages_list_from_file (GPtrArray ** list, const char *fname)
  69. {
  70. FILE *f;
  71. guint i;
  72. char buf[BUF_MEDIUM];
  73. char *default_codepage = NULL;
  74. f = fopen (fname, "r");
  75. if (f == NULL)
  76. return;
  77. for (i = 0; fgets (buf, sizeof buf, f) != NULL;)
  78. {
  79. /* split string into id and cpname */
  80. char *p = buf;
  81. size_t buflen = strlen (buf);
  82. if (*p == '\n' || *p == '\0' || *p == '#')
  83. continue;
  84. if (buflen > 0 && buf[buflen - 1] == '\n')
  85. buf[buflen - 1] = '\0';
  86. while (*p != '\t' && *p != ' ' && *p != '\0')
  87. ++p;
  88. if (*p == '\0')
  89. goto fail;
  90. *p++ = '\0';
  91. g_strstrip (p);
  92. if (*p == '\0')
  93. goto fail;
  94. if (strcmp (buf, "default") == 0)
  95. default_codepage = g_strdup (p);
  96. else
  97. {
  98. const char *id = buf;
  99. if (*list == NULL)
  100. {
  101. *list = g_ptr_array_sized_new (16);
  102. g_ptr_array_add (*list, new_codepage_desc (id, p));
  103. }
  104. else
  105. {
  106. /* whether id is already present in list */
  107. /* if yes, overwrite description */
  108. for (i = 0; i < (*list)->len; i++)
  109. {
  110. codepage_desc *desc;
  111. desc = (codepage_desc *) g_ptr_array_index (*list, i);
  112. if (strcmp (id, desc->id) == 0)
  113. {
  114. /* found */
  115. g_free (desc->name);
  116. desc->name = g_strdup (p);
  117. break;
  118. }
  119. }
  120. /* not found */
  121. if (i == (*list)->len)
  122. g_ptr_array_add (*list, new_codepage_desc (id, p));
  123. }
  124. }
  125. }
  126. if (default_codepage != NULL)
  127. {
  128. display_codepage = get_codepage_index (default_codepage);
  129. g_free (default_codepage);
  130. }
  131. fail:
  132. fclose (f);
  133. }
  134. /* --------------------------------------------------------------------------------------------- */
  135. static char
  136. translate_character (GIConv cd, char c)
  137. {
  138. gchar *tmp_buff = NULL;
  139. gsize bytes_read, bytes_written = 0;
  140. const char *ibuf = &c;
  141. char ch = UNKNCHAR;
  142. int ibuflen = 1;
  143. tmp_buff = g_convert_with_iconv (ibuf, ibuflen, cd, &bytes_read, &bytes_written, NULL);
  144. if (tmp_buff)
  145. ch = tmp_buff[0];
  146. g_free (tmp_buff);
  147. return ch;
  148. }
  149. /* --------------------------------------------------------------------------------------------- */
  150. /*** public functions ****************************************************************************/
  151. /* --------------------------------------------------------------------------------------------- */
  152. void
  153. load_codepages_list (void)
  154. {
  155. char *fname;
  156. /* 1: try load /usr/share/mc/mc.charsets */
  157. fname = g_build_filename (mc_share_data_dir, CHARSETS_LIST, (char *) NULL);
  158. load_codepages_list_from_file (&codepages, fname);
  159. g_free (fname);
  160. /* 2: try load /etc/mc/mc.charsets */
  161. fname = g_build_filename (mc_sysconfig_dir, CHARSETS_LIST, (char *) NULL);
  162. load_codepages_list_from_file (&codepages, fname);
  163. g_free (fname);
  164. if (codepages == NULL)
  165. {
  166. /* files are not found, add defaullt codepage */
  167. fprintf (stderr, "%s\n", _("Warning: cannot load codepages list"));
  168. codepages = g_ptr_array_new ();
  169. g_ptr_array_add (codepages, new_codepage_desc ("ASCII", _("7-bit ASCII")));
  170. }
  171. }
  172. /* --------------------------------------------------------------------------------------------- */
  173. void
  174. free_codepages_list (void)
  175. {
  176. g_ptr_array_foreach (codepages, free_codepage_desc, NULL);
  177. g_ptr_array_free (codepages, TRUE);
  178. }
  179. /* --------------------------------------------------------------------------------------------- */
  180. const char *
  181. get_codepage_id (const int n)
  182. {
  183. return (n < 0) ? OTHER_8BIT : ((codepage_desc *) g_ptr_array_index (codepages, n))->id;
  184. }
  185. /* --------------------------------------------------------------------------------------------- */
  186. int
  187. get_codepage_index (const char *id)
  188. {
  189. size_t i;
  190. if (strcmp (id, OTHER_8BIT) == 0)
  191. return -1;
  192. if (codepages == NULL)
  193. return -1;
  194. for (i = 0; i < codepages->len; i++)
  195. if (strcmp (id, ((codepage_desc *) g_ptr_array_index (codepages, i))->id) == 0)
  196. return i;
  197. return -1;
  198. }
  199. /* --------------------------------------------------------------------------------------------- */
  200. /** Check if specified encoding can be used in mc.
  201. * @param encoding name of encoding
  202. * @returns TRUE if encoding has supported by mc, FALSE otherwise
  203. */
  204. gboolean
  205. is_supported_encoding (const char *encoding)
  206. {
  207. gboolean result = FALSE;
  208. guint t;
  209. for (t = 0; t < codepages->len; t++)
  210. {
  211. const char *id = ((codepage_desc *) g_ptr_array_index (codepages, t))->id;
  212. result |= (g_ascii_strncasecmp (encoding, id, strlen (id)) == 0);
  213. }
  214. return result;
  215. }
  216. /* --------------------------------------------------------------------------------------------- */
  217. char *
  218. init_translation_table (int cpsource, int cpdisplay)
  219. {
  220. int i;
  221. GIConv cd;
  222. /* Fill inpit <-> display tables */
  223. if (cpsource < 0 || cpdisplay < 0 || cpsource == cpdisplay)
  224. {
  225. for (i = 0; i <= 255; ++i)
  226. {
  227. conv_displ[i] = i;
  228. conv_input[i] = i;
  229. cp_source = cp_display;
  230. }
  231. return NULL;
  232. }
  233. for (i = 0; i <= 127; ++i)
  234. {
  235. conv_displ[i] = i;
  236. conv_input[i] = i;
  237. }
  238. cp_source = ((codepage_desc *) g_ptr_array_index (codepages, cpsource))->id;
  239. cp_display = ((codepage_desc *) g_ptr_array_index (codepages, cpdisplay))->id;
  240. /* display <- inpit table */
  241. cd = g_iconv_open (cp_display, cp_source);
  242. if (cd == INVALID_CONV)
  243. return g_strdup_printf (_("Cannot translate from %s to %s"), cp_source, cp_display);
  244. for (i = 128; i <= 255; ++i)
  245. conv_displ[i] = translate_character (cd, i);
  246. g_iconv_close (cd);
  247. /* inpit <- display table */
  248. cd = g_iconv_open (cp_source, cp_display);
  249. if (cd == INVALID_CONV)
  250. return g_strdup_printf (_("Cannot translate from %s to %s"), cp_display, cp_source);
  251. for (i = 128; i <= 255; ++i)
  252. {
  253. unsigned char ch;
  254. ch = translate_character (cd, i);
  255. conv_input[i] = (ch == UNKNCHAR) ? i : ch;
  256. }
  257. g_iconv_close (cd);
  258. return NULL;
  259. }
  260. /* --------------------------------------------------------------------------------------------- */
  261. void
  262. convert_to_display (char *str)
  263. {
  264. if (!str)
  265. return;
  266. while (*str)
  267. {
  268. *str = conv_displ[(unsigned char) *str];
  269. str++;
  270. }
  271. }
  272. /* --------------------------------------------------------------------------------------------- */
  273. GString *
  274. str_convert_to_display (char *str)
  275. {
  276. return str_nconvert_to_display (str, -1);
  277. }
  278. /* --------------------------------------------------------------------------------------------- */
  279. GString *
  280. str_nconvert_to_display (char *str, int len)
  281. {
  282. GString *buff;
  283. GIConv conv;
  284. if (!str)
  285. return g_string_new ("");
  286. if (cp_display == cp_source)
  287. return g_string_new (str);
  288. conv = str_crt_conv_from (cp_source);
  289. buff = g_string_new ("");
  290. str_nconvert (conv, str, len, buff);
  291. str_close_conv (conv);
  292. return buff;
  293. }
  294. /* --------------------------------------------------------------------------------------------- */
  295. void
  296. convert_from_input (char *str)
  297. {
  298. if (!str)
  299. return;
  300. while (*str)
  301. {
  302. *str = conv_input[(unsigned char) *str];
  303. str++;
  304. }
  305. }
  306. /* --------------------------------------------------------------------------------------------- */
  307. GString *
  308. str_convert_to_input (char *str)
  309. {
  310. return str_nconvert_to_input (str, -1);
  311. }
  312. /* --------------------------------------------------------------------------------------------- */
  313. GString *
  314. str_nconvert_to_input (char *str, int len)
  315. {
  316. GString *buff;
  317. GIConv conv;
  318. if (!str)
  319. return g_string_new ("");
  320. if (cp_display == cp_source)
  321. return g_string_new (str);
  322. conv = str_crt_conv_to (cp_source);
  323. buff = g_string_new ("");
  324. str_nconvert (conv, str, len, buff);
  325. str_close_conv (conv);
  326. return buff;
  327. }
  328. /* --------------------------------------------------------------------------------------------- */
  329. unsigned char
  330. convert_from_utf_to_current (const char *str)
  331. {
  332. unsigned char buf_ch[6 + 1];
  333. unsigned char ch = '.';
  334. GIConv conv;
  335. const char *cp_to;
  336. if (!str)
  337. return '.';
  338. cp_to = get_codepage_id (source_codepage);
  339. conv = str_crt_conv_to (cp_to);
  340. if (conv != INVALID_CONV)
  341. {
  342. switch (str_translate_char (conv, str, -1, (char *) buf_ch, sizeof (buf_ch)))
  343. {
  344. case ESTR_SUCCESS:
  345. ch = buf_ch[0];
  346. break;
  347. case ESTR_PROBLEM:
  348. case ESTR_FAILURE:
  349. ch = '.';
  350. break;
  351. }
  352. str_close_conv (conv);
  353. }
  354. return ch;
  355. }
  356. /* --------------------------------------------------------------------------------------------- */
  357. unsigned char
  358. convert_from_utf_to_current_c (const int input_char, GIConv conv)
  359. {
  360. unsigned char str[6 + 1];
  361. unsigned char buf_ch[6 + 1];
  362. unsigned char ch = '.';
  363. int res = 0;
  364. res = g_unichar_to_utf8 (input_char, (char *) str);
  365. if (res == 0)
  366. {
  367. return ch;
  368. }
  369. str[res] = '\0';
  370. switch (str_translate_char (conv, (char *) str, -1, (char *) buf_ch, sizeof (buf_ch)))
  371. {
  372. case ESTR_SUCCESS:
  373. ch = buf_ch[0];
  374. break;
  375. case ESTR_PROBLEM:
  376. case ESTR_FAILURE:
  377. ch = '.';
  378. break;
  379. }
  380. return ch;
  381. }
  382. /* --------------------------------------------------------------------------------------------- */
  383. int
  384. convert_from_8bit_to_utf_c (const char input_char, GIConv conv)
  385. {
  386. unsigned char str[2];
  387. unsigned char buf_ch[6 + 1];
  388. int ch = '.';
  389. int res = 0;
  390. str[0] = (unsigned char) input_char;
  391. str[1] = '\0';
  392. switch (str_translate_char (conv, (char *) str, -1, (char *) buf_ch, sizeof (buf_ch)))
  393. {
  394. case ESTR_SUCCESS:
  395. res = g_utf8_get_char_validated ((char *) buf_ch, -1);
  396. if (res < 0)
  397. {
  398. ch = buf_ch[0];
  399. }
  400. else
  401. {
  402. ch = res;
  403. }
  404. break;
  405. case ESTR_PROBLEM:
  406. case ESTR_FAILURE:
  407. ch = '.';
  408. break;
  409. }
  410. return ch;
  411. }
  412. /* --------------------------------------------------------------------------------------------- */
  413. int
  414. convert_from_8bit_to_utf_c2 (const char input_char)
  415. {
  416. unsigned char str[2];
  417. unsigned char buf_ch[6 + 1];
  418. int ch = '.';
  419. int res = 0;
  420. GIConv conv;
  421. const char *cp_from;
  422. str[0] = (unsigned char) input_char;
  423. str[1] = '\0';
  424. cp_from = get_codepage_id (source_codepage);
  425. conv = str_crt_conv_to (cp_from);
  426. if (conv != INVALID_CONV)
  427. {
  428. switch (str_translate_char (conv, (char *) str, -1, (char *) buf_ch, sizeof (buf_ch)))
  429. {
  430. case ESTR_SUCCESS:
  431. res = g_utf8_get_char_validated ((char *) buf_ch, -1);
  432. if (res < 0)
  433. {
  434. ch = buf_ch[0];
  435. }
  436. else
  437. {
  438. ch = res;
  439. }
  440. break;
  441. case ESTR_PROBLEM:
  442. case ESTR_FAILURE:
  443. ch = '.';
  444. break;
  445. }
  446. str_close_conv (conv);
  447. }
  448. return ch;
  449. }
  450. /* --------------------------------------------------------------------------------------------- */
  451. #endif /* HAVE_CHARSET */