charsets.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527
  1. /*
  2. Text conversion from one charset to another.
  3. Copyright (C) 2001-2024
  4. Free Software Foundation, Inc.
  5. Written by:
  6. Walery Studennikov <despair@sama.ru>
  7. This file is part of the Midnight Commander.
  8. The Midnight Commander is free software: you can redistribute it
  9. and/or modify it under the terms of the GNU General Public License as
  10. published by the Free Software Foundation, either version 3 of the License,
  11. or (at your option) any later version.
  12. The Midnight Commander is distributed in the hope that it will be useful,
  13. but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15. GNU General Public License for more details.
  16. You should have received a copy of the GNU General Public License
  17. along with this program. If not, see <http://www.gnu.org/licenses/>.
  18. */
  19. /** \file charsets.c
  20. * \brief Source: Text conversion from one charset to another
  21. */
  22. #include <config.h>
  23. #include <stdio.h>
  24. #include <stdlib.h>
  25. #include <string.h>
  26. #include "lib/global.h"
  27. #include "lib/strutil.h" /* utf-8 functions */
  28. #include "lib/fileloc.h"
  29. #include "lib/util.h" /* whitespace() */
  30. #include "lib/charsets.h"
  31. /*** global variables ****************************************************************************/
  32. GPtrArray *codepages = NULL;
  33. unsigned char conv_displ[256];
  34. unsigned char conv_input[256];
  35. const char *cp_display = NULL;
  36. const char *cp_source = NULL;
  37. /*** file scope macro definitions ****************************************************************/
  38. #define UNKNCHAR '\001'
  39. #define OTHER_8BIT "Other_8_bit"
  40. /*** file scope type declarations ****************************************************************/
  41. /*** forward declarations (file scope functions) *************************************************/
  42. /*** file scope variables ************************************************************************/
  43. /* --------------------------------------------------------------------------------------------- */
  44. /*** file scope functions ************************************************************************/
  45. /* --------------------------------------------------------------------------------------------- */
  46. static codepage_desc *
  47. new_codepage_desc (const char *id, const char *name)
  48. {
  49. codepage_desc *desc;
  50. desc = g_new (codepage_desc, 1);
  51. desc->id = g_strdup (id);
  52. desc->name = g_strdup (name);
  53. return desc;
  54. }
  55. /* --------------------------------------------------------------------------------------------- */
  56. static void
  57. free_codepage_desc (gpointer data)
  58. {
  59. codepage_desc *desc = (codepage_desc *) data;
  60. g_free (desc->id);
  61. g_free (desc->name);
  62. g_free (desc);
  63. }
  64. /* --------------------------------------------------------------------------------------------- */
  65. /* returns display codepage */
  66. static void
  67. load_codepages_list_from_file (GPtrArray **list, const char *fname)
  68. {
  69. FILE *f;
  70. char buf[BUF_MEDIUM];
  71. char *default_codepage = NULL;
  72. f = fopen (fname, "r");
  73. if (f == NULL)
  74. return;
  75. while (fgets (buf, sizeof buf, f) != NULL)
  76. {
  77. /* split string into id and cpname */
  78. char *p = buf;
  79. size_t buflen;
  80. if (*p == '\n' || *p == '\0' || *p == '#')
  81. continue;
  82. buflen = strlen (buf);
  83. if (buflen != 0 && buf[buflen - 1] == '\n')
  84. buf[buflen - 1] = '\0';
  85. while (*p != '\0' && !whitespace (*p))
  86. ++p;
  87. if (*p == '\0')
  88. goto fail;
  89. *p++ = '\0';
  90. g_strstrip (p);
  91. if (*p == '\0')
  92. goto fail;
  93. if (strcmp (buf, "default") == 0)
  94. default_codepage = g_strdup (p);
  95. else
  96. {
  97. const char *id = buf;
  98. if (*list == NULL)
  99. {
  100. *list = g_ptr_array_new_full (16, free_codepage_desc);
  101. g_ptr_array_add (*list, new_codepage_desc (id, p));
  102. }
  103. else
  104. {
  105. unsigned int i;
  106. /* whether id is already present in list */
  107. /* if yes, overwrite description */
  108. for (i = 0; i < (*list)->len; i++)
  109. {
  110. codepage_desc *desc;
  111. desc = (codepage_desc *) g_ptr_array_index (*list, i);
  112. if (strcmp (id, desc->id) == 0)
  113. {
  114. /* found */
  115. g_free (desc->name);
  116. desc->name = g_strdup (p);
  117. break;
  118. }
  119. }
  120. /* not found */
  121. if (i == (*list)->len)
  122. g_ptr_array_add (*list, new_codepage_desc (id, p));
  123. }
  124. }
  125. }
  126. if (default_codepage != NULL)
  127. {
  128. mc_global.display_codepage = get_codepage_index (default_codepage);
  129. g_free (default_codepage);
  130. }
  131. fail:
  132. fclose (f);
  133. }
  134. /* --------------------------------------------------------------------------------------------- */
  135. static char
  136. translate_character (GIConv cd, char c)
  137. {
  138. gchar *tmp_buff = NULL;
  139. gsize bytes_read, bytes_written = 0;
  140. const char *ibuf = &c;
  141. char ch = UNKNCHAR;
  142. int ibuflen = 1;
  143. tmp_buff = g_convert_with_iconv (ibuf, ibuflen, cd, &bytes_read, &bytes_written, NULL);
  144. if (tmp_buff != NULL)
  145. ch = tmp_buff[0];
  146. g_free (tmp_buff);
  147. return ch;
  148. }
  149. /* --------------------------------------------------------------------------------------------- */
  150. /*** public functions ****************************************************************************/
  151. /* --------------------------------------------------------------------------------------------- */
  152. void
  153. load_codepages_list (void)
  154. {
  155. char *fname;
  156. /* 1: try load /usr/share/mc/mc.charsets */
  157. fname = g_build_filename (mc_global.share_data_dir, CHARSETS_LIST, (char *) NULL);
  158. load_codepages_list_from_file (&codepages, fname);
  159. g_free (fname);
  160. /* 2: try load /etc/mc/mc.charsets */
  161. fname = g_build_filename (mc_global.sysconfig_dir, CHARSETS_LIST, (char *) NULL);
  162. load_codepages_list_from_file (&codepages, fname);
  163. g_free (fname);
  164. if (codepages == NULL)
  165. {
  166. /* files are not found, add default codepage */
  167. fprintf (stderr, "%s\n", _("Warning: cannot load codepages list"));
  168. codepages = g_ptr_array_new_with_free_func (free_codepage_desc);
  169. g_ptr_array_add (codepages, new_codepage_desc (DEFAULT_CHARSET, _("7-bit ASCII")));
  170. }
  171. }
  172. /* --------------------------------------------------------------------------------------------- */
  173. void
  174. free_codepages_list (void)
  175. {
  176. g_ptr_array_free (codepages, TRUE);
  177. /* NULL-ize pointer to make unit tests happy */
  178. codepages = NULL;
  179. }
  180. /* --------------------------------------------------------------------------------------------- */
  181. const char *
  182. get_codepage_id (const int n)
  183. {
  184. return (n < 0) ? OTHER_8BIT : ((codepage_desc *) g_ptr_array_index (codepages, n))->id;
  185. }
  186. /* --------------------------------------------------------------------------------------------- */
  187. int
  188. get_codepage_index (const char *id)
  189. {
  190. size_t i;
  191. if (codepages == NULL)
  192. return -1;
  193. if (strcmp (id, OTHER_8BIT) == 0)
  194. return -1;
  195. for (i = 0; i < codepages->len; i++)
  196. if (strcmp (id, ((codepage_desc *) g_ptr_array_index (codepages, i))->id) == 0)
  197. return i;
  198. return -1;
  199. }
  200. /* --------------------------------------------------------------------------------------------- */
  201. /** Check if specified encoding can be used in mc.
  202. * @param encoding name of encoding
  203. * @return TRUE if encoding is supported by mc, FALSE otherwise
  204. */
  205. gboolean
  206. is_supported_encoding (const char *encoding)
  207. {
  208. gboolean result = FALSE;
  209. guint t;
  210. for (t = 0; t < codepages->len; t++)
  211. {
  212. const char *id;
  213. id = ((codepage_desc *) g_ptr_array_index (codepages, t))->id;
  214. result |= (g_ascii_strncasecmp (encoding, id, strlen (id)) == 0);
  215. }
  216. return result;
  217. }
  218. /* --------------------------------------------------------------------------------------------- */
  219. char *
  220. init_translation_table (int cpsource, int cpdisplay)
  221. {
  222. int i;
  223. GIConv cd;
  224. /* Fill inpit <-> display tables */
  225. if (cpsource < 0 || cpdisplay < 0 || cpsource == cpdisplay)
  226. {
  227. for (i = 0; i <= 255; ++i)
  228. {
  229. conv_displ[i] = i;
  230. conv_input[i] = i;
  231. }
  232. cp_source = cp_display;
  233. return NULL;
  234. }
  235. for (i = 0; i <= 127; ++i)
  236. {
  237. conv_displ[i] = i;
  238. conv_input[i] = i;
  239. }
  240. cp_source = ((codepage_desc *) g_ptr_array_index (codepages, cpsource))->id;
  241. cp_display = ((codepage_desc *) g_ptr_array_index (codepages, cpdisplay))->id;
  242. /* display <- inpit table */
  243. cd = g_iconv_open (cp_display, cp_source);
  244. if (cd == INVALID_CONV)
  245. return g_strdup_printf (_("Cannot translate from %s to %s"), cp_source, cp_display);
  246. for (i = 128; i <= 255; ++i)
  247. conv_displ[i] = translate_character (cd, i);
  248. g_iconv_close (cd);
  249. /* inpit <- display table */
  250. cd = g_iconv_open (cp_source, cp_display);
  251. if (cd == INVALID_CONV)
  252. return g_strdup_printf (_("Cannot translate from %s to %s"), cp_display, cp_source);
  253. for (i = 128; i <= 255; ++i)
  254. {
  255. unsigned char ch;
  256. ch = translate_character (cd, i);
  257. conv_input[i] = (ch == UNKNCHAR) ? i : ch;
  258. }
  259. g_iconv_close (cd);
  260. return NULL;
  261. }
  262. /* --------------------------------------------------------------------------------------------- */
  263. void
  264. convert_to_display (char *str)
  265. {
  266. if (str != NULL)
  267. for (; *str != '\0'; str++)
  268. *str = conv_displ[(unsigned char) *str];
  269. }
  270. /* --------------------------------------------------------------------------------------------- */
  271. GString *
  272. str_nconvert_to_display (const char *str, int len)
  273. {
  274. GString *buff;
  275. GIConv conv;
  276. if (str == NULL)
  277. return NULL;
  278. if (cp_display == cp_source)
  279. return g_string_new (str);
  280. conv = str_crt_conv_from (cp_source);
  281. buff = g_string_new ("");
  282. str_nconvert (conv, str, len, buff);
  283. str_close_conv (conv);
  284. return buff;
  285. }
  286. /* --------------------------------------------------------------------------------------------- */
  287. void
  288. convert_from_input (char *str)
  289. {
  290. if (str != NULL)
  291. for (; *str != '\0'; str++)
  292. *str = conv_input[(unsigned char) *str];
  293. }
  294. /* --------------------------------------------------------------------------------------------- */
  295. GString *
  296. str_nconvert_to_input (const char *str, int len)
  297. {
  298. GString *buff;
  299. GIConv conv;
  300. if (str == NULL)
  301. return NULL;
  302. if (cp_display == cp_source)
  303. return g_string_new (str);
  304. conv = str_crt_conv_to (cp_source);
  305. buff = g_string_new ("");
  306. str_nconvert (conv, str, len, buff);
  307. str_close_conv (conv);
  308. return buff;
  309. }
  310. /* --------------------------------------------------------------------------------------------- */
  311. unsigned char
  312. convert_from_utf_to_current (const char *str)
  313. {
  314. unsigned char buf_ch[UTF8_CHAR_LEN + 1];
  315. unsigned char ch = '.';
  316. GIConv conv;
  317. const char *cp_to;
  318. if (str == NULL)
  319. return '.';
  320. cp_to = get_codepage_id (mc_global.source_codepage);
  321. conv = str_crt_conv_to (cp_to);
  322. if (conv != INVALID_CONV)
  323. {
  324. switch (str_translate_char (conv, str, -1, (char *) buf_ch, sizeof (buf_ch)))
  325. {
  326. case ESTR_SUCCESS:
  327. ch = buf_ch[0];
  328. break;
  329. case ESTR_PROBLEM:
  330. case ESTR_FAILURE:
  331. ch = '.';
  332. break;
  333. default:
  334. break;
  335. }
  336. str_close_conv (conv);
  337. }
  338. return ch;
  339. }
  340. /* --------------------------------------------------------------------------------------------- */
  341. unsigned char
  342. convert_from_utf_to_current_c (int input_char, GIConv conv)
  343. {
  344. unsigned char str[UTF8_CHAR_LEN + 1];
  345. unsigned char buf_ch[UTF8_CHAR_LEN + 1];
  346. unsigned char ch = '.';
  347. int res;
  348. res = g_unichar_to_utf8 (input_char, (char *) str);
  349. if (res == 0)
  350. return ch;
  351. str[res] = '\0';
  352. switch (str_translate_char (conv, (char *) str, -1, (char *) buf_ch, sizeof (buf_ch)))
  353. {
  354. case ESTR_SUCCESS:
  355. ch = buf_ch[0];
  356. break;
  357. case ESTR_PROBLEM:
  358. case ESTR_FAILURE:
  359. ch = '.';
  360. break;
  361. default:
  362. break;
  363. }
  364. return ch;
  365. }
  366. /* --------------------------------------------------------------------------------------------- */
  367. int
  368. convert_from_8bit_to_utf_c (char input_char, GIConv conv)
  369. {
  370. unsigned char str[2];
  371. unsigned char buf_ch[UTF8_CHAR_LEN + 1];
  372. int ch;
  373. str[0] = (unsigned char) input_char;
  374. str[1] = '\0';
  375. switch (str_translate_char (conv, (char *) str, -1, (char *) buf_ch, sizeof (buf_ch)))
  376. {
  377. case ESTR_SUCCESS:
  378. {
  379. int res;
  380. res = g_utf8_get_char_validated ((char *) buf_ch, -1);
  381. ch = res >= 0 ? res : buf_ch[0];
  382. break;
  383. }
  384. case ESTR_PROBLEM:
  385. case ESTR_FAILURE:
  386. default:
  387. ch = '.';
  388. break;
  389. }
  390. return ch;
  391. }
  392. /* --------------------------------------------------------------------------------------------- */
  393. int
  394. convert_from_8bit_to_utf_c2 (char input_char)
  395. {
  396. int ch = '.';
  397. GIConv conv;
  398. const char *cp_from;
  399. cp_from = get_codepage_id (mc_global.source_codepage);
  400. conv = str_crt_conv_to (cp_from);
  401. if (conv != INVALID_CONV)
  402. {
  403. ch = convert_from_8bit_to_utf_c (input_char, conv);
  404. str_close_conv (conv);
  405. }
  406. return ch;
  407. }
  408. /* --------------------------------------------------------------------------------------------- */