striconv.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451
  1. /* Charset conversion.
  2. Copyright (C) 2001-2007, 2010-2024 Free Software Foundation, Inc.
  3. Written by Bruno Haible and Simon Josefsson.
  4. This file is free software: you can redistribute it and/or modify
  5. it under the terms of the GNU Lesser General Public License as
  6. published by the Free Software Foundation; either version 2.1 of the
  7. License, or (at your option) any later version.
  8. This file is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. GNU Lesser General Public License for more details.
  12. You should have received a copy of the GNU Lesser General Public License
  13. along with this program. If not, see <https://www.gnu.org/licenses/>. */
  14. #include <config.h>
  15. /* Specification. */
  16. #include "striconv.h"
  17. #include <errno.h>
  18. #include <stdlib.h>
  19. #include <string.h>
  20. #if HAVE_ICONV
  21. # include <iconv.h>
  22. /* Get MB_LEN_MAX, CHAR_BIT. */
  23. # include <limits.h>
  24. #endif
  25. #include "c-strcase.h"
  26. #ifndef SIZE_MAX
  27. # define SIZE_MAX ((size_t) -1)
  28. #endif
  29. #if HAVE_ICONV
  30. int
  31. mem_cd_iconv (const char *src, size_t srclen, iconv_t cd,
  32. char **resultp, size_t *lengthp)
  33. {
  34. # define tmpbufsize 4096
  35. size_t length;
  36. char *result;
  37. /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug. */
  38. # if defined _LIBICONV_VERSION \
  39. || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
  40. || defined __sun)
  41. /* Set to the initial state. */
  42. iconv (cd, NULL, NULL, NULL, NULL);
  43. # endif
  44. /* Determine the length we need. */
  45. {
  46. size_t count = 0;
  47. /* The alignment is needed when converting e.g. to glibc's WCHAR_T or
  48. libiconv's UCS-4-INTERNAL encoding. */
  49. union { unsigned int align; char buf[tmpbufsize]; } tmp;
  50. # define tmpbuf tmp.buf
  51. const char *inptr = src;
  52. size_t insize = srclen;
  53. while (insize > 0)
  54. {
  55. char *outptr = tmpbuf;
  56. size_t outsize = tmpbufsize;
  57. size_t res = iconv (cd,
  58. (ICONV_CONST char **) &inptr, &insize,
  59. &outptr, &outsize);
  60. if (res == (size_t)(-1))
  61. {
  62. if (errno == E2BIG)
  63. ;
  64. else if (errno == EINVAL)
  65. break;
  66. else
  67. return -1;
  68. }
  69. # if !defined _LIBICONV_VERSION && !(defined __GLIBC__ && !defined __UCLIBC__)
  70. /* Irix iconv() inserts a NUL byte if it cannot convert.
  71. NetBSD iconv() inserts a question mark if it cannot convert.
  72. Only GNU libiconv and GNU libc are known to prefer to fail rather
  73. than doing a lossy conversion. */
  74. else if (res > 0)
  75. {
  76. errno = EILSEQ;
  77. return -1;
  78. }
  79. # endif
  80. count += outptr - tmpbuf;
  81. }
  82. /* Avoid glibc-2.1 bug and Solaris 2.7 bug. */
  83. # if defined _LIBICONV_VERSION \
  84. || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
  85. || defined __sun)
  86. {
  87. char *outptr = tmpbuf;
  88. size_t outsize = tmpbufsize;
  89. size_t res = iconv (cd, NULL, NULL, &outptr, &outsize);
  90. if (res == (size_t)(-1))
  91. return -1;
  92. count += outptr - tmpbuf;
  93. }
  94. # endif
  95. length = count;
  96. # undef tmpbuf
  97. }
  98. if (length == 0)
  99. {
  100. *lengthp = 0;
  101. return 0;
  102. }
  103. if (*resultp != NULL && *lengthp >= length)
  104. result = *resultp;
  105. else
  106. {
  107. result = (char *) malloc (length);
  108. if (result == NULL)
  109. {
  110. errno = ENOMEM;
  111. return -1;
  112. }
  113. }
  114. /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug. */
  115. # if defined _LIBICONV_VERSION \
  116. || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
  117. || defined __sun)
  118. /* Return to the initial state. */
  119. iconv (cd, NULL, NULL, NULL, NULL);
  120. # endif
  121. /* Do the conversion for real. */
  122. {
  123. const char *inptr = src;
  124. size_t insize = srclen;
  125. char *outptr = result;
  126. size_t outsize = length;
  127. while (insize > 0)
  128. {
  129. size_t res = iconv (cd,
  130. (ICONV_CONST char **) &inptr, &insize,
  131. &outptr, &outsize);
  132. if (res == (size_t)(-1))
  133. {
  134. if (errno == EINVAL)
  135. break;
  136. else
  137. goto fail;
  138. }
  139. # if !defined _LIBICONV_VERSION && !(defined __GLIBC__ && !defined __UCLIBC__)
  140. /* Irix iconv() inserts a NUL byte if it cannot convert.
  141. NetBSD iconv() inserts a question mark if it cannot convert.
  142. Only GNU libiconv and GNU libc are known to prefer to fail rather
  143. than doing a lossy conversion. */
  144. else if (res > 0)
  145. {
  146. errno = EILSEQ;
  147. goto fail;
  148. }
  149. # endif
  150. }
  151. /* Avoid glibc-2.1 bug and Solaris 2.7 bug. */
  152. # if defined _LIBICONV_VERSION \
  153. || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
  154. || defined __sun)
  155. {
  156. size_t res = iconv (cd, NULL, NULL, &outptr, &outsize);
  157. if (res == (size_t)(-1))
  158. goto fail;
  159. }
  160. # endif
  161. if (outsize != 0)
  162. abort ();
  163. }
  164. *resultp = result;
  165. *lengthp = length;
  166. return 0;
  167. fail:
  168. {
  169. if (result != *resultp)
  170. free (result);
  171. return -1;
  172. }
  173. # undef tmpbufsize
  174. }
  175. char *
  176. str_cd_iconv (const char *src, iconv_t cd)
  177. {
  178. /* For most encodings, a trailing NUL byte in the input will be converted
  179. to a trailing NUL byte in the output. But not for UTF-7. So that this
  180. function is usable for UTF-7, we have to exclude the NUL byte from the
  181. conversion and add it by hand afterwards. */
  182. # if !defined _LIBICONV_VERSION && !(defined __GLIBC__ && !defined __UCLIBC__)
  183. /* Irix iconv() inserts a NUL byte if it cannot convert.
  184. NetBSD iconv() inserts a question mark if it cannot convert.
  185. Only GNU libiconv and GNU libc are known to prefer to fail rather
  186. than doing a lossy conversion. For other iconv() implementations,
  187. we have to look at the number of irreversible conversions returned;
  188. but this information is lost when iconv() returns for an E2BIG reason.
  189. Therefore we cannot use the second, faster algorithm. */
  190. char *result = NULL;
  191. size_t length = 0;
  192. int retval = mem_cd_iconv (src, strlen (src), cd, &result, &length);
  193. char *final_result;
  194. if (retval < 0)
  195. {
  196. if (result != NULL)
  197. abort ();
  198. return NULL;
  199. }
  200. /* Add the terminating NUL byte. */
  201. final_result =
  202. (result != NULL ? realloc (result, length + 1) : malloc (length + 1));
  203. if (final_result == NULL)
  204. {
  205. free (result);
  206. errno = ENOMEM;
  207. return NULL;
  208. }
  209. final_result[length] = '\0';
  210. return final_result;
  211. # else
  212. /* This algorithm is likely faster than the one above. But it may produce
  213. iconv() returns for an E2BIG reason, when the output size guess is too
  214. small. Therefore it can only be used when we don't need the number of
  215. irreversible conversions performed. */
  216. char *result;
  217. size_t result_size;
  218. size_t length;
  219. const char *inptr = src;
  220. size_t inbytes_remaining = strlen (src);
  221. /* Make a guess for the worst-case output size, in order to avoid a
  222. realloc. It's OK if the guess is wrong as long as it is not zero and
  223. doesn't lead to an integer overflow. */
  224. result_size = inbytes_remaining;
  225. {
  226. size_t approx_sqrt_SIZE_MAX = SIZE_MAX >> (sizeof (size_t) * CHAR_BIT / 2);
  227. if (result_size <= approx_sqrt_SIZE_MAX / MB_LEN_MAX)
  228. result_size *= MB_LEN_MAX;
  229. }
  230. result_size += 1; /* for the terminating NUL */
  231. result = (char *) malloc (result_size);
  232. if (result == NULL)
  233. {
  234. errno = ENOMEM;
  235. return NULL;
  236. }
  237. /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug. */
  238. # if defined _LIBICONV_VERSION \
  239. || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
  240. || defined __sun)
  241. /* Set to the initial state. */
  242. iconv (cd, NULL, NULL, NULL, NULL);
  243. # endif
  244. /* Do the conversion. */
  245. {
  246. char *outptr = result;
  247. size_t outbytes_remaining = result_size - 1;
  248. for (;;)
  249. {
  250. /* Here inptr + inbytes_remaining = src + strlen (src),
  251. outptr + outbytes_remaining = result + result_size - 1. */
  252. size_t res = iconv (cd,
  253. (ICONV_CONST char **) &inptr, &inbytes_remaining,
  254. &outptr, &outbytes_remaining);
  255. if (res == (size_t)(-1))
  256. {
  257. if (errno == EINVAL)
  258. break;
  259. else if (errno == E2BIG)
  260. {
  261. size_t used = outptr - result;
  262. size_t newsize = result_size * 2;
  263. char *newresult;
  264. if (!(newsize > result_size))
  265. {
  266. errno = ENOMEM;
  267. goto failed;
  268. }
  269. newresult = (char *) realloc (result, newsize);
  270. if (newresult == NULL)
  271. {
  272. errno = ENOMEM;
  273. goto failed;
  274. }
  275. result = newresult;
  276. result_size = newsize;
  277. outptr = result + used;
  278. outbytes_remaining = result_size - 1 - used;
  279. }
  280. else
  281. goto failed;
  282. }
  283. else
  284. break;
  285. }
  286. /* Avoid glibc-2.1 bug and Solaris 2.7 bug. */
  287. # if defined _LIBICONV_VERSION \
  288. || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
  289. || defined __sun)
  290. for (;;)
  291. {
  292. /* Here outptr + outbytes_remaining = result + result_size - 1. */
  293. size_t res = iconv (cd, NULL, NULL, &outptr, &outbytes_remaining);
  294. if (res == (size_t)(-1))
  295. {
  296. if (errno == E2BIG)
  297. {
  298. size_t used = outptr - result;
  299. size_t newsize = result_size * 2;
  300. char *newresult;
  301. if (!(newsize > result_size))
  302. {
  303. errno = ENOMEM;
  304. goto failed;
  305. }
  306. newresult = (char *) realloc (result, newsize);
  307. if (newresult == NULL)
  308. {
  309. errno = ENOMEM;
  310. goto failed;
  311. }
  312. result = newresult;
  313. result_size = newsize;
  314. outptr = result + used;
  315. outbytes_remaining = result_size - 1 - used;
  316. }
  317. else
  318. goto failed;
  319. }
  320. else
  321. break;
  322. }
  323. # endif
  324. /* Add the terminating NUL byte. */
  325. *outptr++ = '\0';
  326. length = outptr - result;
  327. }
  328. /* Give away unused memory. */
  329. if (length < result_size)
  330. {
  331. char *smaller_result = (char *) realloc (result, length);
  332. if (smaller_result != NULL)
  333. result = smaller_result;
  334. }
  335. return result;
  336. failed:
  337. free (result);
  338. return NULL;
  339. # endif
  340. }
  341. #endif
  342. char *
  343. str_iconv (const char *src, const char *from_codeset, const char *to_codeset)
  344. {
  345. if (*src == '\0' || c_strcasecmp (from_codeset, to_codeset) == 0)
  346. {
  347. char *result = strdup (src);
  348. if (result == NULL)
  349. errno = ENOMEM;
  350. return result;
  351. }
  352. else
  353. {
  354. #if HAVE_ICONV
  355. iconv_t cd;
  356. char *result;
  357. /* Avoid glibc-2.1 bug with EUC-KR. */
  358. # if ((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
  359. && !defined _LIBICONV_VERSION
  360. if (c_strcasecmp (from_codeset, "EUC-KR") == 0
  361. || c_strcasecmp (to_codeset, "EUC-KR") == 0)
  362. {
  363. errno = EINVAL;
  364. return NULL;
  365. }
  366. # endif
  367. cd = iconv_open (to_codeset, from_codeset);
  368. if (cd == (iconv_t) -1)
  369. return NULL;
  370. result = str_cd_iconv (src, cd);
  371. if (result == NULL)
  372. {
  373. /* Close cd, but preserve the errno from str_cd_iconv. */
  374. int saved_errno = errno;
  375. iconv_close (cd);
  376. errno = saved_errno;
  377. }
  378. else
  379. {
  380. if (iconv_close (cd) < 0)
  381. {
  382. free (result);
  383. return NULL;
  384. }
  385. }
  386. return result;
  387. #else
  388. /* This is a different error code than if iconv_open existed but didn't
  389. support from_codeset and to_codeset, so that the caller can emit
  390. an error message such as
  391. "iconv() is not supported. Installing GNU libiconv and
  392. then reinstalling this package would fix this." */
  393. errno = ENOSYS;
  394. return NULL;
  395. #endif
  396. }
  397. }