123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451 |
- /* Charset conversion.
- Copyright (C) 2001-2007, 2010-2024 Free Software Foundation, Inc.
- Written by Bruno Haible and Simon Josefsson.
- This file is free software: you can redistribute it and/or modify
- it under the terms of the GNU Lesser General Public License as
- published by the Free Software Foundation; either version 2.1 of the
- License, or (at your option) any later version.
- This file is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU Lesser General Public License for more details.
- You should have received a copy of the GNU Lesser General Public License
- along with this program. If not, see <https://www.gnu.org/licenses/>. */
- #include <config.h>
- /* Specification. */
- #include "striconv.h"
- #include <errno.h>
- #include <stdlib.h>
- #include <string.h>
- #if HAVE_ICONV
- # include <iconv.h>
- /* Get MB_LEN_MAX, CHAR_BIT. */
- # include <limits.h>
- #endif
- #include "c-strcase.h"
- #ifndef SIZE_MAX
- # define SIZE_MAX ((size_t) -1)
- #endif
- #if HAVE_ICONV
- int
- mem_cd_iconv (const char *src, size_t srclen, iconv_t cd,
- char **resultp, size_t *lengthp)
- {
- # define tmpbufsize 4096
- size_t length;
- char *result;
- /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug. */
- # if defined _LIBICONV_VERSION \
- || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
- || defined __sun)
- /* Set to the initial state. */
- iconv (cd, NULL, NULL, NULL, NULL);
- # endif
- /* Determine the length we need. */
- {
- size_t count = 0;
- /* The alignment is needed when converting e.g. to glibc's WCHAR_T or
- libiconv's UCS-4-INTERNAL encoding. */
- union { unsigned int align; char buf[tmpbufsize]; } tmp;
- # define tmpbuf tmp.buf
- const char *inptr = src;
- size_t insize = srclen;
- while (insize > 0)
- {
- char *outptr = tmpbuf;
- size_t outsize = tmpbufsize;
- size_t res = iconv (cd,
- (ICONV_CONST char **) &inptr, &insize,
- &outptr, &outsize);
- if (res == (size_t)(-1))
- {
- if (errno == E2BIG)
- ;
- else if (errno == EINVAL)
- break;
- else
- return -1;
- }
- # if !defined _LIBICONV_VERSION && !(defined __GLIBC__ && !defined __UCLIBC__)
- /* Irix iconv() inserts a NUL byte if it cannot convert.
- NetBSD iconv() inserts a question mark if it cannot convert.
- Only GNU libiconv and GNU libc are known to prefer to fail rather
- than doing a lossy conversion. */
- else if (res > 0)
- {
- errno = EILSEQ;
- return -1;
- }
- # endif
- count += outptr - tmpbuf;
- }
- /* Avoid glibc-2.1 bug and Solaris 2.7 bug. */
- # if defined _LIBICONV_VERSION \
- || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
- || defined __sun)
- {
- char *outptr = tmpbuf;
- size_t outsize = tmpbufsize;
- size_t res = iconv (cd, NULL, NULL, &outptr, &outsize);
- if (res == (size_t)(-1))
- return -1;
- count += outptr - tmpbuf;
- }
- # endif
- length = count;
- # undef tmpbuf
- }
- if (length == 0)
- {
- *lengthp = 0;
- return 0;
- }
- if (*resultp != NULL && *lengthp >= length)
- result = *resultp;
- else
- {
- result = (char *) malloc (length);
- if (result == NULL)
- {
- errno = ENOMEM;
- return -1;
- }
- }
- /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug. */
- # if defined _LIBICONV_VERSION \
- || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
- || defined __sun)
- /* Return to the initial state. */
- iconv (cd, NULL, NULL, NULL, NULL);
- # endif
- /* Do the conversion for real. */
- {
- const char *inptr = src;
- size_t insize = srclen;
- char *outptr = result;
- size_t outsize = length;
- while (insize > 0)
- {
- size_t res = iconv (cd,
- (ICONV_CONST char **) &inptr, &insize,
- &outptr, &outsize);
- if (res == (size_t)(-1))
- {
- if (errno == EINVAL)
- break;
- else
- goto fail;
- }
- # if !defined _LIBICONV_VERSION && !(defined __GLIBC__ && !defined __UCLIBC__)
- /* Irix iconv() inserts a NUL byte if it cannot convert.
- NetBSD iconv() inserts a question mark if it cannot convert.
- Only GNU libiconv and GNU libc are known to prefer to fail rather
- than doing a lossy conversion. */
- else if (res > 0)
- {
- errno = EILSEQ;
- goto fail;
- }
- # endif
- }
- /* Avoid glibc-2.1 bug and Solaris 2.7 bug. */
- # if defined _LIBICONV_VERSION \
- || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
- || defined __sun)
- {
- size_t res = iconv (cd, NULL, NULL, &outptr, &outsize);
- if (res == (size_t)(-1))
- goto fail;
- }
- # endif
- if (outsize != 0)
- abort ();
- }
- *resultp = result;
- *lengthp = length;
- return 0;
- fail:
- {
- if (result != *resultp)
- free (result);
- return -1;
- }
- # undef tmpbufsize
- }
- char *
- str_cd_iconv (const char *src, iconv_t cd)
- {
- /* For most encodings, a trailing NUL byte in the input will be converted
- to a trailing NUL byte in the output. But not for UTF-7. So that this
- function is usable for UTF-7, we have to exclude the NUL byte from the
- conversion and add it by hand afterwards. */
- # if !defined _LIBICONV_VERSION && !(defined __GLIBC__ && !defined __UCLIBC__)
- /* Irix iconv() inserts a NUL byte if it cannot convert.
- NetBSD iconv() inserts a question mark if it cannot convert.
- Only GNU libiconv and GNU libc are known to prefer to fail rather
- than doing a lossy conversion. For other iconv() implementations,
- we have to look at the number of irreversible conversions returned;
- but this information is lost when iconv() returns for an E2BIG reason.
- Therefore we cannot use the second, faster algorithm. */
- char *result = NULL;
- size_t length = 0;
- int retval = mem_cd_iconv (src, strlen (src), cd, &result, &length);
- char *final_result;
- if (retval < 0)
- {
- if (result != NULL)
- abort ();
- return NULL;
- }
- /* Add the terminating NUL byte. */
- final_result =
- (result != NULL ? realloc (result, length + 1) : malloc (length + 1));
- if (final_result == NULL)
- {
- free (result);
- errno = ENOMEM;
- return NULL;
- }
- final_result[length] = '\0';
- return final_result;
- # else
- /* This algorithm is likely faster than the one above. But it may produce
- iconv() returns for an E2BIG reason, when the output size guess is too
- small. Therefore it can only be used when we don't need the number of
- irreversible conversions performed. */
- char *result;
- size_t result_size;
- size_t length;
- const char *inptr = src;
- size_t inbytes_remaining = strlen (src);
- /* Make a guess for the worst-case output size, in order to avoid a
- realloc. It's OK if the guess is wrong as long as it is not zero and
- doesn't lead to an integer overflow. */
- result_size = inbytes_remaining;
- {
- size_t approx_sqrt_SIZE_MAX = SIZE_MAX >> (sizeof (size_t) * CHAR_BIT / 2);
- if (result_size <= approx_sqrt_SIZE_MAX / MB_LEN_MAX)
- result_size *= MB_LEN_MAX;
- }
- result_size += 1; /* for the terminating NUL */
- result = (char *) malloc (result_size);
- if (result == NULL)
- {
- errno = ENOMEM;
- return NULL;
- }
- /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug. */
- # if defined _LIBICONV_VERSION \
- || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
- || defined __sun)
- /* Set to the initial state. */
- iconv (cd, NULL, NULL, NULL, NULL);
- # endif
- /* Do the conversion. */
- {
- char *outptr = result;
- size_t outbytes_remaining = result_size - 1;
- for (;;)
- {
- /* Here inptr + inbytes_remaining = src + strlen (src),
- outptr + outbytes_remaining = result + result_size - 1. */
- size_t res = iconv (cd,
- (ICONV_CONST char **) &inptr, &inbytes_remaining,
- &outptr, &outbytes_remaining);
- if (res == (size_t)(-1))
- {
- if (errno == EINVAL)
- break;
- else if (errno == E2BIG)
- {
- size_t used = outptr - result;
- size_t newsize = result_size * 2;
- char *newresult;
- if (!(newsize > result_size))
- {
- errno = ENOMEM;
- goto failed;
- }
- newresult = (char *) realloc (result, newsize);
- if (newresult == NULL)
- {
- errno = ENOMEM;
- goto failed;
- }
- result = newresult;
- result_size = newsize;
- outptr = result + used;
- outbytes_remaining = result_size - 1 - used;
- }
- else
- goto failed;
- }
- else
- break;
- }
- /* Avoid glibc-2.1 bug and Solaris 2.7 bug. */
- # if defined _LIBICONV_VERSION \
- || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
- || defined __sun)
- for (;;)
- {
- /* Here outptr + outbytes_remaining = result + result_size - 1. */
- size_t res = iconv (cd, NULL, NULL, &outptr, &outbytes_remaining);
- if (res == (size_t)(-1))
- {
- if (errno == E2BIG)
- {
- size_t used = outptr - result;
- size_t newsize = result_size * 2;
- char *newresult;
- if (!(newsize > result_size))
- {
- errno = ENOMEM;
- goto failed;
- }
- newresult = (char *) realloc (result, newsize);
- if (newresult == NULL)
- {
- errno = ENOMEM;
- goto failed;
- }
- result = newresult;
- result_size = newsize;
- outptr = result + used;
- outbytes_remaining = result_size - 1 - used;
- }
- else
- goto failed;
- }
- else
- break;
- }
- # endif
- /* Add the terminating NUL byte. */
- *outptr++ = '\0';
- length = outptr - result;
- }
- /* Give away unused memory. */
- if (length < result_size)
- {
- char *smaller_result = (char *) realloc (result, length);
- if (smaller_result != NULL)
- result = smaller_result;
- }
- return result;
- failed:
- free (result);
- return NULL;
- # endif
- }
- #endif
- char *
- str_iconv (const char *src, const char *from_codeset, const char *to_codeset)
- {
- if (*src == '\0' || c_strcasecmp (from_codeset, to_codeset) == 0)
- {
- char *result = strdup (src);
- if (result == NULL)
- errno = ENOMEM;
- return result;
- }
- else
- {
- #if HAVE_ICONV
- iconv_t cd;
- char *result;
- /* Avoid glibc-2.1 bug with EUC-KR. */
- # if ((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
- && !defined _LIBICONV_VERSION
- if (c_strcasecmp (from_codeset, "EUC-KR") == 0
- || c_strcasecmp (to_codeset, "EUC-KR") == 0)
- {
- errno = EINVAL;
- return NULL;
- }
- # endif
- cd = iconv_open (to_codeset, from_codeset);
- if (cd == (iconv_t) -1)
- return NULL;
- result = str_cd_iconv (src, cd);
- if (result == NULL)
- {
- /* Close cd, but preserve the errno from str_cd_iconv. */
- int saved_errno = errno;
- iconv_close (cd);
- errno = saved_errno;
- }
- else
- {
- if (iconv_close (cd) < 0)
- {
- free (result);
- return NULL;
- }
- }
- return result;
- #else
- /* This is a different error code than if iconv_open existed but didn't
- support from_codeset and to_codeset, so that the caller can emit
- an error message such as
- "iconv() is not supported. Installing GNU libiconv and
- then reinstalling this package would fix this." */
- errno = ENOSYS;
- return NULL;
- #endif
- }
- }
|