123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485 |
- /* $OpenLDAP$ */
- /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
- *
- * Copyright 1998-2022 The OpenLDAP Foundation.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted only as authorized by the OpenLDAP
- * Public License.
- *
- * A copy of this license is available in the file LICENSE in the
- * top-level directory of the distribution or, alternatively, at
- * <http://www.OpenLDAP.org/license.html>.
- */
- /* Portions Copyright (C) 1999, 2000 Novell, Inc. All Rights Reserved.
- *
- * THIS WORK IS SUBJECT TO U.S. AND INTERNATIONAL COPYRIGHT LAWS AND
- * TREATIES. USE, MODIFICATION, AND REDISTRIBUTION OF THIS WORK IS SUBJECT
- * TO VERSION 2.0.1 OF THE OPENLDAP PUBLIC LICENSE, A COPY OF WHICH IS
- * AVAILABLE AT HTTP://WWW.OPENLDAP.ORG/LICENSE.HTML OR IN THE FILE "LICENSE"
- * IN THE TOP-LEVEL DIRECTORY OF THE DISTRIBUTION. ANY USE OR EXPLOITATION
- * OF THIS WORK OTHER THAN AS AUTHORIZED IN VERSION 2.0.1 OF THE OPENLDAP
- * PUBLIC LICENSE, OR OTHER PRIOR WRITTEN CONSENT FROM NOVELL, COULD SUBJECT
- * THE PERPETRATOR TO CRIMINAL AND CIVIL LIABILITY.
- *---
- * Note: A verbatim copy of version 2.0.1 of the OpenLDAP Public License
- * can be found in the file "build/LICENSE-2.0.1" in this distribution
- * of OpenLDAP Software.
- */
- /*
- * UTF-8 Conversion Routines
- *
- * These routines convert between Wide Character and UTF-8,
- * or between MultiByte and UTF-8 encodings.
- *
- * Both single character and string versions of the functions are provided.
- * All functions return -1 if the character or string cannot be converted.
- */
- #include "portable.h"
- #if SIZEOF_WCHAR_T >= 4
- /* These routines assume ( sizeof(wchar_t) >= 4 ) */
- #include <stdio.h>
- #include <ac/stdlib.h> /* For wctomb, wcstombs, mbtowc, mbstowcs */
- #include <ac/string.h>
- #include <ac/time.h> /* for time_t */
- #include "ldap-int.h"
- #include <ldap_utf8.h>
- static unsigned char mask[] = { 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
- /*-----------------------------------------------------------------------------
- UTF-8 Format Summary
- ASCII chars 7 bits
- 0xxxxxxx
-
- 2-character UTF-8 sequence: 11 bits
- 110xxxxx 10xxxxxx
- 3-character UTF-8 16 bits
- 1110xxxx 10xxxxxx 10xxxxxx
-
- 4-char UTF-8 21 bits
- 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
-
- 5-char UTF-8 26 bits
- 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
-
- 6-char UTF-8 31 bits
- 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
-
- Unicode address space (0 - 0x10FFFF) 21 bits
- ISO-10646 address space (0 - 0x7FFFFFFF) 31 bits
- Note: This code does not prevent UTF-8 sequences which are longer than
- necessary from being decoded.
- */
- /*-----------------------------------------------------------------------------
- Convert a UTF-8 character to a wide char.
- Return the length of the UTF-8 input character in bytes.
- */
- int
- ldap_x_utf8_to_wc ( wchar_t *wchar, const char *utf8char )
- {
- int utflen, i;
- wchar_t ch;
- if (utf8char == NULL) return -1;
- /* Get UTF-8 sequence length from 1st byte */
- utflen = LDAP_UTF8_CHARLEN2(utf8char, utflen);
-
- if( utflen==0 || utflen > (int)LDAP_MAX_UTF8_LEN ) return -1;
- /* First byte minus length tag */
- ch = (wchar_t)(utf8char[0] & mask[utflen]);
-
- for(i=1; i < utflen; i++) {
- /* Subsequent bytes must start with 10 */
- if ((utf8char[i] & 0xc0) != 0x80) return -1;
-
- ch <<= 6; /* 6 bits of data in each subsequent byte */
- ch |= (wchar_t)(utf8char[i] & 0x3f);
- }
-
- if (wchar) *wchar = ch;
- return utflen;
- }
- /*-----------------------------------------------------------------------------
- Convert a UTF-8 string to a wide char string.
- No more than 'count' wide chars will be written to the output buffer.
- Return the size of the converted string in wide chars, excl null terminator.
- */
- int
- ldap_x_utf8s_to_wcs ( wchar_t *wcstr, const char *utf8str, size_t count )
- {
- size_t wclen = 0;
- int utflen, i;
- wchar_t ch;
- /* If input ptr is NULL or empty... */
- if (utf8str == NULL || !*utf8str) {
- if ( wcstr )
- *wcstr = 0;
- return 0;
- }
- /* Examine next UTF-8 character. If output buffer is NULL, ignore count */
- while ( *utf8str && (wcstr==NULL || wclen<count) ) {
- /* Get UTF-8 sequence length from 1st byte */
- utflen = LDAP_UTF8_CHARLEN2(utf8str, utflen);
-
- if( utflen==0 || utflen > (int)LDAP_MAX_UTF8_LEN ) return -1;
- /* First byte minus length tag */
- ch = (wchar_t)(utf8str[0] & mask[utflen]);
-
- for(i=1; i < utflen; i++) {
- /* Subsequent bytes must start with 10 */
- if ((utf8str[i] & 0xc0) != 0x80) return -1;
-
- ch <<= 6; /* 6 bits of data in each subsequent byte */
- ch |= (wchar_t)(utf8str[i] & 0x3f);
- }
-
- if (wcstr) wcstr[wclen] = ch;
-
- utf8str += utflen; /* Move to next UTF-8 character */
- wclen++; /* Count number of wide chars stored/required */
- }
- /* Add null terminator if there's room in the buffer. */
- if (wcstr && wclen < count) wcstr[wclen] = 0;
- return wclen;
- }
- /*-----------------------------------------------------------------------------
- Convert one wide char to a UTF-8 character.
- Return the length of the converted UTF-8 character in bytes.
- No more than 'count' bytes will be written to the output buffer.
- */
- int
- ldap_x_wc_to_utf8 ( char *utf8char, wchar_t wchar, size_t count )
- {
- int len=0;
- if (utf8char == NULL) /* Just determine the required UTF-8 char length. */
- { /* Ignore count */
- if( wchar < 0 )
- return -1;
- if( wchar < 0x80 )
- return 1;
- if( wchar < 0x800 )
- return 2;
- if( wchar < 0x10000 )
- return 3;
- if( wchar < 0x200000 )
- return 4;
- if( wchar < 0x4000000 )
- return 5;
- #if SIZEOF_WCHAR_T > 4
- /* UL is not strictly needed by ANSI C */
- if( wchar < (wchar_t)0x80000000UL )
- #endif /* SIZEOF_WCHAR_T > 4 */
- return 6;
- return -1;
- }
-
- if ( wchar < 0 ) { /* Invalid wide character */
- len = -1;
- } else if( wchar < 0x80 ) {
- if (count >= 1) {
- utf8char[len++] = (char)wchar;
- }
- } else if( wchar < 0x800 ) {
- if (count >=2) {
- utf8char[len++] = 0xc0 | ( wchar >> 6 );
- utf8char[len++] = 0x80 | ( wchar & 0x3f );
- }
- } else if( wchar < 0x10000 ) {
- if (count >= 3) {
- utf8char[len++] = 0xe0 | ( wchar >> 12 );
- utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
- utf8char[len++] = 0x80 | ( wchar & 0x3f );
- }
-
- } else if( wchar < 0x200000 ) {
- if (count >= 4) {
- utf8char[len++] = 0xf0 | ( wchar >> 18 );
- utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
- utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
- utf8char[len++] = 0x80 | ( wchar & 0x3f );
- }
- } else if( wchar < 0x4000000 ) {
- if (count >= 5) {
- utf8char[len++] = 0xf8 | ( wchar >> 24 );
- utf8char[len++] = 0x80 | ( (wchar >> 18) & 0x3f );
- utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
- utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
- utf8char[len++] = 0x80 | ( wchar & 0x3f );
- }
- } else
- #if SIZEOF_WCHAR_T > 4
- /* UL is not strictly needed by ANSI C */
- if( wchar < (wchar_t)0x80000000UL )
- #endif /* SIZEOF_WCHAR_T > 4 */
- {
- if (count >= 6) {
- utf8char[len++] = 0xfc | ( wchar >> 30 );
- utf8char[len++] = 0x80 | ( (wchar >> 24) & 0x3f );
- utf8char[len++] = 0x80 | ( (wchar >> 18) & 0x3f );
- utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
- utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
- utf8char[len++] = 0x80 | ( wchar & 0x3f );
- }
- #if SIZEOF_WCHAR_T > 4
- } else {
- len = -1;
- #endif /* SIZEOF_WCHAR_T > 4 */
- }
-
- return len;
- }
- /*-----------------------------------------------------------------------------
- Convert a wide char string to a UTF-8 string.
- No more than 'count' bytes will be written to the output buffer.
- Return the # of bytes written to the output buffer, excl null terminator.
- */
- int
- ldap_x_wcs_to_utf8s ( char *utf8str, const wchar_t *wcstr, size_t count )
- {
- int len = 0;
- int n;
- char *p = utf8str;
- wchar_t empty = 0; /* To avoid use of L"" construct */
- if (wcstr == NULL) /* Treat input ptr NULL as an empty string */
- wcstr = ∅
- if (utf8str == NULL) /* Just compute size of output, excl null */
- {
- while (*wcstr)
- {
- /* Get UTF-8 size of next wide char */
- n = ldap_x_wc_to_utf8( NULL, *wcstr++, LDAP_MAX_UTF8_LEN);
- if (n == -1)
- return -1;
- len += n;
- }
- return len;
- }
-
- /* Do the actual conversion. */
- n = 1; /* In case of empty wcstr */
- while (*wcstr)
- {
- n = ldap_x_wc_to_utf8( p, *wcstr++, count);
-
- if (n <= 0) /* If encoding error (-1) or won't fit (0), quit */
- break;
-
- p += n;
- count -= n; /* Space left in output buffer */
- }
- /* If not enough room for last character, pad remainder with null
- so that return value = original count, indicating buffer full. */
- if (n == 0)
- {
- while (count--)
- *p++ = 0;
- }
- /* Add a null terminator if there's room. */
- else if (count)
- *p = 0;
- if (n == -1) /* Conversion encountered invalid wide char. */
- return -1;
- /* Return the number of bytes written to output buffer, excl null. */
- return (p - utf8str);
- }
- #ifdef ANDROID
- int wctomb(char *s, wchar_t wc) { return wcrtomb(s,wc,NULL); }
- int mbtowc(wchar_t *pwc, const char *s, size_t n) { return mbrtowc(pwc, s, n, NULL); }
- #endif
- /*-----------------------------------------------------------------------------
- Convert a UTF-8 character to a MultiByte character.
- Return the size of the converted character in bytes.
- */
- int
- ldap_x_utf8_to_mb ( char *mbchar, const char *utf8char,
- int (*f_wctomb)(char *mbchar, wchar_t wchar) )
- {
- wchar_t wchar;
- int n;
- char tmp[6]; /* Large enough for biggest multibyte char */
- if (f_wctomb == NULL) /* If no conversion function was given... */
- f_wctomb = wctomb; /* use the local ANSI C function */
-
- /* First convert UTF-8 char to a wide char */
- n = ldap_x_utf8_to_wc( &wchar, utf8char);
- if (n == -1)
- return -1; /* Invalid UTF-8 character */
- if (mbchar == NULL)
- n = f_wctomb( tmp, wchar );
- else
- n = f_wctomb( mbchar, wchar);
- return n;
- }
- /*-----------------------------------------------------------------------------
- Convert a UTF-8 string to a MultiByte string.
- No more than 'count' bytes will be written to the output buffer.
- Return the size of the converted string in bytes, excl null terminator.
- */
- int
- ldap_x_utf8s_to_mbs ( char *mbstr, const char *utf8str, size_t count,
- size_t (*f_wcstombs)(char *mbstr, const wchar_t *wcstr, size_t count) )
- {
- wchar_t *wcs;
- size_t wcsize;
- int n;
- if (f_wcstombs == NULL) /* If no conversion function was given... */
- f_wcstombs = wcstombs; /* use the local ANSI C function */
-
- if (utf8str == NULL || *utf8str == 0) /* NULL or empty input string */
- {
- if (mbstr)
- *mbstr = 0;
- return 0;
- }
- /* Allocate memory for the maximum size wchar string that we could get. */
- wcsize = strlen(utf8str) + 1;
- wcs = (wchar_t *)LDAP_MALLOC(wcsize * sizeof(wchar_t));
- if (wcs == NULL)
- return -1; /* Memory allocation failure. */
- /* First convert the UTF-8 string to a wide char string */
- n = ldap_x_utf8s_to_wcs( wcs, utf8str, wcsize);
- /* Then convert wide char string to multi-byte string */
- if (n != -1)
- {
- n = f_wcstombs(mbstr, wcs, count);
- }
- LDAP_FREE(wcs);
- return n;
- }
- /*-----------------------------------------------------------------------------
- Convert a MultiByte character to a UTF-8 character.
- 'mbsize' indicates the number of bytes of 'mbchar' to check.
- Returns the number of bytes written to the output character.
- */
- int
- ldap_x_mb_to_utf8 ( char *utf8char, const char *mbchar, size_t mbsize,
- int (*f_mbtowc)(wchar_t *wchar, const char *mbchar, size_t count) )
- {
- wchar_t wchar;
- int n;
- if (f_mbtowc == NULL) /* If no conversion function was given... */
- f_mbtowc = mbtowc; /* use the local ANSI C function */
-
- if (mbsize == 0) /* 0 is not valid. */
- return -1;
- if (mbchar == NULL || *mbchar == 0)
- {
- if (utf8char)
- *utf8char = 0;
- return 1;
- }
- /* First convert the MB char to a Wide Char */
- n = f_mbtowc( &wchar, mbchar, mbsize);
- if (n == -1)
- return -1;
- /* Convert the Wide Char to a UTF-8 character. */
- n = ldap_x_wc_to_utf8( utf8char, wchar, LDAP_MAX_UTF8_LEN);
- return n;
- }
- /*-----------------------------------------------------------------------------
- Convert a MultiByte string to a UTF-8 string.
- No more than 'count' bytes will be written to the output buffer.
- Return the size of the converted string in bytes, excl null terminator.
- */
- int
- ldap_x_mbs_to_utf8s ( char *utf8str, const char *mbstr, size_t count,
- size_t (*f_mbstowcs)(wchar_t *wcstr, const char *mbstr, size_t count) )
- {
- wchar_t *wcs;
- int n;
- size_t wcsize;
- if (mbstr == NULL) /* Treat NULL input string as an empty string */
- mbstr = "";
- if (f_mbstowcs == NULL) /* If no conversion function was given... */
- f_mbstowcs = mbstowcs; /* use the local ANSI C function */
-
- /* Allocate memory for the maximum size wchar string that we could get. */
- wcsize = strlen(mbstr) + 1;
- wcs = (wchar_t *)LDAP_MALLOC( wcsize * sizeof(wchar_t) );
- if (wcs == NULL)
- return -1;
- /* First convert multi-byte string to a wide char string */
- n = f_mbstowcs(wcs, mbstr, wcsize);
- /* Convert wide char string to UTF-8 string */
- if (n != -1)
- {
- n = ldap_x_wcs_to_utf8s( utf8str, wcs, count);
- }
- LDAP_FREE(wcs);
- return n;
- }
- #endif /* SIZEOF_WCHAR_T >= 4 */
|