utf-8-conv.c 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485
  1. /* $OpenLDAP$ */
  2. /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
  3. *
  4. * Copyright 1998-2022 The OpenLDAP Foundation.
  5. * All rights reserved.
  6. *
  7. * Redistribution and use in source and binary forms, with or without
  8. * modification, are permitted only as authorized by the OpenLDAP
  9. * Public License.
  10. *
  11. * A copy of this license is available in the file LICENSE in the
  12. * top-level directory of the distribution or, alternatively, at
  13. * <http://www.OpenLDAP.org/license.html>.
  14. */
  15. /* Portions Copyright (C) 1999, 2000 Novell, Inc. All Rights Reserved.
  16. *
  17. * THIS WORK IS SUBJECT TO U.S. AND INTERNATIONAL COPYRIGHT LAWS AND
  18. * TREATIES. USE, MODIFICATION, AND REDISTRIBUTION OF THIS WORK IS SUBJECT
  19. * TO VERSION 2.0.1 OF THE OPENLDAP PUBLIC LICENSE, A COPY OF WHICH IS
  20. * AVAILABLE AT HTTP://WWW.OPENLDAP.ORG/LICENSE.HTML OR IN THE FILE "LICENSE"
  21. * IN THE TOP-LEVEL DIRECTORY OF THE DISTRIBUTION. ANY USE OR EXPLOITATION
  22. * OF THIS WORK OTHER THAN AS AUTHORIZED IN VERSION 2.0.1 OF THE OPENLDAP
  23. * PUBLIC LICENSE, OR OTHER PRIOR WRITTEN CONSENT FROM NOVELL, COULD SUBJECT
  24. * THE PERPETRATOR TO CRIMINAL AND CIVIL LIABILITY.
  25. *---
  26. * Note: A verbatim copy of version 2.0.1 of the OpenLDAP Public License
  27. * can be found in the file "build/LICENSE-2.0.1" in this distribution
  28. * of OpenLDAP Software.
  29. */
  30. /*
  31. * UTF-8 Conversion Routines
  32. *
  33. * These routines convert between Wide Character and UTF-8,
  34. * or between MultiByte and UTF-8 encodings.
  35. *
  36. * Both single character and string versions of the functions are provided.
  37. * All functions return -1 if the character or string cannot be converted.
  38. */
  39. #include "portable.h"
  40. #if SIZEOF_WCHAR_T >= 4
  41. /* These routines assume ( sizeof(wchar_t) >= 4 ) */
  42. #include <stdio.h>
  43. #include <ac/stdlib.h> /* For wctomb, wcstombs, mbtowc, mbstowcs */
  44. #include <ac/string.h>
  45. #include <ac/time.h> /* for time_t */
  46. #include "ldap-int.h"
  47. #include <ldap_utf8.h>
  48. static unsigned char mask[] = { 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
  49. /*-----------------------------------------------------------------------------
  50. UTF-8 Format Summary
  51. ASCII chars 7 bits
  52. 0xxxxxxx
  53. 2-character UTF-8 sequence: 11 bits
  54. 110xxxxx 10xxxxxx
  55. 3-character UTF-8 16 bits
  56. 1110xxxx 10xxxxxx 10xxxxxx
  57. 4-char UTF-8 21 bits
  58. 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  59. 5-char UTF-8 26 bits
  60. 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  61. 6-char UTF-8 31 bits
  62. 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  63. Unicode address space (0 - 0x10FFFF) 21 bits
  64. ISO-10646 address space (0 - 0x7FFFFFFF) 31 bits
  65. Note: This code does not prevent UTF-8 sequences which are longer than
  66. necessary from being decoded.
  67. */
  68. /*-----------------------------------------------------------------------------
  69. Convert a UTF-8 character to a wide char.
  70. Return the length of the UTF-8 input character in bytes.
  71. */
  72. int
  73. ldap_x_utf8_to_wc ( wchar_t *wchar, const char *utf8char )
  74. {
  75. int utflen, i;
  76. wchar_t ch;
  77. if (utf8char == NULL) return -1;
  78. /* Get UTF-8 sequence length from 1st byte */
  79. utflen = LDAP_UTF8_CHARLEN2(utf8char, utflen);
  80. if( utflen==0 || utflen > (int)LDAP_MAX_UTF8_LEN ) return -1;
  81. /* First byte minus length tag */
  82. ch = (wchar_t)(utf8char[0] & mask[utflen]);
  83. for(i=1; i < utflen; i++) {
  84. /* Subsequent bytes must start with 10 */
  85. if ((utf8char[i] & 0xc0) != 0x80) return -1;
  86. ch <<= 6; /* 6 bits of data in each subsequent byte */
  87. ch |= (wchar_t)(utf8char[i] & 0x3f);
  88. }
  89. if (wchar) *wchar = ch;
  90. return utflen;
  91. }
  92. /*-----------------------------------------------------------------------------
  93. Convert a UTF-8 string to a wide char string.
  94. No more than 'count' wide chars will be written to the output buffer.
  95. Return the size of the converted string in wide chars, excl null terminator.
  96. */
  97. int
  98. ldap_x_utf8s_to_wcs ( wchar_t *wcstr, const char *utf8str, size_t count )
  99. {
  100. size_t wclen = 0;
  101. int utflen, i;
  102. wchar_t ch;
  103. /* If input ptr is NULL or empty... */
  104. if (utf8str == NULL || !*utf8str) {
  105. if ( wcstr )
  106. *wcstr = 0;
  107. return 0;
  108. }
  109. /* Examine next UTF-8 character. If output buffer is NULL, ignore count */
  110. while ( *utf8str && (wcstr==NULL || wclen<count) ) {
  111. /* Get UTF-8 sequence length from 1st byte */
  112. utflen = LDAP_UTF8_CHARLEN2(utf8str, utflen);
  113. if( utflen==0 || utflen > (int)LDAP_MAX_UTF8_LEN ) return -1;
  114. /* First byte minus length tag */
  115. ch = (wchar_t)(utf8str[0] & mask[utflen]);
  116. for(i=1; i < utflen; i++) {
  117. /* Subsequent bytes must start with 10 */
  118. if ((utf8str[i] & 0xc0) != 0x80) return -1;
  119. ch <<= 6; /* 6 bits of data in each subsequent byte */
  120. ch |= (wchar_t)(utf8str[i] & 0x3f);
  121. }
  122. if (wcstr) wcstr[wclen] = ch;
  123. utf8str += utflen; /* Move to next UTF-8 character */
  124. wclen++; /* Count number of wide chars stored/required */
  125. }
  126. /* Add null terminator if there's room in the buffer. */
  127. if (wcstr && wclen < count) wcstr[wclen] = 0;
  128. return wclen;
  129. }
  130. /*-----------------------------------------------------------------------------
  131. Convert one wide char to a UTF-8 character.
  132. Return the length of the converted UTF-8 character in bytes.
  133. No more than 'count' bytes will be written to the output buffer.
  134. */
  135. int
  136. ldap_x_wc_to_utf8 ( char *utf8char, wchar_t wchar, size_t count )
  137. {
  138. int len=0;
  139. if (utf8char == NULL) /* Just determine the required UTF-8 char length. */
  140. { /* Ignore count */
  141. if( wchar < 0 )
  142. return -1;
  143. if( wchar < 0x80 )
  144. return 1;
  145. if( wchar < 0x800 )
  146. return 2;
  147. if( wchar < 0x10000 )
  148. return 3;
  149. if( wchar < 0x200000 )
  150. return 4;
  151. if( wchar < 0x4000000 )
  152. return 5;
  153. #if SIZEOF_WCHAR_T > 4
  154. /* UL is not strictly needed by ANSI C */
  155. if( wchar < (wchar_t)0x80000000UL )
  156. #endif /* SIZEOF_WCHAR_T > 4 */
  157. return 6;
  158. return -1;
  159. }
  160. if ( wchar < 0 ) { /* Invalid wide character */
  161. len = -1;
  162. } else if( wchar < 0x80 ) {
  163. if (count >= 1) {
  164. utf8char[len++] = (char)wchar;
  165. }
  166. } else if( wchar < 0x800 ) {
  167. if (count >=2) {
  168. utf8char[len++] = 0xc0 | ( wchar >> 6 );
  169. utf8char[len++] = 0x80 | ( wchar & 0x3f );
  170. }
  171. } else if( wchar < 0x10000 ) {
  172. if (count >= 3) {
  173. utf8char[len++] = 0xe0 | ( wchar >> 12 );
  174. utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
  175. utf8char[len++] = 0x80 | ( wchar & 0x3f );
  176. }
  177. } else if( wchar < 0x200000 ) {
  178. if (count >= 4) {
  179. utf8char[len++] = 0xf0 | ( wchar >> 18 );
  180. utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
  181. utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
  182. utf8char[len++] = 0x80 | ( wchar & 0x3f );
  183. }
  184. } else if( wchar < 0x4000000 ) {
  185. if (count >= 5) {
  186. utf8char[len++] = 0xf8 | ( wchar >> 24 );
  187. utf8char[len++] = 0x80 | ( (wchar >> 18) & 0x3f );
  188. utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
  189. utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
  190. utf8char[len++] = 0x80 | ( wchar & 0x3f );
  191. }
  192. } else
  193. #if SIZEOF_WCHAR_T > 4
  194. /* UL is not strictly needed by ANSI C */
  195. if( wchar < (wchar_t)0x80000000UL )
  196. #endif /* SIZEOF_WCHAR_T > 4 */
  197. {
  198. if (count >= 6) {
  199. utf8char[len++] = 0xfc | ( wchar >> 30 );
  200. utf8char[len++] = 0x80 | ( (wchar >> 24) & 0x3f );
  201. utf8char[len++] = 0x80 | ( (wchar >> 18) & 0x3f );
  202. utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
  203. utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
  204. utf8char[len++] = 0x80 | ( wchar & 0x3f );
  205. }
  206. #if SIZEOF_WCHAR_T > 4
  207. } else {
  208. len = -1;
  209. #endif /* SIZEOF_WCHAR_T > 4 */
  210. }
  211. return len;
  212. }
  213. /*-----------------------------------------------------------------------------
  214. Convert a wide char string to a UTF-8 string.
  215. No more than 'count' bytes will be written to the output buffer.
  216. Return the # of bytes written to the output buffer, excl null terminator.
  217. */
  218. int
  219. ldap_x_wcs_to_utf8s ( char *utf8str, const wchar_t *wcstr, size_t count )
  220. {
  221. int len = 0;
  222. int n;
  223. char *p = utf8str;
  224. wchar_t empty = 0; /* To avoid use of L"" construct */
  225. if (wcstr == NULL) /* Treat input ptr NULL as an empty string */
  226. wcstr = &empty;
  227. if (utf8str == NULL) /* Just compute size of output, excl null */
  228. {
  229. while (*wcstr)
  230. {
  231. /* Get UTF-8 size of next wide char */
  232. n = ldap_x_wc_to_utf8( NULL, *wcstr++, LDAP_MAX_UTF8_LEN);
  233. if (n == -1)
  234. return -1;
  235. len += n;
  236. }
  237. return len;
  238. }
  239. /* Do the actual conversion. */
  240. n = 1; /* In case of empty wcstr */
  241. while (*wcstr)
  242. {
  243. n = ldap_x_wc_to_utf8( p, *wcstr++, count);
  244. if (n <= 0) /* If encoding error (-1) or won't fit (0), quit */
  245. break;
  246. p += n;
  247. count -= n; /* Space left in output buffer */
  248. }
  249. /* If not enough room for last character, pad remainder with null
  250. so that return value = original count, indicating buffer full. */
  251. if (n == 0)
  252. {
  253. while (count--)
  254. *p++ = 0;
  255. }
  256. /* Add a null terminator if there's room. */
  257. else if (count)
  258. *p = 0;
  259. if (n == -1) /* Conversion encountered invalid wide char. */
  260. return -1;
  261. /* Return the number of bytes written to output buffer, excl null. */
  262. return (p - utf8str);
  263. }
  264. #ifdef ANDROID
  265. int wctomb(char *s, wchar_t wc) { return wcrtomb(s,wc,NULL); }
  266. int mbtowc(wchar_t *pwc, const char *s, size_t n) { return mbrtowc(pwc, s, n, NULL); }
  267. #endif
  268. /*-----------------------------------------------------------------------------
  269. Convert a UTF-8 character to a MultiByte character.
  270. Return the size of the converted character in bytes.
  271. */
  272. int
  273. ldap_x_utf8_to_mb ( char *mbchar, const char *utf8char,
  274. int (*f_wctomb)(char *mbchar, wchar_t wchar) )
  275. {
  276. wchar_t wchar;
  277. int n;
  278. char tmp[6]; /* Large enough for biggest multibyte char */
  279. if (f_wctomb == NULL) /* If no conversion function was given... */
  280. f_wctomb = wctomb; /* use the local ANSI C function */
  281. /* First convert UTF-8 char to a wide char */
  282. n = ldap_x_utf8_to_wc( &wchar, utf8char);
  283. if (n == -1)
  284. return -1; /* Invalid UTF-8 character */
  285. if (mbchar == NULL)
  286. n = f_wctomb( tmp, wchar );
  287. else
  288. n = f_wctomb( mbchar, wchar);
  289. return n;
  290. }
  291. /*-----------------------------------------------------------------------------
  292. Convert a UTF-8 string to a MultiByte string.
  293. No more than 'count' bytes will be written to the output buffer.
  294. Return the size of the converted string in bytes, excl null terminator.
  295. */
  296. int
  297. ldap_x_utf8s_to_mbs ( char *mbstr, const char *utf8str, size_t count,
  298. size_t (*f_wcstombs)(char *mbstr, const wchar_t *wcstr, size_t count) )
  299. {
  300. wchar_t *wcs;
  301. size_t wcsize;
  302. int n;
  303. if (f_wcstombs == NULL) /* If no conversion function was given... */
  304. f_wcstombs = wcstombs; /* use the local ANSI C function */
  305. if (utf8str == NULL || *utf8str == 0) /* NULL or empty input string */
  306. {
  307. if (mbstr)
  308. *mbstr = 0;
  309. return 0;
  310. }
  311. /* Allocate memory for the maximum size wchar string that we could get. */
  312. wcsize = strlen(utf8str) + 1;
  313. wcs = (wchar_t *)LDAP_MALLOC(wcsize * sizeof(wchar_t));
  314. if (wcs == NULL)
  315. return -1; /* Memory allocation failure. */
  316. /* First convert the UTF-8 string to a wide char string */
  317. n = ldap_x_utf8s_to_wcs( wcs, utf8str, wcsize);
  318. /* Then convert wide char string to multi-byte string */
  319. if (n != -1)
  320. {
  321. n = f_wcstombs(mbstr, wcs, count);
  322. }
  323. LDAP_FREE(wcs);
  324. return n;
  325. }
  326. /*-----------------------------------------------------------------------------
  327. Convert a MultiByte character to a UTF-8 character.
  328. 'mbsize' indicates the number of bytes of 'mbchar' to check.
  329. Returns the number of bytes written to the output character.
  330. */
  331. int
  332. ldap_x_mb_to_utf8 ( char *utf8char, const char *mbchar, size_t mbsize,
  333. int (*f_mbtowc)(wchar_t *wchar, const char *mbchar, size_t count) )
  334. {
  335. wchar_t wchar;
  336. int n;
  337. if (f_mbtowc == NULL) /* If no conversion function was given... */
  338. f_mbtowc = mbtowc; /* use the local ANSI C function */
  339. if (mbsize == 0) /* 0 is not valid. */
  340. return -1;
  341. if (mbchar == NULL || *mbchar == 0)
  342. {
  343. if (utf8char)
  344. *utf8char = 0;
  345. return 1;
  346. }
  347. /* First convert the MB char to a Wide Char */
  348. n = f_mbtowc( &wchar, mbchar, mbsize);
  349. if (n == -1)
  350. return -1;
  351. /* Convert the Wide Char to a UTF-8 character. */
  352. n = ldap_x_wc_to_utf8( utf8char, wchar, LDAP_MAX_UTF8_LEN);
  353. return n;
  354. }
  355. /*-----------------------------------------------------------------------------
  356. Convert a MultiByte string to a UTF-8 string.
  357. No more than 'count' bytes will be written to the output buffer.
  358. Return the size of the converted string in bytes, excl null terminator.
  359. */
  360. int
  361. ldap_x_mbs_to_utf8s ( char *utf8str, const char *mbstr, size_t count,
  362. size_t (*f_mbstowcs)(wchar_t *wcstr, const char *mbstr, size_t count) )
  363. {
  364. wchar_t *wcs;
  365. int n;
  366. size_t wcsize;
  367. if (mbstr == NULL) /* Treat NULL input string as an empty string */
  368. mbstr = "";
  369. if (f_mbstowcs == NULL) /* If no conversion function was given... */
  370. f_mbstowcs = mbstowcs; /* use the local ANSI C function */
  371. /* Allocate memory for the maximum size wchar string that we could get. */
  372. wcsize = strlen(mbstr) + 1;
  373. wcs = (wchar_t *)LDAP_MALLOC( wcsize * sizeof(wchar_t) );
  374. if (wcs == NULL)
  375. return -1;
  376. /* First convert multi-byte string to a wide char string */
  377. n = f_mbstowcs(wcs, mbstr, wcsize);
  378. /* Convert wide char string to UTF-8 string */
  379. if (n != -1)
  380. {
  381. n = ldap_x_wcs_to_utf8s( utf8str, wcs, count);
  382. }
  383. LDAP_FREE(wcs);
  384. return n;
  385. }
  386. #endif /* SIZEOF_WCHAR_T >= 4 */