nfkc.c 28 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111
  1. /* nfkc.c --- Unicode normalization utilities.
  2. Copyright (C) 2002-2022 Simon Josefsson
  3. This file is part of GNU Libidn.
  4. GNU Libidn is free software: you can redistribute it and/or
  5. modify it under the terms of either:
  6. * the GNU Lesser General Public License as published by the Free
  7. Software Foundation; either version 3 of the License, or (at
  8. your option) any later version.
  9. or
  10. * the GNU General Public License as published by the Free
  11. Software Foundation; either version 2 of the License, or (at
  12. your option) any later version.
  13. or both in parallel, as here.
  14. GNU Libidn is distributed in the hope that it will be useful,
  15. but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  17. General Public License for more details.
  18. You should have received copies of the GNU General Public License and
  19. the GNU Lesser General Public License along with this program. If
  20. not, see <https://www.gnu.org/licenses/>. */
  21. #ifdef HAVE_CONFIG_H
  22. # include "config.h"
  23. #endif
  24. #include <stdlib.h>
  25. #include <string.h>
  26. #include "stringprep.h"
  27. /* Hacks to make syncing with GLIB code easier. */
  28. #define gboolean int
  29. #define gchar char
  30. #define guchar unsigned char
  31. #define glong long
  32. #define gint int
  33. #define guint unsigned int
  34. #define gushort unsigned short
  35. #define gint16 int16_t
  36. #define guint16 uint16_t
  37. #define gunichar uint32_t
  38. #define gsize size_t
  39. #define gssize ssize_t
  40. #define g_malloc malloc
  41. #define g_free free
  42. #define g_return_val_if_fail(expr,val) { \
  43. if (!(expr)) \
  44. return (val); \
  45. }
  46. /* Code from GLIB gmacros.h starts here. */
  47. /* GLIB - Library of useful routines for C programming
  48. * Copyright (C) 1995-1997 Peter Mattis, Spencer Kimball and Josh MacDonald
  49. *
  50. * This library is free software; you can redistribute it and/or
  51. * modify it under the terms of the GNU Lesser General Public
  52. * License as published by the Free Software Foundation; either
  53. * version 2 of the License, or (at your option) any later version.
  54. *
  55. * This library is distributed in the hope that it will be useful,
  56. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  57. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  58. * Lesser General Public License for more details.
  59. *
  60. * You should have received a copy of the GNU Lesser General Public
  61. * License along with this library; if not, write to the
  62. * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  63. * Boston, MA 02111-1307, USA.
  64. */
  65. #ifndef FALSE
  66. # define FALSE (0)
  67. #endif
  68. #ifndef TRUE
  69. # define TRUE (!FALSE)
  70. #endif
  71. #define G_N_ELEMENTS(arr) (sizeof (arr) / sizeof ((arr)[0]))
  72. #define G_UNLIKELY(expr) (expr)
  73. /* Code from GLIB gunicode.h starts here. */
  74. /* gunicode.h - Unicode manipulation functions
  75. *
  76. * Copyright (C) 1999, 2000 Tom Tromey
  77. * Copyright 2000, 2005 Red Hat, Inc.
  78. *
  79. * The Gnome Library is free software; you can redistribute it and/or
  80. * modify it under the terms of the GNU Lesser General Public License as
  81. * published by the Free Software Foundation; either version 2 of the
  82. * License, or (at your option) any later version.
  83. *
  84. * The Gnome Library is distributed in the hope that it will be useful,
  85. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  86. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  87. * Lesser General Public License for more details.
  88. *
  89. * You should have received a copy of the GNU Lesser General Public
  90. * License along with the Gnome Library; see the file COPYING.LIB. If not,
  91. * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  92. * Boston, MA 02111-1307, USA.
  93. */
  94. typedef enum
  95. {
  96. G_NORMALIZE_DEFAULT,
  97. G_NORMALIZE_NFD = G_NORMALIZE_DEFAULT,
  98. G_NORMALIZE_DEFAULT_COMPOSE,
  99. G_NORMALIZE_NFC = G_NORMALIZE_DEFAULT_COMPOSE,
  100. G_NORMALIZE_ALL,
  101. G_NORMALIZE_NFKD = G_NORMALIZE_ALL,
  102. G_NORMALIZE_ALL_COMPOSE,
  103. G_NORMALIZE_NFKC = G_NORMALIZE_ALL_COMPOSE
  104. }
  105. GNormalizeMode;
  106. #define g_utf8_next_char(p) ((p) + g_utf8_skip[*(const guchar *)(p)])
  107. /* Code from GLIB gutf8.c starts here. */
  108. /* gutf8.c - Operations on UTF-8 strings.
  109. *
  110. * Copyright (C) 1999 Tom Tromey
  111. * Copyright (C) 2000 Red Hat, Inc.
  112. *
  113. * This library is free software; you can redistribute it and/or
  114. * modify it under the terms of the GNU Lesser General Public
  115. * License as published by the Free Software Foundation; either
  116. * version 2 of the License, or (at your option) any later version.
  117. *
  118. * This library is distributed in the hope that it will be useful,
  119. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  120. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  121. * Lesser General Public License for more details.
  122. *
  123. * You should have received a copy of the GNU Lesser General Public
  124. * License along with this library; if not, write to the
  125. * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  126. * Boston, MA 02111-1307, USA.
  127. */
  128. #define UTF8_COMPUTE(Char, Mask, Len) \
  129. if (Char < 128) \
  130. { \
  131. Len = 1; \
  132. Mask = 0x7f; \
  133. } \
  134. else if ((Char & 0xe0) == 0xc0) \
  135. { \
  136. Len = 2; \
  137. Mask = 0x1f; \
  138. } \
  139. else if ((Char & 0xf0) == 0xe0) \
  140. { \
  141. Len = 3; \
  142. Mask = 0x0f; \
  143. } \
  144. else if ((Char & 0xf8) == 0xf0) \
  145. { \
  146. Len = 4; \
  147. Mask = 0x07; \
  148. } \
  149. else if ((Char & 0xfc) == 0xf8) \
  150. { \
  151. Len = 5; \
  152. Mask = 0x03; \
  153. } \
  154. else if ((Char & 0xfe) == 0xfc) \
  155. { \
  156. Len = 6; \
  157. Mask = 0x01; \
  158. } \
  159. else \
  160. Len = -1;
  161. #define UTF8_LENGTH(Char) \
  162. ((Char) < 0x80 ? 1 : \
  163. ((Char) < 0x800 ? 2 : \
  164. ((Char) < 0x10000 ? 3 : \
  165. ((Char) < 0x200000 ? 4 : \
  166. ((Char) < 0x4000000 ? 5 : 6)))))
  167. #define UTF8_GET(Result, Chars, Count, Mask, Len) \
  168. (Result) = (Chars)[0] & (Mask); \
  169. for ((Count) = 1; (Count) < (Len); ++(Count)) \
  170. { \
  171. if (((Chars)[(Count)] & 0xc0) != 0x80) \
  172. { \
  173. (Result) = -1; \
  174. break; \
  175. } \
  176. (Result) <<= 6; \
  177. (Result) |= ((Chars)[(Count)] & 0x3f); \
  178. }
  179. static const gchar utf8_skip_data[256] = {
  180. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  181. 1, 1, 1, 1, 1, 1, 1,
  182. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  183. 1, 1, 1, 1, 1, 1, 1,
  184. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  185. 1, 1, 1, 1, 1, 1, 1,
  186. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  187. 1, 1, 1, 1, 1, 1, 1,
  188. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  189. 1, 1, 1, 1, 1, 1, 1,
  190. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  191. 1, 1, 1, 1, 1, 1, 1,
  192. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  193. 2, 2, 2, 2, 2, 2, 2,
  194. 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5,
  195. 5, 5, 5, 6, 6, 1, 1
  196. };
  197. static const gchar *const g_utf8_skip = utf8_skip_data;
  198. /*
  199. * g_utf8_strlen:
  200. * @p: pointer to the start of a UTF-8 encoded string
  201. * @max: the maximum number of bytes to examine. If @max
  202. * is less than 0, then the string is assumed to be
  203. * nul-terminated. If @max is 0, @p will not be examined and
  204. * may be %NULL.
  205. *
  206. * Computes the length of the string in characters, not including
  207. * the terminating nul character.
  208. *
  209. * Return value: the length of the string in characters
  210. **/
  211. static glong
  212. g_utf8_strlen (const gchar * p)
  213. {
  214. glong len = 0;
  215. g_return_val_if_fail (p != NULL, 0);
  216. while (*p)
  217. {
  218. p = g_utf8_next_char (p);
  219. ++len;
  220. }
  221. return len;
  222. }
  223. /*
  224. * g_utf8_get_char:
  225. * @p: a pointer to Unicode character encoded as UTF-8
  226. *
  227. * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
  228. * If @p does not point to a valid UTF-8 encoded character, results are
  229. * undefined. If you are not sure that the bytes are complete
  230. * valid Unicode characters, you should use g_utf8_get_char_validated()
  231. * instead.
  232. *
  233. * Return value: the resulting character
  234. **/
  235. static gunichar
  236. g_utf8_get_char (const gchar * p)
  237. {
  238. int i, mask = 0, len;
  239. gunichar result;
  240. unsigned char c = (unsigned char) *p;
  241. UTF8_COMPUTE (c, mask, len);
  242. if (len == -1)
  243. return (gunichar) - 1;
  244. UTF8_GET (result, p, i, mask, len);
  245. return result;
  246. }
  247. /*
  248. * g_unichar_to_utf8:
  249. * @c: a Unicode character code
  250. * @outbuf: output buffer, must have at least 6 bytes of space.
  251. * If %NULL, the length will be computed and returned
  252. * and nothing will be written to @outbuf.
  253. *
  254. * Converts a single character to UTF-8.
  255. *
  256. * Return value: number of bytes written
  257. **/
  258. static int
  259. g_unichar_to_utf8 (gunichar c, gchar * outbuf)
  260. {
  261. /* If this gets modified, also update the copy in g_string_insert_unichar() */
  262. guint len = 0;
  263. int first;
  264. int i;
  265. if (c < 0x80)
  266. {
  267. first = 0;
  268. len = 1;
  269. }
  270. else if (c < 0x800)
  271. {
  272. first = 0xc0;
  273. len = 2;
  274. }
  275. else if (c < 0x10000)
  276. {
  277. first = 0xe0;
  278. len = 3;
  279. }
  280. else if (c < 0x200000)
  281. {
  282. first = 0xf0;
  283. len = 4;
  284. }
  285. else if (c < 0x4000000)
  286. {
  287. first = 0xf8;
  288. len = 5;
  289. }
  290. else
  291. {
  292. first = 0xfc;
  293. len = 6;
  294. }
  295. if (outbuf)
  296. {
  297. for (i = len - 1; i > 0; --i)
  298. {
  299. outbuf[i] = (c & 0x3f) | 0x80;
  300. c >>= 6;
  301. }
  302. outbuf[0] = c | first;
  303. }
  304. return len;
  305. }
  306. /*
  307. * g_utf8_to_ucs4_fast:
  308. * @str: a UTF-8 encoded string
  309. * @len: the maximum length of @str to use, in bytes. If @len < 0,
  310. * then the string is nul-terminated.
  311. * @items_written: location to store the number of characters in the
  312. * result, or %NULL.
  313. *
  314. * Convert a string from UTF-8 to a 32-bit fixed width
  315. * representation as UCS-4, assuming valid UTF-8 input.
  316. * This function is roughly twice as fast as g_utf8_to_ucs4()
  317. * but does no error checking on the input. A trailing 0 character
  318. * will be added to the string after the converted text.
  319. *
  320. * Return value: a pointer to a newly allocated UCS-4 string.
  321. * This value must be freed with g_free().
  322. **/
  323. static gunichar *
  324. g_utf8_to_ucs4_fast (const gchar * str, glong len, glong * items_written)
  325. {
  326. gunichar *result;
  327. gsize n_chars, i;
  328. const gchar *p;
  329. g_return_val_if_fail (str != NULL, NULL);
  330. p = str;
  331. n_chars = 0;
  332. if (len < 0)
  333. {
  334. while (*p)
  335. {
  336. p = g_utf8_next_char (p);
  337. ++n_chars;
  338. }
  339. }
  340. else
  341. {
  342. while (p < str + len && *p)
  343. {
  344. p = g_utf8_next_char (p);
  345. ++n_chars;
  346. }
  347. }
  348. result = g_malloc (sizeof (gunichar) * (n_chars + 1));
  349. if (!result)
  350. return NULL;
  351. p = str;
  352. for (i = 0; i < n_chars; i++)
  353. {
  354. gunichar wc = (guchar) * p++;
  355. if (wc < 0x80)
  356. {
  357. result[i] = wc;
  358. }
  359. else
  360. {
  361. gunichar mask = 0x40;
  362. if (G_UNLIKELY ((wc & mask) == 0))
  363. {
  364. /* It's an out-of-sequence 10xxxxxxx byte.
  365. * Rather than making an ugly hash of this and the next byte
  366. * and overrunning the buffer, it's more useful to treat it
  367. * with a replacement character */
  368. result[i] = 0xfffd;
  369. continue;
  370. }
  371. do
  372. {
  373. wc <<= 6;
  374. wc |= (guchar) (*p++) & 0x3f;
  375. mask <<= 5;
  376. }
  377. while ((wc & mask) != 0);
  378. wc &= mask - 1;
  379. result[i] = wc;
  380. }
  381. }
  382. result[i] = 0;
  383. if (items_written)
  384. *items_written = i;
  385. return result;
  386. }
  387. /*
  388. * g_ucs4_to_utf8:
  389. * @str: a UCS-4 encoded string
  390. * @len: the maximum length (number of characters) of @str to use.
  391. * If @len < 0, then the string is nul-terminated.
  392. * @items_read: location to store number of characters read, or %NULL.
  393. * @items_written: location to store number of bytes written or %NULL.
  394. * The value here stored does not include the trailing 0
  395. * byte.
  396. * @error: location to store the error occurring, or %NULL to ignore
  397. * errors. Any of the errors in #GConvertError other than
  398. * %G_CONVERT_ERROR_NO_CONVERSION may occur.
  399. *
  400. * Convert a string from a 32-bit fixed width representation as UCS-4.
  401. * to UTF-8. The result will be terminated with a 0 byte.
  402. *
  403. * Return value: a pointer to a newly allocated UTF-8 string.
  404. * This value must be freed with g_free(). If an
  405. * error occurs, %NULL will be returned and
  406. * @error set. In that case, @items_read will be
  407. * set to the position of the first invalid input
  408. * character.
  409. **/
  410. static gchar *
  411. g_ucs4_to_utf8 (const gunichar * str,
  412. glong len, glong * items_read, glong * items_written)
  413. {
  414. gint result_length;
  415. gchar *result = NULL;
  416. gchar *p;
  417. gint i;
  418. result_length = 0;
  419. for (i = 0; len < 0 || i < len; i++)
  420. {
  421. if (!str[i])
  422. break;
  423. if (str[i] >= 0x80000000)
  424. goto err_out;
  425. result_length += UTF8_LENGTH (str[i]);
  426. }
  427. result = g_malloc (result_length + 1);
  428. if (!result)
  429. return NULL;
  430. p = result;
  431. i = 0;
  432. while (p < result + result_length)
  433. p += g_unichar_to_utf8 (str[i++], p);
  434. *p = '\0';
  435. if (items_written)
  436. *items_written = p - result;
  437. err_out:
  438. if (items_read)
  439. *items_read = i;
  440. return result;
  441. }
  442. /* Code from GLIB gunidecomp.c starts here. */
  443. /* decomp.c - Character decomposition.
  444. *
  445. * Copyright (C) 1999, 2000 Tom Tromey
  446. * Copyright 2000 Red Hat, Inc.
  447. *
  448. * The Gnome Library is free software; you can redistribute it and/or
  449. * modify it under the terms of the GNU Lesser General Public License as
  450. * published by the Free Software Foundation; either version 2 of the
  451. * License, or (at your option) any later version.
  452. *
  453. * The Gnome Library is distributed in the hope that it will be useful,
  454. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  455. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  456. * Lesser General Public License for more details.
  457. *
  458. * You should have received a copy of the GNU Lesser General Public
  459. * License along with the Gnome Library; see the file COPYING.LIB. If not,
  460. * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  461. * Boston, MA 02111-1307, USA.
  462. */
  463. #include "gunidecomp.h"
  464. #include "gunicomp.h"
  465. #define CC_PART1(Page, Char) \
  466. ((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
  467. ? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \
  468. : (cclass_data[combining_class_table_part1[Page]][Char]))
  469. #define CC_PART2(Page, Char) \
  470. ((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
  471. ? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
  472. : (cclass_data[combining_class_table_part2[Page]][Char]))
  473. #define COMBINING_CLASS(Char) \
  474. (((Char) <= G_UNICODE_LAST_CHAR_PART1) \
  475. ? CC_PART1 ((Char) >> 8, (Char) & 0xff) \
  476. : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
  477. ? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
  478. : 0))
  479. /* constants for hangul syllable [de]composition */
  480. #define SBase 0xAC00
  481. #define LBase 0x1100
  482. #define VBase 0x1161
  483. #define TBase 0x11A7
  484. #define LCount 19
  485. #define VCount 21
  486. #define TCount 28
  487. #define NCount (VCount * TCount)
  488. #define SCount (LCount * NCount)
  489. /*
  490. * g_unicode_canonical_ordering:
  491. * @string: a UCS-4 encoded string.
  492. * @len: the maximum length of @string to use.
  493. *
  494. * Computes the canonical ordering of a string in-place.
  495. * This rearranges decomposed characters in the string
  496. * according to their combining classes. See the Unicode
  497. * manual for more information.
  498. **/
  499. static void
  500. g_unicode_canonical_ordering (gunichar * string, gsize len)
  501. {
  502. gsize i;
  503. int swap = 1;
  504. while (swap)
  505. {
  506. int last;
  507. swap = 0;
  508. last = COMBINING_CLASS (string[0]);
  509. for (i = 0; i < len - 1; ++i)
  510. {
  511. int next = COMBINING_CLASS (string[i + 1]);
  512. if (next != 0 && last > next)
  513. {
  514. gsize j;
  515. /* Percolate item leftward through string. */
  516. for (j = i + 1; j > 0; --j)
  517. {
  518. gunichar t;
  519. if (COMBINING_CLASS (string[j - 1]) <= next)
  520. break;
  521. t = string[j];
  522. string[j] = string[j - 1];
  523. string[j - 1] = t;
  524. swap = 1;
  525. }
  526. /* We're re-entering the loop looking at the old
  527. character again. */
  528. next = last;
  529. }
  530. last = next;
  531. }
  532. }
  533. }
  534. /* http://www.unicode.org/unicode/reports/tr15/#Hangul
  535. * r should be null or have sufficient space. Calling with r == NULL will
  536. * only calculate the result_len; however, a buffer with space for three
  537. * characters will always be big enough. */
  538. static void
  539. decompose_hangul (gunichar s, gunichar * r, gsize * result_len)
  540. {
  541. gint SIndex = s - SBase;
  542. gint TIndex = SIndex % TCount;
  543. if (r)
  544. {
  545. r[0] = LBase + SIndex / NCount;
  546. r[1] = VBase + (SIndex % NCount) / TCount;
  547. }
  548. if (TIndex)
  549. {
  550. if (r)
  551. r[2] = TBase + TIndex;
  552. *result_len = 3;
  553. }
  554. else
  555. *result_len = 2;
  556. }
  557. /* returns a pointer to a null-terminated UTF-8 string */
  558. static const gchar *
  559. find_decomposition (gunichar ch, gboolean compat)
  560. {
  561. int start = 0;
  562. int end = G_N_ELEMENTS (decomp_table);
  563. if (ch >= decomp_table[start].ch && ch <= decomp_table[end - 1].ch)
  564. {
  565. while (TRUE)
  566. {
  567. int half = (start + end) / 2;
  568. if (ch == decomp_table[half].ch)
  569. {
  570. int offset;
  571. if (compat)
  572. {
  573. offset = decomp_table[half].compat_offset;
  574. if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
  575. offset = decomp_table[half].canon_offset;
  576. }
  577. else
  578. {
  579. offset = decomp_table[half].canon_offset;
  580. if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
  581. return NULL;
  582. }
  583. return &(decomp_expansion_string[offset]);
  584. }
  585. else if (half == start)
  586. break;
  587. else if (ch > decomp_table[half].ch)
  588. start = half;
  589. else
  590. end = half;
  591. }
  592. }
  593. return NULL;
  594. }
  595. /* L,V => LV and LV,T => LVT */
  596. static gboolean
  597. combine_hangul (gunichar a, gunichar b, gunichar * result)
  598. {
  599. if (a >= LBase && a < LCount + LBase && b >= VBase && b < VCount + VBase)
  600. {
  601. gint LIndex = a - LBase;
  602. gint VIndex = b - VBase;
  603. *result = SBase + (LIndex * VCount + VIndex) * TCount;
  604. return TRUE;
  605. }
  606. if (a >= SBase && a < SCount + SBase && b > TBase && b < TCount + TBase)
  607. {
  608. gint SIndex = a - SBase;
  609. if ((SIndex % TCount) == 0)
  610. {
  611. gint TIndex = b - TBase;
  612. *result = a + TIndex;
  613. return TRUE;
  614. }
  615. }
  616. return FALSE;
  617. }
  618. #define CI(Page, Char) \
  619. ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
  620. ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
  621. : (compose_data[compose_table[Page]][Char]))
  622. #define COMPOSE_INDEX(Char) \
  623. (((Char >> 8) > (COMPOSE_TABLE_LAST)) ? 0 : CI((Char) >> 8, (Char) & 0xff))
  624. static gboolean
  625. combine (gunichar a, gunichar b, gunichar * result)
  626. {
  627. gushort index_a, index_b;
  628. if (combine_hangul (a, b, result))
  629. return TRUE;
  630. index_a = COMPOSE_INDEX (a);
  631. if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
  632. {
  633. if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0])
  634. {
  635. *result =
  636. compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1];
  637. return TRUE;
  638. }
  639. else
  640. return FALSE;
  641. }
  642. index_b = COMPOSE_INDEX (b);
  643. if (index_b >= COMPOSE_SECOND_SINGLE_START)
  644. {
  645. if (a ==
  646. compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0])
  647. {
  648. *result =
  649. compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1];
  650. return TRUE;
  651. }
  652. else
  653. return FALSE;
  654. }
  655. if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START
  656. && index_b >= COMPOSE_SECOND_START
  657. && index_b < COMPOSE_SECOND_SINGLE_START)
  658. {
  659. gunichar res =
  660. compose_array[index_a - COMPOSE_FIRST_START][index_b -
  661. COMPOSE_SECOND_START];
  662. if (res)
  663. {
  664. *result = res;
  665. return TRUE;
  666. }
  667. }
  668. return FALSE;
  669. }
  670. static gunichar *
  671. _g_utf8_normalize_wc (const gchar * str, gssize max_len, GNormalizeMode mode)
  672. {
  673. gsize n_wc;
  674. gunichar *wc_buffer;
  675. const char *p;
  676. gsize last_start;
  677. gboolean do_compat = (mode == G_NORMALIZE_NFKC || mode == G_NORMALIZE_NFKD);
  678. gboolean do_compose = (mode == G_NORMALIZE_NFC || mode == G_NORMALIZE_NFKC);
  679. n_wc = 0;
  680. p = str;
  681. while ((max_len < 0 || p < str + max_len) && *p)
  682. {
  683. const gchar *decomp;
  684. gunichar wc = g_utf8_get_char (p);
  685. if (wc >= SBase && wc < SBase + SCount)
  686. {
  687. gsize result_len;
  688. decompose_hangul (wc, NULL, &result_len);
  689. n_wc += result_len;
  690. }
  691. else
  692. {
  693. decomp = find_decomposition (wc, do_compat);
  694. if (decomp)
  695. n_wc += g_utf8_strlen (decomp);
  696. else
  697. n_wc++;
  698. }
  699. p = g_utf8_next_char (p);
  700. }
  701. wc_buffer = g_malloc (sizeof (gunichar) * (n_wc + 1));
  702. if (!wc_buffer)
  703. return NULL;
  704. last_start = 0;
  705. n_wc = 0;
  706. p = str;
  707. while ((max_len < 0 || p < str + max_len) && *p)
  708. {
  709. gunichar wc = g_utf8_get_char (p);
  710. const gchar *decomp;
  711. int cc;
  712. gsize old_n_wc = n_wc;
  713. if (wc >= SBase && wc < SBase + SCount)
  714. {
  715. gsize result_len;
  716. decompose_hangul (wc, wc_buffer + n_wc, &result_len);
  717. n_wc += result_len;
  718. }
  719. else
  720. {
  721. decomp = find_decomposition (wc, do_compat);
  722. if (decomp)
  723. {
  724. const char *pd;
  725. for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd))
  726. wc_buffer[n_wc++] = g_utf8_get_char (pd);
  727. }
  728. else
  729. wc_buffer[n_wc++] = wc;
  730. }
  731. if (n_wc > 0)
  732. {
  733. cc = COMBINING_CLASS (wc_buffer[old_n_wc]);
  734. if (cc == 0)
  735. {
  736. g_unicode_canonical_ordering (wc_buffer + last_start,
  737. n_wc - last_start);
  738. last_start = old_n_wc;
  739. }
  740. }
  741. p = g_utf8_next_char (p);
  742. }
  743. if (n_wc > 0)
  744. {
  745. g_unicode_canonical_ordering (wc_buffer + last_start,
  746. n_wc - last_start);
  747. /* dead assignment: last_start = n_wc; */
  748. }
  749. wc_buffer[n_wc] = 0;
  750. /* All decomposed and reordered */
  751. if (do_compose && n_wc > 0)
  752. {
  753. gsize i, j;
  754. int last_cc = 0;
  755. last_start = 0;
  756. for (i = 0; i < n_wc; i++)
  757. {
  758. int cc = COMBINING_CLASS (wc_buffer[i]);
  759. if (i > 0 &&
  760. (last_cc == 0 || last_cc != cc) &&
  761. combine (wc_buffer[last_start], wc_buffer[i],
  762. &wc_buffer[last_start]))
  763. {
  764. for (j = i + 1; j < n_wc; j++)
  765. wc_buffer[j - 1] = wc_buffer[j];
  766. n_wc--;
  767. i--;
  768. if (i == last_start)
  769. last_cc = 0;
  770. else
  771. last_cc = COMBINING_CLASS (wc_buffer[i - 1]);
  772. continue;
  773. }
  774. if (cc == 0)
  775. last_start = i;
  776. last_cc = cc;
  777. }
  778. }
  779. wc_buffer[n_wc] = 0;
  780. return wc_buffer;
  781. }
  782. /*
  783. * g_utf8_normalize:
  784. * @str: a UTF-8 encoded string.
  785. * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
  786. * @mode: the type of normalization to perform.
  787. *
  788. * Converts a string into canonical form, standardizing
  789. * such issues as whether a character with an accent
  790. * is represented as a base character and combining
  791. * accent or as a single precomposed character. The
  792. * string has to be valid UTF-8, otherwise %NULL is
  793. * returned. You should generally call g_utf8_normalize()
  794. * before comparing two Unicode strings.
  795. *
  796. * The normalization mode %G_NORMALIZE_DEFAULT only
  797. * standardizes differences that do not affect the
  798. * text content, such as the above-mentioned accent
  799. * representation. %G_NORMALIZE_ALL also standardizes
  800. * the "compatibility" characters in Unicode, such
  801. * as SUPERSCRIPT THREE to the standard forms
  802. * (in this case DIGIT THREE). Formatting information
  803. * may be lost but for most text operations such
  804. * characters should be considered the same.
  805. *
  806. * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE
  807. * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL,
  808. * but returned a result with composed forms rather
  809. * than a maximally decomposed form. This is often
  810. * useful if you intend to convert the string to
  811. * a legacy encoding or pass it to a system with
  812. * less capable Unicode handling.
  813. *
  814. * Return value: a newly allocated string, that is the
  815. * normalized form of @str, or %NULL if @str is not
  816. * valid UTF-8.
  817. **/
  818. static gchar *
  819. g_utf8_normalize (const gchar * str, gssize len, GNormalizeMode mode)
  820. {
  821. gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode);
  822. gchar *result = NULL;
  823. if (result_wc)
  824. result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL);
  825. g_free (result_wc);
  826. return result;
  827. }
  828. /* Public Libidn API starts here. */
  829. /**
  830. * stringprep_utf8_to_unichar:
  831. * @p: a pointer to Unicode character encoded as UTF-8
  832. *
  833. * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
  834. * If @p does not point to a valid UTF-8 encoded character, results are
  835. * undefined.
  836. *
  837. * Return value: the resulting character.
  838. **/
  839. uint32_t
  840. stringprep_utf8_to_unichar (const char *p)
  841. {
  842. return g_utf8_get_char (p);
  843. }
  844. /**
  845. * stringprep_unichar_to_utf8:
  846. * @c: a ISO10646 character code
  847. * @outbuf: output buffer, must have at least 6 bytes of space.
  848. * If %NULL, the length will be computed and returned
  849. * and nothing will be written to @outbuf.
  850. *
  851. * Converts a single character to UTF-8.
  852. *
  853. * Return value: number of bytes written.
  854. **/
  855. int
  856. stringprep_unichar_to_utf8 (uint32_t c, char *outbuf)
  857. {
  858. return g_unichar_to_utf8 (c, outbuf);
  859. }
  860. #include <unistr.h>
  861. /**
  862. * stringprep_utf8_to_ucs4:
  863. * @str: a UTF-8 encoded string
  864. * @len: the maximum length of @str to use. If @len < 0, then
  865. * the string is nul-terminated.
  866. * @items_written: location to store the number of characters in the
  867. * result, or %NULL.
  868. *
  869. * Convert a string from UTF-8 to a 32-bit fixed width representation
  870. * as UCS-4. The function now performs error checking to verify that
  871. * the input is valid UTF-8 (before it was documented to not do error
  872. * checking).
  873. *
  874. * Return value: a pointer to a newly allocated UCS-4 string.
  875. * This value must be deallocated by the caller.
  876. **/
  877. uint32_t *
  878. stringprep_utf8_to_ucs4 (const char *str, ssize_t len, size_t *items_written)
  879. {
  880. size_t n;
  881. if (len < 0)
  882. n = strlen (str);
  883. else
  884. n = len;
  885. if (u8_check ((const uint8_t *) str, n))
  886. return NULL;
  887. return g_utf8_to_ucs4_fast (str, (glong) len, (glong *) items_written);
  888. }
  889. /**
  890. * stringprep_ucs4_to_utf8:
  891. * @str: a UCS-4 encoded string
  892. * @len: the maximum length of @str to use. If @len < 0, then
  893. * the string is terminated with a 0 character.
  894. * @items_read: location to store number of characters read read, or %NULL.
  895. * @items_written: location to store number of bytes written or %NULL.
  896. * The value here stored does not include the trailing 0
  897. * byte.
  898. *
  899. * Convert a string from a 32-bit fixed width representation as UCS-4.
  900. * to UTF-8. The result will be terminated with a 0 byte.
  901. *
  902. * Return value: a pointer to a newly allocated UTF-8 string.
  903. * This value must be deallocated by the caller.
  904. * If an error occurs, %NULL will be returned.
  905. **/
  906. char *
  907. stringprep_ucs4_to_utf8 (const uint32_t * str, ssize_t len,
  908. size_t *items_read, size_t *items_written)
  909. {
  910. return g_ucs4_to_utf8 (str, len, (glong *) items_read,
  911. (glong *) items_written);
  912. }
  913. /**
  914. * stringprep_utf8_nfkc_normalize:
  915. * @str: a UTF-8 encoded string.
  916. * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
  917. *
  918. * Converts a string into canonical form, standardizing
  919. * such issues as whether a character with an accent
  920. * is represented as a base character and combining
  921. * accent or as a single precomposed character.
  922. *
  923. * The normalization mode is NFKC (ALL COMPOSE). It standardizes
  924. * differences that do not affect the text content, such as the
  925. * above-mentioned accent representation. It standardizes the
  926. * "compatibility" characters in Unicode, such as SUPERSCRIPT THREE to
  927. * the standard forms (in this case DIGIT THREE). Formatting
  928. * information may be lost but for most text operations such
  929. * characters should be considered the same. It returns a result with
  930. * composed forms rather than a maximally decomposed form.
  931. *
  932. * Return value: a newly allocated string, that is the
  933. * NFKC normalized form of @str.
  934. **/
  935. char *
  936. stringprep_utf8_nfkc_normalize (const char *str, ssize_t len)
  937. {
  938. size_t n;
  939. if (len < 0)
  940. n = strlen (str);
  941. else
  942. n = len;
  943. if (u8_check ((const uint8_t *) str, n))
  944. return NULL;
  945. return g_utf8_normalize (str, len, G_NORMALIZE_NFKC);
  946. }
  947. #include <stdio.h>
  948. /**
  949. * stringprep_ucs4_nfkc_normalize:
  950. * @str: a Unicode string.
  951. * @len: length of @str array, or -1 if @str is nul-terminated.
  952. *
  953. * Converts a UCS4 string into canonical form, see
  954. * stringprep_utf8_nfkc_normalize() for more information.
  955. *
  956. * Return value: a newly allocated Unicode string, that is the NFKC
  957. * normalized form of @str.
  958. **/
  959. uint32_t *
  960. stringprep_ucs4_nfkc_normalize (const uint32_t * str, ssize_t len)
  961. {
  962. char *p;
  963. uint32_t *result_wc;
  964. p = stringprep_ucs4_to_utf8 (str, len, 0, 0);
  965. if (!p)
  966. return NULL;
  967. result_wc = _g_utf8_normalize_wc (p, -1, G_NORMALIZE_NFKC);
  968. free (p);
  969. return result_wc;
  970. }