punycode.c 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456
  1. /* punycode.c --- Implementation of punycode used to ASCII encode IDN's.
  2. * Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007 Simon Josefsson
  3. *
  4. * This file is part of GNU Libidn.
  5. *
  6. * GNU Libidn is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * GNU Libidn is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with GNU Libidn; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
  19. *
  20. */
  21. /*
  22. * This file is derived from RFC 3492bis written by Adam M. Costello.
  23. *
  24. * Disclaimer and license: Regarding this entire document or any
  25. * portion of it (including the pseudocode and C code), the author
  26. * makes no guarantees and is not responsible for any damage resulting
  27. * from its use. The author grants irrevocable permission to anyone
  28. * to use, modify, and distribute it in any way that does not diminish
  29. * the rights of anyone else to use, modify, and distribute it,
  30. * provided that redistributed derivative works do not contain
  31. * misleading author or version information. Derivative works need
  32. * not be licensed under similar terms.
  33. *
  34. * Copyright (C) The Internet Society (2003). All Rights Reserved.
  35. *
  36. * This document and translations of it may be copied and furnished to
  37. * others, and derivative works that comment on or otherwise explain it
  38. * or assist in its implementation may be prepared, copied, published
  39. * and distributed, in whole or in part, without restriction of any
  40. * kind, provided that the above copyright notice and this paragraph are
  41. * included on all such copies and derivative works. However, this
  42. * document itself may not be modified in any way, such as by removing
  43. * the copyright notice or references to the Internet Society or other
  44. * Internet organizations, except as needed for the purpose of
  45. * developing Internet standards in which case the procedures for
  46. * copyrights defined in the Internet Standards process must be
  47. * followed, or as required to translate it into languages other than
  48. * English.
  49. *
  50. * The limited permissions granted above are perpetual and will not be
  51. * revoked by the Internet Society or its successors or assigns.
  52. *
  53. * This document and the information contained herein is provided on an
  54. * "AS IS" basis and THE INTERNET SOCIETY AND THE INTERNET ENGINEERING
  55. * TASK FORCE DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING
  56. * BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE INFORMATION
  57. * HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED WARRANTIES OF
  58. * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
  59. */
  60. #include <string.h>
  61. #include "punycode.h"
  62. /*** Bootstring parameters for Punycode ***/
  63. enum
  64. { base = 36, tmin = 1, tmax = 26, skew = 38, damp = 700,
  65. initial_bias = 72, initial_n = 0x80, delimiter = 0x2D
  66. };
  67. /* basic(cp) tests whether cp is a basic code point: */
  68. #define basic(cp) ((punycode_uint)(cp) < 0x80)
  69. /* delim(cp) tests whether cp is a delimiter: */
  70. #define delim(cp) ((cp) == delimiter)
  71. /* decode_digit(cp) returns the numeric value of a basic code */
  72. /* point (for use in representing integers) in the range 0 to */
  73. /* base-1, or base if cp does not represent a value. */
  74. static punycode_uint
  75. decode_digit (punycode_uint cp)
  76. {
  77. return cp - 48 < 10 ? cp - 22 : cp - 65 < 26 ? cp - 65 :
  78. cp - 97 < 26 ? cp - 97 : base;
  79. }
  80. /* encode_digit(d,flag) returns the basic code point whose value */
  81. /* (when used for representing integers) is d, which needs to be in */
  82. /* the range 0 to base-1. The lowercase form is used unless flag is */
  83. /* nonzero, in which case the uppercase form is used. The behavior */
  84. /* is undefined if flag is nonzero and digit d has no uppercase form. */
  85. static char
  86. encode_digit (punycode_uint d, int flag)
  87. {
  88. return d + 22 + 75 * (d < 26) - ((flag != 0) << 5);
  89. /* 0..25 map to ASCII a..z or A..Z */
  90. /* 26..35 map to ASCII 0..9 */
  91. }
  92. /* flagged(bcp) tests whether a basic code point is flagged */
  93. /* (uppercase). The behavior is undefined if bcp is not a */
  94. /* basic code point. */
  95. #define flagged(bcp) ((punycode_uint)(bcp) - 65 < 26)
  96. /* encode_basic(bcp,flag) forces a basic code point to lowercase */
  97. /* if flag is zero, uppercase if flag is nonzero, and returns */
  98. /* the resulting code point. The code point is unchanged if it */
  99. /* is caseless. The behavior is undefined if bcp is not a basic */
  100. /* code point. */
  101. static char
  102. encode_basic (punycode_uint bcp, int flag)
  103. {
  104. bcp -= (bcp - 97 < 26) << 5;
  105. return bcp + ((!flag && (bcp - 65 < 26)) << 5);
  106. }
  107. /*** Platform-specific constants ***/
  108. /* maxint is the maximum value of a punycode_uint variable: */
  109. static const punycode_uint maxint = -1;
  110. /* Because maxint is unsigned, -1 becomes the maximum value. */
  111. /*** Bias adaptation function ***/
  112. static punycode_uint
  113. adapt (punycode_uint delta, punycode_uint numpoints, int firsttime)
  114. {
  115. punycode_uint k;
  116. delta = firsttime ? delta / damp : delta >> 1;
  117. /* delta >> 1 is a faster way of doing delta / 2 */
  118. delta += delta / numpoints;
  119. for (k = 0; delta > ((base - tmin) * tmax) / 2; k += base)
  120. {
  121. delta /= base - tmin;
  122. }
  123. return k + (base - tmin + 1) * delta / (delta + skew);
  124. }
  125. /*** Main encode function ***/
  126. /**
  127. * punycode_encode - encode Unicode to Punycode
  128. * @input_length: The number of code points in the @input array and
  129. * the number of flags in the @case_flags array.
  130. * @input: An array of code points. They are presumed to be Unicode
  131. * code points, but that is not strictly REQUIRED. The array
  132. * contains code points, not code units. UTF-16 uses code units
  133. * D800 through DFFF to refer to code points 10000..10FFFF. The
  134. * code points D800..DFFF do not occur in any valid Unicode string.
  135. * The code points that can occur in Unicode strings (0..D7FF and
  136. * E000..10FFFF) are also called Unicode scalar values.
  137. * @case_flags: A %NULL pointer or an array of boolean values parallel
  138. * to the @input array. Nonzero (true, flagged) suggests that the
  139. * corresponding Unicode character be forced to uppercase after
  140. * being decoded (if possible), and zero (false, unflagged) suggests
  141. * that it be forced to lowercase (if possible). ASCII code points
  142. * (0..7F) are encoded literally, except that ASCII letters are
  143. * forced to uppercase or lowercase according to the corresponding
  144. * case flags. If @case_flags is a %NULL pointer then ASCII letters
  145. * are left as they are, and other code points are treated as
  146. * unflagged.
  147. * @output_length: The caller passes in the maximum number of ASCII
  148. * code points that it can receive. On successful return it will
  149. * contain the number of ASCII code points actually output.
  150. * @output: An array of ASCII code points. It is *not*
  151. * null-terminated; it will contain zeros if and only if the @input
  152. * contains zeros. (Of course the caller can leave room for a
  153. * terminator and add one if needed.)
  154. *
  155. * Converts a sequence of code points (presumed to be Unicode code
  156. * points) to Punycode.
  157. *
  158. * Return value: The return value can be any of the #Punycode_status
  159. * values defined above except %PUNYCODE_BAD_INPUT. If not
  160. * %PUNYCODE_SUCCESS, then @output_size and @output might contain
  161. * garbage.
  162. **/
  163. int
  164. punycode_encode (size_t input_length,
  165. const punycode_uint input[],
  166. const unsigned char case_flags[],
  167. size_t * output_length, char output[])
  168. {
  169. punycode_uint input_len, n, delta, h, b, bias, j, m, q, k, t;
  170. size_t out, max_out;
  171. /* The Punycode spec assumes that the input length is the same type */
  172. /* of integer as a code point, so we need to convert the size_t to */
  173. /* a punycode_uint, which could overflow. */
  174. if (input_length > maxint)
  175. return punycode_overflow;
  176. input_len = (punycode_uint) input_length;
  177. /* Initialize the state: */
  178. n = initial_n;
  179. delta = 0;
  180. out = 0;
  181. max_out = *output_length;
  182. bias = initial_bias;
  183. /* Handle the basic code points: */
  184. for (j = 0; j < input_len; ++j)
  185. {
  186. if (basic (input[j]))
  187. {
  188. if (max_out - out < 2)
  189. return punycode_big_output;
  190. output[out++] = case_flags ?
  191. encode_basic (input[j], case_flags[j]) : (char) input[j];
  192. }
  193. /* else if (input[j] < n) return punycode_bad_input; */
  194. /* (not needed for Punycode with unsigned code points) */
  195. }
  196. h = b = (punycode_uint) out;
  197. /* cannot overflow because out <= input_len <= maxint */
  198. /* h is the number of code points that have been handled, b is the */
  199. /* number of basic code points, and out is the number of ASCII code */
  200. /* points that have been output. */
  201. if (b > 0)
  202. output[out++] = delimiter;
  203. /* Main encoding loop: */
  204. while (h < input_len)
  205. {
  206. /* All non-basic code points < n have been */
  207. /* handled already. Find the next larger one: */
  208. for (m = maxint, j = 0; j < input_len; ++j)
  209. {
  210. /* if (basic(input[j])) continue; */
  211. /* (not needed for Punycode) */
  212. if (input[j] >= n && input[j] < m)
  213. m = input[j];
  214. }
  215. /* Increase delta enough to advance the decoder's */
  216. /* <n,i> state to <m,0>, but guard against overflow: */
  217. if (m - n > (maxint - delta) / (h + 1))
  218. return punycode_overflow;
  219. delta += (m - n) * (h + 1);
  220. n = m;
  221. for (j = 0; j < input_len; ++j)
  222. {
  223. /* Punycode does not need to check whether input[j] is basic: */
  224. if (input[j] < n /* || basic(input[j]) */ )
  225. {
  226. if (++delta == 0)
  227. return punycode_overflow;
  228. }
  229. if (input[j] == n)
  230. {
  231. /* Represent delta as a generalized variable-length integer: */
  232. for (q = delta, k = base;; k += base)
  233. {
  234. if (out >= max_out)
  235. return punycode_big_output;
  236. t = k <= bias /* + tmin */ ? tmin : /* +tmin not needed */
  237. k >= bias + tmax ? tmax : k - bias;
  238. if (q < t)
  239. break;
  240. output[out++] = encode_digit (t + (q - t) % (base - t), 0);
  241. q = (q - t) / (base - t);
  242. }
  243. output[out++] = encode_digit (q, case_flags && case_flags[j]);
  244. bias = adapt (delta, h + 1, h == b);
  245. delta = 0;
  246. ++h;
  247. }
  248. }
  249. ++delta, ++n;
  250. }
  251. *output_length = out;
  252. return punycode_success;
  253. }
  254. /*** Main decode function ***/
  255. /**
  256. * punycode_decode - decode Punycode to Unicode
  257. * @input_length: The number of ASCII code points in the @input array.
  258. * @input: An array of ASCII code points (0..7F).
  259. * @output_length: The caller passes in the maximum number of code
  260. * points that it can receive into the @output array (which is also
  261. * the maximum number of flags that it can receive into the
  262. * @case_flags array, if @case_flags is not a %NULL pointer). On
  263. * successful return it will contain the number of code points
  264. * actually output (which is also the number of flags actually
  265. * output, if case_flags is not a null pointer). The decoder will
  266. * never need to output more code points than the number of ASCII
  267. * code points in the input, because of the way the encoding is
  268. * defined. The number of code points output cannot exceed the
  269. * maximum possible value of a punycode_uint, even if the supplied
  270. * @output_length is greater than that.
  271. * @output: An array of code points like the input argument of
  272. * punycode_encode() (see above).
  273. * @case_flags: A %NULL pointer (if the flags are not needed by the
  274. * caller) or an array of boolean values parallel to the @output
  275. * array. Nonzero (true, flagged) suggests that the corresponding
  276. * Unicode character be forced to uppercase by the caller (if
  277. * possible), and zero (false, unflagged) suggests that it be forced
  278. * to lowercase (if possible). ASCII code points (0..7F) are output
  279. * already in the proper case, but their flags will be set
  280. * appropriately so that applying the flags would be harmless.
  281. *
  282. * Converts Punycode to a sequence of code points (presumed to be
  283. * Unicode code points).
  284. *
  285. * Return value: The return value can be any of the #Punycode_status
  286. * values defined above. If not %PUNYCODE_SUCCESS, then
  287. * @output_length, @output, and @case_flags might contain garbage.
  288. *
  289. **/
  290. int
  291. punycode_decode (size_t input_length,
  292. const char input[],
  293. size_t * output_length,
  294. punycode_uint output[], unsigned char case_flags[])
  295. {
  296. punycode_uint n, out, i, max_out, bias, oldi, w, k, digit, t;
  297. size_t b, j, in;
  298. /* Initialize the state: */
  299. n = initial_n;
  300. out = i = 0;
  301. max_out = *output_length > maxint ? maxint
  302. : (punycode_uint) * output_length;
  303. bias = initial_bias;
  304. /* Handle the basic code points: Let b be the number of input code */
  305. /* points before the last delimiter, or 0 if there is none, then */
  306. /* copy the first b code points to the output. */
  307. for (b = j = 0; j < input_length; ++j)
  308. if (delim (input[j]))
  309. b = j;
  310. if (b > max_out)
  311. return punycode_big_output;
  312. for (j = 0; j < b; ++j)
  313. {
  314. if (case_flags)
  315. case_flags[out] = flagged (input[j]);
  316. if (!basic (input[j]))
  317. return punycode_bad_input;
  318. output[out++] = input[j];
  319. }
  320. /* Main decoding loop: Start just after the last delimiter if any */
  321. /* basic code points were copied; start at the beginning otherwise. */
  322. for (in = b > 0 ? b + 1 : 0; in < input_length; ++out)
  323. {
  324. /* in is the index of the next ASCII code point to be consumed, */
  325. /* and out is the number of code points in the output array. */
  326. /* Decode a generalized variable-length integer into delta, */
  327. /* which gets added to i. The overflow checking is easier */
  328. /* if we increase i as we go, then subtract off its starting */
  329. /* value at the end to obtain delta. */
  330. for (oldi = i, w = 1, k = base;; k += base)
  331. {
  332. if (in >= input_length)
  333. return punycode_bad_input;
  334. digit = decode_digit (input[in++]);
  335. if (digit >= base)
  336. return punycode_bad_input;
  337. if (digit > (maxint - i) / w)
  338. return punycode_overflow;
  339. i += digit * w;
  340. t = k <= bias /* + tmin */ ? tmin : /* +tmin not needed */
  341. k >= bias + tmax ? tmax : k - bias;
  342. if (digit < t)
  343. break;
  344. if (w > maxint / (base - t))
  345. return punycode_overflow;
  346. w *= (base - t);
  347. }
  348. bias = adapt (i - oldi, out + 1, oldi == 0);
  349. /* i was supposed to wrap around from out+1 to 0, */
  350. /* incrementing n each time, so we'll fix that now: */
  351. if (i / (out + 1) > maxint - n)
  352. return punycode_overflow;
  353. n += i / (out + 1);
  354. i %= (out + 1);
  355. /* Insert n at position i of the output: */
  356. /* not needed for Punycode: */
  357. /* if (basic(n)) return punycode_invalid_input; */
  358. if (out >= max_out)
  359. return punycode_big_output;
  360. if (case_flags)
  361. {
  362. memmove (case_flags + i + 1, case_flags + i, out - i);
  363. /* Case of last ASCII code point determines case flag: */
  364. case_flags[i] = flagged (input[in - 1]);
  365. }
  366. memmove (output + i + 1, output + i, (out - i) * sizeof *output);
  367. output[i++] = n;
  368. }
  369. *output_length = (size_t) out;
  370. /* cannot overflow because out <= old value of *output_length */
  371. return punycode_success;
  372. }
  373. /**
  374. * punycode_uint
  375. *
  376. * Unicode code point data type, this is always a 32 bit unsigned
  377. * integer.
  378. */
  379. /**
  380. * Punycode_status
  381. * @PUNYCODE_SUCCESS: Successful operation. This value is guaranteed
  382. * to always be zero, the remaining ones are only guaranteed to hold
  383. * non-zero values, for logical comparison purposes.
  384. * @PUNYCODE_BAD_INPUT: Input is invalid.
  385. * @PUNYCODE_BIG_OUTPUT: Output would exceed the space provided.
  386. * @PUNYCODE_OVERFLOW: Input needs wider integers to process.
  387. *
  388. * Enumerated return codes of punycode_encode() and punycode_decode().
  389. * The value 0 is guaranteed to always correspond to success.
  390. */