punycode.cpp 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. *******************************************************************************
  5. *
  6. * Copyright (C) 2002-2011, International Business Machines
  7. * Corporation and others. All Rights Reserved.
  8. *
  9. *******************************************************************************
  10. * file name: punycode.cpp
  11. * encoding: UTF-8
  12. * tab size: 8 (not used)
  13. * indentation:4
  14. *
  15. * created on: 2002jan31
  16. * created by: Markus W. Scherer
  17. */
  18. /* This ICU code derived from: */
  19. /*
  20. punycode.c 0.4.0 (2001-Nov-17-Sat)
  21. http://www.cs.berkeley.edu/~amc/idn/
  22. Adam M. Costello
  23. http://www.nicemice.net/amc/
  24. Disclaimer and license
  25. Regarding this entire document or any portion of it (including
  26. the pseudocode and C code), the author makes no guarantees and
  27. is not responsible for any damage resulting from its use. The
  28. author grants irrevocable permission to anyone to use, modify,
  29. and distribute it in any way that does not diminish the rights
  30. of anyone else to use, modify, and distribute it, provided that
  31. redistributed derivative works do not contain misleading author or
  32. version information. Derivative works need not be licensed under
  33. similar terms.
  34. */
  35. /*
  36. * ICU modifications:
  37. * - ICU data types and coding conventions
  38. * - ICU string buffer handling with implicit source lengths
  39. * and destination preflighting
  40. * - UTF-16 handling
  41. */
  42. #include "unicode/utypes.h"
  43. #if !UCONFIG_NO_IDNA
  44. #include "unicode/ustring.h"
  45. #include "unicode/utf.h"
  46. #include "unicode/utf16.h"
  47. #include "ustr_imp.h"
  48. #include "cstring.h"
  49. #include "cmemory.h"
  50. #include "punycode.h"
  51. #include "uassert.h"
  52. /* Punycode ----------------------------------------------------------------- */
  53. /* Punycode parameters for Bootstring */
  54. #define BASE 36
  55. #define TMIN 1
  56. #define TMAX 26
  57. #define SKEW 38
  58. #define DAMP 700
  59. #define INITIAL_BIAS 72
  60. #define INITIAL_N 0x80
  61. /* "Basic" Unicode/ASCII code points */
  62. #define _HYPHEN 0X2d
  63. #define DELIMITER _HYPHEN
  64. #define _ZERO_ 0X30
  65. #define _NINE 0x39
  66. #define _SMALL_A 0X61
  67. #define _SMALL_Z 0X7a
  68. #define _CAPITAL_A 0X41
  69. #define _CAPITAL_Z 0X5a
  70. #define IS_BASIC(c) ((c)<0x80)
  71. #define IS_BASIC_UPPERCASE(c) (_CAPITAL_A<=(c) && (c)<=_CAPITAL_Z)
  72. /**
  73. * digitToBasic() returns the basic code point whose value
  74. * (when used for representing integers) is d, which must be in the
  75. * range 0 to BASE-1. The lowercase form is used unless the uppercase flag is
  76. * nonzero, in which case the uppercase form is used.
  77. */
  78. static inline char
  79. digitToBasic(int32_t digit, UBool uppercase) {
  80. /* 0..25 map to ASCII a..z or A..Z */
  81. /* 26..35 map to ASCII 0..9 */
  82. if(digit<26) {
  83. if(uppercase) {
  84. return static_cast<char>(_CAPITAL_A + digit);
  85. } else {
  86. return static_cast<char>(_SMALL_A + digit);
  87. }
  88. } else {
  89. return static_cast<char>((_ZERO_ - 26) + digit);
  90. }
  91. }
  92. /**
  93. * @return the numeric value of a basic code point (for use in representing integers)
  94. * in the range 0 to BASE-1, or a negative value if cp is invalid.
  95. */
  96. static int32_t decodeDigit(int32_t cp) {
  97. if(cp<=u'Z') {
  98. if(cp<=u'9') {
  99. if(cp<u'0') {
  100. return -1;
  101. } else {
  102. return cp-u'0'+26; // 0..9 -> 26..35
  103. }
  104. } else {
  105. return cp-u'A'; // A-Z -> 0..25
  106. }
  107. } else if(cp<=u'z') {
  108. return cp-'a'; // a..z -> 0..25
  109. } else {
  110. return -1;
  111. }
  112. }
  113. static inline char
  114. asciiCaseMap(char b, UBool uppercase) {
  115. if(uppercase) {
  116. if(_SMALL_A<=b && b<=_SMALL_Z) {
  117. b-=(_SMALL_A-_CAPITAL_A);
  118. }
  119. } else {
  120. if(_CAPITAL_A<=b && b<=_CAPITAL_Z) {
  121. b+=(_SMALL_A-_CAPITAL_A);
  122. }
  123. }
  124. return b;
  125. }
  126. /* Punycode-specific Bootstring code ---------------------------------------- */
  127. /*
  128. * The following code omits the {parts} of the pseudo-algorithm in the spec
  129. * that are not used with the Punycode parameter set.
  130. */
  131. /* Bias adaptation function. */
  132. static int32_t
  133. adaptBias(int32_t delta, int32_t length, UBool firstTime) {
  134. int32_t count;
  135. if(firstTime) {
  136. delta/=DAMP;
  137. } else {
  138. delta/=2;
  139. }
  140. delta+=delta/length;
  141. for(count=0; delta>((BASE-TMIN)*TMAX)/2; count+=BASE) {
  142. delta/=(BASE-TMIN);
  143. }
  144. return count+(((BASE-TMIN+1)*delta)/(delta+SKEW));
  145. }
  146. namespace {
  147. // ICU-13727: Limit input length for n^2 algorithm
  148. // where well-formed strings are at most 59 characters long.
  149. constexpr int32_t ENCODE_MAX_CODE_UNITS=1000;
  150. constexpr int32_t DECODE_MAX_CHARS=2000;
  151. } // namespace
  152. // encode
  153. U_CAPI int32_t
  154. u_strToPunycode(const char16_t *src, int32_t srcLength,
  155. char16_t *dest, int32_t destCapacity,
  156. const UBool *caseFlags,
  157. UErrorCode *pErrorCode) {
  158. int32_t cpBuffer[ENCODE_MAX_CODE_UNITS];
  159. int32_t n, delta, handledCPCount, basicLength, destLength, bias, j, m, q, k, t, srcCPCount;
  160. char16_t c, c2;
  161. /* argument checking */
  162. if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) {
  163. return 0;
  164. }
  165. if(src==nullptr || srcLength<-1 || (dest==nullptr && destCapacity!=0)) {
  166. *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
  167. return 0;
  168. }
  169. if (srcLength>ENCODE_MAX_CODE_UNITS) {
  170. *pErrorCode=U_INPUT_TOO_LONG_ERROR;
  171. return 0;
  172. }
  173. /*
  174. * Handle the basic code points and
  175. * convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit):
  176. */
  177. srcCPCount=destLength=0;
  178. if(srcLength==-1) {
  179. /* NUL-terminated input */
  180. for(j=0; /* no condition */; ++j) {
  181. if((c=src[j])==0) {
  182. break;
  183. }
  184. if(j>=ENCODE_MAX_CODE_UNITS) {
  185. *pErrorCode=U_INPUT_TOO_LONG_ERROR;
  186. return 0;
  187. }
  188. if(IS_BASIC(c)) {
  189. cpBuffer[srcCPCount++]=0;
  190. if(destLength<destCapacity) {
  191. dest[destLength]=
  192. caseFlags!=nullptr ?
  193. asciiCaseMap((char)c, caseFlags[j]) :
  194. (char)c;
  195. }
  196. ++destLength;
  197. } else {
  198. n=(caseFlags!=nullptr && caseFlags[j])<<31L;
  199. if(U16_IS_SINGLE(c)) {
  200. n|=c;
  201. } else if(U16_IS_LEAD(c) && U16_IS_TRAIL(c2=src[j+1])) {
  202. ++j;
  203. n|=(int32_t)U16_GET_SUPPLEMENTARY(c, c2);
  204. } else {
  205. /* error: unmatched surrogate */
  206. *pErrorCode=U_INVALID_CHAR_FOUND;
  207. return 0;
  208. }
  209. cpBuffer[srcCPCount++]=n;
  210. }
  211. }
  212. } else {
  213. /* length-specified input */
  214. for(j=0; j<srcLength; ++j) {
  215. c=src[j];
  216. if(IS_BASIC(c)) {
  217. cpBuffer[srcCPCount++]=0;
  218. if(destLength<destCapacity) {
  219. dest[destLength]=
  220. caseFlags!=nullptr ?
  221. asciiCaseMap((char)c, caseFlags[j]) :
  222. (char)c;
  223. }
  224. ++destLength;
  225. } else {
  226. n=(caseFlags!=nullptr && caseFlags[j])<<31L;
  227. if(U16_IS_SINGLE(c)) {
  228. n|=c;
  229. } else if(U16_IS_LEAD(c) && (j+1)<srcLength && U16_IS_TRAIL(c2=src[j+1])) {
  230. ++j;
  231. n|=(int32_t)U16_GET_SUPPLEMENTARY(c, c2);
  232. } else {
  233. /* error: unmatched surrogate */
  234. *pErrorCode=U_INVALID_CHAR_FOUND;
  235. return 0;
  236. }
  237. cpBuffer[srcCPCount++]=n;
  238. }
  239. }
  240. }
  241. /* Finish the basic string - if it is not empty - with a delimiter. */
  242. basicLength=destLength;
  243. if(basicLength>0) {
  244. if(destLength<destCapacity) {
  245. dest[destLength]=DELIMITER;
  246. }
  247. ++destLength;
  248. }
  249. /*
  250. * handledCPCount is the number of code points that have been handled
  251. * basicLength is the number of basic code points
  252. * destLength is the number of chars that have been output
  253. */
  254. /* Initialize the state: */
  255. n=INITIAL_N;
  256. delta=0;
  257. bias=INITIAL_BIAS;
  258. /* Main encoding loop: */
  259. for(handledCPCount=basicLength; handledCPCount<srcCPCount; /* no op */) {
  260. /*
  261. * All non-basic code points < n have been handled already.
  262. * Find the next larger one:
  263. */
  264. for(m=0x7fffffff, j=0; j<srcCPCount; ++j) {
  265. q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
  266. if(n<=q && q<m) {
  267. m=q;
  268. }
  269. }
  270. /*
  271. * Increase delta enough to advance the decoder's
  272. * <n,i> state to <m,0>, but guard against overflow:
  273. */
  274. if(m-n>(0x7fffffff-handledCPCount-delta)/(handledCPCount+1)) {
  275. *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
  276. return 0;
  277. }
  278. delta+=(m-n)*(handledCPCount+1);
  279. n=m;
  280. /* Encode a sequence of same code points n */
  281. for(j=0; j<srcCPCount; ++j) {
  282. q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
  283. if(q<n) {
  284. ++delta;
  285. } else if(q==n) {
  286. /* Represent delta as a generalized variable-length integer: */
  287. for(q=delta, k=BASE; /* no condition */; k+=BASE) {
  288. /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
  289. t=k-bias;
  290. if(t<TMIN) {
  291. t=TMIN;
  292. } else if(t>TMAX) {
  293. t=TMAX;
  294. }
  295. */
  296. t=k-bias;
  297. if(t<TMIN) {
  298. t=TMIN;
  299. } else if(k>=(bias+TMAX)) {
  300. t=TMAX;
  301. }
  302. if(q<t) {
  303. break;
  304. }
  305. if(destLength<destCapacity) {
  306. dest[destLength]=digitToBasic(t+(q-t)%(BASE-t), 0);
  307. }
  308. ++destLength;
  309. q=(q-t)/(BASE-t);
  310. }
  311. if(destLength<destCapacity) {
  312. dest[destLength] = digitToBasic(q, cpBuffer[j] < 0);
  313. }
  314. ++destLength;
  315. bias = adaptBias(delta, handledCPCount + 1, handledCPCount == basicLength);
  316. delta=0;
  317. ++handledCPCount;
  318. }
  319. }
  320. ++delta;
  321. ++n;
  322. }
  323. return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
  324. }
  325. // decode
  326. U_CAPI int32_t
  327. u_strFromPunycode(const char16_t *src, int32_t srcLength,
  328. char16_t *dest, int32_t destCapacity,
  329. UBool *caseFlags,
  330. UErrorCode *pErrorCode) {
  331. int32_t n, destLength, i, bias, basicLength, j, in, oldi, w, k, digit, t,
  332. destCPCount, firstSupplementaryIndex, cpLength;
  333. char16_t b;
  334. /* argument checking */
  335. if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) {
  336. return 0;
  337. }
  338. if(src==nullptr || srcLength<-1 || (dest==nullptr && destCapacity!=0)) {
  339. *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
  340. return 0;
  341. }
  342. if(srcLength==-1) {
  343. srcLength=u_strlen(src);
  344. }
  345. if (srcLength>DECODE_MAX_CHARS) {
  346. *pErrorCode=U_INPUT_TOO_LONG_ERROR;
  347. return 0;
  348. }
  349. /*
  350. * Handle the basic code points:
  351. * Let basicLength be the number of input code points
  352. * before the last delimiter, or 0 if there is none,
  353. * then copy the first basicLength code points to the output.
  354. *
  355. * The two following loops iterate backward.
  356. */
  357. for(j=srcLength; j>0;) {
  358. if(src[--j]==DELIMITER) {
  359. break;
  360. }
  361. }
  362. destLength=basicLength=destCPCount=j;
  363. U_ASSERT(destLength>=0);
  364. while(j>0) {
  365. b=src[--j];
  366. if(!IS_BASIC(b)) {
  367. *pErrorCode=U_INVALID_CHAR_FOUND;
  368. return 0;
  369. }
  370. if(j<destCapacity) {
  371. dest[j] = b;
  372. if(caseFlags!=nullptr) {
  373. caseFlags[j]=IS_BASIC_UPPERCASE(b);
  374. }
  375. }
  376. }
  377. /* Initialize the state: */
  378. n=INITIAL_N;
  379. i=0;
  380. bias=INITIAL_BIAS;
  381. firstSupplementaryIndex=1000000000;
  382. /*
  383. * Main decoding loop:
  384. * Start just after the last delimiter if any
  385. * basic code points were copied; start at the beginning otherwise.
  386. */
  387. for(in=basicLength>0 ? basicLength+1 : 0; in<srcLength; /* no op */) {
  388. /*
  389. * in is the index of the next character to be consumed, and
  390. * destCPCount is the number of code points in the output array.
  391. *
  392. * Decode a generalized variable-length integer into delta,
  393. * which gets added to i. The overflow checking is easier
  394. * if we increase i as we go, then subtract off its starting
  395. * value at the end to obtain delta.
  396. */
  397. for(oldi=i, w=1, k=BASE; /* no condition */; k+=BASE) {
  398. if(in>=srcLength) {
  399. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  400. return 0;
  401. }
  402. digit=decodeDigit(src[in++]);
  403. if(digit<0) {
  404. *pErrorCode=U_INVALID_CHAR_FOUND;
  405. return 0;
  406. }
  407. if(digit>(0x7fffffff-i)/w) {
  408. /* integer overflow */
  409. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  410. return 0;
  411. }
  412. i+=digit*w;
  413. /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
  414. t=k-bias;
  415. if(t<TMIN) {
  416. t=TMIN;
  417. } else if(t>TMAX) {
  418. t=TMAX;
  419. }
  420. */
  421. t=k-bias;
  422. if(t<TMIN) {
  423. t=TMIN;
  424. } else if(k>=(bias+TMAX)) {
  425. t=TMAX;
  426. }
  427. if(digit<t) {
  428. break;
  429. }
  430. if(w>0x7fffffff/(BASE-t)) {
  431. /* integer overflow */
  432. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  433. return 0;
  434. }
  435. w*=BASE-t;
  436. }
  437. /*
  438. * Modification from sample code:
  439. * Increments destCPCount here,
  440. * where needed instead of in for() loop tail.
  441. */
  442. ++destCPCount;
  443. bias = adaptBias(i - oldi, destCPCount, oldi == 0);
  444. /*
  445. * i was supposed to wrap around from (incremented) destCPCount to 0,
  446. * incrementing n each time, so we'll fix that now:
  447. */
  448. if(i/destCPCount>(0x7fffffff-n)) {
  449. /* integer overflow */
  450. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  451. return 0;
  452. }
  453. n+=i/destCPCount;
  454. i%=destCPCount;
  455. /* not needed for Punycode: */
  456. /* if (decode_digit(n) <= BASE) return punycode_invalid_input; */
  457. if(n>0x10ffff || U_IS_SURROGATE(n)) {
  458. /* Unicode code point overflow */
  459. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  460. return 0;
  461. }
  462. /* Insert n at position i of the output: */
  463. cpLength=U16_LENGTH(n);
  464. if(dest!=nullptr && ((destLength+cpLength)<=destCapacity)) {
  465. int32_t codeUnitIndex;
  466. /*
  467. * Handle indexes when supplementary code points are present.
  468. *
  469. * In almost all cases, there will be only BMP code points before i
  470. * and even in the entire string.
  471. * This is handled with the same efficiency as with UTF-32.
  472. *
  473. * Only the rare cases with supplementary code points are handled
  474. * more slowly - but not too bad since this is an insertion anyway.
  475. */
  476. if(i<=firstSupplementaryIndex) {
  477. codeUnitIndex=i;
  478. if(cpLength>1) {
  479. firstSupplementaryIndex=codeUnitIndex;
  480. } else {
  481. ++firstSupplementaryIndex;
  482. }
  483. } else {
  484. codeUnitIndex=firstSupplementaryIndex;
  485. U16_FWD_N(dest, codeUnitIndex, destLength, i-codeUnitIndex);
  486. }
  487. /* use the char16_t index codeUnitIndex instead of the code point index i */
  488. if(codeUnitIndex<destLength) {
  489. uprv_memmove(dest+codeUnitIndex+cpLength,
  490. dest+codeUnitIndex,
  491. (destLength-codeUnitIndex)*U_SIZEOF_UCHAR);
  492. if(caseFlags!=nullptr) {
  493. uprv_memmove(caseFlags+codeUnitIndex+cpLength,
  494. caseFlags+codeUnitIndex,
  495. destLength-codeUnitIndex);
  496. }
  497. }
  498. if(cpLength==1) {
  499. /* BMP, insert one code unit */
  500. dest[codeUnitIndex]=(char16_t)n;
  501. } else {
  502. /* supplementary character, insert two code units */
  503. dest[codeUnitIndex]=U16_LEAD(n);
  504. dest[codeUnitIndex+1]=U16_TRAIL(n);
  505. }
  506. if(caseFlags!=nullptr) {
  507. /* Case of last character determines uppercase flag: */
  508. caseFlags[codeUnitIndex]=IS_BASIC_UPPERCASE(src[in-1]);
  509. if(cpLength==2) {
  510. caseFlags[codeUnitIndex+1]=false;
  511. }
  512. }
  513. }
  514. destLength+=cpLength;
  515. U_ASSERT(destLength>=0);
  516. ++i;
  517. }
  518. return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
  519. }
  520. /* ### check notes on overflow handling - only necessary if not IDNA? are these Punycode functions to be public? */
  521. #endif /* #if !UCONFIG_NO_IDNA */