test_idna_uts46.py 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220
  1. """Tests for TR46 code."""
  2. import os.path
  3. import re
  4. import unittest
  5. import idna
  6. _RE_UNICODE = re.compile("\\\\u([0-9a-fA-F]{4})")
  7. _RE_SURROGATE = re.compile("[\ud800-\udbff][\udc00-\udfff]")
  8. _SKIP_TESTS = [
  9. # These are strings that are illegal in IDNA 2008. Older versions of the UTS-46 test suite
  10. # had these denoted with the 'NV8' marker but this has been removed, so we need to manually
  11. # review exceptions and add them here to skip them as text vectors if they are invalid.
  12. "\U000102f7\u3002\u200d",
  13. "\U0001d7f5\u9681\u2bee\uff0e\u180d\u200c",
  14. "9\u9681\u2bee.\u180d\u200c",
  15. "\u00df\u200c\uaaf6\u18a5.\u22b6\u2d21\u2d16",
  16. "ss\u200c\uaaf6\u18a5.\u22b6\u2d21\u2d16",
  17. "\u00df\u200c\uaaf6\u18a5\uff0e\u22b6\u2d21\u2d16",
  18. "ss\u200c\uaaf6\u18a5\uff0e\u22b6\u2d21\u2d16",
  19. "\U00010a57\u200d\u3002\u2d09\u2d15",
  20. "\U00010a57\u200d\uff61\u2d09\u2d15",
  21. "\U0001d7cf\U0001da19\u2e16.\u200d",
  22. "1\U0001da19\u2e16.\u200d",
  23. "\U0001d7e04\U000e01d7\U0001d23b\uff0e\u200d\U000102f5\u26e7\u200d",
  24. "84\U000e01d7\U0001d23b.\u200d\U000102f5\u26e7\u200d",
  25. "\u00a1",
  26. "xn--7a",
  27. "\u19da",
  28. "xn--pkf",
  29. "\u2615",
  30. "xn--53h",
  31. "\U0001e937.\U00010b90\U0001e881\U00010e60\u0624",
  32. "\U0001e937.\U00010b90\U0001e881\U00010e60\u0648\u0654",
  33. "\U0001e915.\U00010b90\U0001e881\U00010e60\u0648\u0654",
  34. "\U0001e915.\U00010b90\U0001e881\U00010e60\u0624",
  35. "xn--ve6h.xn--jgb1694kz0b2176a",
  36. "\u00df\u3002\U000102f3\u2d0c\u0fb8",
  37. "ss\u3002\U000102f3\u2d0c\u0fb8",
  38. "ss.xn--lgd921mvv0m",
  39. "ss.\U000102f3\u2d0c\u0fb8",
  40. "xn--zca.xn--lgd921mvv0m",
  41. "\u00df.\U000102f3\u2d0c\u0fb8",
  42. "\u00df\uff61\U000102f3\u2d0c\u0fb8",
  43. "ss\uff61\U000102f3\u2d0c\u0fb8",
  44. "\u16ad\uff61\U0001d320\u00df\U00016af1",
  45. "\u16ad\u3002\U0001d320\u00df\U00016af1",
  46. "\u16ad\u3002\U0001d320SS\U00016af1",
  47. "\u16ad\u3002\U0001d320ss\U00016af1",
  48. "\u16ad\u3002\U0001d320Ss\U00016af1",
  49. "xn--hwe.xn--ss-ci1ub261a",
  50. "\u16ad.\U0001d320ss\U00016af1",
  51. "\u16ad.\U0001d320SS\U00016af1",
  52. "\u16ad.\U0001d320Ss\U00016af1",
  53. "xn--hwe.xn--zca4946pblnc",
  54. "\u16ad.\U0001d320\u00df\U00016af1",
  55. "\u16ad\uff61\U0001d320SS\U00016af1",
  56. "\u16ad\uff61\U0001d320ss\U00016af1",
  57. "\u16ad\uff61\U0001d320Ss\U00016af1",
  58. "\u2d1a\U000102f8\U000e0104\u30025\ud7f6\u103a",
  59. "xn--ilj2659d.xn--5-dug9054m",
  60. "\u2d1a\U000102f8.5\ud7f6\u103a",
  61. "\u2d1a\U000102f8\U000e0104\u3002\U0001d7dd\ud7f6\u103a",
  62. "xn--9-mfs8024b.",
  63. "9\u9681\u2bee.",
  64. "xn--ss-4epx629f.xn--ifh802b6a",
  65. "ss\uaaf6\u18a5.\u22b6\u2d21\u2d16",
  66. "xn--pt9c.xn--0kjya",
  67. "\U00010a57.\u2d09\u2d15",
  68. "\ua5f7\U00011180.\u075d\U00010a52",
  69. "xn--ju8a625r.xn--hpb0073k",
  70. "\u03c2.\u0641\u0645\u064a\U0001f79b1.",
  71. "\u03a3.\u0641\u0645\u064a\U0001f79b1.",
  72. "\u03c3.\u0641\u0645\u064a\U0001f79b1.",
  73. "xn--4xa.xn--1-gocmu97674d.",
  74. "xn--3xa.xn--1-gocmu97674d.",
  75. "xn--1-5bt6845n.",
  76. "1\U0001da19\u2e16.",
  77. "xn--84-s850a.xn--59h6326e",
  78. "84\U0001d23b.\U000102f5\u26e7",
  79. "xn--r97c.",
  80. "\U000102f7.",
  81. # These appear to be errors in the test vectors. All relate to incorrectly applying
  82. # bidi rules across label boundaries. Appears independently confirmed
  83. # at http://www.alvestrand.no/pipermail/idna-update/2017-January/007946.html
  84. "0\u00e0.\u05d0",
  85. "0a\u0300.\u05d0",
  86. "0A\u0300.\u05d0",
  87. "0\u00c0.\u05d0",
  88. "xn--0-sfa.xn--4db",
  89. "\u00e0\u02c7.\u05d0",
  90. "a\u0300\u02c7.\u05d0",
  91. "A\u0300\u02c7.\u05d0",
  92. "\u00c0\u02c7.\u05d0",
  93. "xn--0ca88g.xn--4db",
  94. "0A.\u05d0",
  95. "0a.\u05d0",
  96. "0a.xn--4db",
  97. "c.xn--0-eha.xn--4db",
  98. "c.0\u00fc.\u05d0",
  99. "c.0u\u0308.\u05d0",
  100. "C.0U\u0308.\u05d0",
  101. "C.0\u00dc.\u05d0",
  102. "C.0\u00fc.\u05d0",
  103. "C.0\u0075\u0308.\u05d0",
  104. "\u06b6\u06df\u3002\u2087\ua806",
  105. "\u06b6\u06df\u30027\ua806",
  106. "xn--pkb6f.xn--7-x93e",
  107. "\u06b6\u06df.7\ua806",
  108. "1.\uac7e6.\U00010c41\u06d0",
  109. "1.\u1100\u1165\u11b56.\U00010c41\u06d0",
  110. "1.xn--6-945e.xn--glb1794k",
  111. ]
  112. def unicode_fixup(string):
  113. """Replace backslash-u-XXXX with appropriate unicode characters."""
  114. return _RE_SURROGATE.sub(
  115. lambda match: chr((ord(match.group(0)[0]) - 0xD800) * 0x400 + ord(match.group(0)[1]) - 0xDC00 + 0x10000),
  116. _RE_UNICODE.sub(lambda match: chr(int(match.group(1), 16)), string),
  117. )
  118. def parse_idna_test_table(inputstream):
  119. """Parse IdnaTestV2.txt and return a list of tuples."""
  120. for lineno, line in enumerate(inputstream):
  121. line = line.decode("utf-8").strip()
  122. if "#" in line:
  123. line = line.split("#", 1)[0]
  124. if not line:
  125. continue
  126. yield ((lineno + 1, tuple(field.strip() for field in line.split(";"))))
  127. class TestIdnaTest(unittest.TestCase):
  128. """Run one of the IdnaTestV2.txt test lines."""
  129. def __init__(self, lineno=None, fields=None):
  130. super().__init__()
  131. self.lineno = lineno
  132. self.fields = fields
  133. def id(self):
  134. return "{}.{}".format(super().id(), self.lineno)
  135. def shortDescription(self):
  136. if not self.fields:
  137. return ""
  138. return "IdnaTestV2.txt line {}: {}".format(self.lineno, "; ".join(self.fields))
  139. def runTest(self):
  140. if not self.fields:
  141. return
  142. (
  143. source,
  144. to_unicode,
  145. to_unicode_status,
  146. to_ascii,
  147. to_ascii_status,
  148. to_ascii_t,
  149. to_ascii_t_status,
  150. ) = self.fields
  151. if source in _SKIP_TESTS:
  152. return
  153. if not to_unicode:
  154. to_unicode = source
  155. if not to_unicode_status:
  156. to_unicode_status = "[]"
  157. if not to_ascii:
  158. to_ascii = to_unicode
  159. if not to_ascii_status:
  160. to_ascii_status = to_unicode_status
  161. if not to_ascii_t:
  162. to_ascii_t = to_ascii
  163. if not to_ascii_t_status:
  164. to_ascii_t_status = to_ascii_status
  165. try:
  166. output = idna.decode(source, uts46=True, strict=True)
  167. if to_unicode_status != "[]":
  168. self.fail("decode() did not emit required error {} for {}".format(to_unicode, repr(source)))
  169. self.assertEqual(output, to_unicode, "unexpected decode() output")
  170. except (idna.IDNAError, UnicodeError, ValueError) as exc:
  171. if str(exc).startswith("Unknown"):
  172. raise unittest.SkipTest("Test requires support for a newer" " version of Unicode than this Python supports")
  173. if to_unicode_status == "[]":
  174. raise
  175. try:
  176. output = idna.encode(source, uts46=True, strict=True).decode("ascii")
  177. if to_ascii_status != "[]":
  178. self.fail("encode() did not emit required error {} for {}".format(to_ascii_status, repr(source)))
  179. self.assertEqual(output, to_ascii, "unexpected encode() output")
  180. except (idna.IDNAError, UnicodeError, ValueError) as exc:
  181. if str(exc).startswith("Unknown"):
  182. raise unittest.SkipTest("Test requires support for a newer" " version of Unicode than this Python supports")
  183. if to_ascii_status == "[]":
  184. raise
  185. try:
  186. output = idna.encode(source, uts46=True, strict=True, transitional=True).decode("ascii")
  187. if to_ascii_t_status != "[]":
  188. self.fail(
  189. "encode(transitional=True) did not emit required error {} for {}".format(to_ascii_t_status, repr(source))
  190. )
  191. self.assertEqual(output, to_ascii_t, "unexpected encode() output")
  192. except (idna.IDNAError, UnicodeError, ValueError) as exc:
  193. if str(exc).startswith("Unknown"):
  194. raise unittest.SkipTest("Test requires support for a newer" " version of Unicode than this Python supports")
  195. if to_ascii_t_status == "[]":
  196. raise
  197. def load_tests(loader, tests, pattern):
  198. """Create a suite of all the individual tests."""
  199. suite = unittest.TestSuite()
  200. with open(os.path.join(os.path.dirname(__file__), "IdnaTestV2.txt"), "rb") as tests_file:
  201. suite.addTests(TestIdnaTest(lineno, fields) for lineno, fields in parse_idna_test_table(tests_file))
  202. return suite