test_idna_uts46.py 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145
  1. """Tests for TR46 code."""
  2. import gzip
  3. import os.path
  4. import re
  5. import sys
  6. import unittest
  7. import idna
  8. if sys.version_info[0] >= 3:
  9. unichr = chr
  10. unicode = str
  11. _RE_UNICODE = re.compile(u"\\\\u([0-9a-fA-F]{4})")
  12. _RE_SURROGATE = re.compile(u"[\uD800-\uDBFF][\uDC00-\uDFFF]")
  13. _SKIP_TESTS = [
  14. # These appear to be errors in the test vectors. All relate to incorrectly applying
  15. # bidi rules across label boundaries. Appears independently confirmed
  16. # at http://www.alvestrand.no/pipermail/idna-update/2017-January/007946.html
  17. u'0\u00E0.\u05D0', u'0a\u0300.\u05D0', u'0A\u0300.\u05D0', u'0\u00C0.\u05D0', 'xn--0-sfa.xn--4db',
  18. u'\u00E0\u02c7.\u05D0', u'a\u0300\u02c7.\u05D0', u'A\u0300\u02c7.\u05D0', u'\u00C0\u02c7.\u05D0',
  19. 'xn--0ca88g.xn--4db', u'0A.\u05D0', u'0a.\u05D0', '0a.xn--4db', 'c.xn--0-eha.xn--4db',
  20. u'c.0\u00FC.\u05D0', u'c.0u\u0308.\u05D0', u'C.0U\u0308.\u05D0', u'C.0\u00DC.\u05D0',
  21. u'\u06B6\u06DF\u3002\u2087\uA806', u'\u06B6\u06DF\u30027\uA806', 'xn--pkb6f.xn--7-x93e',
  22. u'\u06B6\u06DF.7\uA806', u'1.\uAC7E6.\U00010C41\u06D0', u'1.\u1100\u1165\u11B56.\U00010C41\u06D0',
  23. '1.xn--6-945e.xn--glb1794k',
  24. # These are transitional strings that compute to NV8 and thus are not supported
  25. # in IDNA 2008.
  26. u'\U000102F7\u3002\u200D',
  27. u'\U0001D7F5\u9681\u2BEE\uFF0E\u180D\u200C',
  28. u'9\u9681\u2BEE.\u180D\u200C',
  29. u'\u00DF\u200C\uAAF6\u18A5.\u22B6\u2D21\u2D16',
  30. u'ss\u200C\uAAF6\u18A5.\u22B6\u2D21\u2D16',
  31. u'\u00DF\u200C\uAAF6\u18A5\uFF0E\u22B6\u2D21\u2D16',
  32. u'ss\u200C\uAAF6\u18A5\uFF0E\u22B6\u2D21\u2D16',
  33. u'\U00010A57\u200D\u3002\u2D09\u2D15',
  34. u'\U00010A57\u200D\uFF61\u2D09\u2D15',
  35. u'\U0001D7CF\U0001DA19\u2E16.\u200D',
  36. u'1\U0001DA19\u2E16.\u200D',
  37. u'\U0001D7E04\U000E01D7\U0001D23B\uFF0E\u200D\U000102F5\u26E7\u200D',
  38. u'84\U000E01D7\U0001D23B.\u200D\U000102F5\u26E7\u200D',
  39. ]
  40. def unicode_fixup(string):
  41. """Replace backslash-u-XXXX with appropriate unicode characters."""
  42. return _RE_SURROGATE.sub(lambda match: unichr(
  43. (ord(match.group(0)[0]) - 0xd800) * 0x400 +
  44. ord(match.group(0)[1]) - 0xdc00 + 0x10000),
  45. _RE_UNICODE.sub(lambda match: unichr(int(match.group(1), 16)), string))
  46. def parse_idna_test_table(inputstream):
  47. """Parse IdnaTest.txt and return a list of tuples."""
  48. tests = []
  49. for lineno, line in enumerate(inputstream):
  50. line = line.decode("utf-8").strip()
  51. if "#" in line:
  52. line = line.split("#", 1)[0]
  53. if not line:
  54. continue
  55. tests.append((lineno + 1, tuple(field.strip()
  56. for field in line.split(u";"))))
  57. return tests
  58. class TestIdnaTest(unittest.TestCase):
  59. """Run one of the IdnaTest.txt test lines."""
  60. def __init__(self, lineno=None, fields=None):
  61. super(TestIdnaTest, self).__init__()
  62. self.lineno = lineno
  63. self.fields = fields
  64. def id(self):
  65. return "%s.%d" % (super(TestIdnaTest, self).id(), self.lineno)
  66. def shortDescription(self):
  67. if not self.fields:
  68. return ""
  69. return "IdnaTest.txt line %d: %r" % (self.lineno,
  70. u"; ".join(self.fields))
  71. def runTest(self):
  72. if not self.fields:
  73. return
  74. try:
  75. types, source, to_unicode, to_ascii = (unicode_fixup(field)
  76. for field in self.fields[:4])
  77. if (unicode_fixup(u"\\uD804\\uDC39") in source and
  78. sys.version_info[0] < 3):
  79. raise unittest.SkipTest(
  80. "Python 2's Unicode support is too old for this test")
  81. except ValueError:
  82. raise unittest.SkipTest(
  83. "Test requires Python wide Unicode support")
  84. if source in _SKIP_TESTS:
  85. return
  86. if not to_unicode:
  87. to_unicode = source
  88. if not to_ascii:
  89. to_ascii = to_unicode
  90. nv8 = (len(self.fields) > 4 and self.fields[4])
  91. try:
  92. output = idna.decode(source, uts46=True, strict=True)
  93. if to_unicode[0] == u"[":
  94. self.fail("decode() did not emit required error {0} for {1}".format(to_unicode, repr(source)))
  95. self.assertEqual(output, to_unicode, "unexpected decode() output")
  96. except (idna.IDNAError, UnicodeError, ValueError) as exc:
  97. if unicode(exc).startswith(u"Unknown"):
  98. raise unittest.SkipTest("Test requires support for a newer"
  99. " version of Unicode than this Python supports")
  100. if to_unicode[0] != u"[" and not nv8:
  101. raise
  102. for transitional in {
  103. u"B": (True, False),
  104. u"T": (True,),
  105. u"N": (False,),
  106. }[types]:
  107. try:
  108. output = idna.encode(source, uts46=True, strict=True,
  109. transitional=transitional).decode("ascii")
  110. if to_ascii[0] == u"[":
  111. self.fail(
  112. "encode(transitional={0}) did not emit required error {1} for {2}".
  113. format(transitional, to_ascii, repr(source)))
  114. self.assertEqual(output, to_ascii,
  115. "unexpected encode(transitional={0}) output".
  116. format(transitional))
  117. except (idna.IDNAError, UnicodeError, ValueError) as exc:
  118. if unicode(exc).startswith(u"Unknown"):
  119. raise unittest.SkipTest("Test requires support for a newer"
  120. " version of Unicode than this Python supports")
  121. if to_ascii[0] != u"[" and not nv8:
  122. raise
  123. def load_tests(loader, tests, pattern):
  124. """Create a suite of all the individual tests."""
  125. suite = unittest.TestSuite()
  126. with gzip.open(os.path.join(os.path.dirname(__file__),
  127. "IdnaTest.txt.gz"), "rb") as tests_file:
  128. suite.addTests(TestIdnaTest(lineno, fields)
  129. for lineno, fields in parse_idna_test_table(tests_file))
  130. return suite