idna.py 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317
  1. # This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep)
  2. import stringprep, re, codecs
  3. from unicodedata import ucd_3_2_0 as unicodedata
  4. # IDNA section 3.1
  5. dots = re.compile("[\u002E\u3002\uFF0E\uFF61]")
  6. # IDNA section 5
  7. ace_prefix = b"xn--"
  8. sace_prefix = "xn--"
  9. # This assumes query strings, so AllowUnassigned is true
  10. def nameprep(label):
  11. # Map
  12. newlabel = []
  13. for c in label:
  14. if stringprep.in_table_b1(c):
  15. # Map to nothing
  16. continue
  17. newlabel.append(stringprep.map_table_b2(c))
  18. label = "".join(newlabel)
  19. # Normalize
  20. label = unicodedata.normalize("NFKC", label)
  21. # Prohibit
  22. for c in label:
  23. if stringprep.in_table_c12(c) or \
  24. stringprep.in_table_c22(c) or \
  25. stringprep.in_table_c3(c) or \
  26. stringprep.in_table_c4(c) or \
  27. stringprep.in_table_c5(c) or \
  28. stringprep.in_table_c6(c) or \
  29. stringprep.in_table_c7(c) or \
  30. stringprep.in_table_c8(c) or \
  31. stringprep.in_table_c9(c):
  32. raise UnicodeError("Invalid character %r" % c)
  33. # Check bidi
  34. RandAL = [stringprep.in_table_d1(x) for x in label]
  35. if any(RandAL):
  36. # There is a RandAL char in the string. Must perform further
  37. # tests:
  38. # 1) The characters in section 5.8 MUST be prohibited.
  39. # This is table C.8, which was already checked
  40. # 2) If a string contains any RandALCat character, the string
  41. # MUST NOT contain any LCat character.
  42. if any(stringprep.in_table_d2(x) for x in label):
  43. raise UnicodeError("Violation of BIDI requirement 2")
  44. # 3) If a string contains any RandALCat character, a
  45. # RandALCat character MUST be the first character of the
  46. # string, and a RandALCat character MUST be the last
  47. # character of the string.
  48. if not RandAL[0] or not RandAL[-1]:
  49. raise UnicodeError("Violation of BIDI requirement 3")
  50. return label
  51. def ToASCII(label):
  52. try:
  53. # Step 1: try ASCII
  54. label = label.encode("ascii")
  55. except UnicodeError:
  56. pass
  57. else:
  58. # Skip to step 3: UseSTD3ASCIIRules is false, so
  59. # Skip to step 8.
  60. if 0 < len(label) < 64:
  61. return label
  62. raise UnicodeError("label empty or too long")
  63. # Step 2: nameprep
  64. label = nameprep(label)
  65. # Step 3: UseSTD3ASCIIRules is false
  66. # Step 4: try ASCII
  67. try:
  68. label = label.encode("ascii")
  69. except UnicodeError:
  70. pass
  71. else:
  72. # Skip to step 8.
  73. if 0 < len(label) < 64:
  74. return label
  75. raise UnicodeError("label empty or too long")
  76. # Step 5: Check ACE prefix
  77. if label.startswith(sace_prefix):
  78. raise UnicodeError("Label starts with ACE prefix")
  79. # Step 6: Encode with PUNYCODE
  80. label = label.encode("punycode")
  81. # Step 7: Prepend ACE prefix
  82. label = ace_prefix + label
  83. # Step 8: Check size
  84. if 0 < len(label) < 64:
  85. return label
  86. raise UnicodeError("label empty or too long")
  87. def ToUnicode(label):
  88. if len(label) > 1024:
  89. # Protection from https://github.com/python/cpython/issues/98433.
  90. # https://datatracker.ietf.org/doc/html/rfc5894#section-6
  91. # doesn't specify a label size limit prior to NAMEPREP. But having
  92. # one makes practical sense.
  93. # This leaves ample room for nameprep() to remove Nothing characters
  94. # per https://www.rfc-editor.org/rfc/rfc3454#section-3.1 while still
  95. # preventing us from wasting time decoding a big thing that'll just
  96. # hit the actual <= 63 length limit in Step 6.
  97. raise UnicodeError("label way too long")
  98. # Step 1: Check for ASCII
  99. if isinstance(label, bytes):
  100. pure_ascii = True
  101. else:
  102. try:
  103. label = label.encode("ascii")
  104. pure_ascii = True
  105. except UnicodeError:
  106. pure_ascii = False
  107. if not pure_ascii:
  108. # Step 2: Perform nameprep
  109. label = nameprep(label)
  110. # It doesn't say this, but apparently, it should be ASCII now
  111. try:
  112. label = label.encode("ascii")
  113. except UnicodeError:
  114. raise UnicodeError("Invalid character in IDN label")
  115. # Step 3: Check for ACE prefix
  116. if not label.startswith(ace_prefix):
  117. return str(label, "ascii")
  118. # Step 4: Remove ACE prefix
  119. label1 = label[len(ace_prefix):]
  120. # Step 5: Decode using PUNYCODE
  121. result = label1.decode("punycode")
  122. # Step 6: Apply ToASCII
  123. label2 = ToASCII(result)
  124. # Step 7: Compare the result of step 6 with the one of step 3
  125. # label2 will already be in lower case.
  126. if str(label, "ascii").lower() != str(label2, "ascii"):
  127. raise UnicodeError("IDNA does not round-trip", label, label2)
  128. # Step 8: return the result of step 5
  129. return result
  130. ### Codec APIs
  131. class Codec(codecs.Codec):
  132. def encode(self, input, errors='strict'):
  133. if errors != 'strict':
  134. # IDNA is quite clear that implementations must be strict
  135. raise UnicodeError("unsupported error handling "+errors)
  136. if not input:
  137. return b'', 0
  138. try:
  139. result = input.encode('ascii')
  140. except UnicodeEncodeError:
  141. pass
  142. else:
  143. # ASCII name: fast path
  144. labels = result.split(b'.')
  145. for label in labels[:-1]:
  146. if not (0 < len(label) < 64):
  147. raise UnicodeError("label empty or too long")
  148. if len(labels[-1]) >= 64:
  149. raise UnicodeError("label too long")
  150. return result, len(input)
  151. result = bytearray()
  152. labels = dots.split(input)
  153. if labels and not labels[-1]:
  154. trailing_dot = b'.'
  155. del labels[-1]
  156. else:
  157. trailing_dot = b''
  158. for label in labels:
  159. if result:
  160. # Join with U+002E
  161. result.extend(b'.')
  162. result.extend(ToASCII(label))
  163. return bytes(result+trailing_dot), len(input)
  164. def decode(self, input, errors='strict'):
  165. if errors != 'strict':
  166. raise UnicodeError("Unsupported error handling "+errors)
  167. if not input:
  168. return "", 0
  169. # IDNA allows decoding to operate on Unicode strings, too.
  170. if not isinstance(input, bytes):
  171. # XXX obviously wrong, see #3232
  172. input = bytes(input)
  173. if ace_prefix not in input:
  174. # Fast path
  175. try:
  176. return input.decode('ascii'), len(input)
  177. except UnicodeDecodeError:
  178. pass
  179. labels = input.split(b".")
  180. if labels and len(labels[-1]) == 0:
  181. trailing_dot = '.'
  182. del labels[-1]
  183. else:
  184. trailing_dot = ''
  185. result = []
  186. for label in labels:
  187. result.append(ToUnicode(label))
  188. return ".".join(result)+trailing_dot, len(input)
  189. class IncrementalEncoder(codecs.BufferedIncrementalEncoder):
  190. def _buffer_encode(self, input, errors, final):
  191. if errors != 'strict':
  192. # IDNA is quite clear that implementations must be strict
  193. raise UnicodeError("unsupported error handling "+errors)
  194. if not input:
  195. return (b'', 0)
  196. labels = dots.split(input)
  197. trailing_dot = b''
  198. if labels:
  199. if not labels[-1]:
  200. trailing_dot = b'.'
  201. del labels[-1]
  202. elif not final:
  203. # Keep potentially unfinished label until the next call
  204. del labels[-1]
  205. if labels:
  206. trailing_dot = b'.'
  207. result = bytearray()
  208. size = 0
  209. for label in labels:
  210. if size:
  211. # Join with U+002E
  212. result.extend(b'.')
  213. size += 1
  214. result.extend(ToASCII(label))
  215. size += len(label)
  216. result += trailing_dot
  217. size += len(trailing_dot)
  218. return (bytes(result), size)
  219. class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
  220. def _buffer_decode(self, input, errors, final):
  221. if errors != 'strict':
  222. raise UnicodeError("Unsupported error handling "+errors)
  223. if not input:
  224. return ("", 0)
  225. # IDNA allows decoding to operate on Unicode strings, too.
  226. if isinstance(input, str):
  227. labels = dots.split(input)
  228. else:
  229. # Must be ASCII string
  230. input = str(input, "ascii")
  231. labels = input.split(".")
  232. trailing_dot = ''
  233. if labels:
  234. if not labels[-1]:
  235. trailing_dot = '.'
  236. del labels[-1]
  237. elif not final:
  238. # Keep potentially unfinished label until the next call
  239. del labels[-1]
  240. if labels:
  241. trailing_dot = '.'
  242. result = []
  243. size = 0
  244. for label in labels:
  245. result.append(ToUnicode(label))
  246. if size:
  247. size += 1
  248. size += len(label)
  249. result = ".".join(result) + trailing_dot
  250. size += len(trailing_dot)
  251. return (result, size)
  252. class StreamWriter(Codec,codecs.StreamWriter):
  253. pass
  254. class StreamReader(Codec,codecs.StreamReader):
  255. pass
  256. ### encodings module API
  257. def getregentry():
  258. return codecs.CodecInfo(
  259. name='idna',
  260. encode=Codec().encode,
  261. decode=Codec().decode,
  262. incrementalencoder=IncrementalEncoder,
  263. incrementaldecoder=IncrementalDecoder,
  264. streamwriter=StreamWriter,
  265. streamreader=StreamReader,
  266. )