hypothesis.py 9.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324
  1. # -*- coding: utf-8 -*-
  2. """
  3. Hypothesis strategies.
  4. """
  5. from __future__ import absolute_import
  6. try:
  7. import hypothesis
  8. del hypothesis
  9. except ImportError:
  10. from typing import Tuple
  11. __all__ = () # type: Tuple[str, ...]
  12. else:
  13. import io
  14. import pkgutil
  15. from csv import reader as csv_reader
  16. from os.path import dirname, join
  17. from string import ascii_letters, digits
  18. from sys import maxunicode
  19. from typing import (
  20. Callable,
  21. Iterable,
  22. List,
  23. Optional,
  24. Sequence,
  25. Text,
  26. TypeVar,
  27. cast,
  28. )
  29. from gzip import open as open_gzip
  30. from . import DecodedURL, EncodedURL
  31. from hypothesis import assume
  32. from hypothesis.strategies import (
  33. composite,
  34. integers,
  35. lists,
  36. sampled_from,
  37. text,
  38. )
  39. from idna import IDNAError, check_label, encode as idna_encode
  40. __all__ = (
  41. "decoded_urls",
  42. "encoded_urls",
  43. "hostname_labels",
  44. "hostnames",
  45. "idna_text",
  46. "paths",
  47. "port_numbers",
  48. )
  49. T = TypeVar("T")
  50. DrawCallable = Callable[[Callable[..., T]], T]
  51. try:
  52. unichr
  53. except NameError: # Py3
  54. unichr = chr # type: Callable[[int], Text]
  55. def idna_characters():
  56. # type: () -> Text
  57. """
  58. Returns a string containing IDNA characters.
  59. """
  60. global _idnaCharacters
  61. if not _idnaCharacters:
  62. result = []
  63. # Data source "IDNA Derived Properties":
  64. # https://www.iana.org/assignments/idna-tables-6.3.0/
  65. # idna-tables-6.3.0.xhtml#idna-tables-properties
  66. dataFileName = join(
  67. dirname(__file__), "idna-tables-properties.csv.gz"
  68. )
  69. data = io.BytesIO(pkgutil.get_data(__name__, "idna-tables-properties.csv.gz"))
  70. with open_gzip(data) as dataFile:
  71. reader = csv_reader(
  72. (line.decode("utf-8") for line in dataFile),
  73. delimiter=",",
  74. )
  75. next(reader) # Skip header row
  76. for row in reader:
  77. codes, prop, description = row
  78. if prop != "PVALID":
  79. # CONTEXTO or CONTEXTJ are also allowed, but they come
  80. # with rules, so we're punting on those here.
  81. # See: https://tools.ietf.org/html/rfc5892
  82. continue
  83. startEnd = row[0].split("-", 1)
  84. if len(startEnd) == 1:
  85. # No end of range given; use start
  86. startEnd.append(startEnd[0])
  87. start, end = (int(i, 16) for i in startEnd)
  88. for i in range(start, end + 1):
  89. if i > maxunicode: # Happens using Py2 on Windows
  90. break
  91. result.append(unichr(i))
  92. _idnaCharacters = u"".join(result)
  93. return _idnaCharacters
  94. _idnaCharacters = "" # type: Text
  95. @composite
  96. def idna_text(draw, min_size=1, max_size=None):
  97. # type: (DrawCallable, int, Optional[int]) -> Text
  98. """
  99. A strategy which generates IDNA-encodable text.
  100. @param min_size: The minimum number of characters in the text.
  101. C{None} is treated as C{0}.
  102. @param max_size: The maximum number of characters in the text.
  103. Use C{None} for an unbounded size.
  104. """
  105. alphabet = idna_characters()
  106. assert min_size >= 1
  107. if max_size is not None:
  108. assert max_size >= 1
  109. result = cast(
  110. Text,
  111. draw(text(min_size=min_size, max_size=max_size, alphabet=alphabet)),
  112. )
  113. # FIXME: There should be a more efficient way to ensure we produce
  114. # valid IDNA text.
  115. try:
  116. idna_encode(result)
  117. except IDNAError:
  118. assume(False)
  119. return result
  120. @composite
  121. def port_numbers(draw, allow_zero=False):
  122. # type: (DrawCallable, bool) -> int
  123. """
  124. A strategy which generates port numbers.
  125. @param allow_zero: Whether to allow port C{0} as a possible value.
  126. """
  127. if allow_zero:
  128. min_value = 0
  129. else:
  130. min_value = 1
  131. return cast(int, draw(integers(min_value=min_value, max_value=65535)))
  132. @composite
  133. def hostname_labels(draw, allow_idn=True):
  134. # type: (DrawCallable, bool) -> Text
  135. """
  136. A strategy which generates host name labels.
  137. @param allow_idn: Whether to allow non-ASCII characters as allowed by
  138. internationalized domain names (IDNs).
  139. """
  140. if allow_idn:
  141. label = cast(Text, draw(idna_text(min_size=1, max_size=63)))
  142. try:
  143. label.encode("ascii")
  144. except UnicodeEncodeError:
  145. # If the label doesn't encode to ASCII, then we need to check
  146. # the length of the label after encoding to punycode and adding
  147. # the xn-- prefix.
  148. while len(label.encode("punycode")) > 63 - len("xn--"):
  149. # Rather than bombing out, just trim from the end until it
  150. # is short enough, so hypothesis doesn't have to generate
  151. # new data.
  152. label = label[:-1]
  153. else:
  154. label = cast(
  155. Text,
  156. draw(
  157. text(
  158. min_size=1,
  159. max_size=63,
  160. alphabet=Text(ascii_letters + digits + u"-"),
  161. )
  162. ),
  163. )
  164. # Filter invalid labels.
  165. # It would be better to reliably avoid generation of bogus labels in
  166. # the first place, but it's hard...
  167. try:
  168. check_label(label)
  169. except UnicodeError: # pragma: no cover (not always drawn)
  170. assume(False)
  171. return label
  172. @composite
  173. def hostnames(draw, allow_leading_digit=True, allow_idn=True):
  174. # type: (DrawCallable, bool, bool) -> Text
  175. """
  176. A strategy which generates host names.
  177. @param allow_leading_digit: Whether to allow a leading digit in host
  178. names; they were not allowed prior to RFC 1123.
  179. @param allow_idn: Whether to allow non-ASCII characters as allowed by
  180. internationalized domain names (IDNs).
  181. """
  182. # Draw first label, filtering out labels with leading digits if needed
  183. labels = [
  184. cast(
  185. Text,
  186. draw(
  187. hostname_labels(allow_idn=allow_idn).filter(
  188. lambda l: (
  189. True if allow_leading_digit else l[0] not in digits
  190. )
  191. )
  192. ),
  193. )
  194. ]
  195. # Draw remaining labels
  196. labels += cast(
  197. List[Text],
  198. draw(
  199. lists(
  200. hostname_labels(allow_idn=allow_idn),
  201. min_size=1,
  202. max_size=4,
  203. )
  204. ),
  205. )
  206. # Trim off labels until the total host name length fits in 252
  207. # characters. This avoids having to filter the data.
  208. while sum(len(label) for label in labels) + len(labels) - 1 > 252:
  209. labels = labels[:-1]
  210. return u".".join(labels)
  211. def path_characters():
  212. # type: () -> str
  213. """
  214. Returns a string containing valid URL path characters.
  215. """
  216. global _path_characters
  217. if _path_characters is None:
  218. def chars():
  219. # type: () -> Iterable[Text]
  220. for i in range(maxunicode):
  221. c = unichr(i)
  222. # Exclude reserved characters
  223. if c in "#/?":
  224. continue
  225. # Exclude anything not UTF-8 compatible
  226. try:
  227. c.encode("utf-8")
  228. except UnicodeEncodeError:
  229. continue
  230. yield c
  231. _path_characters = "".join(chars())
  232. return _path_characters
  233. _path_characters = None # type: Optional[str]
  234. @composite
  235. def paths(draw):
  236. # type: (DrawCallable) -> Sequence[Text]
  237. return cast(
  238. List[Text],
  239. draw(
  240. lists(text(min_size=1, alphabet=path_characters()), max_size=10)
  241. ),
  242. )
  243. @composite
  244. def encoded_urls(draw):
  245. # type: (DrawCallable) -> EncodedURL
  246. """
  247. A strategy which generates L{EncodedURL}s.
  248. Call the L{EncodedURL.to_uri} method on each URL to get an HTTP
  249. protocol-friendly URI.
  250. """
  251. port = cast(Optional[int], draw(port_numbers(allow_zero=True)))
  252. host = cast(Text, draw(hostnames()))
  253. path = cast(Sequence[Text], draw(paths()))
  254. if port == 0:
  255. port = None
  256. return EncodedURL(
  257. scheme=cast(Text, draw(sampled_from((u"http", u"https")))),
  258. host=host,
  259. port=port,
  260. path=path,
  261. )
  262. @composite
  263. def decoded_urls(draw):
  264. # type: (DrawCallable) -> DecodedURL
  265. """
  266. A strategy which generates L{DecodedURL}s.
  267. Call the L{EncodedURL.to_uri} method on each URL to get an HTTP
  268. protocol-friendly URI.
  269. """
  270. return DecodedURL(draw(encoded_urls()))