url.py 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231
  1. # SPDX-License-Identifier: MIT
  2. from __future__ import absolute_import
  3. from collections import namedtuple
  4. from ..exceptions import LocationParseError
  5. url_attrs = ['scheme', 'auth', 'host', 'port', 'path', 'query', 'fragment']
  6. # We only want to normalize urls with an HTTP(S) scheme.
  7. # urllib3 infers URLs without a scheme (None) to be http.
  8. NORMALIZABLE_SCHEMES = ('http', 'https', None)
  9. class Url(namedtuple('Url', url_attrs)):
  10. """
  11. Datastructure for representing an HTTP URL. Used as a return value for
  12. :func:`parse_url`. Both the scheme and host are normalized as they are
  13. both case-insensitive according to RFC 3986.
  14. """
  15. __slots__ = ()
  16. def __new__(cls, scheme=None, auth=None, host=None, port=None, path=None,
  17. query=None, fragment=None):
  18. if path and not path.startswith('/'):
  19. path = '/' + path
  20. if scheme:
  21. scheme = scheme.lower()
  22. if host and scheme in NORMALIZABLE_SCHEMES:
  23. host = host.lower()
  24. return super(Url, cls).__new__(cls, scheme, auth, host, port, path,
  25. query, fragment)
  26. @property
  27. def hostname(self):
  28. """For backwards-compatibility with urlparse. We're nice like that."""
  29. return self.host
  30. @property
  31. def request_uri(self):
  32. """Absolute path including the query string."""
  33. uri = self.path or '/'
  34. if self.query is not None:
  35. uri += '?' + self.query
  36. return uri
  37. @property
  38. def netloc(self):
  39. """Network location including host and port"""
  40. if self.port:
  41. return '%s:%d' % (self.host, self.port)
  42. return self.host
  43. @property
  44. def url(self):
  45. """
  46. Convert self into a url
  47. This function should more or less round-trip with :func:`.parse_url`. The
  48. returned url may not be exactly the same as the url inputted to
  49. :func:`.parse_url`, but it should be equivalent by the RFC (e.g., urls
  50. with a blank port will have : removed).
  51. Example: ::
  52. >>> U = parse_url('http://google.com/mail/')
  53. >>> U.url
  54. 'http://google.com/mail/'
  55. >>> Url('http', 'username:password', 'host.com', 80,
  56. ... '/path', 'query', 'fragment').url
  57. 'http://username:password@host.com:80/path?query#fragment'
  58. """
  59. scheme, auth, host, port, path, query, fragment = self
  60. url = ''
  61. # We use "is not None" we want things to happen with empty strings (or 0 port)
  62. if scheme is not None:
  63. url += scheme + '://'
  64. if auth is not None:
  65. url += auth + '@'
  66. if host is not None:
  67. url += host
  68. if port is not None:
  69. url += ':' + str(port)
  70. if path is not None:
  71. url += path
  72. if query is not None:
  73. url += '?' + query
  74. if fragment is not None:
  75. url += '#' + fragment
  76. return url
  77. def __str__(self):
  78. return self.url
  79. def split_first(s, delims):
  80. """
  81. Given a string and an iterable of delimiters, split on the first found
  82. delimiter. Return two split parts and the matched delimiter.
  83. If not found, then the first part is the full input string.
  84. Example::
  85. >>> split_first('foo/bar?baz', '?/=')
  86. ('foo', 'bar?baz', '/')
  87. >>> split_first('foo/bar?baz', '123')
  88. ('foo/bar?baz', '', None)
  89. Scales linearly with number of delims. Not ideal for large number of delims.
  90. """
  91. min_idx = None
  92. min_delim = None
  93. for d in delims:
  94. idx = s.find(d)
  95. if idx < 0:
  96. continue
  97. if min_idx is None or idx < min_idx:
  98. min_idx = idx
  99. min_delim = d
  100. if min_idx is None or min_idx < 0:
  101. return s, '', None
  102. return s[:min_idx], s[min_idx + 1:], min_delim
  103. def parse_url(url):
  104. """
  105. Given a url, return a parsed :class:`.Url` namedtuple. Best-effort is
  106. performed to parse incomplete urls. Fields not provided will be None.
  107. Partly backwards-compatible with :mod:`urlparse`.
  108. Example::
  109. >>> parse_url('http://google.com/mail/')
  110. Url(scheme='http', host='google.com', port=None, path='/mail/', ...)
  111. >>> parse_url('google.com:80')
  112. Url(scheme=None, host='google.com', port=80, path=None, ...)
  113. >>> parse_url('/foo?bar')
  114. Url(scheme=None, host=None, port=None, path='/foo', query='bar', ...)
  115. """
  116. # While this code has overlap with stdlib's urlparse, it is much
  117. # simplified for our needs and less annoying.
  118. # Additionally, this implementations does silly things to be optimal
  119. # on CPython.
  120. if not url:
  121. # Empty
  122. return Url()
  123. scheme = None
  124. auth = None
  125. host = None
  126. port = None
  127. path = None
  128. fragment = None
  129. query = None
  130. # Scheme
  131. if '://' in url:
  132. scheme, url = url.split('://', 1)
  133. # Find the earliest Authority Terminator
  134. # (http://tools.ietf.org/html/rfc3986#section-3.2)
  135. url, path_, delim = split_first(url, ['/', '?', '#'])
  136. if delim:
  137. # Reassemble the path
  138. path = delim + path_
  139. # Auth
  140. if '@' in url:
  141. # Last '@' denotes end of auth part
  142. auth, url = url.rsplit('@', 1)
  143. # IPv6
  144. if url and url[0] == '[':
  145. host, url = url.split(']', 1)
  146. host += ']'
  147. # Port
  148. if ':' in url:
  149. _host, port = url.split(':', 1)
  150. if not host:
  151. host = _host
  152. if port:
  153. # If given, ports must be integers. No whitespace, no plus or
  154. # minus prefixes, no non-integer digits such as ^2 (superscript).
  155. if not port.isdigit():
  156. raise LocationParseError(url)
  157. try:
  158. port = int(port)
  159. except ValueError:
  160. raise LocationParseError(url)
  161. else:
  162. # Blank ports are cool, too. (rfc3986#section-3.2.3)
  163. port = None
  164. elif not host and url:
  165. host = url
  166. if not path:
  167. return Url(scheme, auth, host, port, path, query, fragment)
  168. # Fragment
  169. if '#' in path:
  170. path, fragment = path.split('#', 1)
  171. # Query
  172. if '?' in path:
  173. path, query = path.split('?', 1)
  174. return Url(scheme, auth, host, port, path, query, fragment)
  175. def get_host(url):
  176. """
  177. Deprecated. Use :func:`parse_url` instead.
  178. """
  179. p = parse_url(url)
  180. return p.scheme or 'http', p.hostname, p.port