_urllib.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424
  1. from __future__ import annotations
  2. import functools
  3. import http.client
  4. import io
  5. import ssl
  6. import urllib.error
  7. import urllib.parse
  8. import urllib.request
  9. import urllib.response
  10. import zlib
  11. from urllib.request import (
  12. DataHandler,
  13. FileHandler,
  14. FTPHandler,
  15. HTTPCookieProcessor,
  16. HTTPDefaultErrorHandler,
  17. HTTPErrorProcessor,
  18. UnknownHandler,
  19. )
  20. from ._helper import (
  21. InstanceStoreMixin,
  22. add_accept_encoding_header,
  23. create_connection,
  24. create_socks_proxy_socket,
  25. get_redirect_method,
  26. make_socks_proxy_opts,
  27. select_proxy,
  28. )
  29. from .common import Features, RequestHandler, Response, register_rh
  30. from .exceptions import (
  31. CertificateVerifyError,
  32. HTTPError,
  33. IncompleteRead,
  34. ProxyError,
  35. RequestError,
  36. SSLError,
  37. TransportError,
  38. )
  39. from ..dependencies import brotli
  40. from ..socks import ProxyError as SocksProxyError
  41. from ..utils import update_url_query
  42. from ..utils.networking import normalize_url
  43. SUPPORTED_ENCODINGS = ['gzip', 'deflate']
  44. CONTENT_DECODE_ERRORS = [zlib.error, OSError]
  45. if brotli:
  46. SUPPORTED_ENCODINGS.append('br')
  47. CONTENT_DECODE_ERRORS.append(brotli.error)
  48. def _create_http_connection(http_class, source_address, *args, **kwargs):
  49. hc = http_class(*args, **kwargs)
  50. if hasattr(hc, '_create_connection'):
  51. hc._create_connection = create_connection
  52. if source_address is not None:
  53. hc.source_address = (source_address, 0)
  54. return hc
  55. class HTTPHandler(urllib.request.AbstractHTTPHandler):
  56. """Handler for HTTP requests and responses.
  57. This class, when installed with an OpenerDirector, automatically adds
  58. the standard headers to every HTTP request and handles gzipped, deflated and
  59. brotli responses from web servers.
  60. Part of this code was copied from:
  61. http://techknack.net/python-urllib2-handlers/
  62. Andrew Rowls, the author of that code, agreed to release it to the
  63. public domain.
  64. """
  65. def __init__(self, context=None, source_address=None, *args, **kwargs):
  66. super().__init__(*args, **kwargs)
  67. self._source_address = source_address
  68. self._context = context
  69. @staticmethod
  70. def _make_conn_class(base, req):
  71. conn_class = base
  72. socks_proxy = req.headers.pop('Ytdl-socks-proxy', None)
  73. if socks_proxy:
  74. conn_class = make_socks_conn_class(conn_class, socks_proxy)
  75. return conn_class
  76. def http_open(self, req):
  77. conn_class = self._make_conn_class(http.client.HTTPConnection, req)
  78. return self.do_open(functools.partial(
  79. _create_http_connection, conn_class, self._source_address), req)
  80. def https_open(self, req):
  81. conn_class = self._make_conn_class(http.client.HTTPSConnection, req)
  82. return self.do_open(
  83. functools.partial(
  84. _create_http_connection, conn_class, self._source_address),
  85. req, context=self._context)
  86. @staticmethod
  87. def deflate(data):
  88. if not data:
  89. return data
  90. try:
  91. return zlib.decompress(data, -zlib.MAX_WBITS)
  92. except zlib.error:
  93. return zlib.decompress(data)
  94. @staticmethod
  95. def brotli(data):
  96. if not data:
  97. return data
  98. return brotli.decompress(data)
  99. @staticmethod
  100. def gz(data):
  101. # There may be junk added the end of the file
  102. # We ignore it by only ever decoding a single gzip payload
  103. if not data:
  104. return data
  105. return zlib.decompress(data, wbits=zlib.MAX_WBITS | 16)
  106. def http_request(self, req):
  107. # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
  108. # always respected by websites, some tend to give out URLs with non percent-encoded
  109. # non-ASCII characters (see telemb.py, ard.py [#3412])
  110. # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
  111. # To work around aforementioned issue we will replace request's original URL with
  112. # percent-encoded one
  113. # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
  114. # the code of this workaround has been moved here from YoutubeDL.urlopen()
  115. url = req.get_full_url()
  116. url_escaped = normalize_url(url)
  117. # Substitute URL if any change after escaping
  118. if url != url_escaped:
  119. req = update_Request(req, url=url_escaped)
  120. return super().do_request_(req)
  121. def http_response(self, req, resp):
  122. old_resp = resp
  123. # Content-Encoding header lists the encodings in order that they were applied [1].
  124. # To decompress, we simply do the reverse.
  125. # [1]: https://datatracker.ietf.org/doc/html/rfc9110#name-content-encoding
  126. decoded_response = None
  127. for encoding in (e.strip() for e in reversed(resp.headers.get('Content-encoding', '').split(','))):
  128. if encoding == 'gzip':
  129. decoded_response = self.gz(decoded_response or resp.read())
  130. elif encoding == 'deflate':
  131. decoded_response = self.deflate(decoded_response or resp.read())
  132. elif encoding == 'br' and brotli:
  133. decoded_response = self.brotli(decoded_response or resp.read())
  134. if decoded_response is not None:
  135. resp = urllib.request.addinfourl(io.BytesIO(decoded_response), old_resp.headers, old_resp.url, old_resp.code)
  136. resp.msg = old_resp.msg
  137. # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
  138. # https://github.com/ytdl-org/youtube-dl/issues/6457).
  139. if 300 <= resp.code < 400:
  140. location = resp.headers.get('Location')
  141. if location:
  142. # As of RFC 2616 default charset is iso-8859-1 that is respected by Python 3
  143. location = location.encode('iso-8859-1').decode()
  144. location_escaped = normalize_url(location)
  145. if location != location_escaped:
  146. del resp.headers['Location']
  147. resp.headers['Location'] = location_escaped
  148. return resp
  149. https_request = http_request
  150. https_response = http_response
  151. def make_socks_conn_class(base_class, socks_proxy):
  152. assert issubclass(base_class, (
  153. http.client.HTTPConnection, http.client.HTTPSConnection))
  154. proxy_args = make_socks_proxy_opts(socks_proxy)
  155. class SocksConnection(base_class):
  156. _create_connection = create_connection
  157. def connect(self):
  158. self.sock = create_connection(
  159. (proxy_args['addr'], proxy_args['port']),
  160. timeout=self.timeout,
  161. source_address=self.source_address,
  162. _create_socket_func=functools.partial(
  163. create_socks_proxy_socket, (self.host, self.port), proxy_args))
  164. if isinstance(self, http.client.HTTPSConnection):
  165. self.sock = self._context.wrap_socket(self.sock, server_hostname=self.host)
  166. return SocksConnection
  167. class RedirectHandler(urllib.request.HTTPRedirectHandler):
  168. """YoutubeDL redirect handler
  169. The code is based on HTTPRedirectHandler implementation from CPython [1].
  170. This redirect handler fixes and improves the logic to better align with RFC7261
  171. and what browsers tend to do [2][3]
  172. 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
  173. 2. https://datatracker.ietf.org/doc/html/rfc7231
  174. 3. https://github.com/python/cpython/issues/91306
  175. """
  176. http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
  177. def redirect_request(self, req, fp, code, msg, headers, newurl):
  178. if code not in (301, 302, 303, 307, 308):
  179. raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
  180. new_data = req.data
  181. # Technically the Cookie header should be in unredirected_hdrs,
  182. # however in practice some may set it in normal headers anyway.
  183. # We will remove it here to prevent any leaks.
  184. remove_headers = ['Cookie']
  185. new_method = get_redirect_method(req.get_method(), code)
  186. # only remove payload if method changed (e.g. POST to GET)
  187. if new_method != req.get_method():
  188. new_data = None
  189. remove_headers.extend(['Content-Length', 'Content-Type'])
  190. new_headers = {k: v for k, v in req.headers.items() if k.title() not in remove_headers}
  191. return urllib.request.Request(
  192. newurl, headers=new_headers, origin_req_host=req.origin_req_host,
  193. unverifiable=True, method=new_method, data=new_data)
  194. class ProxyHandler(urllib.request.BaseHandler):
  195. handler_order = 100
  196. def __init__(self, proxies=None):
  197. self.proxies = proxies
  198. # Set default handlers
  199. for scheme in ('http', 'https', 'ftp'):
  200. setattr(self, f'{scheme}_open', lambda r, meth=self.proxy_open: meth(r))
  201. def proxy_open(self, req):
  202. proxy = select_proxy(req.get_full_url(), self.proxies)
  203. if proxy is None:
  204. return
  205. if urllib.parse.urlparse(proxy).scheme.lower() in ('socks4', 'socks4a', 'socks5', 'socks5h'):
  206. req.add_header('Ytdl-socks-proxy', proxy)
  207. # yt-dlp's http/https handlers do wrapping the socket with socks
  208. return None
  209. return urllib.request.ProxyHandler.proxy_open(
  210. self, req, proxy, None)
  211. class PUTRequest(urllib.request.Request):
  212. def get_method(self):
  213. return 'PUT'
  214. class HEADRequest(urllib.request.Request):
  215. def get_method(self):
  216. return 'HEAD'
  217. def update_Request(req, url=None, data=None, headers=None, query=None):
  218. req_headers = req.headers.copy()
  219. req_headers.update(headers or {})
  220. req_data = data if data is not None else req.data
  221. req_url = update_url_query(url or req.get_full_url(), query)
  222. req_get_method = req.get_method()
  223. if req_get_method == 'HEAD':
  224. req_type = HEADRequest
  225. elif req_get_method == 'PUT':
  226. req_type = PUTRequest
  227. else:
  228. req_type = urllib.request.Request
  229. new_req = req_type(
  230. req_url, data=req_data, headers=req_headers,
  231. origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
  232. if hasattr(req, 'timeout'):
  233. new_req.timeout = req.timeout
  234. return new_req
  235. class UrllibResponseAdapter(Response):
  236. """
  237. HTTP Response adapter class for urllib addinfourl and http.client.HTTPResponse
  238. """
  239. def __init__(self, res: http.client.HTTPResponse | urllib.response.addinfourl):
  240. # addinfourl: In Python 3.9+, .status was introduced and .getcode() was deprecated [1]
  241. # HTTPResponse: .getcode() was deprecated, .status always existed [2]
  242. # 1. https://docs.python.org/3/library/urllib.request.html#urllib.response.addinfourl.getcode
  243. # 2. https://docs.python.org/3.10/library/http.client.html#http.client.HTTPResponse.status
  244. super().__init__(
  245. fp=res, headers=res.headers, url=res.url,
  246. status=getattr(res, 'status', None) or res.getcode(), reason=getattr(res, 'reason', None))
  247. def read(self, amt=None):
  248. try:
  249. return self.fp.read(amt)
  250. except Exception as e:
  251. handle_response_read_exceptions(e)
  252. raise e
  253. def handle_sslerror(e: ssl.SSLError):
  254. if not isinstance(e, ssl.SSLError):
  255. return
  256. if isinstance(e, ssl.SSLCertVerificationError):
  257. raise CertificateVerifyError(cause=e) from e
  258. raise SSLError(cause=e) from e
  259. def handle_response_read_exceptions(e):
  260. if isinstance(e, http.client.IncompleteRead):
  261. raise IncompleteRead(partial=len(e.partial), cause=e, expected=e.expected) from e
  262. elif isinstance(e, ssl.SSLError):
  263. handle_sslerror(e)
  264. elif isinstance(e, (OSError, EOFError, http.client.HTTPException, *CONTENT_DECODE_ERRORS)):
  265. # OSErrors raised here should mostly be network related
  266. raise TransportError(cause=e) from e
  267. @register_rh
  268. class UrllibRH(RequestHandler, InstanceStoreMixin):
  269. _SUPPORTED_URL_SCHEMES = ('http', 'https', 'data', 'ftp')
  270. _SUPPORTED_PROXY_SCHEMES = ('http', 'socks4', 'socks4a', 'socks5', 'socks5h')
  271. _SUPPORTED_FEATURES = (Features.NO_PROXY, Features.ALL_PROXY)
  272. RH_NAME = 'urllib'
  273. def __init__(self, *, enable_file_urls: bool = False, **kwargs):
  274. super().__init__(**kwargs)
  275. self.enable_file_urls = enable_file_urls
  276. if self.enable_file_urls:
  277. self._SUPPORTED_URL_SCHEMES = (*self._SUPPORTED_URL_SCHEMES, 'file')
  278. def _check_extensions(self, extensions):
  279. super()._check_extensions(extensions)
  280. extensions.pop('cookiejar', None)
  281. extensions.pop('timeout', None)
  282. extensions.pop('legacy_ssl', None)
  283. def _create_instance(self, proxies, cookiejar, legacy_ssl_support=None):
  284. opener = urllib.request.OpenerDirector()
  285. handlers = [
  286. ProxyHandler(proxies),
  287. HTTPHandler(
  288. debuglevel=int(bool(self.verbose)),
  289. context=self._make_sslcontext(legacy_ssl_support=legacy_ssl_support),
  290. source_address=self.source_address),
  291. HTTPCookieProcessor(cookiejar),
  292. DataHandler(),
  293. UnknownHandler(),
  294. HTTPDefaultErrorHandler(),
  295. FTPHandler(),
  296. HTTPErrorProcessor(),
  297. RedirectHandler(),
  298. ]
  299. if self.enable_file_urls:
  300. handlers.append(FileHandler())
  301. for handler in handlers:
  302. opener.add_handler(handler)
  303. # Delete the default user-agent header, which would otherwise apply in
  304. # cases where our custom HTTP handler doesn't come into play
  305. # (See https://github.com/ytdl-org/youtube-dl/issues/1309 for details)
  306. opener.addheaders = []
  307. return opener
  308. def _send(self, request):
  309. headers = self._merge_headers(request.headers)
  310. add_accept_encoding_header(headers, SUPPORTED_ENCODINGS)
  311. urllib_req = urllib.request.Request(
  312. url=request.url,
  313. data=request.data,
  314. headers=dict(headers),
  315. method=request.method,
  316. )
  317. opener = self._get_instance(
  318. proxies=self._get_proxies(request),
  319. cookiejar=self._get_cookiejar(request),
  320. legacy_ssl_support=request.extensions.get('legacy_ssl'),
  321. )
  322. try:
  323. res = opener.open(urllib_req, timeout=self._calculate_timeout(request))
  324. except urllib.error.HTTPError as e:
  325. if isinstance(e.fp, (http.client.HTTPResponse, urllib.response.addinfourl)):
  326. # Prevent file object from being closed when urllib.error.HTTPError is destroyed.
  327. e._closer.close_called = True
  328. raise HTTPError(UrllibResponseAdapter(e.fp), redirect_loop='redirect error' in str(e)) from e
  329. raise # unexpected
  330. except urllib.error.URLError as e:
  331. cause = e.reason # NOTE: cause may be a string
  332. # proxy errors
  333. if 'tunnel connection failed' in str(cause).lower() or isinstance(cause, SocksProxyError):
  334. raise ProxyError(cause=e) from e
  335. handle_response_read_exceptions(cause)
  336. raise TransportError(cause=e) from e
  337. except (http.client.InvalidURL, ValueError) as e:
  338. # Validation errors
  339. # http.client.HTTPConnection raises ValueError in some validation cases
  340. # such as if request method contains illegal control characters [1]
  341. # 1. https://github.com/python/cpython/blob/987b712b4aeeece336eed24fcc87a950a756c3e2/Lib/http/client.py#L1256
  342. raise RequestError(cause=e) from e
  343. except Exception as e:
  344. handle_response_read_exceptions(e)
  345. raise # unexpected
  346. return UrllibResponseAdapter(res)