_requests.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418
  1. from __future__ import annotations
  2. import contextlib
  3. import functools
  4. import http.client
  5. import logging
  6. import re
  7. import socket
  8. import warnings
  9. from ..dependencies import brotli, requests, urllib3
  10. from ..utils import bug_reports_message, int_or_none, variadic
  11. from ..utils.networking import normalize_url
  12. if requests is None:
  13. raise ImportError('requests module is not installed')
  14. if urllib3 is None:
  15. raise ImportError('urllib3 module is not installed')
  16. urllib3_version = tuple(int_or_none(x, default=0) for x in urllib3.__version__.split('.'))
  17. if urllib3_version < (1, 26, 17):
  18. raise ImportError('Only urllib3 >= 1.26.17 is supported')
  19. if requests.__build__ < 0x023202:
  20. raise ImportError('Only requests >= 2.32.2 is supported')
  21. import requests.adapters
  22. import requests.utils
  23. import urllib3.connection
  24. import urllib3.exceptions
  25. import urllib3.util
  26. from ._helper import (
  27. InstanceStoreMixin,
  28. add_accept_encoding_header,
  29. create_connection,
  30. create_socks_proxy_socket,
  31. get_redirect_method,
  32. make_socks_proxy_opts,
  33. select_proxy,
  34. )
  35. from .common import (
  36. Features,
  37. RequestHandler,
  38. Response,
  39. register_preference,
  40. register_rh,
  41. )
  42. from .exceptions import (
  43. CertificateVerifyError,
  44. HTTPError,
  45. IncompleteRead,
  46. ProxyError,
  47. RequestError,
  48. SSLError,
  49. TransportError,
  50. )
  51. from ..socks import ProxyError as SocksProxyError
  52. SUPPORTED_ENCODINGS = [
  53. 'gzip', 'deflate',
  54. ]
  55. if brotli is not None:
  56. SUPPORTED_ENCODINGS.append('br')
  57. '''
  58. Override urllib3's behavior to not convert lower-case percent-encoded characters
  59. to upper-case during url normalization process.
  60. RFC3986 defines that the lower or upper case percent-encoded hexidecimal characters are equivalent
  61. and normalizers should convert them to uppercase for consistency [1].
  62. However, some sites may have an incorrect implementation where they provide
  63. a percent-encoded url that is then compared case-sensitively.[2]
  64. While this is a very rare case, since urllib does not do this normalization step, it
  65. is best to avoid it in requests too for compatability reasons.
  66. 1: https://tools.ietf.org/html/rfc3986#section-2.1
  67. 2: https://github.com/streamlink/streamlink/pull/4003
  68. '''
  69. class Urllib3PercentREOverride:
  70. def __init__(self, r: re.Pattern):
  71. self.re = r
  72. # pass through all other attribute calls to the original re
  73. def __getattr__(self, item):
  74. return self.re.__getattribute__(item)
  75. def subn(self, repl, string, *args, **kwargs):
  76. return string, self.re.subn(repl, string, *args, **kwargs)[1]
  77. # urllib3 >= 1.25.8 uses subn:
  78. # https://github.com/urllib3/urllib3/commit/a2697e7c6b275f05879b60f593c5854a816489f0
  79. import urllib3.util.url
  80. if hasattr(urllib3.util.url, 'PERCENT_RE'):
  81. urllib3.util.url.PERCENT_RE = Urllib3PercentREOverride(urllib3.util.url.PERCENT_RE)
  82. elif hasattr(urllib3.util.url, '_PERCENT_RE'): # urllib3 >= 2.0.0
  83. urllib3.util.url._PERCENT_RE = Urllib3PercentREOverride(urllib3.util.url._PERCENT_RE)
  84. else:
  85. warnings.warn('Failed to patch PERCENT_RE in urllib3 (does the attribute exist?)' + bug_reports_message())
  86. '''
  87. Workaround for issue in urllib.util.ssl_.py: ssl_wrap_context does not pass
  88. server_hostname to SSLContext.wrap_socket if server_hostname is an IP,
  89. however this is an issue because we set check_hostname to True in our SSLContext.
  90. Monkey-patching IS_SECURETRANSPORT forces ssl_wrap_context to pass server_hostname regardless.
  91. This has been fixed in urllib3 2.0+.
  92. See: https://github.com/urllib3/urllib3/issues/517
  93. '''
  94. if urllib3_version < (2, 0, 0):
  95. with contextlib.suppress(Exception):
  96. urllib3.util.IS_SECURETRANSPORT = urllib3.util.ssl_.IS_SECURETRANSPORT = True
  97. # Requests will not automatically handle no_proxy by default
  98. # due to buggy no_proxy handling with proxy dict [1].
  99. # 1. https://github.com/psf/requests/issues/5000
  100. requests.adapters.select_proxy = select_proxy
  101. class RequestsResponseAdapter(Response):
  102. def __init__(self, res: requests.models.Response):
  103. super().__init__(
  104. fp=res.raw, headers=res.headers, url=res.url,
  105. status=res.status_code, reason=res.reason)
  106. self._requests_response = res
  107. def read(self, amt: int | None = None):
  108. try:
  109. # Interact with urllib3 response directly.
  110. return self.fp.read(amt, decode_content=True)
  111. # See urllib3.response.HTTPResponse.read() for exceptions raised on read
  112. except urllib3.exceptions.SSLError as e:
  113. raise SSLError(cause=e) from e
  114. except urllib3.exceptions.ProtocolError as e:
  115. # IncompleteRead is always contained within ProtocolError
  116. # See urllib3.response.HTTPResponse._error_catcher()
  117. ir_err = next(
  118. (err for err in (e.__context__, e.__cause__, *variadic(e.args))
  119. if isinstance(err, http.client.IncompleteRead)), None)
  120. if ir_err is not None:
  121. # `urllib3.exceptions.IncompleteRead` is subclass of `http.client.IncompleteRead`
  122. # but uses an `int` for its `partial` property.
  123. partial = ir_err.partial if isinstance(ir_err.partial, int) else len(ir_err.partial)
  124. raise IncompleteRead(partial=partial, expected=ir_err.expected) from e
  125. raise TransportError(cause=e) from e
  126. except urllib3.exceptions.HTTPError as e:
  127. # catch-all for any other urllib3 response exceptions
  128. raise TransportError(cause=e) from e
  129. class RequestsHTTPAdapter(requests.adapters.HTTPAdapter):
  130. def __init__(self, ssl_context=None, proxy_ssl_context=None, source_address=None, **kwargs):
  131. self._pm_args = {}
  132. if ssl_context:
  133. self._pm_args['ssl_context'] = ssl_context
  134. if source_address:
  135. self._pm_args['source_address'] = (source_address, 0)
  136. self._proxy_ssl_context = proxy_ssl_context or ssl_context
  137. super().__init__(**kwargs)
  138. def init_poolmanager(self, *args, **kwargs):
  139. return super().init_poolmanager(*args, **kwargs, **self._pm_args)
  140. def proxy_manager_for(self, proxy, **proxy_kwargs):
  141. extra_kwargs = {}
  142. if not proxy.lower().startswith('socks') and self._proxy_ssl_context:
  143. extra_kwargs['proxy_ssl_context'] = self._proxy_ssl_context
  144. return super().proxy_manager_for(proxy, **proxy_kwargs, **self._pm_args, **extra_kwargs)
  145. # Skip `requests` internal verification; we use our own SSLContext
  146. def cert_verify(*args, **kwargs):
  147. pass
  148. # requests 2.32.2+: Reimplementation without `_urllib3_request_context`
  149. def get_connection_with_tls_context(self, request, verify, proxies=None, cert=None):
  150. url = urllib3.util.parse_url(request.url).url
  151. manager = self.poolmanager
  152. if proxy := select_proxy(url, proxies):
  153. manager = self.proxy_manager_for(proxy)
  154. return manager.connection_from_url(url)
  155. class RequestsSession(requests.sessions.Session):
  156. """
  157. Ensure unified redirect method handling with our urllib redirect handler.
  158. """
  159. def rebuild_method(self, prepared_request, response):
  160. new_method = get_redirect_method(prepared_request.method, response.status_code)
  161. # HACK: requests removes headers/body on redirect unless code was a 307/308.
  162. if new_method == prepared_request.method:
  163. response._real_status_code = response.status_code
  164. response.status_code = 308
  165. prepared_request.method = new_method
  166. # Requests fails to resolve dot segments on absolute redirect locations
  167. # See: https://github.com/yt-dlp/yt-dlp/issues/9020
  168. prepared_request.url = normalize_url(prepared_request.url)
  169. def rebuild_auth(self, prepared_request, response):
  170. # HACK: undo status code change from rebuild_method, if applicable.
  171. # rebuild_auth runs after requests would remove headers/body based on status code
  172. if hasattr(response, '_real_status_code'):
  173. response.status_code = response._real_status_code
  174. del response._real_status_code
  175. return super().rebuild_auth(prepared_request, response)
  176. class Urllib3LoggingFilter(logging.Filter):
  177. def filter(self, record):
  178. # Ignore HTTP request messages since HTTPConnection prints those
  179. return record.msg != '%s://%s:%s "%s %s %s" %s %s'
  180. class Urllib3LoggingHandler(logging.Handler):
  181. """Redirect urllib3 logs to our logger"""
  182. def __init__(self, logger, *args, **kwargs):
  183. super().__init__(*args, **kwargs)
  184. self._logger = logger
  185. def emit(self, record):
  186. try:
  187. msg = self.format(record)
  188. if record.levelno >= logging.ERROR:
  189. self._logger.error(msg)
  190. else:
  191. self._logger.stdout(msg)
  192. except Exception:
  193. self.handleError(record)
  194. @register_rh
  195. class RequestsRH(RequestHandler, InstanceStoreMixin):
  196. """Requests RequestHandler
  197. https://github.com/psf/requests
  198. """
  199. _SUPPORTED_URL_SCHEMES = ('http', 'https')
  200. _SUPPORTED_ENCODINGS = tuple(SUPPORTED_ENCODINGS)
  201. _SUPPORTED_PROXY_SCHEMES = ('http', 'https', 'socks4', 'socks4a', 'socks5', 'socks5h')
  202. _SUPPORTED_FEATURES = (Features.NO_PROXY, Features.ALL_PROXY)
  203. RH_NAME = 'requests'
  204. def __init__(self, *args, **kwargs):
  205. super().__init__(*args, **kwargs)
  206. # Forward urllib3 debug messages to our logger
  207. logger = logging.getLogger('urllib3')
  208. self.__logging_handler = Urllib3LoggingHandler(logger=self._logger)
  209. self.__logging_handler.setFormatter(logging.Formatter('requests: %(message)s'))
  210. self.__logging_handler.addFilter(Urllib3LoggingFilter())
  211. logger.addHandler(self.__logging_handler)
  212. # TODO: Use a logger filter to suppress pool reuse warning instead
  213. logger.setLevel(logging.ERROR)
  214. if self.verbose:
  215. # Setting this globally is not ideal, but is easier than hacking with urllib3.
  216. # It could technically be problematic for scripts embedding yt-dlp.
  217. # However, it is unlikely debug traffic is used in that context in a way this will cause problems.
  218. urllib3.connection.HTTPConnection.debuglevel = 1
  219. logger.setLevel(logging.DEBUG)
  220. # this is expected if we are using --no-check-certificate
  221. urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
  222. def close(self):
  223. self._clear_instances()
  224. # Remove the logging handler that contains a reference to our logger
  225. # See: https://github.com/yt-dlp/yt-dlp/issues/8922
  226. logging.getLogger('urllib3').removeHandler(self.__logging_handler)
  227. def _check_extensions(self, extensions):
  228. super()._check_extensions(extensions)
  229. extensions.pop('cookiejar', None)
  230. extensions.pop('timeout', None)
  231. def _create_instance(self, cookiejar):
  232. session = RequestsSession()
  233. http_adapter = RequestsHTTPAdapter(
  234. ssl_context=self._make_sslcontext(),
  235. source_address=self.source_address,
  236. max_retries=urllib3.util.retry.Retry(False),
  237. )
  238. session.adapters.clear()
  239. session.headers = requests.models.CaseInsensitiveDict({'Connection': 'keep-alive'})
  240. session.mount('https://', http_adapter)
  241. session.mount('http://', http_adapter)
  242. session.cookies = cookiejar
  243. session.trust_env = False # no need, we already load proxies from env
  244. return session
  245. def _send(self, request):
  246. headers = self._merge_headers(request.headers)
  247. add_accept_encoding_header(headers, SUPPORTED_ENCODINGS)
  248. max_redirects_exceeded = False
  249. session = self._get_instance(cookiejar=self._get_cookiejar(request))
  250. try:
  251. requests_res = session.request(
  252. method=request.method,
  253. url=request.url,
  254. data=request.data,
  255. headers=headers,
  256. timeout=self._calculate_timeout(request),
  257. proxies=self._get_proxies(request),
  258. allow_redirects=True,
  259. stream=True,
  260. )
  261. except requests.exceptions.TooManyRedirects as e:
  262. max_redirects_exceeded = True
  263. requests_res = e.response
  264. except requests.exceptions.SSLError as e:
  265. if 'CERTIFICATE_VERIFY_FAILED' in str(e):
  266. raise CertificateVerifyError(cause=e) from e
  267. raise SSLError(cause=e) from e
  268. except requests.exceptions.ProxyError as e:
  269. raise ProxyError(cause=e) from e
  270. except (requests.exceptions.ConnectionError, requests.exceptions.Timeout) as e:
  271. raise TransportError(cause=e) from e
  272. except urllib3.exceptions.HTTPError as e:
  273. # Catch any urllib3 exceptions that may leak through
  274. raise TransportError(cause=e) from e
  275. except requests.exceptions.RequestException as e:
  276. # Miscellaneous Requests exceptions. May not necessary be network related e.g. InvalidURL
  277. raise RequestError(cause=e) from e
  278. res = RequestsResponseAdapter(requests_res)
  279. if not 200 <= res.status < 300:
  280. raise HTTPError(res, redirect_loop=max_redirects_exceeded)
  281. return res
  282. @register_preference(RequestsRH)
  283. def requests_preference(rh, request):
  284. return 100
  285. # Use our socks proxy implementation with requests to avoid an extra dependency.
  286. class SocksHTTPConnection(urllib3.connection.HTTPConnection):
  287. def __init__(self, _socks_options, *args, **kwargs): # must use _socks_options to pass PoolKey checks
  288. self._proxy_args = _socks_options
  289. super().__init__(*args, **kwargs)
  290. def _new_conn(self):
  291. try:
  292. return create_connection(
  293. address=(self._proxy_args['addr'], self._proxy_args['port']),
  294. timeout=self.timeout,
  295. source_address=self.source_address,
  296. _create_socket_func=functools.partial(
  297. create_socks_proxy_socket, (self.host, self.port), self._proxy_args))
  298. except (socket.timeout, TimeoutError) as e:
  299. raise urllib3.exceptions.ConnectTimeoutError(
  300. self, f'Connection to {self.host} timed out. (connect timeout={self.timeout})') from e
  301. except SocksProxyError as e:
  302. raise urllib3.exceptions.ProxyError(str(e), e) from e
  303. except OSError as e:
  304. raise urllib3.exceptions.NewConnectionError(
  305. self, f'Failed to establish a new connection: {e}') from e
  306. class SocksHTTPSConnection(SocksHTTPConnection, urllib3.connection.HTTPSConnection):
  307. pass
  308. class SocksHTTPConnectionPool(urllib3.HTTPConnectionPool):
  309. ConnectionCls = SocksHTTPConnection
  310. class SocksHTTPSConnectionPool(urllib3.HTTPSConnectionPool):
  311. ConnectionCls = SocksHTTPSConnection
  312. class SocksProxyManager(urllib3.PoolManager):
  313. def __init__(self, socks_proxy, username=None, password=None, num_pools=10, headers=None, **connection_pool_kw):
  314. connection_pool_kw['_socks_options'] = make_socks_proxy_opts(socks_proxy)
  315. super().__init__(num_pools, headers, **connection_pool_kw)
  316. self.pool_classes_by_scheme = {
  317. 'http': SocksHTTPConnectionPool,
  318. 'https': SocksHTTPSConnectionPool,
  319. }
  320. requests.adapters.SOCKSProxyManager = SocksProxyManager