_curlcffi.py 9.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241
  1. from __future__ import annotations
  2. import io
  3. import math
  4. import urllib.parse
  5. from ._helper import InstanceStoreMixin, select_proxy
  6. from .common import (
  7. Features,
  8. Request,
  9. Response,
  10. register_preference,
  11. register_rh,
  12. )
  13. from .exceptions import (
  14. CertificateVerifyError,
  15. HTTPError,
  16. IncompleteRead,
  17. ProxyError,
  18. SSLError,
  19. TransportError,
  20. )
  21. from .impersonate import ImpersonateRequestHandler, ImpersonateTarget
  22. from ..dependencies import curl_cffi, certifi
  23. from ..utils import int_or_none
  24. if curl_cffi is None:
  25. raise ImportError('curl_cffi is not installed')
  26. curl_cffi_version = tuple(int_or_none(x, default=0) for x in curl_cffi.__version__.split('.'))
  27. if curl_cffi_version != (0, 5, 10):
  28. curl_cffi._yt_dlp__version = f'{curl_cffi.__version__} (unsupported)'
  29. raise ImportError('Only curl_cffi 0.5.10 is supported')
  30. import curl_cffi.requests
  31. from curl_cffi.const import CurlECode, CurlOpt
  32. class CurlCFFIResponseReader(io.IOBase):
  33. def __init__(self, response: curl_cffi.requests.Response):
  34. self._response = response
  35. self._iterator = response.iter_content()
  36. self._buffer = b''
  37. self.bytes_read = 0
  38. def readable(self):
  39. return True
  40. def read(self, size=None):
  41. exception_raised = True
  42. try:
  43. while self._iterator and (size is None or len(self._buffer) < size):
  44. chunk = next(self._iterator, None)
  45. if chunk is None:
  46. self._iterator = None
  47. break
  48. self._buffer += chunk
  49. self.bytes_read += len(chunk)
  50. if size is None:
  51. size = len(self._buffer)
  52. data = self._buffer[:size]
  53. self._buffer = self._buffer[size:]
  54. # "free" the curl instance if the response is fully read.
  55. # curl_cffi doesn't do this automatically and only allows one open response per thread
  56. if not self._iterator and not self._buffer:
  57. self.close()
  58. exception_raised = False
  59. return data
  60. finally:
  61. if exception_raised:
  62. self.close()
  63. def close(self):
  64. if not self.closed:
  65. self._response.close()
  66. self._buffer = b''
  67. super().close()
  68. class CurlCFFIResponseAdapter(Response):
  69. fp: CurlCFFIResponseReader
  70. def __init__(self, response: curl_cffi.requests.Response):
  71. super().__init__(
  72. fp=CurlCFFIResponseReader(response),
  73. headers=response.headers,
  74. url=response.url,
  75. status=response.status_code)
  76. def read(self, amt=None):
  77. try:
  78. return self.fp.read(amt)
  79. except curl_cffi.requests.errors.RequestsError as e:
  80. if e.code == CurlECode.PARTIAL_FILE:
  81. content_length = int_or_none(e.response.headers.get('Content-Length'))
  82. raise IncompleteRead(
  83. partial=self.fp.bytes_read,
  84. expected=content_length - self.fp.bytes_read if content_length is not None else None,
  85. cause=e) from e
  86. raise TransportError(cause=e) from e
  87. @register_rh
  88. class CurlCFFIRH(ImpersonateRequestHandler, InstanceStoreMixin):
  89. RH_NAME = 'curl_cffi'
  90. _SUPPORTED_URL_SCHEMES = ('http', 'https')
  91. _SUPPORTED_FEATURES = (Features.NO_PROXY, Features.ALL_PROXY)
  92. _SUPPORTED_PROXY_SCHEMES = ('http', 'https', 'socks4', 'socks4a', 'socks5', 'socks5h')
  93. _SUPPORTED_IMPERSONATE_TARGET_MAP = {
  94. ImpersonateTarget('chrome', '110', 'windows', '10'): curl_cffi.requests.BrowserType.chrome110,
  95. ImpersonateTarget('chrome', '107', 'windows', '10'): curl_cffi.requests.BrowserType.chrome107,
  96. ImpersonateTarget('chrome', '104', 'windows', '10'): curl_cffi.requests.BrowserType.chrome104,
  97. ImpersonateTarget('chrome', '101', 'windows', '10'): curl_cffi.requests.BrowserType.chrome101,
  98. ImpersonateTarget('chrome', '100', 'windows', '10'): curl_cffi.requests.BrowserType.chrome100,
  99. ImpersonateTarget('chrome', '99', 'windows', '10'): curl_cffi.requests.BrowserType.chrome99,
  100. ImpersonateTarget('edge', '101', 'windows', '10'): curl_cffi.requests.BrowserType.edge101,
  101. ImpersonateTarget('edge', '99', 'windows', '10'): curl_cffi.requests.BrowserType.edge99,
  102. ImpersonateTarget('safari', '15.5', 'macos', '12'): curl_cffi.requests.BrowserType.safari15_5,
  103. ImpersonateTarget('safari', '15.3', 'macos', '11'): curl_cffi.requests.BrowserType.safari15_3,
  104. ImpersonateTarget('chrome', '99', 'android', '12'): curl_cffi.requests.BrowserType.chrome99_android,
  105. }
  106. def _create_instance(self, cookiejar=None):
  107. return curl_cffi.requests.Session(cookies=cookiejar)
  108. def _check_extensions(self, extensions):
  109. super()._check_extensions(extensions)
  110. extensions.pop('impersonate', None)
  111. extensions.pop('cookiejar', None)
  112. extensions.pop('timeout', None)
  113. def send(self, request: Request) -> Response:
  114. target = self._get_request_target(request)
  115. try:
  116. response = super().send(request)
  117. except HTTPError as e:
  118. e.response.extensions['impersonate'] = target
  119. raise
  120. response.extensions['impersonate'] = target
  121. return response
  122. def _send(self, request: Request):
  123. max_redirects_exceeded = False
  124. session: curl_cffi.requests.Session = self._get_instance(
  125. cookiejar=self._get_cookiejar(request) if 'cookie' not in request.headers else None)
  126. if self.verbose:
  127. session.curl.setopt(CurlOpt.VERBOSE, 1)
  128. proxies = self._get_proxies(request)
  129. if 'no' in proxies:
  130. session.curl.setopt(CurlOpt.NOPROXY, proxies['no'])
  131. proxies.pop('no', None)
  132. # curl doesn't support per protocol proxies, so we select the one that matches the request protocol
  133. proxy = select_proxy(request.url, proxies=proxies)
  134. if proxy:
  135. session.curl.setopt(CurlOpt.PROXY, proxy)
  136. scheme = urllib.parse.urlparse(request.url).scheme.lower()
  137. if scheme != 'http':
  138. # Enable HTTP CONNECT for HTTPS urls.
  139. # Don't use CONNECT for http for compatibility with urllib behaviour.
  140. # See: https://curl.se/libcurl/c/CURLOPT_HTTPPROXYTUNNEL.html
  141. session.curl.setopt(CurlOpt.HTTPPROXYTUNNEL, 1)
  142. # curl_cffi does not currently set these for proxies
  143. session.curl.setopt(CurlOpt.PROXY_CAINFO, certifi.where())
  144. if not self.verify:
  145. session.curl.setopt(CurlOpt.PROXY_SSL_VERIFYPEER, 0)
  146. session.curl.setopt(CurlOpt.PROXY_SSL_VERIFYHOST, 0)
  147. headers = self._get_impersonate_headers(request)
  148. if self._client_cert:
  149. session.curl.setopt(CurlOpt.SSLCERT, self._client_cert['client_certificate'])
  150. client_certificate_key = self._client_cert.get('client_certificate_key')
  151. client_certificate_password = self._client_cert.get('client_certificate_password')
  152. if client_certificate_key:
  153. session.curl.setopt(CurlOpt.SSLKEY, client_certificate_key)
  154. if client_certificate_password:
  155. session.curl.setopt(CurlOpt.KEYPASSWD, client_certificate_password)
  156. timeout = self._calculate_timeout(request)
  157. # set CURLOPT_LOW_SPEED_LIMIT and CURLOPT_LOW_SPEED_TIME to act as a read timeout. [1]
  158. # curl_cffi does not currently do this. [2]
  159. # Note: CURLOPT_LOW_SPEED_TIME is in seconds, so we need to round up to the nearest second. [3]
  160. # [1] https://unix.stackexchange.com/a/305311
  161. # [2] https://github.com/yifeikong/curl_cffi/issues/156
  162. # [3] https://curl.se/libcurl/c/CURLOPT_LOW_SPEED_TIME.html
  163. session.curl.setopt(CurlOpt.LOW_SPEED_LIMIT, 1) # 1 byte per second
  164. session.curl.setopt(CurlOpt.LOW_SPEED_TIME, math.ceil(timeout))
  165. try:
  166. curl_response = session.request(
  167. method=request.method,
  168. url=request.url,
  169. headers=headers,
  170. data=request.data,
  171. verify=self.verify,
  172. max_redirects=5,
  173. timeout=timeout,
  174. impersonate=self._SUPPORTED_IMPERSONATE_TARGET_MAP.get(
  175. self._get_request_target(request)),
  176. interface=self.source_address,
  177. stream=True,
  178. )
  179. except curl_cffi.requests.errors.RequestsError as e:
  180. if e.code == CurlECode.PEER_FAILED_VERIFICATION:
  181. raise CertificateVerifyError(cause=e) from e
  182. elif e.code == CurlECode.SSL_CONNECT_ERROR:
  183. raise SSLError(cause=e) from e
  184. elif e.code == CurlECode.TOO_MANY_REDIRECTS:
  185. max_redirects_exceeded = True
  186. curl_response = e.response
  187. elif (
  188. e.code == CurlECode.PROXY
  189. or (e.code == CurlECode.RECV_ERROR and 'Received HTTP code 407 from proxy after CONNECT' in str(e))
  190. ):
  191. raise ProxyError(cause=e) from e
  192. else:
  193. raise TransportError(cause=e) from e
  194. response = CurlCFFIResponseAdapter(curl_response)
  195. if not 200 <= response.status < 300:
  196. raise HTTPError(response, redirect_loop=max_redirects_exceeded)
  197. return response
  198. @register_preference(CurlCFFIRH)
  199. def curl_cffi_preference(rh, request):
  200. return -100