_curlcffi.py 11 KB


  1. from __future__ import annotations
  2. import io
  3. import math
  4. import re
  5. import urllib.parse
  6. from ._helper import InstanceStoreMixin, select_proxy
  7. from .common import (
  8. Features,
  9. Request,
  10. Response,
  11. register_preference,
  12. register_rh,
  13. )
  14. from .exceptions import (
  15. CertificateVerifyError,
  16. HTTPError,
  17. IncompleteRead,
  18. ProxyError,
  19. SSLError,
  20. TransportError,
  21. )
  22. from .impersonate import ImpersonateRequestHandler, ImpersonateTarget
  23. from ..dependencies import curl_cffi, certifi
  24. from ..utils import int_or_none
  25. if curl_cffi is None:
  26. raise ImportError('curl_cffi is not installed')
  27. curl_cffi_version = tuple(map(int, re.split(r'[^\d]+', curl_cffi.__version__)[:3]))
  28. if curl_cffi_version != (0, 5, 10) and not ((0, 7, 0) <= curl_cffi_version < (0, 7, 2)):
  29. curl_cffi._yt_dlp__version = f'{curl_cffi.__version__} (unsupported)'
  30. raise ImportError('Only curl_cffi versions 0.5.10, 0.7.0 and 0.7.1 are supported')
  31. import curl_cffi.requests
  32. from curl_cffi.const import CurlECode, CurlOpt
  33. class CurlCFFIResponseReader(io.IOBase):
  34. def __init__(self, response: curl_cffi.requests.Response):
  35. self._response = response
  36. self._iterator = response.iter_content()
  37. self._buffer = b''
  38. self.bytes_read = 0
  39. def readable(self):
  40. return True
  41. def read(self, size=None):
  42. exception_raised = True
  43. try:
  44. while self._iterator and (size is None or len(self._buffer) < size):
  45. chunk = next(self._iterator, None)
  46. if chunk is None:
  47. self._iterator = None
  48. break
  49. self._buffer += chunk
  50. self.bytes_read += len(chunk)
  51. if size is None:
  52. size = len(self._buffer)
  53. data = self._buffer[:size]
  54. self._buffer = self._buffer[size:]
  55. # "free" the curl instance if the response is fully read.
  56. # curl_cffi doesn't do this automatically and only allows one open response per thread
  57. if not self._iterator and not self._buffer:
  58. self.close()
  59. exception_raised = False
  60. return data
  61. finally:
  62. if exception_raised:
  63. self.close()
  64. def close(self):
  65. if not self.closed:
  66. self._response.close()
  67. self._buffer = b''
  68. super().close()
  69. class CurlCFFIResponseAdapter(Response):
  70. fp: CurlCFFIResponseReader
  71. def __init__(self, response: curl_cffi.requests.Response):
  72. super().__init__(
  73. fp=CurlCFFIResponseReader(response),
  74. headers=response.headers,
  75. url=response.url,
  76. status=response.status_code)
  77. def read(self, amt=None):
  78. try:
  79. return self.fp.read(amt)
  80. except curl_cffi.requests.errors.RequestsError as e:
  81. if e.code == CurlECode.PARTIAL_FILE:
  82. content_length = int_or_none(e.response.headers.get('Content-Length'))
  83. raise IncompleteRead(
  84. partial=self.fp.bytes_read,
  85. expected=content_length - self.fp.bytes_read if content_length is not None else None,
  86. cause=e) from e
  87. raise TransportError(cause=e) from e
  88. @register_rh
  89. class CurlCFFIRH(ImpersonateRequestHandler, InstanceStoreMixin):
  90. RH_NAME = 'curl_cffi'
  91. _SUPPORTED_URL_SCHEMES = ('http', 'https')
  92. _SUPPORTED_FEATURES = (Features.NO_PROXY, Features.ALL_PROXY)
  93. _SUPPORTED_PROXY_SCHEMES = ('http', 'https', 'socks4', 'socks4a', 'socks5', 'socks5h')
  94. _SUPPORTED_IMPERSONATE_TARGET_MAP = {
  95. **({
  96. ImpersonateTarget('chrome', '124', 'macos', '14'): curl_cffi.requests.BrowserType.chrome124,
  97. ImpersonateTarget('chrome', '123', 'macos', '14'): curl_cffi.requests.BrowserType.chrome123,
  98. ImpersonateTarget('chrome', '120', 'macos', '14'): curl_cffi.requests.BrowserType.chrome120,
  99. ImpersonateTarget('chrome', '119', 'macos', '14'): curl_cffi.requests.BrowserType.chrome119,
  100. ImpersonateTarget('chrome', '116', 'windows', '10'): curl_cffi.requests.BrowserType.chrome116,
  101. } if curl_cffi_version >= (0, 7, 0) else {}),
  102. ImpersonateTarget('chrome', '110', 'windows', '10'): curl_cffi.requests.BrowserType.chrome110,
  103. ImpersonateTarget('chrome', '107', 'windows', '10'): curl_cffi.requests.BrowserType.chrome107,
  104. ImpersonateTarget('chrome', '104', 'windows', '10'): curl_cffi.requests.BrowserType.chrome104,
  105. ImpersonateTarget('chrome', '101', 'windows', '10'): curl_cffi.requests.BrowserType.chrome101,
  106. ImpersonateTarget('chrome', '100', 'windows', '10'): curl_cffi.requests.BrowserType.chrome100,
  107. ImpersonateTarget('chrome', '99', 'windows', '10'): curl_cffi.requests.BrowserType.chrome99,
  108. ImpersonateTarget('edge', '101', 'windows', '10'): curl_cffi.requests.BrowserType.edge101,
  109. ImpersonateTarget('edge', '99', 'windows', '10'): curl_cffi.requests.BrowserType.edge99,
  110. **({
  111. ImpersonateTarget('safari', '17.0', 'macos', '14'): curl_cffi.requests.BrowserType.safari17_0,
  112. } if curl_cffi_version >= (0, 7, 0) else {}),
  113. ImpersonateTarget('safari', '15.5', 'macos', '12'): curl_cffi.requests.BrowserType.safari15_5,
  114. ImpersonateTarget('safari', '15.3', 'macos', '11'): curl_cffi.requests.BrowserType.safari15_3,
  115. ImpersonateTarget('chrome', '99', 'android', '12'): curl_cffi.requests.BrowserType.chrome99_android,
  116. **({
  117. ImpersonateTarget('safari', '17.2', 'ios', '17.2'): curl_cffi.requests.BrowserType.safari17_2_ios,
  118. } if curl_cffi_version >= (0, 7, 0) else {}),
  119. }
  120. def _create_instance(self, cookiejar=None):
  121. return curl_cffi.requests.Session(cookies=cookiejar)
  122. def _check_extensions(self, extensions):
  123. super()._check_extensions(extensions)
  124. extensions.pop('impersonate', None)
  125. extensions.pop('cookiejar', None)
  126. extensions.pop('timeout', None)
  127. # CurlCFFIRH ignores legacy ssl options currently.
  128. # Impersonation generally uses a looser SSL configuration than urllib/requests.
  129. extensions.pop('legacy_ssl', None)
  130. def send(self, request: Request) -> Response:
  131. target = self._get_request_target(request)
  132. try:
  133. response = super().send(request)
  134. except HTTPError as e:
  135. e.response.extensions['impersonate'] = target
  136. raise
  137. response.extensions['impersonate'] = target
  138. return response
  139. def _send(self, request: Request):
  140. max_redirects_exceeded = False
  141. session: curl_cffi.requests.Session = self._get_instance(
  142. cookiejar=self._get_cookiejar(request) if 'cookie' not in request.headers else None)
  143. if self.verbose:
  144. session.curl.setopt(CurlOpt.VERBOSE, 1)
  145. proxies = self._get_proxies(request)
  146. if 'no' in proxies:
  147. session.curl.setopt(CurlOpt.NOPROXY, proxies['no'])
  148. proxies.pop('no', None)
  149. # curl doesn't support per protocol proxies, so we select the one that matches the request protocol
  150. proxy = select_proxy(request.url, proxies=proxies)
  151. if proxy:
  152. session.curl.setopt(CurlOpt.PROXY, proxy)
  153. scheme = urllib.parse.urlparse(request.url).scheme.lower()
  154. if scheme != 'http':
  155. # Enable HTTP CONNECT for HTTPS urls.
  156. # Don't use CONNECT for http for compatibility with urllib behaviour.
  157. # See: https://curl.se/libcurl/c/CURLOPT_HTTPPROXYTUNNEL.html
  158. session.curl.setopt(CurlOpt.HTTPPROXYTUNNEL, 1)
  159. # curl_cffi does not currently set these for proxies
  160. session.curl.setopt(CurlOpt.PROXY_CAINFO, certifi.where())
  161. if not self.verify:
  162. session.curl.setopt(CurlOpt.PROXY_SSL_VERIFYPEER, 0)
  163. session.curl.setopt(CurlOpt.PROXY_SSL_VERIFYHOST, 0)
  164. headers = self._get_impersonate_headers(request)
  165. if self._client_cert:
  166. session.curl.setopt(CurlOpt.SSLCERT, self._client_cert['client_certificate'])
  167. client_certificate_key = self._client_cert.get('client_certificate_key')
  168. client_certificate_password = self._client_cert.get('client_certificate_password')
  169. if client_certificate_key:
  170. session.curl.setopt(CurlOpt.SSLKEY, client_certificate_key)
  171. if client_certificate_password:
  172. session.curl.setopt(CurlOpt.KEYPASSWD, client_certificate_password)
  173. timeout = self._calculate_timeout(request)
  174. # set CURLOPT_LOW_SPEED_LIMIT and CURLOPT_LOW_SPEED_TIME to act as a read timeout. [1]
  175. # This is required only for 0.5.10 [2]
  176. # Note: CURLOPT_LOW_SPEED_TIME is in seconds, so we need to round up to the nearest second. [3]
  177. # [1] https://unix.stackexchange.com/a/305311
  178. # [2] https://github.com/yifeikong/curl_cffi/issues/156
  179. # [3] https://curl.se/libcurl/c/CURLOPT_LOW_SPEED_TIME.html
  180. session.curl.setopt(CurlOpt.LOW_SPEED_LIMIT, 1) # 1 byte per second
  181. session.curl.setopt(CurlOpt.LOW_SPEED_TIME, math.ceil(timeout))
  182. try:
  183. curl_response = session.request(
  184. method=request.method,
  185. url=request.url,
  186. headers=headers,
  187. data=request.data,
  188. verify=self.verify,
  189. max_redirects=5,
  190. timeout=(timeout, timeout),
  191. impersonate=self._SUPPORTED_IMPERSONATE_TARGET_MAP.get(
  192. self._get_request_target(request)),
  193. interface=self.source_address,
  194. stream=True,
  195. )
  196. except curl_cffi.requests.errors.RequestsError as e:
  197. if e.code == CurlECode.PEER_FAILED_VERIFICATION:
  198. raise CertificateVerifyError(cause=e) from e
  199. elif e.code == CurlECode.SSL_CONNECT_ERROR:
  200. raise SSLError(cause=e) from e
  201. elif e.code == CurlECode.TOO_MANY_REDIRECTS:
  202. max_redirects_exceeded = True
  203. curl_response = e.response
  204. elif (
  205. e.code == CurlECode.PROXY
  206. or (e.code == CurlECode.RECV_ERROR and 'CONNECT' in str(e))
  207. ):
  208. raise ProxyError(cause=e) from e
  209. else:
  210. raise TransportError(cause=e) from e
  211. response = CurlCFFIResponseAdapter(curl_response)
  212. if not 200 <= response.status < 300:
  213. raise HTTPError(response, redirect_loop=max_redirects_exceeded)
  214. return response
  215. @register_preference(CurlCFFIRH)
  216. def curl_cffi_preference(rh, request):
  217. return -100