|
@@ -0,0 +1,315 @@
|
|
|
+import functools
|
|
|
+import gzip
|
|
|
+import http.client
|
|
|
+import io
|
|
|
+import socket
|
|
|
+import ssl
|
|
|
+import urllib.error
|
|
|
+import urllib.parse
|
|
|
+import urllib.request
|
|
|
+import urllib.response
|
|
|
+import zlib
|
|
|
+
|
|
|
+from ._helper import (
|
|
|
+ add_accept_encoding_header,
|
|
|
+ get_redirect_method,
|
|
|
+ make_socks_proxy_opts,
|
|
|
+)
|
|
|
+from ..dependencies import brotli
|
|
|
+from ..socks import sockssocket
|
|
|
+from ..utils import escape_url, update_url_query
|
|
|
+from ..utils.networking import clean_headers, std_headers
|
|
|
+
|
|
|
+SUPPORTED_ENCODINGS = ['gzip', 'deflate']
|
|
|
+
|
|
|
+if brotli:
|
|
|
+ SUPPORTED_ENCODINGS.append('br')
|
|
|
+
|
|
|
+
|
|
|
+def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
|
|
|
+ hc = http_class(*args, **kwargs)
|
|
|
+ source_address = ydl_handler._params.get('source_address')
|
|
|
+
|
|
|
+ if source_address is not None:
|
|
|
+ # This is to workaround _create_connection() from socket where it will try all
|
|
|
+ # address data from getaddrinfo() including IPv6. This filters the result from
|
|
|
+ # getaddrinfo() based on the source_address value.
|
|
|
+ # This is based on the cpython socket.create_connection() function.
|
|
|
+ # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
|
|
|
+ def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
|
|
|
+ host, port = address
|
|
|
+ err = None
|
|
|
+ addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
|
|
|
+ af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
|
|
|
+ ip_addrs = [addr for addr in addrs if addr[0] == af]
|
|
|
+ if addrs and not ip_addrs:
|
|
|
+ ip_version = 'v4' if af == socket.AF_INET else 'v6'
|
|
|
+ raise OSError(
|
|
|
+ "No remote IP%s addresses available for connect, can't use '%s' as source address"
|
|
|
+ % (ip_version, source_address[0]))
|
|
|
+ for res in ip_addrs:
|
|
|
+ af, socktype, proto, canonname, sa = res
|
|
|
+ sock = None
|
|
|
+ try:
|
|
|
+ sock = socket.socket(af, socktype, proto)
|
|
|
+ if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
|
|
|
+ sock.settimeout(timeout)
|
|
|
+ sock.bind(source_address)
|
|
|
+ sock.connect(sa)
|
|
|
+ err = None # Explicitly break reference cycle
|
|
|
+ return sock
|
|
|
+ except OSError as _:
|
|
|
+ err = _
|
|
|
+ if sock is not None:
|
|
|
+ sock.close()
|
|
|
+ if err is not None:
|
|
|
+ raise err
|
|
|
+ else:
|
|
|
+ raise OSError('getaddrinfo returns an empty list')
|
|
|
+ if hasattr(hc, '_create_connection'):
|
|
|
+ hc._create_connection = _create_connection
|
|
|
+ hc.source_address = (source_address, 0)
|
|
|
+
|
|
|
+ return hc
|
|
|
+
|
|
|
+
|
|
|
+class HTTPHandler(urllib.request.HTTPHandler):
|
|
|
+ """Handler for HTTP requests and responses.
|
|
|
+
|
|
|
+ This class, when installed with an OpenerDirector, automatically adds
|
|
|
+ the standard headers to every HTTP request and handles gzipped, deflated and
|
|
|
+ brotli responses from web servers.
|
|
|
+
|
|
|
+ Part of this code was copied from:
|
|
|
+
|
|
|
+ http://techknack.net/python-urllib2-handlers/
|
|
|
+
|
|
|
+ Andrew Rowls, the author of that code, agreed to release it to the
|
|
|
+ public domain.
|
|
|
+ """
|
|
|
+
|
|
|
+ def __init__(self, params, *args, **kwargs):
|
|
|
+ urllib.request.HTTPHandler.__init__(self, *args, **kwargs)
|
|
|
+ self._params = params
|
|
|
+
|
|
|
+ def http_open(self, req):
|
|
|
+ conn_class = http.client.HTTPConnection
|
|
|
+
|
|
|
+ socks_proxy = req.headers.get('Ytdl-socks-proxy')
|
|
|
+ if socks_proxy:
|
|
|
+ conn_class = make_socks_conn_class(conn_class, socks_proxy)
|
|
|
+ del req.headers['Ytdl-socks-proxy']
|
|
|
+
|
|
|
+ return self.do_open(functools.partial(
|
|
|
+ _create_http_connection, self, conn_class, False),
|
|
|
+ req)
|
|
|
+
|
|
|
+ @staticmethod
|
|
|
+ def deflate(data):
|
|
|
+ if not data:
|
|
|
+ return data
|
|
|
+ try:
|
|
|
+ return zlib.decompress(data, -zlib.MAX_WBITS)
|
|
|
+ except zlib.error:
|
|
|
+ return zlib.decompress(data)
|
|
|
+
|
|
|
+ @staticmethod
|
|
|
+ def brotli(data):
|
|
|
+ if not data:
|
|
|
+ return data
|
|
|
+ return brotli.decompress(data)
|
|
|
+
|
|
|
+ @staticmethod
|
|
|
+ def gz(data):
|
|
|
+ gz = gzip.GzipFile(fileobj=io.BytesIO(data), mode='rb')
|
|
|
+ try:
|
|
|
+ return gz.read()
|
|
|
+ except OSError as original_oserror:
|
|
|
+ # There may be junk add the end of the file
|
|
|
+ # See http://stackoverflow.com/q/4928560/35070 for details
|
|
|
+ for i in range(1, 1024):
|
|
|
+ try:
|
|
|
+ gz = gzip.GzipFile(fileobj=io.BytesIO(data[:-i]), mode='rb')
|
|
|
+ return gz.read()
|
|
|
+ except OSError:
|
|
|
+ continue
|
|
|
+ else:
|
|
|
+ raise original_oserror
|
|
|
+
|
|
|
+ def http_request(self, req):
|
|
|
+ # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
|
|
|
+ # always respected by websites, some tend to give out URLs with non percent-encoded
|
|
|
+ # non-ASCII characters (see telemb.py, ard.py [#3412])
|
|
|
+ # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
|
|
|
+ # To work around aforementioned issue we will replace request's original URL with
|
|
|
+ # percent-encoded one
|
|
|
+ # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
|
|
|
+ # the code of this workaround has been moved here from YoutubeDL.urlopen()
|
|
|
+ url = req.get_full_url()
|
|
|
+ url_escaped = escape_url(url)
|
|
|
+
|
|
|
+ # Substitute URL if any change after escaping
|
|
|
+ if url != url_escaped:
|
|
|
+ req = update_Request(req, url=url_escaped)
|
|
|
+
|
|
|
+ for h, v in self._params.get('http_headers', std_headers).items():
|
|
|
+ # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
|
|
|
+ # The dict keys are capitalized because of this bug by urllib
|
|
|
+ if h.capitalize() not in req.headers:
|
|
|
+ req.add_header(h, v)
|
|
|
+
|
|
|
+ clean_headers(req.headers)
|
|
|
+ add_accept_encoding_header(req.headers, SUPPORTED_ENCODINGS)
|
|
|
+ return super().do_request_(req)
|
|
|
+
|
|
|
+ def http_response(self, req, resp):
|
|
|
+ old_resp = resp
|
|
|
+
|
|
|
+ # Content-Encoding header lists the encodings in order that they were applied [1].
|
|
|
+ # To decompress, we simply do the reverse.
|
|
|
+ # [1]: https://datatracker.ietf.org/doc/html/rfc9110#name-content-encoding
|
|
|
+ decoded_response = None
|
|
|
+ for encoding in (e.strip() for e in reversed(resp.headers.get('Content-encoding', '').split(','))):
|
|
|
+ if encoding == 'gzip':
|
|
|
+ decoded_response = self.gz(decoded_response or resp.read())
|
|
|
+ elif encoding == 'deflate':
|
|
|
+ decoded_response = self.deflate(decoded_response or resp.read())
|
|
|
+ elif encoding == 'br' and brotli:
|
|
|
+ decoded_response = self.brotli(decoded_response or resp.read())
|
|
|
+
|
|
|
+ if decoded_response is not None:
|
|
|
+ resp = urllib.request.addinfourl(io.BytesIO(decoded_response), old_resp.headers, old_resp.url, old_resp.code)
|
|
|
+ resp.msg = old_resp.msg
|
|
|
+ # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
|
|
|
+ # https://github.com/ytdl-org/youtube-dl/issues/6457).
|
|
|
+ if 300 <= resp.code < 400:
|
|
|
+ location = resp.headers.get('Location')
|
|
|
+ if location:
|
|
|
+ # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
|
|
|
+ location = location.encode('iso-8859-1').decode()
|
|
|
+ location_escaped = escape_url(location)
|
|
|
+ if location != location_escaped:
|
|
|
+ del resp.headers['Location']
|
|
|
+ resp.headers['Location'] = location_escaped
|
|
|
+ return resp
|
|
|
+
|
|
|
+ https_request = http_request
|
|
|
+ https_response = http_response
|
|
|
+
|
|
|
+
|
|
|
+def make_socks_conn_class(base_class, socks_proxy):
|
|
|
+ assert issubclass(base_class, (
|
|
|
+ http.client.HTTPConnection, http.client.HTTPSConnection))
|
|
|
+
|
|
|
+ proxy_args = make_socks_proxy_opts(socks_proxy)
|
|
|
+
|
|
|
+ class SocksConnection(base_class):
|
|
|
+ def connect(self):
|
|
|
+ self.sock = sockssocket()
|
|
|
+ self.sock.setproxy(**proxy_args)
|
|
|
+ if isinstance(self.timeout, (int, float)):
|
|
|
+ self.sock.settimeout(self.timeout)
|
|
|
+ self.sock.connect((self.host, self.port))
|
|
|
+
|
|
|
+ if isinstance(self, http.client.HTTPSConnection):
|
|
|
+ if hasattr(self, '_context'): # Python > 2.6
|
|
|
+ self.sock = self._context.wrap_socket(
|
|
|
+ self.sock, server_hostname=self.host)
|
|
|
+ else:
|
|
|
+ self.sock = ssl.wrap_socket(self.sock)
|
|
|
+
|
|
|
+ return SocksConnection
|
|
|
+
|
|
|
+
|
|
|
+class RedirectHandler(urllib.request.HTTPRedirectHandler):
|
|
|
+ """YoutubeDL redirect handler
|
|
|
+
|
|
|
+ The code is based on HTTPRedirectHandler implementation from CPython [1].
|
|
|
+
|
|
|
+ This redirect handler fixes and improves the logic to better align with RFC7261
|
|
|
+ and what browsers tend to do [2][3]
|
|
|
+
|
|
|
+ 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
|
|
|
+ 2. https://datatracker.ietf.org/doc/html/rfc7231
|
|
|
+ 3. https://github.com/python/cpython/issues/91306
|
|
|
+ """
|
|
|
+
|
|
|
+ http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
|
|
|
+
|
|
|
+ def redirect_request(self, req, fp, code, msg, headers, newurl):
|
|
|
+ if code not in (301, 302, 303, 307, 308):
|
|
|
+ raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
|
|
|
+
|
|
|
+ new_data = req.data
|
|
|
+
|
|
|
+ # Technically the Cookie header should be in unredirected_hdrs,
|
|
|
+ # however in practice some may set it in normal headers anyway.
|
|
|
+ # We will remove it here to prevent any leaks.
|
|
|
+ remove_headers = ['Cookie']
|
|
|
+
|
|
|
+ new_method = get_redirect_method(req.get_method(), code)
|
|
|
+ # only remove payload if method changed (e.g. POST to GET)
|
|
|
+ if new_method != req.get_method():
|
|
|
+ new_data = None
|
|
|
+ remove_headers.extend(['Content-Length', 'Content-Type'])
|
|
|
+
|
|
|
+ new_headers = {k: v for k, v in req.headers.items() if k.title() not in remove_headers}
|
|
|
+
|
|
|
+ return urllib.request.Request(
|
|
|
+ newurl, headers=new_headers, origin_req_host=req.origin_req_host,
|
|
|
+ unverifiable=True, method=new_method, data=new_data)
|
|
|
+
|
|
|
+
|
|
|
+class ProxyHandler(urllib.request.ProxyHandler):
|
|
|
+ def __init__(self, proxies=None):
|
|
|
+ # Set default handlers
|
|
|
+ for type in ('http', 'https'):
|
|
|
+ setattr(self, '%s_open' % type,
|
|
|
+ lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
|
|
|
+ meth(r, proxy, type))
|
|
|
+ urllib.request.ProxyHandler.__init__(self, proxies)
|
|
|
+
|
|
|
+ def proxy_open(self, req, proxy, type):
|
|
|
+ req_proxy = req.headers.get('Ytdl-request-proxy')
|
|
|
+ if req_proxy is not None:
|
|
|
+ proxy = req_proxy
|
|
|
+ del req.headers['Ytdl-request-proxy']
|
|
|
+
|
|
|
+ if proxy == '__noproxy__':
|
|
|
+ return None # No Proxy
|
|
|
+ if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
|
|
|
+ req.add_header('Ytdl-socks-proxy', proxy)
|
|
|
+ # yt-dlp's http/https handlers do wrapping the socket with socks
|
|
|
+ return None
|
|
|
+ return urllib.request.ProxyHandler.proxy_open(
|
|
|
+ self, req, proxy, type)
|
|
|
+
|
|
|
+
|
|
|
+class PUTRequest(urllib.request.Request):
|
|
|
+ def get_method(self):
|
|
|
+ return 'PUT'
|
|
|
+
|
|
|
+
|
|
|
+class HEADRequest(urllib.request.Request):
|
|
|
+ def get_method(self):
|
|
|
+ return 'HEAD'
|
|
|
+
|
|
|
+
|
|
|
+def update_Request(req, url=None, data=None, headers=None, query=None):
|
|
|
+ req_headers = req.headers.copy()
|
|
|
+ req_headers.update(headers or {})
|
|
|
+ req_data = data or req.data
|
|
|
+ req_url = update_url_query(url or req.get_full_url(), query)
|
|
|
+ req_get_method = req.get_method()
|
|
|
+ if req_get_method == 'HEAD':
|
|
|
+ req_type = HEADRequest
|
|
|
+ elif req_get_method == 'PUT':
|
|
|
+ req_type = PUTRequest
|
|
|
+ else:
|
|
|
+ req_type = urllib.request.Request
|
|
|
+ new_req = req_type(
|
|
|
+ req_url, data=req_data, headers=req_headers,
|
|
|
+ origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
|
|
|
+ if hasattr(req, 'timeout'):
|
|
|
+ new_req.timeout = req.timeout
|
|
|
+ return new_req
|