Browse Source

[networking] Add module (#2861)

No actual changes - code is only moved around
pukkandan 1 year ago

+ 1 - 1

@@ -74,7 +74,7 @@ offlinetest: codetest
 	$(PYTHON) -m pytest -k "not download"
 # XXX: This is hard to maintain
-CODE_FOLDERS = yt_dlp yt_dlp/downloader yt_dlp/extractor yt_dlp/postprocessor yt_dlp/compat yt_dlp/compat/urllib yt_dlp/utils yt_dlp/dependencies
+CODE_FOLDERS = yt_dlp yt_dlp/downloader yt_dlp/extractor yt_dlp/postprocessor yt_dlp/compat yt_dlp/compat/urllib yt_dlp/utils yt_dlp/dependencies yt_dlp/networking
 yt-dlp: yt_dlp/*.py yt_dlp/*/*.py
 	mkdir -p zip
 	for d in $(CODE_FOLDERS) ; do \

+ 1 - 0

@@ -54,6 +54,7 @@ class CommitGroup(enum.Enum):
+                    'networking',

+ 0 - 0
test/ → test/

+ 9 - 9

@@ -258,15 +258,6 @@ class TestUtil(unittest.TestCase):
         self.assertEqual(sanitize_url(''), '')
         self.assertEqual(sanitize_url('foo bar'), 'foo bar')
-    def test_extract_basic_auth(self):
-        auth_header = lambda url: sanitized_Request(url).get_header('Authorization')
-        self.assertFalse(auth_header(''))
-        self.assertFalse(auth_header(''))
-        self.assertEqual(auth_header(''), 'Basic Og==')
-        self.assertEqual(auth_header(''), 'Basic OnBhc3M=')
-        self.assertEqual(auth_header(''), 'Basic dXNlcjo=')
-        self.assertEqual(auth_header(''), 'Basic dXNlcjpwYXNz')
     def test_expand_path(self):
         def env(var):
             return f'%{var}%' if sys.platform == 'win32' else f'${var}'
@@ -2324,6 +2315,15 @@ Line 1
         self.assertEqual(traverse_obj(mobj, lambda k, _: k in (0, 'group')), ['0123', '3'],
                          msg='function on a `re.Match` should give group name as well')
+    def test_extract_basic_auth(self):
+        auth_header = lambda url: sanitized_Request(url).get_header('Authorization')
+        self.assertFalse(auth_header(''))
+        self.assertFalse(auth_header(''))
+        self.assertEqual(auth_header(''), 'Basic Og==')
+        self.assertEqual(auth_header(''), 'Basic OnBhc3M=')
+        self.assertEqual(auth_header(''), 'Basic dXNlcjo=')
+        self.assertEqual(auth_header(''), 'Basic dXNlcjpwYXNz')
 if __name__ == '__main__':

+ 20 - 20

@@ -151,6 +151,7 @@ from .utils import (
+from .utils.networking import clean_headers
 from .version import CHANNEL, RELEASE_GIT_HEAD, VARIANT, __version__
 if compat_os_name == 'nt':
@@ -672,6 +673,7 @@ class YoutubeDL:
         self.params['compat_opts'] = set(self.params.get('compat_opts', ()))
+        self.params['http_headers'] = merge_headers(std_headers, self.params.get('http_headers', {}))
         if auto_init and auto_init != 'no_verbose_header':
@@ -745,9 +747,6 @@ class YoutubeDL:
             else self.params['format'] if callable(self.params['format'])
             else self.build_format_selector(self.params['format']))
-        # Set http_headers defaults according to std_headers
-        self.params['http_headers'] = merge_headers(std_headers, self.params.get('http_headers', {}))
         hooks = {
             'post_hooks': self.add_post_hook,
             'progress_hooks': self.add_progress_hook,
@@ -941,12 +940,14 @@ class YoutubeDL:
         return self
-    def __exit__(self, *args):
-        self.restore_console_title()
+    def save_cookies(self):
         if self.params.get('cookiefile') is not None:
   , ignore_expires=True)
+    def __exit__(self, *args):
+        self.restore_console_title()
+        self.save_cookies()
     def trouble(self, message=None, tb=None, is_error=True):
         """Determine action to take when a download problem appears.
@@ -2468,9 +2469,7 @@ class YoutubeDL:
     def _calc_headers(self, info_dict):
         res = merge_headers(self.params['http_headers'], info_dict.get('http_headers') or {})
-        if 'Youtubedl-No-Compression' in res:  # deprecated
-            res.pop('Youtubedl-No-Compression', None)
-            res['Accept-Encoding'] = 'identity'
+        clean_headers(res)
         cookies = self.cookiejar.get_cookies_for_url(info_dict['url'])
         if cookies:
             encoder = LenientSimpleCookie()
@@ -3856,12 +3855,6 @@ class YoutubeDL:
     def list_subtitles(self, video_id, subtitles, name='subtitles'):
         self.__list_table(video_id, name, self.render_subtitles_table, video_id, subtitles)
-    def urlopen(self, req):
-        """ Start an HTTP download """
-        if isinstance(req, str):
-            req = sanitized_Request(req)
-        return, timeout=self._socket_timeout)
     def print_debug_header(self):
         if not self.params.get('verbose'):
@@ -3989,13 +3982,8 @@ class YoutubeDL:
         timeout_val = self.params.get('socket_timeout')
         self._socket_timeout = 20 if timeout_val is None else float(timeout_val)
-        opts_cookiesfrombrowser = self.params.get('cookiesfrombrowser')
-        opts_cookiefile = self.params.get('cookiefile')
         opts_proxy = self.params.get('proxy')
-        self.cookiejar = load_cookies(opts_cookiefile, opts_cookiesfrombrowser, self)
         cookie_processor = YoutubeDLCookieProcessor(self.cookiejar)
         if opts_proxy is not None:
             if opts_proxy == '':
@@ -4037,6 +4025,18 @@ class YoutubeDL:
         opener.addheaders = []
         self._opener = opener
+    @functools.cached_property
+    def cookiejar(self):
+        """Global cookiejar instance"""
+        return load_cookies(
+            self.params.get('cookiefile'), self.params.get('cookiesfrombrowser'), self)
+    def urlopen(self, req):
+        """ Start an HTTP download """
+        if isinstance(req, str):
+            req = sanitized_Request(req)
+        return, timeout=self._socket_timeout)
     def encode(self, s):
         if isinstance(s, bytes):
             return s  # Already encoded

+ 0 - 0

+ 139 - 0

@@ -0,0 +1,139 @@
+from __future__ import annotations
+import contextlib
+import ssl
+import sys
+import urllib.parse
+from ..dependencies import certifi
+from ..socks import ProxyType
+from ..utils import YoutubeDLError
+def ssl_load_certs(context: ssl.SSLContext, use_certifi=True):
+    if certifi and use_certifi:
+        context.load_verify_locations(cafile=certifi.where())
+    else:
+        try:
+            context.load_default_certs()
+        # Work around the issue in load_default_certs when there are bad certificates. See:
+        #,
+        #,
+        except ssl.SSLError:
+            # enum_certificates is not present in mingw python. See
+            if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
+                for storename in ('CA', 'ROOT'):
+                    _ssl_load_windows_store_certs(context, storename)
+            context.set_default_verify_paths()
+def _ssl_load_windows_store_certs(ssl_context, storename):
+    # Code adapted from _load_windows_store_certs in
+    try:
+        certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
+                 if encoding == 'x509_asn' and (
+                     trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
+    except PermissionError:
+        return
+    for cert in certs:
+        with contextlib.suppress(ssl.SSLError):
+            ssl_context.load_verify_locations(cadata=cert)
+def make_socks_proxy_opts(socks_proxy):
+    url_components = urllib.parse.urlparse(socks_proxy)
+    if url_components.scheme.lower() == 'socks5':
+        socks_type = ProxyType.SOCKS5
+    elif url_components.scheme.lower() in ('socks', 'socks4'):
+        socks_type = ProxyType.SOCKS4
+    elif url_components.scheme.lower() == 'socks4a':
+        socks_type = ProxyType.SOCKS4A
+    def unquote_if_non_empty(s):
+        if not s:
+            return s
+        return urllib.parse.unquote_plus(s)
+    return {
+        'proxytype': socks_type,
+        'addr': url_components.hostname,
+        'port': url_components.port or 1080,
+        'rdns': True,
+        'username': unquote_if_non_empty(url_components.username),
+        'password': unquote_if_non_empty(url_components.password),
+    }
+def get_redirect_method(method, status):
+    """Unified redirect method handling"""
+    # A 303 must either use GET or HEAD for subsequent request
+    #
+    if status == 303 and method != 'HEAD':
+        method = 'GET'
+    # 301 and 302 redirects are commonly turned into a GET from a POST
+    # for subsequent requests by browsers, so we'll do the same.
+    #
+    #
+    if status in (301, 302) and method == 'POST':
+        method = 'GET'
+    return method
+def make_ssl_context(
+    verify=True,
+    client_certificate=None,
+    client_certificate_key=None,
+    client_certificate_password=None,
+    legacy_support=False,
+    use_certifi=True,
+    context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
+    context.check_hostname = verify
+    context.verify_mode = ssl.CERT_REQUIRED if verify else ssl.CERT_NONE
+    # Some servers may reject requests if ALPN extension is not sent. See:
+    #
+    #
+    with contextlib.suppress(NotImplementedError):
+        context.set_alpn_protocols(['http/1.1'])
+    if verify:
+        ssl_load_certs(context, use_certifi)
+    if legacy_support:
+        context.options |= 4  # SSL_OP_LEGACY_SERVER_CONNECT
+        context.set_ciphers('DEFAULT')  # compat
+    elif ssl.OPENSSL_VERSION_INFO >= (1, 1, 1) and not ssl.OPENSSL_VERSION.startswith('LibreSSL'):
+        # Use the default SSL ciphers and minimum TLS version settings from Python 3.10 [1].
+        # This is to ensure consistent behavior across Python versions and libraries, and help avoid fingerprinting
+        # in some situations [2][3].
+        # Python 3.10 only supports OpenSSL 1.1.1+ [4]. Because this change is likely
+        # untested on older versions, we only apply this to OpenSSL 1.1.1+ to be safe.
+        # LibreSSL is excluded until further investigation due to cipher support issues [5][6].
+        # 1.
+        # 2.
+        # 3.
+        # 4.
+        # 5.
+        # 6.
+        context.set_ciphers(
+        context.minimum_version = ssl.TLSVersion.TLSv1_2
+    if client_certificate:
+        try:
+            context.load_cert_chain(
+                client_certificate, keyfile=client_certificate_key,
+                password=client_certificate_password)
+        except ssl.SSLError:
+            raise YoutubeDLError('Unable to load client certificate')
+    return context
+def add_accept_encoding_header(headers, supported_encodings):
+    if supported_encodings and 'Accept-Encoding' not in headers:
+        headers['Accept-Encoding'] = ', '.join(supported_encodings)
+    elif 'Accept-Encoding' not in headers:
+        headers['Accept-Encoding'] = 'identity'

+ 315 - 0

@@ -0,0 +1,315 @@
+import functools
+import gzip
+import http.client
+import io
+import socket
+import ssl
+import urllib.error
+import urllib.parse
+import urllib.request
+import urllib.response
+import zlib
+from ._helper import (
+    add_accept_encoding_header,
+    get_redirect_method,
+    make_socks_proxy_opts,
+from ..dependencies import brotli
+from ..socks import sockssocket
+from ..utils import escape_url, update_url_query
+from ..utils.networking import clean_headers, std_headers
+SUPPORTED_ENCODINGS = ['gzip', 'deflate']
+if brotli:
+    SUPPORTED_ENCODINGS.append('br')
+def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
+    hc = http_class(*args, **kwargs)
+    source_address = ydl_handler._params.get('source_address')
+    if source_address is not None:
+        # This is to workaround _create_connection() from socket where it will try all
+        # address data from getaddrinfo() including IPv6. This filters the result from
+        # getaddrinfo() based on the source_address value.
+        # This is based on the cpython socket.create_connection() function.
+        #
+        def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
+            host, port = address
+            err = None
+            addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
+            af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
+            ip_addrs = [addr for addr in addrs if addr[0] == af]
+            if addrs and not ip_addrs:
+                ip_version = 'v4' if af == socket.AF_INET else 'v6'
+                raise OSError(
+                    "No remote IP%s addresses available for connect, can't use '%s' as source address"
+                    % (ip_version, source_address[0]))
+            for res in ip_addrs:
+                af, socktype, proto, canonname, sa = res
+                sock = None
+                try:
+                    sock = socket.socket(af, socktype, proto)
+                    if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
+                        sock.settimeout(timeout)
+                    sock.bind(source_address)
+                    sock.connect(sa)
+                    err = None  # Explicitly break reference cycle
+                    return sock
+                except OSError as _:
+                    err = _
+                    if sock is not None:
+                        sock.close()
+            if err is not None:
+                raise err
+            else:
+                raise OSError('getaddrinfo returns an empty list')
+        if hasattr(hc, '_create_connection'):
+            hc._create_connection = _create_connection
+        hc.source_address = (source_address, 0)
+    return hc
+class HTTPHandler(urllib.request.HTTPHandler):
+    """Handler for HTTP requests and responses.
+    This class, when installed with an OpenerDirector, automatically adds
+    the standard headers to every HTTP request and handles gzipped, deflated and
+    brotli responses from web servers.
+    Part of this code was copied from:
+    Andrew Rowls, the author of that code, agreed to release it to the
+    public domain.
+    """
+    def __init__(self, params, *args, **kwargs):
+        urllib.request.HTTPHandler.__init__(self, *args, **kwargs)
+        self._params = params
+    def http_open(self, req):
+        conn_class = http.client.HTTPConnection
+        socks_proxy = req.headers.get('Ytdl-socks-proxy')
+        if socks_proxy:
+            conn_class = make_socks_conn_class(conn_class, socks_proxy)
+            del req.headers['Ytdl-socks-proxy']
+        return self.do_open(functools.partial(
+            _create_http_connection, self, conn_class, False),
+            req)
+    @staticmethod
+    def deflate(data):
+        if not data:
+            return data
+        try:
+            return zlib.decompress(data, -zlib.MAX_WBITS)
+        except zlib.error:
+            return zlib.decompress(data)
+    @staticmethod
+    def brotli(data):
+        if not data:
+            return data
+        return brotli.decompress(data)
+    @staticmethod
+    def gz(data):
+        gz = gzip.GzipFile(fileobj=io.BytesIO(data), mode='rb')
+        try:
+            return
+        except OSError as original_oserror:
+            # There may be junk add the end of the file
+            # See for details
+            for i in range(1, 1024):
+                try:
+                    gz = gzip.GzipFile(fileobj=io.BytesIO(data[:-i]), mode='rb')
+                    return
+                except OSError:
+                    continue
+            else:
+                raise original_oserror
+    def http_request(self, req):
+        # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
+        # always respected by websites, some tend to give out URLs with non percent-encoded
+        # non-ASCII characters (see, [#3412])
+        # urllib chokes on URLs with non-ASCII characters (see
+        # To work around aforementioned issue we will replace request's original URL with
+        # percent-encoded one
+        # Since redirects are also affected (e.g.
+        # the code of this workaround has been moved here from YoutubeDL.urlopen()
+        url = req.get_full_url()
+        url_escaped = escape_url(url)
+        # Substitute URL if any change after escaping
+        if url != url_escaped:
+            req = update_Request(req, url=url_escaped)
+        for h, v in self._params.get('http_headers', std_headers).items():
+            # Capitalize is needed because of Python bug 2275:
+            # The dict keys are capitalized because of this bug by urllib
+            if h.capitalize() not in req.headers:
+                req.add_header(h, v)
+        clean_headers(req.headers)
+        add_accept_encoding_header(req.headers, SUPPORTED_ENCODINGS)
+        return super().do_request_(req)
+    def http_response(self, req, resp):
+        old_resp = resp
+        # Content-Encoding header lists the encodings in order that they were applied [1].
+        # To decompress, we simply do the reverse.
+        # [1]:
+        decoded_response = None
+        for encoding in (e.strip() for e in reversed(resp.headers.get('Content-encoding', '').split(','))):
+            if encoding == 'gzip':
+                decoded_response = self.gz(decoded_response or
+            elif encoding == 'deflate':
+                decoded_response = self.deflate(decoded_response or
+            elif encoding == 'br' and brotli:
+                decoded_response = self.brotli(decoded_response or
+        if decoded_response is not None:
+            resp = urllib.request.addinfourl(io.BytesIO(decoded_response), old_resp.headers, old_resp.url, old_resp.code)
+            resp.msg = old_resp.msg
+        # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
+        #
+        if 300 <= resp.code < 400:
+            location = resp.headers.get('Location')
+            if location:
+                # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
+                location = location.encode('iso-8859-1').decode()
+                location_escaped = escape_url(location)
+                if location != location_escaped:
+                    del resp.headers['Location']
+                    resp.headers['Location'] = location_escaped
+        return resp
+    https_request = http_request
+    https_response = http_response
+def make_socks_conn_class(base_class, socks_proxy):
+    assert issubclass(base_class, (
+        http.client.HTTPConnection, http.client.HTTPSConnection))
+    proxy_args = make_socks_proxy_opts(socks_proxy)
+    class SocksConnection(base_class):
+        def connect(self):
+            self.sock = sockssocket()
+            self.sock.setproxy(**proxy_args)
+            if isinstance(self.timeout, (int, float)):
+                self.sock.settimeout(self.timeout)
+            self.sock.connect((, self.port))
+            if isinstance(self, http.client.HTTPSConnection):
+                if hasattr(self, '_context'):  # Python > 2.6
+                    self.sock = self._context.wrap_socket(
+                        self.sock,
+                else:
+                    self.sock = ssl.wrap_socket(self.sock)
+    return SocksConnection
+class RedirectHandler(urllib.request.HTTPRedirectHandler):
+    """YoutubeDL redirect handler
+    The code is based on HTTPRedirectHandler implementation from CPython [1].
+    This redirect handler fixes and improves the logic to better align with RFC7261
+     and what browsers tend to do [2][3]
+    1.
+    2.
+    3.
+    """
+    http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
+    def redirect_request(self, req, fp, code, msg, headers, newurl):
+        if code not in (301, 302, 303, 307, 308):
+            raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
+        new_data =
+        # Technically the Cookie header should be in unredirected_hdrs,
+        # however in practice some may set it in normal headers anyway.
+        # We will remove it here to prevent any leaks.
+        remove_headers = ['Cookie']
+        new_method = get_redirect_method(req.get_method(), code)
+        # only remove payload if method changed (e.g. POST to GET)
+        if new_method != req.get_method():
+            new_data = None
+            remove_headers.extend(['Content-Length', 'Content-Type'])
+        new_headers = {k: v for k, v in req.headers.items() if k.title() not in remove_headers}
+        return urllib.request.Request(
+            newurl, headers=new_headers, origin_req_host=req.origin_req_host,
+            unverifiable=True, method=new_method, data=new_data)
+class ProxyHandler(urllib.request.ProxyHandler):
+    def __init__(self, proxies=None):
+        # Set default handlers
+        for type in ('http', 'https'):
+            setattr(self, '%s_open' % type,
+                    lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
+                        meth(r, proxy, type))
+        urllib.request.ProxyHandler.__init__(self, proxies)
+    def proxy_open(self, req, proxy, type):
+        req_proxy = req.headers.get('Ytdl-request-proxy')
+        if req_proxy is not None:
+            proxy = req_proxy
+            del req.headers['Ytdl-request-proxy']
+        if proxy == '__noproxy__':
+            return None  # No Proxy
+        if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
+            req.add_header('Ytdl-socks-proxy', proxy)
+            # yt-dlp's http/https handlers do wrapping the socket with socks
+            return None
+        return urllib.request.ProxyHandler.proxy_open(
+            self, req, proxy, type)
+class PUTRequest(urllib.request.Request):
+    def get_method(self):
+        return 'PUT'
+class HEADRequest(urllib.request.Request):
+    def get_method(self):
+        return 'HEAD'
+def update_Request(req, url=None, data=None, headers=None, query=None):
+    req_headers = req.headers.copy()
+    req_headers.update(headers or {})
+    req_data = data or
+    req_url = update_url_query(url or req.get_full_url(), query)
+    req_get_method = req.get_method()
+    if req_get_method == 'HEAD':
+        req_type = HEADRequest
+    elif req_get_method == 'PUT':
+        req_type = PUTRequest
+    else:
+        req_type = urllib.request.Request
+    new_req = req_type(
+        req_url, data=req_data, headers=req_headers,
+        origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
+    if hasattr(req, 'timeout'):
+        new_req.timeout = req.timeout
+    return new_req

+ 9 - 0

@@ -0,0 +1,9 @@
+import http.client
+import socket
+import ssl
+import urllib.error
+network_exceptions = [urllib.error.URLError, http.client.HTTPException, socket.error]
+if hasattr(ssl, 'CertificateError'):
+    network_exceptions.append(ssl.CertificateError)
+network_exceptions = tuple(network_exceptions)

+ 1 - 4

@@ -3,13 +3,10 @@ import warnings
 from ..compat.compat_utils import passthrough_module
-# XXX: Implement this the same way as other DeprecationWarnings without circular import
-passthrough_module(__name__, '._legacy', callback=lambda attr: warnings.warn(
-    DeprecationWarning(f'{__name__}.{attr} is deprecated'), stacklevel=5))
+passthrough_module(__name__, '._deprecated')
 del passthrough_module
 # isort: off
 from .traversal import *
 from ._utils import *
 from ._utils import _configuration_args, _get_exe_version_output
-from ._deprecated import *

Some files were not shown because too many files changed in this diff