networking.py 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164
  1. import collections
  2. import random
  3. import urllib.parse
  4. import urllib.request
  5. from ._utils import remove_start
  6. def random_user_agent():
  7. _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
  8. _CHROME_VERSIONS = (
  9. '90.0.4430.212',
  10. '90.0.4430.24',
  11. '90.0.4430.70',
  12. '90.0.4430.72',
  13. '90.0.4430.85',
  14. '90.0.4430.93',
  15. '91.0.4472.101',
  16. '91.0.4472.106',
  17. '91.0.4472.114',
  18. '91.0.4472.124',
  19. '91.0.4472.164',
  20. '91.0.4472.19',
  21. '91.0.4472.77',
  22. '92.0.4515.107',
  23. '92.0.4515.115',
  24. '92.0.4515.131',
  25. '92.0.4515.159',
  26. '92.0.4515.43',
  27. '93.0.4556.0',
  28. '93.0.4577.15',
  29. '93.0.4577.63',
  30. '93.0.4577.82',
  31. '94.0.4606.41',
  32. '94.0.4606.54',
  33. '94.0.4606.61',
  34. '94.0.4606.71',
  35. '94.0.4606.81',
  36. '94.0.4606.85',
  37. '95.0.4638.17',
  38. '95.0.4638.50',
  39. '95.0.4638.54',
  40. '95.0.4638.69',
  41. '95.0.4638.74',
  42. '96.0.4664.18',
  43. '96.0.4664.45',
  44. '96.0.4664.55',
  45. '96.0.4664.93',
  46. '97.0.4692.20',
  47. )
  48. return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
  49. class HTTPHeaderDict(collections.UserDict, dict):
  50. """
  51. Store and access keys case-insensitively.
  52. The constructor can take multiple dicts, in which keys in the latter are prioritised.
  53. """
  54. def __init__(self, *args, **kwargs):
  55. super().__init__()
  56. for dct in args:
  57. if dct is not None:
  58. self.update(dct)
  59. self.update(kwargs)
  60. def __setitem__(self, key, value):
  61. if isinstance(value, bytes):
  62. value = value.decode('latin-1')
  63. super().__setitem__(key.title(), str(value).strip())
  64. def __getitem__(self, key):
  65. return super().__getitem__(key.title())
  66. def __delitem__(self, key):
  67. super().__delitem__(key.title())
  68. def __contains__(self, key):
  69. return super().__contains__(key.title() if isinstance(key, str) else key)
  70. std_headers = HTTPHeaderDict({
  71. 'User-Agent': random_user_agent(),
  72. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  73. 'Accept-Language': 'en-us,en;q=0.5',
  74. 'Sec-Fetch-Mode': 'navigate',
  75. })
  76. def clean_proxies(proxies: dict, headers: HTTPHeaderDict):
  77. req_proxy = headers.pop('Ytdl-Request-Proxy', None)
  78. if req_proxy:
  79. proxies.clear() # XXX: compat: Ytdl-Request-Proxy takes preference over everything, including NO_PROXY
  80. proxies['all'] = req_proxy
  81. for proxy_key, proxy_url in proxies.items():
  82. if proxy_url == '__noproxy__':
  83. proxies[proxy_key] = None
  84. continue
  85. if proxy_key == 'no': # special case
  86. continue
  87. if proxy_url is not None:
  88. # Ensure proxies without a scheme are http.
  89. try:
  90. proxy_scheme = urllib.request._parse_proxy(proxy_url)[0]
  91. except ValueError:
  92. # Ignore invalid proxy URLs. Sometimes these may be introduced through environment
  93. # variables unrelated to proxy settings - e.g. Colab `COLAB_LANGUAGE_SERVER_PROXY`.
  94. # If the proxy is going to be used, the Request Handler proxy validation will handle it.
  95. continue
  96. if proxy_scheme is None:
  97. proxies[proxy_key] = 'http://' + remove_start(proxy_url, '//')
  98. replace_scheme = {
  99. 'socks5': 'socks5h', # compat: socks5 was treated as socks5h
  100. 'socks': 'socks4', # compat: non-standard
  101. }
  102. if proxy_scheme in replace_scheme:
  103. proxies[proxy_key] = urllib.parse.urlunparse(
  104. urllib.parse.urlparse(proxy_url)._replace(scheme=replace_scheme[proxy_scheme]))
  105. def clean_headers(headers: HTTPHeaderDict):
  106. if 'Youtubedl-No-Compression' in headers: # compat
  107. del headers['Youtubedl-No-Compression']
  108. headers['Accept-Encoding'] = 'identity'
  109. headers.pop('Ytdl-socks-proxy', None)
  110. def remove_dot_segments(path):
  111. # Implements RFC3986 5.2.4 remote_dot_segments
  112. # Pseudo-code: https://tools.ietf.org/html/rfc3986#section-5.2.4
  113. # https://github.com/urllib3/urllib3/blob/ba49f5c4e19e6bca6827282feb77a3c9f937e64b/src/urllib3/util/url.py#L263
  114. output = []
  115. segments = path.split('/')
  116. for s in segments:
  117. if s == '.':
  118. continue
  119. elif s == '..':
  120. if output:
  121. output.pop()
  122. else:
  123. output.append(s)
  124. if not segments[0] and (not output or output[0]):
  125. output.insert(0, '')
  126. if segments[-1] in ('.', '..'):
  127. output.append('')
  128. return '/'.join(output)
  129. def escape_rfc3986(s):
  130. """Escape non-ASCII characters as suggested by RFC 3986"""
  131. return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
  132. def normalize_url(url):
  133. """Normalize URL as suggested by RFC 3986"""
  134. url_parsed = urllib.parse.urlparse(url)
  135. return url_parsed._replace(
  136. netloc=url_parsed.netloc.encode('idna').decode('ascii'),
  137. path=escape_rfc3986(remove_dot_segments(url_parsed.path)),
  138. params=escape_rfc3986(url_parsed.params),
  139. query=escape_rfc3986(url_parsed.query),
  140. fragment=escape_rfc3986(url_parsed.fragment),
  141. ).geturl()