youtube.py 149 KB


  1. # coding: utf-8
  2. from __future__ import unicode_literals
  3. import itertools
  4. import json
  5. import os.path
  6. import random
  7. import re
  8. import time
  9. import traceback
  10. from .common import InfoExtractor, SearchInfoExtractor
  11. from ..compat import (
  12. compat_chr,
  13. compat_HTTPError,
  14. compat_kwargs,
  15. compat_parse_qs,
  16. compat_str,
  17. compat_urllib_parse_unquote_plus,
  18. compat_urllib_parse_urlencode,
  19. compat_urllib_parse_urlparse,
  20. compat_urlparse,
  21. )
  22. from ..jsinterp import JSInterpreter
  23. from ..utils import (
  24. clean_html,
  25. ExtractorError,
  26. format_field,
  27. float_or_none,
  28. int_or_none,
  29. mimetype2ext,
  30. parse_codecs,
  31. parse_duration,
  32. # qualities, # TODO: Enable this after fixing formatSort
  33. remove_start,
  34. smuggle_url,
  35. str_or_none,
  36. str_to_int,
  37. try_get,
  38. unescapeHTML,
  39. unified_strdate,
  40. unsmuggle_url,
  41. update_url_query,
  42. url_or_none,
  43. urlencode_postdata,
  44. urljoin,
  45. )
  46. class YoutubeBaseInfoExtractor(InfoExtractor):
  47. """Provide base functions for Youtube extractors"""
  48. _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
  49. _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
  50. _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
  51. _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
  52. _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
  53. _RESERVED_NAMES = (
  54. r'embed|e|watch_popup|channel|c|user|playlist|watch|w|v|movies|results|shared|'
  55. r'storefront|oops|index|account|reporthistory|t/terms|about|upload|signin|logout|'
  56. r'feed/(?:watch_later|history|subscriptions|library|trending|recommended)')
  57. _NETRC_MACHINE = 'youtube'
  58. # If True it will raise an error if no login info is provided
  59. _LOGIN_REQUIRED = False
  60. _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM|WL|LL|LM)'
  61. def _ids_to_results(self, ids):
  62. return [
  63. self.url_result(vid_id, 'Youtube', video_id=vid_id)
  64. for vid_id in ids]
  65. def _login(self):
  66. """
  67. Attempt to log in to YouTube.
  68. True is returned if successful or skipped.
  69. False is returned if login failed.
  70. If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
  71. """
  72. username, password = self._get_login_info()
  73. # No authentication to be performed
  74. if username is None:
  75. if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None:
  76. raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
  77. # if self._downloader.params.get('cookiefile'): # TODO remove 'and False' later - too many people using outdated cookies and open issues, remind them.
  78. # self.to_screen('[Cookies] Reminder - Make sure to always use up to date cookies!')
  79. return True
  80. login_page = self._download_webpage(
  81. self._LOGIN_URL, None,
  82. note='Downloading login page',
  83. errnote='unable to fetch login page', fatal=False)
  84. if login_page is False:
  85. return
  86. login_form = self._hidden_inputs(login_page)
  87. def req(url, f_req, note, errnote):
  88. data = login_form.copy()
  89. data.update({
  90. 'pstMsg': 1,
  91. 'checkConnection': 'youtube',
  92. 'checkedDomains': 'youtube',
  93. 'hl': 'en',
  94. 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',
  95. 'f.req': json.dumps(f_req),
  96. 'flowName': 'GlifWebSignIn',
  97. 'flowEntry': 'ServiceLogin',
  98. # TODO: reverse actual botguard identifier generation algo
  99. 'bgRequest': '["identifier",""]',
  100. })
  101. return self._download_json(
  102. url, None, note=note, errnote=errnote,
  103. transform_source=lambda s: re.sub(r'^[^[]*', '', s),
  104. fatal=False,
  105. data=urlencode_postdata(data), headers={
  106. 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
  107. 'Google-Accounts-XSRF': 1,
  108. })
  109. def warn(message):
  110. self._downloader.report_warning(message)
  111. lookup_req = [
  112. username,
  113. None, [], None, 'US', None, None, 2, False, True,
  114. [
  115. None, None,
  116. [2, 1, None, 1,
  117. 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
  118. None, [], 4],
  119. 1, [None, None, []], None, None, None, True
  120. ],
  121. username,
  122. ]
  123. lookup_results = req(
  124. self._LOOKUP_URL, lookup_req,
  125. 'Looking up account info', 'Unable to look up account info')
  126. if lookup_results is False:
  127. return False
  128. user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)
  129. if not user_hash:
  130. warn('Unable to extract user hash')
  131. return False
  132. challenge_req = [
  133. user_hash,
  134. None, 1, None, [1, None, None, None, [password, None, True]],
  135. [
  136. None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
  137. 1, [None, None, []], None, None, None, True
  138. ]]
  139. challenge_results = req(
  140. self._CHALLENGE_URL, challenge_req,
  141. 'Logging in', 'Unable to log in')
  142. if challenge_results is False:
  143. return
  144. login_res = try_get(challenge_results, lambda x: x[0][5], list)
  145. if login_res:
  146. login_msg = try_get(login_res, lambda x: x[5], compat_str)
  147. warn(
  148. 'Unable to login: %s' % 'Invalid password'
  149. if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)
  150. return False
  151. res = try_get(challenge_results, lambda x: x[0][-1], list)
  152. if not res:
  153. warn('Unable to extract result entry')
  154. return False
  155. login_challenge = try_get(res, lambda x: x[0][0], list)
  156. if login_challenge:
  157. challenge_str = try_get(login_challenge, lambda x: x[2], compat_str)
  158. if challenge_str == 'TWO_STEP_VERIFICATION':
  159. # SEND_SUCCESS - TFA code has been successfully sent to phone
  160. # QUOTA_EXCEEDED - reached the limit of TFA codes
  161. status = try_get(login_challenge, lambda x: x[5], compat_str)
  162. if status == 'QUOTA_EXCEEDED':
  163. warn('Exceeded the limit of TFA codes, try later')
  164. return False
  165. tl = try_get(challenge_results, lambda x: x[1][2], compat_str)
  166. if not tl:
  167. warn('Unable to extract TL')
  168. return False
  169. tfa_code = self._get_tfa_info('2-step verification code')
  170. if not tfa_code:
  171. warn(
  172. 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
  173. '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
  174. return False
  175. tfa_code = remove_start(tfa_code, 'G-')
  176. tfa_req = [
  177. user_hash, None, 2, None,
  178. [
  179. 9, None, None, None, None, None, None, None,
  180. [None, tfa_code, True, 2]
  181. ]]
  182. tfa_results = req(
  183. self._TFA_URL.format(tl), tfa_req,
  184. 'Submitting TFA code', 'Unable to submit TFA code')
  185. if tfa_results is False:
  186. return False
  187. tfa_res = try_get(tfa_results, lambda x: x[0][5], list)
  188. if tfa_res:
  189. tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)
  190. warn(
  191. 'Unable to finish TFA: %s' % 'Invalid TFA code'
  192. if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)
  193. return False
  194. check_cookie_url = try_get(
  195. tfa_results, lambda x: x[0][-1][2], compat_str)
  196. else:
  197. CHALLENGES = {
  198. 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.",
  199. 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.',
  200. 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.",
  201. }
  202. challenge = CHALLENGES.get(
  203. challenge_str,
  204. '%s returned error %s.' % (self.IE_NAME, challenge_str))
  205. warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge)
  206. return False
  207. else:
  208. check_cookie_url = try_get(res, lambda x: x[2], compat_str)
  209. if not check_cookie_url:
  210. warn('Unable to extract CheckCookie URL')
  211. return False
  212. check_cookie_results = self._download_webpage(
  213. check_cookie_url, None, 'Checking cookie', fatal=False)
  214. if check_cookie_results is False:
  215. return False
  216. if 'https://myaccount.google.com/' not in check_cookie_results:
  217. warn('Unable to log in')
  218. return False
  219. return True
  220. def _download_webpage_handle(self, *args, **kwargs):
  221. query = kwargs.get('query', {}).copy()
  222. kwargs['query'] = query
  223. return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle(
  224. *args, **compat_kwargs(kwargs))
  225. def _real_initialize(self):
  226. if self._downloader is None:
  227. return
  228. if not self._login():
  229. return
  230. _DEFAULT_API_DATA = {
  231. 'context': {
  232. 'client': {
  233. 'clientName': 'WEB',
  234. 'clientVersion': '2.20201021.03.00',
  235. }
  236. },
  237. }
  238. _YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;'
  239. _YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;'
  240. _YT_INITIAL_BOUNDARY_RE = r'(?:var\s+meta|</script|\n)'
  241. def _call_api(self, ep, query, video_id, fatal=True):
  242. data = self._DEFAULT_API_DATA.copy()
  243. data.update(query)
  244. return self._download_json(
  245. 'https://www.youtube.com/youtubei/v1/%s' % ep, video_id=video_id,
  246. note='Downloading API JSON', errnote='Unable to download API page',
  247. data=json.dumps(data).encode('utf8'), fatal=fatal,
  248. headers={'content-type': 'application/json'},
  249. query={'key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8'})
  250. def _extract_yt_initial_data(self, video_id, webpage):
  251. return self._parse_json(
  252. self._search_regex(
  253. (r'%s\s*%s' % (self._YT_INITIAL_DATA_RE, self._YT_INITIAL_BOUNDARY_RE),
  254. self._YT_INITIAL_DATA_RE), webpage, 'yt initial data'),
  255. video_id)
  256. def _extract_ytcfg(self, video_id, webpage):
  257. return self._parse_json(
  258. self._search_regex(
  259. r'ytcfg\.set\s*\(\s*({.+?})\s*\)\s*;', webpage, 'ytcfg',
  260. default='{}'), video_id, fatal=False)
  261. def _extract_video(self, renderer):
  262. video_id = renderer.get('videoId')
  263. title = try_get(
  264. renderer,
  265. (lambda x: x['title']['runs'][0]['text'],
  266. lambda x: x['title']['simpleText']), compat_str)
  267. description = try_get(
  268. renderer, lambda x: x['descriptionSnippet']['runs'][0]['text'],
  269. compat_str)
  270. duration = parse_duration(try_get(
  271. renderer, lambda x: x['lengthText']['simpleText'], compat_str))
  272. view_count_text = try_get(
  273. renderer, lambda x: x['viewCountText']['simpleText'], compat_str) or ''
  274. view_count = str_to_int(self._search_regex(
  275. r'^([\d,]+)', re.sub(r'\s', '', view_count_text),
  276. 'view count', default=None))
  277. uploader = try_get(
  278. renderer, lambda x: x['ownerText']['runs'][0]['text'], compat_str)
  279. return {
  280. '_type': 'url_transparent',
  281. 'ie_key': YoutubeIE.ie_key(),
  282. 'id': video_id,
  283. 'url': video_id,
  284. 'title': title,
  285. 'description': description,
  286. 'duration': duration,
  287. 'view_count': view_count,
  288. 'uploader': uploader,
  289. }
  290. class YoutubeIE(YoutubeBaseInfoExtractor):
  291. IE_DESC = 'YouTube.com'
  292. _VALID_URL = r"""(?x)^
  293. (
  294. (?:https?://|//) # http(s):// or protocol-independent URL
  295. (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com/|
  296. (?:www\.)?deturl\.com/www\.youtube\.com/|
  297. (?:www\.)?pwnyoutube\.com/|
  298. (?:www\.)?hooktube\.com/|
  299. (?:www\.)?yourepeat\.com/|
  300. tube\.majestyc\.net/|
  301. # Invidious instances taken from https://github.com/omarroth/invidious/wiki/Invidious-Instances
  302. (?:(?:www|dev)\.)?invidio\.us/|
  303. (?:(?:www|no)\.)?invidiou\.sh/|
  304. (?:(?:www|fi)\.)?invidious\.snopyta\.org/|
  305. (?:www\.)?invidious\.kabi\.tk/|
  306. (?:www\.)?invidious\.13ad\.de/|
  307. (?:www\.)?invidious\.mastodon\.host/|
  308. (?:www\.)?invidious\.zapashcanon\.fr/|
  309. (?:www\.)?invidious\.kavin\.rocks/|
  310. (?:www\.)?invidious\.tube/|
  311. (?:www\.)?invidiou\.site/|
  312. (?:www\.)?invidious\.site/|
  313. (?:www\.)?invidious\.xyz/|
  314. (?:www\.)?invidious\.nixnet\.xyz/|
  315. (?:www\.)?invidious\.drycat\.fr/|
  316. (?:www\.)?tube\.poal\.co/|
  317. (?:www\.)?tube\.connect\.cafe/|
  318. (?:www\.)?vid\.wxzm\.sx/|
  319. (?:www\.)?vid\.mint\.lgbt/|
  320. (?:www\.)?yewtu\.be/|
  321. (?:www\.)?yt\.elukerio\.org/|
  322. (?:www\.)?yt\.lelux\.fi/|
  323. (?:www\.)?invidious\.ggc-project\.de/|
  324. (?:www\.)?yt\.maisputain\.ovh/|
  325. (?:www\.)?invidious\.13ad\.de/|
  326. (?:www\.)?invidious\.toot\.koeln/|
  327. (?:www\.)?invidious\.fdn\.fr/|
  328. (?:www\.)?watch\.nettohikari\.com/|
  329. (?:www\.)?kgg2m7yk5aybusll\.onion/|
  330. (?:www\.)?qklhadlycap4cnod\.onion/|
  331. (?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion/|
  332. (?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion/|
  333. (?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion/|
  334. (?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion/|
  335. (?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p/|
  336. (?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion/|
  337. youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
  338. (?:.*?\#/)? # handle anchor (#/) redirect urls
  339. (?: # the various things that can precede the ID:
  340. (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
  341. |(?: # or the v= param in all its forms
  342. (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
  343. (?:\?|\#!?) # the params delimiter ? or # or #!
  344. (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&amp;v=V36LpHqtcDY)
  345. v=
  346. )
  347. ))
  348. |(?:
  349. youtu\.be| # just youtu.be/xxxx
  350. vid\.plus| # or vid.plus/xxxx
  351. zwearz\.com/watch| # or zwearz.com/watch/xxxx
  352. )/
  353. |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
  354. )
  355. )? # all until now is optional -> you can pass the naked ID
  356. (?P<id>[0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
  357. (?!.*?\blist=
  358. (?:
  359. %(playlist_id)s| # combined list/video URLs are handled by the playlist IE
  360. WL # WL are handled by the watch later IE
  361. )
  362. )
  363. (?(1).+)? # if we found the ID, everything can follow
  364. $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
  365. _PLAYER_INFO_RE = (
  366. r'/s/player/(?P<id>[a-zA-Z0-9_-]{8,})/player',
  367. r'/(?P<id>[a-zA-Z0-9_-]{8,})/player(?:_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?|-plasma-ias-(?:phone|tablet)-[a-z]{2}_[A-Z]{2}\.vflset)/base\.js$',
  368. r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.js$',
  369. )
  370. _formats = {
  371. '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
  372. '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
  373. '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
  374. '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
  375. '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
  376. '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
  377. '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
  378. '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
  379. # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
  380. '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
  381. '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
  382. '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
  383. '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
  384. '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
  385. '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
  386. '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
  387. '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
  388. '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
  389. # 3D videos
  390. '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
  391. '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
  392. '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
  393. '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
  394. '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
  395. '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
  396. '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
  397. # Apple HTTP Live Streaming
  398. '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
  399. '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
  400. '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
  401. '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
  402. '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
  403. '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
  404. '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
  405. '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
  406. # DASH mp4 video
  407. '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
  408. '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
  409. '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
  410. '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
  411. '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
  412. '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
  413. '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
  414. '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
  415. '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
  416. '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
  417. '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
  418. '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
  419. # Dash mp4 audio
  420. '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
  421. '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
  422. '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
  423. '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
  424. '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
  425. '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
  426. '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
  427. # Dash webm
  428. '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
  429. '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
  430. '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
  431. '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
  432. '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
  433. '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
  434. '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
  435. '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
  436. '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
  437. '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
  438. '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
  439. '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
  440. '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
  441. '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
  442. '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
  443. # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
  444. '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
  445. '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
  446. '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
  447. '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
  448. '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
  449. '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
  450. # Dash webm audio
  451. '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
  452. '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
  453. # Dash webm audio with opus inside
  454. '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
  455. '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
  456. '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
  457. # RTMP (unnamed)
  458. '_rtmp': {'protocol': 'rtmp'},
  459. # av01 video only formats sometimes served with "unknown" codecs
  460. '394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
  461. '395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
  462. '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
  463. '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
  464. }
  465. _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt')
  466. _GEO_BYPASS = False
  467. IE_NAME = 'youtube'
  468. _TESTS = [
  469. {
  470. 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',
  471. 'info_dict': {
  472. 'id': 'BaW_jenozKc',
  473. 'ext': 'mp4',
  474. 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
  475. 'uploader': 'Philipp Hagemeister',
  476. 'uploader_id': 'phihag',
  477. 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
  478. 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',
  479. 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',
  480. 'upload_date': '20121002',
  481. 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
  482. 'categories': ['Science & Technology'],
  483. 'tags': ['youtube-dl'],
  484. 'duration': 10,
  485. 'view_count': int,
  486. 'like_count': int,
  487. 'dislike_count': int,
  488. 'start_time': 1,
  489. 'end_time': 9,
  490. }
  491. },
  492. {
  493. 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
  494. 'note': 'Embed-only video (#1746)',
  495. 'info_dict': {
  496. 'id': 'yZIXLfi8CZQ',
  497. 'ext': 'mp4',
  498. 'upload_date': '20120608',
  499. 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
  500. 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
  501. 'uploader': 'SET India',
  502. 'uploader_id': 'setindia',
  503. 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia',
  504. 'age_limit': 18,
  505. },
  506. 'skip': 'Private video',
  507. },
  508. {
  509. 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=yZIXLfi8CZQ',
  510. 'note': 'Use the first video ID in the URL',
  511. 'info_dict': {
  512. 'id': 'BaW_jenozKc',
  513. 'ext': 'mp4',
  514. 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
  515. 'uploader': 'Philipp Hagemeister',
  516. 'uploader_id': 'phihag',
  517. 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
  518. 'upload_date': '20121002',
  519. 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
  520. 'categories': ['Science & Technology'],
  521. 'tags': ['youtube-dl'],
  522. 'duration': 10,
  523. 'view_count': int,
  524. 'like_count': int,
  525. 'dislike_count': int,
  526. },
  527. 'params': {
  528. 'skip_download': True,
  529. },
  530. },
  531. {
  532. 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I',
  533. 'note': '256k DASH audio (format 141) via DASH manifest',
  534. 'info_dict': {
  535. 'id': 'a9LDPn-MO4I',
  536. 'ext': 'm4a',
  537. 'upload_date': '20121002',
  538. 'uploader_id': '8KVIDEO',
  539. 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
  540. 'description': '',
  541. 'uploader': '8KVIDEO',
  542. 'title': 'UHDTV TEST 8K VIDEO.mp4'
  543. },
  544. 'params': {
  545. 'youtube_include_dash_manifest': True,
  546. 'format': '141',
  547. },
  548. 'skip': 'format 141 not served anymore',
  549. },
  550. # DASH manifest with encrypted signature
  551. {
  552. 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
  553. 'info_dict': {
  554. 'id': 'IB3lcPjvWLA',
  555. 'ext': 'm4a',
  556. 'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson',
  557. 'description': 'md5:8f5e2b82460520b619ccac1f509d43bf',
  558. 'duration': 244,
  559. 'uploader': 'AfrojackVEVO',
  560. 'uploader_id': 'AfrojackVEVO',
  561. 'upload_date': '20131011',
  562. 'abr': 129.495,
  563. },
  564. 'params': {
  565. 'youtube_include_dash_manifest': True,
  566. 'format': '141/bestaudio[ext=m4a]',
  567. },
  568. },
  569. # Controversy video
  570. {
  571. 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
  572. 'info_dict': {
  573. 'id': 'T4XJQO3qol8',
  574. 'ext': 'mp4',
  575. 'duration': 219,
  576. 'upload_date': '20100909',
  577. 'uploader': 'Amazing Atheist',
  578. 'uploader_id': 'TheAmazingAtheist',
  579. 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist',
  580. 'title': 'Burning Everyone\'s Koran',
  581. 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms \r\n\r\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
  582. }
  583. },
  584. # Normal age-gate video (embed allowed)
  585. {
  586. 'url': 'https://youtube.com/watch?v=HtVdAasjOgU',
  587. 'info_dict': {
  588. 'id': 'HtVdAasjOgU',
  589. 'ext': 'mp4',
  590. 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
  591. 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
  592. 'duration': 142,
  593. 'uploader': 'The Witcher',
  594. 'uploader_id': 'WitcherGame',
  595. 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',
  596. 'upload_date': '20140605',
  597. 'age_limit': 18,
  598. },
  599. },
  600. # video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421)
  601. # YouTube Red ad is not captured for creator
  602. {
  603. 'url': '__2ABJjxzNo',
  604. 'info_dict': {
  605. 'id': '__2ABJjxzNo',
  606. 'ext': 'mp4',
  607. 'duration': 266,
  608. 'upload_date': '20100430',
  609. 'uploader_id': 'deadmau5',
  610. 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5',
  611. 'creator': 'deadmau5',
  612. 'description': 'md5:6cbcd3a92ce1bc676fc4d6ab4ace2336',
  613. 'uploader': 'deadmau5',
  614. 'title': 'Deadmau5 - Some Chords (HD)',
  615. 'alt_title': 'Some Chords',
  616. },
  617. 'expected_warnings': [
  618. 'DASH manifest missing',
  619. ]
  620. },
  621. # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431)
  622. {
  623. 'url': 'lqQg6PlCWgI',
  624. 'info_dict': {
  625. 'id': 'lqQg6PlCWgI',
  626. 'ext': 'mp4',
  627. 'duration': 6085,
  628. 'upload_date': '20150827',
  629. 'uploader_id': 'olympic',
  630. 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',
  631. 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
  632. 'uploader': 'Olympic',
  633. 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
  634. },
  635. 'params': {
  636. 'skip_download': 'requires avconv',
  637. }
  638. },
  639. # Non-square pixels
  640. {
  641. 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
  642. 'info_dict': {
  643. 'id': '_b-2C3KPAM0',
  644. 'ext': 'mp4',
  645. 'stretched_ratio': 16 / 9.,
  646. 'duration': 85,
  647. 'upload_date': '20110310',
  648. 'uploader_id': 'AllenMeow',
  649. 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow',
  650. 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
  651. 'uploader': '孫ᄋᄅ',
  652. 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
  653. },
  654. },
  655. # url_encoded_fmt_stream_map is empty string
  656. {
  657. 'url': 'qEJwOuvDf7I',
  658. 'info_dict': {
  659. 'id': 'qEJwOuvDf7I',
  660. 'ext': 'webm',
  661. 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
  662. 'description': '',
  663. 'upload_date': '20150404',
  664. 'uploader_id': 'spbelect',
  665. 'uploader': 'Наблюдатели Петербурга',
  666. },
  667. 'params': {
  668. 'skip_download': 'requires avconv',
  669. },
  670. 'skip': 'This live event has ended.',
  671. },
  672. # Extraction from multiple DASH manifests (https://github.com/ytdl-org/youtube-dl/pull/6097)
  673. {
  674. 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
  675. 'info_dict': {
  676. 'id': 'FIl7x6_3R5Y',
  677. 'ext': 'webm',
  678. 'title': 'md5:7b81415841e02ecd4313668cde88737a',
  679. 'description': 'md5:116377fd2963b81ec4ce64b542173306',
  680. 'duration': 220,
  681. 'upload_date': '20150625',
  682. 'uploader_id': 'dorappi2000',
  683. 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000',
  684. 'uploader': 'dorappi2000',
  685. 'formats': 'mincount:31',
  686. },
  687. 'skip': 'not actual anymore',
  688. },
  689. # DASH manifest with segment_list
  690. {
  691. 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
  692. 'md5': '8ce563a1d667b599d21064e982ab9e31',
  693. 'info_dict': {
  694. 'id': 'CsmdDsKjzN8',
  695. 'ext': 'mp4',
  696. 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510
  697. 'uploader': 'Airtek',
  698. 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
  699. 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
  700. 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
  701. },
  702. 'params': {
  703. 'youtube_include_dash_manifest': True,
  704. 'format': '135', # bestvideo
  705. },
  706. 'skip': 'This live event has ended.',
  707. },
  708. {
  709. # Multifeed videos (multiple cameras), URL is for Main Camera
  710. 'url': 'https://www.youtube.com/watch?v=jvGDaLqkpTg',
  711. 'info_dict': {
  712. 'id': 'jvGDaLqkpTg',
  713. 'title': 'Tom Clancy Free Weekend Rainbow Whatever',
  714. 'description': 'md5:e03b909557865076822aa169218d6a5d',
  715. },
  716. 'playlist': [{
  717. 'info_dict': {
  718. 'id': 'jvGDaLqkpTg',
  719. 'ext': 'mp4',
  720. 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Main Camera)',
  721. 'description': 'md5:e03b909557865076822aa169218d6a5d',
  722. 'duration': 10643,
  723. 'upload_date': '20161111',
  724. 'uploader': 'Team PGP',
  725. 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg',
  726. 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg',
  727. },
  728. }, {
  729. 'info_dict': {
  730. 'id': '3AKt1R1aDnw',
  731. 'ext': 'mp4',
  732. 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Camera 2)',
  733. 'description': 'md5:e03b909557865076822aa169218d6a5d',
  734. 'duration': 10991,
  735. 'upload_date': '20161111',
  736. 'uploader': 'Team PGP',
  737. 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg',
  738. 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg',
  739. },
  740. }, {
  741. 'info_dict': {
  742. 'id': 'RtAMM00gpVc',
  743. 'ext': 'mp4',
  744. 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Camera 3)',
  745. 'description': 'md5:e03b909557865076822aa169218d6a5d',
  746. 'duration': 10995,
  747. 'upload_date': '20161111',
  748. 'uploader': 'Team PGP',
  749. 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg',
  750. 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg',
  751. },
  752. }, {
  753. 'info_dict': {
  754. 'id': '6N2fdlP3C5U',
  755. 'ext': 'mp4',
  756. 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Camera 4)',
  757. 'description': 'md5:e03b909557865076822aa169218d6a5d',
  758. 'duration': 10990,
  759. 'upload_date': '20161111',
  760. 'uploader': 'Team PGP',
  761. 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg',
  762. 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg',
  763. },
  764. }],
  765. 'params': {
  766. 'skip_download': True,
  767. },
  768. },
  769. {
  770. # Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536)
  771. 'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo',
  772. 'info_dict': {
  773. 'id': 'gVfLd0zydlo',
  774. 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',
  775. },
  776. 'playlist_count': 2,
  777. 'skip': 'Not multifeed anymore',
  778. },
  779. {
  780. 'url': 'https://vid.plus/FlRa-iH7PGw',
  781. 'only_matching': True,
  782. },
  783. {
  784. 'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html',
  785. 'only_matching': True,
  786. },
  787. {
  788. # Title with JS-like syntax "};" (see https://github.com/ytdl-org/youtube-dl/issues/7468)
  789. # Also tests cut-off URL expansion in video description (see
  790. # https://github.com/ytdl-org/youtube-dl/issues/1892,
  791. # https://github.com/ytdl-org/youtube-dl/issues/8164)
  792. 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',
  793. 'info_dict': {
  794. 'id': 'lsguqyKfVQg',
  795. 'ext': 'mp4',
  796. 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',
  797. 'alt_title': 'Dark Walk - Position Music',
  798. 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
  799. 'duration': 133,
  800. 'upload_date': '20151119',
  801. 'uploader_id': 'IronSoulElf',
  802. 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',
  803. 'uploader': 'IronSoulElf',
  804. 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
  805. 'track': 'Dark Walk - Position Music',
  806. 'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
  807. 'album': 'Position Music - Production Music Vol. 143 - Dark Walk',
  808. },
  809. 'params': {
  810. 'skip_download': True,
  811. },
  812. },
  813. {
  814. # Tags with '};' (see https://github.com/ytdl-org/youtube-dl/issues/7468)
  815. 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',
  816. 'only_matching': True,
  817. },
  818. {
  819. # Video with yt:stretch=17:0
  820. 'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',
  821. 'info_dict': {
  822. 'id': 'Q39EVAstoRM',
  823. 'ext': 'mp4',
  824. 'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',
  825. 'description': 'md5:ee18a25c350637c8faff806845bddee9',
  826. 'upload_date': '20151107',
  827. 'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',
  828. 'uploader': 'CH GAMER DROID',
  829. },
  830. 'params': {
  831. 'skip_download': True,
  832. },
  833. 'skip': 'This video does not exist.',
  834. },
  835. {
  836. # Video licensed under Creative Commons
  837. 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',
  838. 'info_dict': {
  839. 'id': 'M4gD1WSo5mA',
  840. 'ext': 'mp4',
  841. 'title': 'md5:e41008789470fc2533a3252216f1c1d1',
  842. 'description': 'md5:a677553cf0840649b731a3024aeff4cc',
  843. 'duration': 721,
  844. 'upload_date': '20150127',
  845. 'uploader_id': 'BerkmanCenter',
  846. 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',
  847. 'uploader': 'The Berkman Klein Center for Internet & Society',
  848. 'license': 'Creative Commons Attribution license (reuse allowed)',
  849. },
  850. 'params': {
  851. 'skip_download': True,
  852. },
  853. },
  854. {
  855. # Channel-like uploader_url
  856. 'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg',
  857. 'info_dict': {
  858. 'id': 'eQcmzGIKrzg',
  859. 'ext': 'mp4',
  860. 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',
  861. 'description': 'md5:13a2503d7b5904ef4b223aa101628f39',
  862. 'duration': 4060,
  863. 'upload_date': '20151119',
  864. 'uploader': 'Bernie Sanders',
  865. 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',
  866. 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
  867. 'license': 'Creative Commons Attribution license (reuse allowed)',
  868. },
  869. 'params': {
  870. 'skip_download': True,
  871. },
  872. },
  873. {
  874. 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;amp;v=V36LpHqtcDY',
  875. 'only_matching': True,
  876. },
  877. {
  878. # YouTube Red paid video (https://github.com/ytdl-org/youtube-dl/issues/10059)
  879. 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo',
  880. 'only_matching': True,
  881. },
  882. {
  883. # Rental video preview
  884. 'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg',
  885. 'info_dict': {
  886. 'id': 'uGpuVWrhIzE',
  887. 'ext': 'mp4',
  888. 'title': 'Piku - Trailer',
  889. 'description': 'md5:c36bd60c3fd6f1954086c083c72092eb',
  890. 'upload_date': '20150811',
  891. 'uploader': 'FlixMatrix',
  892. 'uploader_id': 'FlixMatrixKaravan',
  893. 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan',
  894. 'license': 'Standard YouTube License',
  895. },
  896. 'params': {
  897. 'skip_download': True,
  898. },
  899. 'skip': 'This video is not available.',
  900. },
  901. {
  902. # YouTube Red video with episode data
  903. 'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4',
  904. 'info_dict': {
  905. 'id': 'iqKdEhx-dD4',
  906. 'ext': 'mp4',
  907. 'title': 'Isolation - Mind Field (Ep 1)',
  908. 'description': 'md5:f540112edec5d09fc8cc752d3d4ba3cd',
  909. 'duration': 2085,
  910. 'upload_date': '20170118',
  911. 'uploader': 'Vsauce',
  912. 'uploader_id': 'Vsauce',
  913. 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce',
  914. 'series': 'Mind Field',
  915. 'season_number': 1,
  916. 'episode_number': 1,
  917. },
  918. 'params': {
  919. 'skip_download': True,
  920. },
  921. 'expected_warnings': [
  922. 'Skipping DASH manifest',
  923. ],
  924. },
  925. {
  926. # The following content has been identified by the YouTube community
  927. # as inappropriate or offensive to some audiences.
  928. 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI',
  929. 'info_dict': {
  930. 'id': '6SJNVb0GnPI',
  931. 'ext': 'mp4',
  932. 'title': 'Race Differences in Intelligence',
  933. 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1',
  934. 'duration': 965,
  935. 'upload_date': '20140124',
  936. 'uploader': 'New Century Foundation',
  937. 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',
  938. 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',
  939. },
  940. 'params': {
  941. 'skip_download': True,
  942. },
  943. 'skip': 'This video has been removed for violating YouTube\'s policy on hate speech.',
  944. },
  945. {
  946. # itag 212
  947. 'url': '1t24XAntNCY',
  948. 'only_matching': True,
  949. },
  950. {
  951. # geo restricted to JP
  952. 'url': 'sJL6WA-aGkQ',
  953. 'only_matching': True,
  954. },
  955. {
  956. 'url': 'https://invidio.us/watch?v=BaW_jenozKc',
  957. 'only_matching': True,
  958. },
  959. {
  960. # DRM protected
  961. 'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc',
  962. 'only_matching': True,
  963. },
  964. {
  965. # Video with unsupported adaptive stream type formats
  966. 'url': 'https://www.youtube.com/watch?v=Z4Vy8R84T1U',
  967. 'info_dict': {
  968. 'id': 'Z4Vy8R84T1U',
  969. 'ext': 'mp4',
  970. 'title': 'saman SMAN 53 Jakarta(Sancety) opening COFFEE4th at SMAN 53 Jakarta',
  971. 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
  972. 'duration': 433,
  973. 'upload_date': '20130923',
  974. 'uploader': 'Amelia Putri Harwita',
  975. 'uploader_id': 'UCpOxM49HJxmC1qCalXyB3_Q',
  976. 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCpOxM49HJxmC1qCalXyB3_Q',
  977. 'formats': 'maxcount:10',
  978. },
  979. 'params': {
  980. 'skip_download': True,
  981. 'youtube_include_dash_manifest': False,
  982. },
  983. 'skip': 'not actual anymore',
  984. },
  985. {
  986. # Youtube Music Auto-generated description
  987. 'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs',
  988. 'info_dict': {
  989. 'id': 'MgNrAu2pzNs',
  990. 'ext': 'mp4',
  991. 'title': 'Voyeur Girl',
  992. 'description': 'md5:7ae382a65843d6df2685993e90a8628f',
  993. 'upload_date': '20190312',
  994. 'uploader': 'Stephen - Topic',
  995. 'uploader_id': 'UC-pWHpBjdGG69N9mM2auIAA',
  996. 'artist': 'Stephen',
  997. 'track': 'Voyeur Girl',
  998. 'album': 'it\'s too much love to know my dear',
  999. 'release_date': '20190313',
  1000. 'release_year': 2019,
  1001. },
  1002. 'params': {
  1003. 'skip_download': True,
  1004. },
  1005. },
  1006. {
  1007. 'url': 'https://www.youtubekids.com/watch?v=3b8nCWDgZ6Q',
  1008. 'only_matching': True,
  1009. },
  1010. {
  1011. # invalid -> valid video id redirection
  1012. 'url': 'DJztXj2GPfl',
  1013. 'info_dict': {
  1014. 'id': 'DJztXj2GPfk',
  1015. 'ext': 'mp4',
  1016. 'title': 'Panjabi MC - Mundian To Bach Ke (The Dictator Soundtrack)',
  1017. 'description': 'md5:bf577a41da97918e94fa9798d9228825',
  1018. 'upload_date': '20090125',
  1019. 'uploader': 'Prochorowka',
  1020. 'uploader_id': 'Prochorowka',
  1021. 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Prochorowka',
  1022. 'artist': 'Panjabi MC',
  1023. 'track': 'Beware of the Boys (Mundian to Bach Ke) - Motivo Hi-Lectro Remix',
  1024. 'album': 'Beware of the Boys (Mundian To Bach Ke)',
  1025. },
  1026. 'params': {
  1027. 'skip_download': True,
  1028. },
  1029. 'skip': 'Video unavailable',
  1030. },
  1031. {
  1032. # empty description results in an empty string
  1033. 'url': 'https://www.youtube.com/watch?v=x41yOUIvK2k',
  1034. 'info_dict': {
  1035. 'id': 'x41yOUIvK2k',
  1036. 'ext': 'mp4',
  1037. 'title': 'IMG 3456',
  1038. 'description': '',
  1039. 'upload_date': '20170613',
  1040. 'uploader_id': 'ElevageOrVert',
  1041. 'uploader': 'ElevageOrVert',
  1042. },
  1043. 'params': {
  1044. 'skip_download': True,
  1045. },
  1046. },
  1047. {
  1048. # with '};' inside yt initial data (see [1])
  1049. # see [2] for an example with '};' inside ytInitialPlayerResponse
  1050. # 1. https://github.com/ytdl-org/youtube-dl/issues/27093
  1051. # 2. https://github.com/ytdl-org/youtube-dl/issues/27216
  1052. 'url': 'https://www.youtube.com/watch?v=CHqg6qOn4no',
  1053. 'info_dict': {
  1054. 'id': 'CHqg6qOn4no',
  1055. 'ext': 'mp4',
  1056. 'title': 'Part 77 Sort a list of simple types in c#',
  1057. 'description': 'md5:b8746fa52e10cdbf47997903f13b20dc',
  1058. 'upload_date': '20130831',
  1059. 'uploader_id': 'kudvenkat',
  1060. 'uploader': 'kudvenkat',
  1061. },
  1062. 'params': {
  1063. 'skip_download': True,
  1064. },
  1065. },
  1066. {
  1067. # another example of '};' in ytInitialData
  1068. 'url': 'https://www.youtube.com/watch?v=gVfgbahppCY',
  1069. 'only_matching': True,
  1070. },
  1071. {
  1072. 'url': 'https://www.youtube.com/watch_popup?v=63RmMXCd_bQ',
  1073. 'only_matching': True,
  1074. },
  1075. {
  1076. # https://github.com/ytdl-org/youtube-dl/pull/28094
  1077. 'url': 'OtqTfy26tG0',
  1078. 'info_dict': {
  1079. 'id': 'OtqTfy26tG0',
  1080. 'ext': 'mp4',
  1081. 'title': 'Burn Out',
  1082. 'description': 'md5:8d07b84dcbcbfb34bc12a56d968b6131',
  1083. 'upload_date': '20141120',
  1084. 'uploader': 'The Cinematic Orchestra - Topic',
  1085. 'uploader_id': 'UCIzsJBIyo8hhpFm1NK0uLgw',
  1086. 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCIzsJBIyo8hhpFm1NK0uLgw',
  1087. 'artist': 'The Cinematic Orchestra',
  1088. 'track': 'Burn Out',
  1089. 'album': 'Every Day',
  1090. 'release_data': None,
  1091. 'release_year': None,
  1092. },
  1093. 'params': {
  1094. 'skip_download': True,
  1095. },
  1096. },
  1097. ]
  1098. def __init__(self, *args, **kwargs):
  1099. super(YoutubeIE, self).__init__(*args, **kwargs)
  1100. self._code_cache = {}
  1101. self._player_cache = {}
  1102. def _signature_cache_id(self, example_sig):
  1103. """ Return a string representation of a signature """
  1104. return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
  1105. @classmethod
  1106. def _extract_player_info(cls, player_url):
  1107. for player_re in cls._PLAYER_INFO_RE:
  1108. id_m = re.search(player_re, player_url)
  1109. if id_m:
  1110. break
  1111. else:
  1112. raise ExtractorError('Cannot identify player %r' % player_url)
  1113. return id_m.group('id')
  1114. def _extract_signature_function(self, video_id, player_url, example_sig):
  1115. player_id = self._extract_player_info(player_url)
  1116. # Read from filesystem cache
  1117. func_id = 'js_%s_%s' % (
  1118. player_id, self._signature_cache_id(example_sig))
  1119. assert os.path.basename(func_id) == func_id
  1120. cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
  1121. if cache_spec is not None:
  1122. return lambda s: ''.join(s[i] for i in cache_spec)
  1123. if player_id not in self._code_cache:
  1124. self._code_cache[player_id] = self._download_webpage(
  1125. player_url, video_id,
  1126. note='Downloading player ' + player_id,
  1127. errnote='Download of %s failed' % player_url)
  1128. code = self._code_cache[player_id]
  1129. res = self._parse_sig_js(code)
  1130. test_string = ''.join(map(compat_chr, range(len(example_sig))))
  1131. cache_res = res(test_string)
  1132. cache_spec = [ord(c) for c in cache_res]
  1133. self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
  1134. return res
  1135. def _print_sig_code(self, func, example_sig):
  1136. def gen_sig_code(idxs):
  1137. def _genslice(start, end, step):
  1138. starts = '' if start == 0 else str(start)
  1139. ends = (':%d' % (end + step)) if end + step >= 0 else ':'
  1140. steps = '' if step == 1 else (':%d' % step)
  1141. return 's[%s%s%s]' % (starts, ends, steps)
  1142. step = None
  1143. # Quelch pyflakes warnings - start will be set when step is set
  1144. start = '(Never used)'
  1145. for i, prev in zip(idxs[1:], idxs[:-1]):
  1146. if step is not None:
  1147. if i - prev == step:
  1148. continue
  1149. yield _genslice(start, prev, step)
  1150. step = None
  1151. continue
  1152. if i - prev in [-1, 1]:
  1153. step = i - prev
  1154. start = prev
  1155. continue
  1156. else:
  1157. yield 's[%d]' % prev
  1158. if step is None:
  1159. yield 's[%d]' % i
  1160. else:
  1161. yield _genslice(start, i, step)
  1162. test_string = ''.join(map(compat_chr, range(len(example_sig))))
  1163. cache_res = func(test_string)
  1164. cache_spec = [ord(c) for c in cache_res]
  1165. expr_code = ' + '.join(gen_sig_code(cache_spec))
  1166. signature_id_tuple = '(%s)' % (
  1167. ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
  1168. code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
  1169. ' return %s\n') % (signature_id_tuple, expr_code)
  1170. self.to_screen('Extracted signature function:\n' + code)
  1171. def _parse_sig_js(self, jscode):
  1172. funcname = self._search_regex(
  1173. (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
  1174. r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
  1175. r'\bm=(?P<sig>[a-zA-Z0-9$]{2})\(decodeURIComponent\(h\.s\)\)',
  1176. r'\bc&&\(c=(?P<sig>[a-zA-Z0-9$]{2})\(decodeURIComponent\(c\)\)',
  1177. r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\);[a-zA-Z0-9$]{2}\.[a-zA-Z0-9$]{2}\(a,\d+\)',
  1178. r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
  1179. r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
  1180. # Obsolete patterns
  1181. r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
  1182. r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',
  1183. r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(',
  1184. r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
  1185. r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
  1186. r'\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
  1187. r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
  1188. r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('),
  1189. jscode, 'Initial JS player signature function name', group='sig')
  1190. jsi = JSInterpreter(jscode)
  1191. initial_function = jsi.extract_function(funcname)
  1192. return lambda s: initial_function([s])
  1193. def _decrypt_signature(self, s, video_id, player_url):
  1194. """Turn the encrypted s field into a working signature"""
  1195. if player_url is None:
  1196. raise ExtractorError('Cannot decrypt signature without player_url')
  1197. if player_url.startswith('//'):
  1198. player_url = 'https:' + player_url
  1199. elif not re.match(r'https?://', player_url):
  1200. player_url = compat_urlparse.urljoin(
  1201. 'https://www.youtube.com', player_url)
  1202. try:
  1203. player_id = (player_url, self._signature_cache_id(s))
  1204. if player_id not in self._player_cache:
  1205. func = self._extract_signature_function(
  1206. video_id, player_url, s
  1207. )
  1208. self._player_cache[player_id] = func
  1209. func = self._player_cache[player_id]
  1210. if self._downloader.params.get('youtube_print_sig_code'):
  1211. self._print_sig_code(func, s)
  1212. return func(s)
  1213. except Exception as e:
  1214. tb = traceback.format_exc()
  1215. raise ExtractorError(
  1216. 'Signature extraction failed: ' + tb, cause=e)
  1217. def _mark_watched(self, video_id, player_response):
  1218. playback_url = url_or_none(try_get(
  1219. player_response,
  1220. lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']))
  1221. if not playback_url:
  1222. return
  1223. parsed_playback_url = compat_urlparse.urlparse(playback_url)
  1224. qs = compat_urlparse.parse_qs(parsed_playback_url.query)
  1225. # cpn generation algorithm is reverse engineered from base.js.
  1226. # In fact it works even with dummy cpn.
  1227. CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'
  1228. cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))
  1229. qs.update({
  1230. 'ver': ['2'],
  1231. 'cpn': [cpn],
  1232. })
  1233. playback_url = compat_urlparse.urlunparse(
  1234. parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
  1235. self._download_webpage(
  1236. playback_url, video_id, 'Marking watched',
  1237. 'Unable to mark watched', fatal=False)
  1238. @staticmethod
  1239. def _extract_urls(webpage):
  1240. # Embedded YouTube player
  1241. entries = [
  1242. unescapeHTML(mobj.group('url'))
  1243. for mobj in re.finditer(r'''(?x)
  1244. (?:
  1245. <iframe[^>]+?src=|
  1246. data-video-url=|
  1247. <embed[^>]+?src=|
  1248. embedSWF\(?:\s*|
  1249. <object[^>]+data=|
  1250. new\s+SWFObject\(
  1251. )
  1252. (["\'])
  1253. (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
  1254. (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?)
  1255. \1''', webpage)]
  1256. # lazyYT YouTube embed
  1257. entries.extend(list(map(
  1258. unescapeHTML,
  1259. re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage))))
  1260. # Wordpress "YouTube Video Importer" plugin
  1261. matches = re.findall(r'''(?x)<div[^>]+
  1262. class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
  1263. data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
  1264. entries.extend(m[-1] for m in matches)
  1265. return entries
  1266. @staticmethod
  1267. def _extract_url(webpage):
  1268. urls = YoutubeIE._extract_urls(webpage)
  1269. return urls[0] if urls else None
  1270. @classmethod
  1271. def extract_id(cls, url):
  1272. mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
  1273. if mobj is None:
  1274. raise ExtractorError('Invalid URL: %s' % url)
  1275. video_id = mobj.group(2)
  1276. return video_id
  1277. def _extract_chapters_from_json(self, data, video_id, duration):
  1278. chapters_list = try_get(
  1279. data,
  1280. lambda x: x['playerOverlays']
  1281. ['playerOverlayRenderer']
  1282. ['decoratedPlayerBarRenderer']
  1283. ['decoratedPlayerBarRenderer']
  1284. ['playerBar']
  1285. ['chapteredPlayerBarRenderer']
  1286. ['chapters'],
  1287. list)
  1288. if not chapters_list:
  1289. return
  1290. def chapter_time(chapter):
  1291. return float_or_none(
  1292. try_get(
  1293. chapter,
  1294. lambda x: x['chapterRenderer']['timeRangeStartMillis'],
  1295. int),
  1296. scale=1000)
  1297. chapters = []
  1298. for next_num, chapter in enumerate(chapters_list, start=1):
  1299. start_time = chapter_time(chapter)
  1300. if start_time is None:
  1301. continue
  1302. end_time = (chapter_time(chapters_list[next_num])
  1303. if next_num < len(chapters_list) else duration)
  1304. if end_time is None:
  1305. continue
  1306. title = try_get(
  1307. chapter, lambda x: x['chapterRenderer']['title']['simpleText'],
  1308. compat_str)
  1309. chapters.append({
  1310. 'start_time': start_time,
  1311. 'end_time': end_time,
  1312. 'title': title,
  1313. })
  1314. return chapters
  1315. def _extract_yt_initial_variable(self, webpage, regex, video_id, name):
  1316. return self._parse_json(self._search_regex(
  1317. (r'%s\s*%s' % (regex, self._YT_INITIAL_BOUNDARY_RE),
  1318. regex), webpage, name, default='{}'), video_id, fatal=False)
  1319. def _real_extract(self, url):
  1320. url, smuggled_data = unsmuggle_url(url, {})
  1321. video_id = self._match_id(url)
  1322. base_url = self.http_scheme() + '//www.youtube.com/'
  1323. webpage_url = base_url + 'watch?v=' + video_id
  1324. webpage = self._download_webpage(webpage_url, video_id, fatal=False)
  1325. player_response = None
  1326. if webpage:
  1327. player_response = self._extract_yt_initial_variable(
  1328. webpage, self._YT_INITIAL_PLAYER_RESPONSE_RE,
  1329. video_id, 'initial player response')
  1330. if not player_response:
  1331. player_response = self._call_api(
  1332. 'player', {'videoId': video_id}, video_id)
  1333. playability_status = player_response.get('playabilityStatus') or {}
  1334. if playability_status.get('reason') == 'Sign in to confirm your age':
  1335. pr = self._parse_json(try_get(compat_parse_qs(
  1336. self._download_webpage(
  1337. base_url + 'get_video_info', video_id,
  1338. 'Refetching age-gated info webpage',
  1339. 'unable to download video info webpage', query={
  1340. 'video_id': video_id,
  1341. 'eurl': 'https://www.youtube.com/embed/' + video_id,
  1342. }, fatal=False)),
  1343. lambda x: x['player_response'][0],
  1344. compat_str) or '{}', video_id)
  1345. if pr:
  1346. player_response = pr
  1347. trailer_video_id = try_get(
  1348. playability_status,
  1349. lambda x: x['errorScreen']['playerLegacyDesktopYpcTrailerRenderer']['trailerVideoId'],
  1350. compat_str)
  1351. if trailer_video_id:
  1352. return self.url_result(
  1353. trailer_video_id, self.ie_key(), trailer_video_id)
  1354. def get_text(x):
  1355. if not x:
  1356. return
  1357. return x.get('simpleText') or ''.join([r['text'] for r in x['runs']])
  1358. search_meta = (
  1359. lambda x: self._html_search_meta(x, webpage, default=None)) \
  1360. if webpage else lambda x: None
  1361. video_details = player_response.get('videoDetails') or {}
  1362. microformat = try_get(
  1363. player_response,
  1364. lambda x: x['microformat']['playerMicroformatRenderer'],
  1365. dict) or {}
  1366. video_title = video_details.get('title') \
  1367. or get_text(microformat.get('title')) \
  1368. or search_meta(['og:title', 'twitter:title', 'title'])
  1369. video_description = video_details.get('shortDescription')
  1370. if not smuggled_data.get('force_singlefeed', False):
  1371. if not self._downloader.params.get('noplaylist'):
  1372. multifeed_metadata_list = try_get(
  1373. player_response,
  1374. lambda x: x['multicamera']['playerLegacyMulticameraRenderer']['metadataList'],
  1375. compat_str)
  1376. if multifeed_metadata_list:
  1377. entries = []
  1378. feed_ids = []
  1379. for feed in multifeed_metadata_list.split(','):
  1380. # Unquote should take place before split on comma (,) since textual
  1381. # fields may contain comma as well (see
  1382. # https://github.com/ytdl-org/youtube-dl/issues/8536)
  1383. feed_data = compat_parse_qs(
  1384. compat_urllib_parse_unquote_plus(feed))
  1385. def feed_entry(name):
  1386. return try_get(
  1387. feed_data, lambda x: x[name][0], compat_str)
  1388. feed_id = feed_entry('id')
  1389. if not feed_id:
  1390. continue
  1391. feed_title = feed_entry('title')
  1392. title = video_title
  1393. if feed_title:
  1394. title += ' (%s)' % feed_title
  1395. entries.append({
  1396. '_type': 'url_transparent',
  1397. 'ie_key': 'Youtube',
  1398. 'url': smuggle_url(
  1399. base_url + 'watch?v=' + feed_data['id'][0],
  1400. {'force_singlefeed': True}),
  1401. 'title': title,
  1402. })
  1403. feed_ids.append(feed_id)
  1404. self.to_screen(
  1405. 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
  1406. % (', '.join(feed_ids), video_id))
  1407. return self.playlist_result(
  1408. entries, video_id, video_title, video_description)
  1409. else:
  1410. self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
  1411. formats = []
  1412. itags = []
  1413. itag_qualities = {}
  1414. player_url = None
  1415. # TODO: Enable this after fixing formatSort
  1416. # q = qualities(['tiny', 'small', 'medium', 'large', 'hd720', 'hd1080', 'hd1440', 'hd2160', 'hd2880', 'highres'])
  1417. streaming_data = player_response.get('streamingData') or {}
  1418. streaming_formats = streaming_data.get('formats') or []
  1419. streaming_formats.extend(streaming_data.get('adaptiveFormats') or [])
  1420. for fmt in streaming_formats:
  1421. if fmt.get('targetDurationSec') or fmt.get('drmFamilies'):
  1422. continue
  1423. itag = str_or_none(fmt.get('itag'))
  1424. quality = fmt.get('quality')
  1425. if itag and quality:
  1426. itag_qualities[itag] = quality
  1427. # FORMAT_STREAM_TYPE_OTF(otf=1) requires downloading the init fragment
  1428. # (adding `&sq=0` to the URL) and parsing emsg box to determine the
  1429. # number of fragment that would subsequently requested with (`&sq=N`)
  1430. if fmt.get('type') == 'FORMAT_STREAM_TYPE_OTF':
  1431. continue
  1432. fmt_url = fmt.get('url')
  1433. if not fmt_url:
  1434. sc = compat_parse_qs(fmt.get('signatureCipher'))
  1435. fmt_url = url_or_none(try_get(sc, lambda x: x['url'][0]))
  1436. encrypted_sig = try_get(sc, lambda x: x['s'][0])
  1437. if not (sc and fmt_url and encrypted_sig):
  1438. continue
  1439. if not player_url:
  1440. if not webpage:
  1441. continue
  1442. player_url = self._search_regex(
  1443. r'"(?:PLAYER_JS_URL|jsUrl)"\s*:\s*"([^"]+)"',
  1444. webpage, 'player URL', fatal=False)
  1445. if not player_url:
  1446. continue
  1447. signature = self._decrypt_signature(sc['s'][0], video_id, player_url)
  1448. sp = try_get(sc, lambda x: x['sp'][0]) or 'signature'
  1449. fmt_url += '&' + sp + '=' + signature
  1450. if itag:
  1451. itags.append(itag)
  1452. tbr = float_or_none(
  1453. fmt.get('averageBitrate') or fmt.get('bitrate'), 1000)
  1454. dct = {
  1455. 'asr': int_or_none(fmt.get('audioSampleRate')),
  1456. 'filesize': int_or_none(fmt.get('contentLength')),
  1457. 'format_id': itag,
  1458. 'format_note': fmt.get('qualityLabel') or quality,
  1459. 'fps': int_or_none(fmt.get('fps')),
  1460. 'height': int_or_none(fmt.get('height')),
  1461. # 'quality': q(quality), # TODO: Enable this after fixing formatSort
  1462. 'tbr': tbr,
  1463. 'url': fmt_url,
  1464. 'width': fmt.get('width'),
  1465. }
  1466. mimetype = fmt.get('mimeType')
  1467. if mimetype:
  1468. mobj = re.match(
  1469. r'((?:[^/]+)/(?:[^;]+))(?:;\s*codecs="([^"]+)")?', mimetype)
  1470. if mobj:
  1471. dct['ext'] = mimetype2ext(mobj.group(1))
  1472. dct.update(parse_codecs(mobj.group(2)))
  1473. no_audio = dct.get('acodec') == 'none'
  1474. no_video = dct.get('vcodec') == 'none'
  1475. if no_audio:
  1476. dct['vbr'] = tbr
  1477. if no_video:
  1478. dct['abr'] = tbr
  1479. if no_audio or no_video:
  1480. dct['downloader_options'] = {
  1481. # Youtube throttles chunks >~10M
  1482. 'http_chunk_size': 10485760,
  1483. }
  1484. formats.append(dct)
  1485. hls_manifest_url = streaming_data.get('hlsManifestUrl')
  1486. if hls_manifest_url:
  1487. for f in self._extract_m3u8_formats(
  1488. hls_manifest_url, video_id, 'mp4', fatal=False):
  1489. itag = self._search_regex(
  1490. r'/itag/(\d+)', f['url'], 'itag', default=None)
  1491. if itag:
  1492. f['format_id'] = itag
  1493. formats.append(f)
  1494. if self._downloader.params.get('youtube_include_dash_manifest'):
  1495. dash_manifest_url = streaming_data.get('dashManifestUrl')
  1496. if dash_manifest_url:
  1497. for f in self._extract_mpd_formats(
  1498. dash_manifest_url, video_id, fatal=False):
  1499. itag = f['format_id']
  1500. if itag in itags:
  1501. continue
  1502. # if itag in itag_qualities: # TODO: Enable this after fixing formatSort
  1503. # f['quality'] = q(itag_qualities[itag])
  1504. filesize = int_or_none(self._search_regex(
  1505. r'/clen/(\d+)', f.get('fragment_base_url')
  1506. or f['url'], 'file size', default=None))
  1507. if filesize:
  1508. f['filesize'] = filesize
  1509. formats.append(f)
  1510. if not formats:
  1511. if streaming_data.get('licenseInfos'):
  1512. raise ExtractorError(
  1513. 'This video is DRM protected.', expected=True)
  1514. pemr = try_get(
  1515. playability_status,
  1516. lambda x: x['errorScreen']['playerErrorMessageRenderer'],
  1517. dict) or {}
  1518. reason = get_text(pemr.get('reason')) or playability_status.get('reason')
  1519. subreason = pemr.get('subreason')
  1520. if subreason:
  1521. subreason = clean_html(get_text(subreason))
  1522. if subreason == 'The uploader has not made this video available in your country.':
  1523. countries = microformat.get('availableCountries')
  1524. if not countries:
  1525. regions_allowed = search_meta('regionsAllowed')
  1526. countries = regions_allowed.split(',') if regions_allowed else None
  1527. self.raise_geo_restricted(
  1528. subreason, countries)
  1529. reason += '\n' + subreason
  1530. if reason:
  1531. raise ExtractorError(reason, expected=True)
  1532. self._sort_formats(formats)
  1533. keywords = video_details.get('keywords') or []
  1534. if not keywords and webpage:
  1535. keywords = [
  1536. unescapeHTML(m.group('content'))
  1537. for m in re.finditer(self._meta_regex('og:video:tag'), webpage)]
  1538. for keyword in keywords:
  1539. if keyword.startswith('yt:stretch='):
  1540. w, h = keyword.split('=')[1].split(':')
  1541. w, h = int(w), int(h)
  1542. if w > 0 and h > 0:
  1543. ratio = w / h
  1544. for f in formats:
  1545. if f.get('vcodec') != 'none':
  1546. f['stretched_ratio'] = ratio
  1547. thumbnails = []
  1548. for container in (video_details, microformat):
  1549. for thumbnail in (try_get(
  1550. container,
  1551. lambda x: x['thumbnail']['thumbnails'], list) or []):
  1552. thumbnail_url = thumbnail.get('url')
  1553. if not thumbnail_url:
  1554. continue
  1555. thumbnails.append({
  1556. 'height': int_or_none(thumbnail.get('height')),
  1557. 'url': thumbnail_url,
  1558. 'width': int_or_none(thumbnail.get('width')),
  1559. })
  1560. if thumbnails:
  1561. break
  1562. else:
  1563. thumbnail = search_meta(['og:image', 'twitter:image'])
  1564. if thumbnail:
  1565. thumbnails = [{'url': thumbnail}]
  1566. category = microformat.get('category') or search_meta('genre')
  1567. channel_id = video_details.get('channelId') \
  1568. or microformat.get('externalChannelId') \
  1569. or search_meta('channelId')
  1570. duration = int_or_none(
  1571. video_details.get('lengthSeconds')
  1572. or microformat.get('lengthSeconds')) \
  1573. or parse_duration(search_meta('duration'))
  1574. is_live = video_details.get('isLive')
  1575. owner_profile_url = microformat.get('ownerProfileUrl')
  1576. info = {
  1577. 'id': video_id,
  1578. 'title': self._live_title(video_title) if is_live else video_title,
  1579. 'formats': formats,
  1580. 'thumbnails': thumbnails,
  1581. 'description': video_description,
  1582. 'upload_date': unified_strdate(
  1583. microformat.get('uploadDate')
  1584. or search_meta('uploadDate')),
  1585. 'uploader': video_details['author'],
  1586. 'uploader_id': self._search_regex(r'/(?:channel|user)/([^/?&#]+)', owner_profile_url, 'uploader id') if owner_profile_url else None,
  1587. 'uploader_url': owner_profile_url,
  1588. 'channel_id': channel_id,
  1589. 'channel_url': 'https://www.youtube.com/channel/' + channel_id if channel_id else None,
  1590. 'duration': duration,
  1591. 'view_count': int_or_none(
  1592. video_details.get('viewCount')
  1593. or microformat.get('viewCount')
  1594. or search_meta('interactionCount')),
  1595. 'average_rating': float_or_none(video_details.get('averageRating')),
  1596. 'age_limit': 18 if (
  1597. microformat.get('isFamilySafe') is False
  1598. or search_meta('isFamilyFriendly') == 'false'
  1599. or search_meta('og:restrictions:age') == '18+') else 0,
  1600. 'webpage_url': webpage_url,
  1601. 'categories': [category] if category else None,
  1602. 'tags': keywords,
  1603. 'is_live': is_live,
  1604. 'playable_in_embed': playability_status.get('playableInEmbed'),
  1605. }
  1606. pctr = try_get(
  1607. player_response,
  1608. lambda x: x['captions']['playerCaptionsTracklistRenderer'], dict)
  1609. subtitles = {}
  1610. if pctr:
  1611. def process_language(container, base_url, lang_code, query):
  1612. lang_subs = []
  1613. for fmt in self._SUBTITLE_FORMATS:
  1614. query.update({
  1615. 'fmt': fmt,
  1616. })
  1617. lang_subs.append({
  1618. 'ext': fmt,
  1619. 'url': update_url_query(base_url, query),
  1620. })
  1621. container[lang_code] = lang_subs
  1622. for caption_track in (pctr.get('captionTracks') or []):
  1623. base_url = caption_track.get('baseUrl')
  1624. if not base_url:
  1625. continue
  1626. if caption_track.get('kind') != 'asr':
  1627. lang_code = caption_track.get('languageCode')
  1628. if not lang_code:
  1629. continue
  1630. process_language(
  1631. subtitles, base_url, lang_code, {})
  1632. continue
  1633. automatic_captions = {}
  1634. for translation_language in (pctr.get('translationLanguages') or []):
  1635. translation_language_code = translation_language.get('languageCode')
  1636. if not translation_language_code:
  1637. continue
  1638. process_language(
  1639. automatic_captions, base_url, translation_language_code,
  1640. {'tlang': translation_language_code})
  1641. info['automatic_captions'] = automatic_captions
  1642. info['subtitles'] = subtitles
  1643. parsed_url = compat_urllib_parse_urlparse(url)
  1644. for component in [parsed_url.fragment, parsed_url.query]:
  1645. query = compat_parse_qs(component)
  1646. for k, v in query.items():
  1647. for d_k, s_ks in [('start', ('start', 't')), ('end', ('end',))]:
  1648. d_k += '_time'
  1649. if d_k not in info and k in s_ks:
  1650. info[d_k] = parse_duration(query[k][0])
  1651. # Youtube Music Auto-generated description
  1652. if video_description:
  1653. mobj = re.search(r'(?s)(?P<track>[^·\n]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?.+\nAuto-generated by YouTube\.\s*$', video_description)
  1654. if mobj:
  1655. release_year = mobj.group('release_year')
  1656. release_date = mobj.group('release_date')
  1657. if release_date:
  1658. release_date = release_date.replace('-', '')
  1659. if not release_year:
  1660. release_year = release_date[:4]
  1661. info.update({
  1662. 'album': mobj.group('album'.strip()),
  1663. 'artist': mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·')),
  1664. 'track': mobj.group('track').strip(),
  1665. 'release_date': release_date,
  1666. 'release_year': int_or_none(release_year),
  1667. })
  1668. initial_data = None
  1669. if webpage:
  1670. initial_data = self._extract_yt_initial_variable(
  1671. webpage, self._YT_INITIAL_DATA_RE, video_id,
  1672. 'yt initial data')
  1673. if not initial_data:
  1674. initial_data = self._call_api(
  1675. 'next', {'videoId': video_id}, video_id, fatal=False)
  1676. if not is_live:
  1677. try:
  1678. # This will error if there is no livechat
  1679. initial_data['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation']
  1680. info['subtitles']['live_chat'] = [{
  1681. 'video_id': video_id,
  1682. 'ext': 'json',
  1683. 'protocol': 'youtube_live_chat_replay',
  1684. }]
  1685. except (KeyError, IndexError, TypeError):
  1686. pass
  1687. if initial_data:
  1688. chapters = self._extract_chapters_from_json(
  1689. initial_data, video_id, duration)
  1690. if not chapters:
  1691. for engagment_pannel in (initial_data.get('engagementPanels') or []):
  1692. contents = try_get(
  1693. engagment_pannel, lambda x: x['engagementPanelSectionListRenderer']['content']['macroMarkersListRenderer']['contents'],
  1694. list)
  1695. if not contents:
  1696. continue
  1697. def chapter_time(mmlir):
  1698. return parse_duration(
  1699. get_text(mmlir.get('timeDescription')))
  1700. chapters = []
  1701. for next_num, content in enumerate(contents, start=1):
  1702. mmlir = content.get('macroMarkersListItemRenderer') or {}
  1703. start_time = chapter_time(mmlir)
  1704. end_time = chapter_time(try_get(
  1705. contents, lambda x: x[next_num]['macroMarkersListItemRenderer'])) \
  1706. if next_num < len(contents) else duration
  1707. if start_time is None or end_time is None:
  1708. continue
  1709. chapters.append({
  1710. 'start_time': start_time,
  1711. 'end_time': end_time,
  1712. 'title': get_text(mmlir.get('title')),
  1713. })
  1714. if chapters:
  1715. break
  1716. if chapters:
  1717. info['chapters'] = chapters
  1718. contents = try_get(
  1719. initial_data,
  1720. lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'],
  1721. list) or []
  1722. for content in contents:
  1723. vpir = content.get('videoPrimaryInfoRenderer')
  1724. if vpir:
  1725. stl = vpir.get('superTitleLink')
  1726. if stl:
  1727. stl = get_text(stl)
  1728. if try_get(
  1729. vpir,
  1730. lambda x: x['superTitleIcon']['iconType']) == 'LOCATION_PIN':
  1731. info['location'] = stl
  1732. else:
  1733. mobj = re.search(r'(.+?)\s*S(\d+)\s*•\s*E(\d+)', stl)
  1734. if mobj:
  1735. info.update({
  1736. 'series': mobj.group(1),
  1737. 'season_number': int(mobj.group(2)),
  1738. 'episode_number': int(mobj.group(3)),
  1739. })
  1740. for tlb in (try_get(
  1741. vpir,
  1742. lambda x: x['videoActions']['menuRenderer']['topLevelButtons'],
  1743. list) or []):
  1744. tbr = tlb.get('toggleButtonRenderer') or {}
  1745. for getter, regex in [(
  1746. lambda x: x['defaultText']['accessibility']['accessibilityData'],
  1747. r'(?P<count>[\d,]+)\s*(?P<type>(?:dis)?like)'), ([
  1748. lambda x: x['accessibility'],
  1749. lambda x: x['accessibilityData']['accessibilityData'],
  1750. ], r'(?P<type>(?:dis)?like) this video along with (?P<count>[\d,]+) other people')]:
  1751. label = (try_get(tbr, getter, dict) or {}).get('label')
  1752. if label:
  1753. mobj = re.match(regex, label)
  1754. if mobj:
  1755. info[mobj.group('type') + '_count'] = str_to_int(mobj.group('count'))
  1756. break
  1757. sbr_tooltip = try_get(
  1758. vpir, lambda x: x['sentimentBar']['sentimentBarRenderer']['tooltip'])
  1759. if sbr_tooltip:
  1760. like_count, dislike_count = sbr_tooltip.split(' / ')
  1761. info.update({
  1762. 'like_count': str_to_int(like_count),
  1763. 'dislike_count': str_to_int(dislike_count),
  1764. })
  1765. vsir = content.get('videoSecondaryInfoRenderer')
  1766. if vsir:
  1767. info['channel'] = get_text(try_get(
  1768. vsir,
  1769. lambda x: x['owner']['videoOwnerRenderer']['title'],
  1770. compat_str))
  1771. rows = try_get(
  1772. vsir,
  1773. lambda x: x['metadataRowContainer']['metadataRowContainerRenderer']['rows'],
  1774. list) or []
  1775. multiple_songs = False
  1776. for row in rows:
  1777. if try_get(row, lambda x: x['metadataRowRenderer']['hasDividerLine']) is True:
  1778. multiple_songs = True
  1779. break
  1780. for row in rows:
  1781. mrr = row.get('metadataRowRenderer') or {}
  1782. mrr_title = mrr.get('title')
  1783. if not mrr_title:
  1784. continue
  1785. mrr_title = get_text(mrr['title'])
  1786. mrr_contents_text = get_text(mrr['contents'][0])
  1787. if mrr_title == 'License':
  1788. info['license'] = mrr_contents_text
  1789. elif not multiple_songs:
  1790. if mrr_title == 'Album':
  1791. info['album'] = mrr_contents_text
  1792. elif mrr_title == 'Artist':
  1793. info['artist'] = mrr_contents_text
  1794. elif mrr_title == 'Song':
  1795. info['track'] = mrr_contents_text
  1796. fallbacks = {
  1797. 'channel': 'uploader',
  1798. 'channel_id': 'uploader_id',
  1799. 'channel_url': 'uploader_url',
  1800. }
  1801. for to, frm in fallbacks.items():
  1802. if not info.get(to):
  1803. info[to] = info.get(frm)
  1804. for s_k, d_k in [('artist', 'creator'), ('track', 'alt_title')]:
  1805. v = info.get(s_k)
  1806. if v:
  1807. info[d_k] = v
  1808. # get xsrf for annotations or comments
  1809. get_annotations = self._downloader.params.get('writeannotations', False)
  1810. get_comments = self._downloader.params.get('getcomments', False)
  1811. if get_annotations or get_comments:
  1812. xsrf_token = None
  1813. ytcfg = self._extract_ytcfg(video_id, webpage)
  1814. if ytcfg:
  1815. xsrf_token = try_get(ytcfg, lambda x: x['XSRF_TOKEN'], compat_str)
  1816. if not xsrf_token:
  1817. xsrf_token = self._search_regex(
  1818. r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P<xsrf_token>(?:(?!\2).)+)\2',
  1819. webpage, 'xsrf token', group='xsrf_token', fatal=False)
  1820. # annotations
  1821. if get_annotations:
  1822. invideo_url = try_get(
  1823. player_response, lambda x: x['annotations'][0]['playerAnnotationsUrlsRenderer']['invideoUrl'], compat_str)
  1824. if xsrf_token and invideo_url:
  1825. xsrf_field_name = None
  1826. if ytcfg:
  1827. xsrf_field_name = try_get(ytcfg, lambda x: x['XSRF_FIELD_NAME'], compat_str)
  1828. if not xsrf_field_name:
  1829. xsrf_field_name = self._search_regex(
  1830. r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P<xsrf_field_name>\w+)\2',
  1831. webpage, 'xsrf field name',
  1832. group='xsrf_field_name', default='session_token')
  1833. info['annotations'] = self._download_webpage(
  1834. self._proto_relative_url(invideo_url),
  1835. video_id, note='Downloading annotations',
  1836. errnote='Unable to download video annotations', fatal=False,
  1837. data=urlencode_postdata({xsrf_field_name: xsrf_token}))
  1838. # Get comments
  1839. # TODO: Refactor and move to seperate function
  1840. if get_comments:
  1841. expected_video_comment_count = 0
  1842. video_comments = []
  1843. def find_value(html, key, num_chars=2, separator='"'):
  1844. pos_begin = html.find(key) + len(key) + num_chars
  1845. pos_end = html.find(separator, pos_begin)
  1846. return html[pos_begin: pos_end]
  1847. def search_dict(partial, key):
  1848. if isinstance(partial, dict):
  1849. for k, v in partial.items():
  1850. if k == key:
  1851. yield v
  1852. else:
  1853. for o in search_dict(v, key):
  1854. yield o
  1855. elif isinstance(partial, list):
  1856. for i in partial:
  1857. for o in search_dict(i, key):
  1858. yield o
  1859. continuations = []
  1860. if initial_data:
  1861. try:
  1862. ncd = next(search_dict(initial_data, 'nextContinuationData'))
  1863. continuations = [ncd['continuation']]
  1864. # Handle videos where comments have been disabled entirely
  1865. except StopIteration:
  1866. pass
  1867. def get_continuation(continuation, session_token, replies=False):
  1868. query = {
  1869. 'pbj': 1,
  1870. 'ctoken': continuation,
  1871. }
  1872. if replies:
  1873. query['action_get_comment_replies'] = 1
  1874. else:
  1875. query['action_get_comments'] = 1
  1876. while True:
  1877. content, handle = self._download_webpage_handle(
  1878. 'https://www.youtube.com/comment_service_ajax',
  1879. video_id,
  1880. note=False,
  1881. expected_status=[413],
  1882. data=urlencode_postdata({
  1883. 'session_token': session_token
  1884. }),
  1885. query=query,
  1886. headers={
  1887. 'Accept': '*/*',
  1888. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0',
  1889. 'X-YouTube-Client-Name': '1',
  1890. 'X-YouTube-Client-Version': '2.20201202.06.01'
  1891. }
  1892. )
  1893. response_code = handle.getcode()
  1894. if (response_code == 200):
  1895. return self._parse_json(content, video_id)
  1896. if (response_code == 413):
  1897. return None
  1898. raise ExtractorError('Unexpected HTTP error code: %s' % response_code)
  1899. first_continuation = True
  1900. chain_msg = ''
  1901. self.to_screen('Downloading comments')
  1902. while continuations:
  1903. continuation = continuations.pop()
  1904. comment_response = get_continuation(continuation, xsrf_token)
  1905. if not comment_response:
  1906. continue
  1907. if list(search_dict(comment_response, 'externalErrorMessage')):
  1908. raise ExtractorError('Error returned from server: ' + next(search_dict(comment_response, 'externalErrorMessage')))
  1909. if 'continuationContents' not in comment_response['response']:
  1910. # Something is wrong here. Youtube won't accept this continuation token for some reason and responds with a user satisfaction dialog (error?)
  1911. continue
  1912. # not sure if this actually helps
  1913. if 'xsrf_token' in comment_response:
  1914. xsrf_token = comment_response['xsrf_token']
  1915. item_section = comment_response['response']['continuationContents']['itemSectionContinuation']
  1916. if first_continuation:
  1917. expected_video_comment_count = int(item_section['header']['commentsHeaderRenderer']['countText']['runs'][0]['text'].replace(' Comments', '').replace('1 Comment', '1').replace(',', ''))
  1918. first_continuation = False
  1919. if 'contents' not in item_section:
  1920. # continuation returned no comments?
  1921. # set an empty array as to not break the for loop
  1922. item_section['contents'] = []
  1923. for meta_comment in item_section['contents']:
  1924. comment = meta_comment['commentThreadRenderer']['comment']['commentRenderer']
  1925. video_comments.append({
  1926. 'id': comment['commentId'],
  1927. 'text': ''.join([c['text'] for c in comment['contentText']['runs']]),
  1928. 'time_text': ''.join([c['text'] for c in comment['publishedTimeText']['runs']]),
  1929. 'author': comment.get('authorText', {}).get('simpleText', ''),
  1930. 'votes': comment.get('voteCount', {}).get('simpleText', '0'),
  1931. 'author_thumbnail': comment['authorThumbnail']['thumbnails'][-1]['url'],
  1932. 'parent': 'root'
  1933. })
  1934. if 'replies' not in meta_comment['commentThreadRenderer']:
  1935. continue
  1936. reply_continuations = [rcn['nextContinuationData']['continuation'] for rcn in meta_comment['commentThreadRenderer']['replies']['commentRepliesRenderer']['continuations']]
  1937. while reply_continuations:
  1938. time.sleep(1)
  1939. continuation = reply_continuations.pop()
  1940. replies_data = get_continuation(continuation, xsrf_token, True)
  1941. if not replies_data or 'continuationContents' not in replies_data[1]['response']:
  1942. continue
  1943. if self._downloader.params.get('verbose', False):
  1944. chain_msg = ' (chain %s)' % comment['commentId']
  1945. self.to_screen('Comments downloaded: %d of ~%d%s' % (len(video_comments), expected_video_comment_count, chain_msg))
  1946. reply_comment_meta = replies_data[1]['response']['continuationContents']['commentRepliesContinuation']
  1947. for reply_meta in reply_comment_meta.get('contents', {}):
  1948. reply_comment = reply_meta['commentRenderer']
  1949. video_comments.append({
  1950. 'id': reply_comment['commentId'],
  1951. 'text': ''.join([c['text'] for c in reply_comment['contentText']['runs']]),
  1952. 'time_text': ''.join([c['text'] for c in reply_comment['publishedTimeText']['runs']]),
  1953. 'author': reply_comment.get('authorText', {}).get('simpleText', ''),
  1954. 'votes': reply_comment.get('voteCount', {}).get('simpleText', '0'),
  1955. 'author_thumbnail': reply_comment['authorThumbnail']['thumbnails'][-1]['url'],
  1956. 'parent': comment['commentId']
  1957. })
  1958. if 'continuations' not in reply_comment_meta or len(reply_comment_meta['continuations']) == 0:
  1959. continue
  1960. reply_continuations += [rcn['nextContinuationData']['continuation'] for rcn in reply_comment_meta['continuations']]
  1961. self.to_screen('Comments downloaded: %d of ~%d' % (len(video_comments), expected_video_comment_count))
  1962. if 'continuations' in item_section:
  1963. continuations += [ncd['nextContinuationData']['continuation'] for ncd in item_section['continuations']]
  1964. time.sleep(1)
  1965. self.to_screen('Total comments downloaded: %d of ~%d' % (len(video_comments), expected_video_comment_count))
  1966. info.update({
  1967. 'comments': video_comments,
  1968. 'comment_count': expected_video_comment_count
  1969. })
  1970. self.mark_watched(video_id, player_response)
  1971. return info
  1972. class YoutubeTabIE(YoutubeBaseInfoExtractor):
  1973. IE_DESC = 'YouTube.com tab'
  1974. _VALID_URL = r'''(?x)
  1975. https?://
  1976. (?:\w+\.)?
  1977. (?:
  1978. youtube(?:kids)?\.com|
  1979. invidio\.us
  1980. )/
  1981. (?:
  1982. (?:channel|c|user)/|
  1983. (?P<not_channel>
  1984. feed/|
  1985. (?:playlist|watch)\?.*?\blist=
  1986. )|
  1987. (?!(?:%s)\b) # Direct URLs
  1988. )
  1989. (?P<id>[^/?\#&]+)
  1990. ''' % YoutubeBaseInfoExtractor._RESERVED_NAMES
  1991. IE_NAME = 'youtube:tab'
  1992. _TESTS = [{
  1993. # playlists, multipage
  1994. 'url': 'https://www.youtube.com/c/ИгорьКлейнер/playlists?view=1&flow=grid',
  1995. 'playlist_mincount': 94,
  1996. 'info_dict': {
  1997. 'id': 'UCqj7Cz7revf5maW9g5pgNcg',
  1998. 'title': 'Игорь Клейнер - Playlists',
  1999. 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
  2000. 'uploader': 'Игорь Клейнер',
  2001. 'uploader_id': 'UCqj7Cz7revf5maW9g5pgNcg',
  2002. },
  2003. }, {
  2004. # playlists, multipage, different order
  2005. 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
  2006. 'playlist_mincount': 94,
  2007. 'info_dict': {
  2008. 'id': 'UCqj7Cz7revf5maW9g5pgNcg',
  2009. 'title': 'Игорь Клейнер - Playlists',
  2010. 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
  2011. 'uploader_id': 'UCqj7Cz7revf5maW9g5pgNcg',
  2012. 'uploader': 'Игорь Клейнер',
  2013. },
  2014. }, {
  2015. # playlists, singlepage
  2016. 'url': 'https://www.youtube.com/user/ThirstForScience/playlists',
  2017. 'playlist_mincount': 4,
  2018. 'info_dict': {
  2019. 'id': 'UCAEtajcuhQ6an9WEzY9LEMQ',
  2020. 'title': 'ThirstForScience - Playlists',
  2021. 'description': 'md5:609399d937ea957b0f53cbffb747a14c',
  2022. 'uploader': 'ThirstForScience',
  2023. 'uploader_id': 'UCAEtajcuhQ6an9WEzY9LEMQ',
  2024. }
  2025. }, {
  2026. 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists',
  2027. 'only_matching': True,
  2028. }, {
  2029. # basic, single video playlist
  2030. 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
  2031. 'info_dict': {
  2032. 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
  2033. 'uploader': 'Sergey M.',
  2034. 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
  2035. 'title': 'youtube-dl public playlist',
  2036. },
  2037. 'playlist_count': 1,
  2038. }, {
  2039. # empty playlist
  2040. 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
  2041. 'info_dict': {
  2042. 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
  2043. 'uploader': 'Sergey M.',
  2044. 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
  2045. 'title': 'youtube-dl empty playlist',
  2046. },
  2047. 'playlist_count': 0,
  2048. }, {
  2049. # Home tab
  2050. 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/featured',
  2051. 'info_dict': {
  2052. 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
  2053. 'title': 'lex will - Home',
  2054. 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
  2055. 'uploader': 'lex will',
  2056. 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
  2057. },
  2058. 'playlist_mincount': 2,
  2059. }, {
  2060. # Videos tab
  2061. 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos',
  2062. 'info_dict': {
  2063. 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
  2064. 'title': 'lex will - Videos',
  2065. 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
  2066. 'uploader': 'lex will',
  2067. 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
  2068. },
  2069. 'playlist_mincount': 975,
  2070. }, {
  2071. # Videos tab, sorted by popular
  2072. 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos?view=0&sort=p&flow=grid',
  2073. 'info_dict': {
  2074. 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
  2075. 'title': 'lex will - Videos',
  2076. 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
  2077. 'uploader': 'lex will',
  2078. 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
  2079. },
  2080. 'playlist_mincount': 199,
  2081. }, {
  2082. # Playlists tab
  2083. 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/playlists',
  2084. 'info_dict': {
  2085. 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
  2086. 'title': 'lex will - Playlists',
  2087. 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
  2088. 'uploader': 'lex will',
  2089. 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
  2090. },
  2091. 'playlist_mincount': 17,
  2092. }, {
  2093. # Community tab
  2094. 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/community',
  2095. 'info_dict': {
  2096. 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
  2097. 'title': 'lex will - Community',
  2098. 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
  2099. 'uploader': 'lex will',
  2100. 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
  2101. },
  2102. 'playlist_mincount': 18,
  2103. }, {
  2104. # Channels tab
  2105. 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/channels',
  2106. 'info_dict': {
  2107. 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
  2108. 'title': 'lex will - Channels',
  2109. 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
  2110. 'uploader': 'lex will',
  2111. 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
  2112. },
  2113. 'playlist_mincount': 12,
  2114. }, {
  2115. 'url': 'https://invidio.us/channel/UCmlqkdCBesrv2Lak1mF_MxA',
  2116. 'only_matching': True,
  2117. }, {
  2118. 'url': 'https://www.youtubekids.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
  2119. 'only_matching': True,
  2120. }, {
  2121. 'url': 'https://music.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
  2122. 'only_matching': True,
  2123. }, {
  2124. 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
  2125. 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
  2126. 'info_dict': {
  2127. 'title': '29C3: Not my department',
  2128. 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
  2129. 'uploader': 'Christiaan008',
  2130. 'uploader_id': 'UCEPzS1rYsrkqzSLNp76nrcg',
  2131. 'description': 'md5:a14dc1a8ef8307a9807fe136a0660268',
  2132. },
  2133. 'playlist_count': 96,
  2134. }, {
  2135. 'note': 'Large playlist',
  2136. 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
  2137. 'info_dict': {
  2138. 'title': 'Uploads from Cauchemar',
  2139. 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
  2140. 'uploader': 'Cauchemar',
  2141. 'uploader_id': 'UCBABnxM4Ar9ten8Mdjj1j0Q',
  2142. },
  2143. 'playlist_mincount': 1123,
  2144. }, {
  2145. # even larger playlist, 8832 videos
  2146. 'url': 'http://www.youtube.com/user/NASAgovVideo/videos',
  2147. 'only_matching': True,
  2148. }, {
  2149. 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
  2150. 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
  2151. 'info_dict': {
  2152. 'title': 'Uploads from Interstellar Movie',
  2153. 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
  2154. 'uploader': 'Interstellar Movie',
  2155. 'uploader_id': 'UCXw-G3eDE9trcvY2sBMM_aA',
  2156. },
  2157. 'playlist_mincount': 21,
  2158. }, {
  2159. # https://github.com/ytdl-org/youtube-dl/issues/21844
  2160. 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
  2161. 'info_dict': {
  2162. 'title': 'Data Analysis with Dr Mike Pound',
  2163. 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
  2164. 'uploader_id': 'UC9-y-6csu5WGm29I7JiwpnA',
  2165. 'uploader': 'Computerphile',
  2166. 'description': 'md5:7f567c574d13d3f8c0954d9ffee4e487',
  2167. },
  2168. 'playlist_mincount': 11,
  2169. }, {
  2170. 'url': 'https://invidio.us/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
  2171. 'only_matching': True,
  2172. }, {
  2173. # Playlist URL that does not actually serve a playlist
  2174. 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',
  2175. 'info_dict': {
  2176. 'id': 'FqZTN594JQw',
  2177. 'ext': 'webm',
  2178. 'title': "Smiley's People 01 detective, Adventure Series, Action",
  2179. 'uploader': 'STREEM',
  2180. 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng',
  2181. 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng',
  2182. 'upload_date': '20150526',
  2183. 'license': 'Standard YouTube License',
  2184. 'description': 'md5:507cdcb5a49ac0da37a920ece610be80',
  2185. 'categories': ['People & Blogs'],
  2186. 'tags': list,
  2187. 'view_count': int,
  2188. 'like_count': int,
  2189. 'dislike_count': int,
  2190. },
  2191. 'params': {
  2192. 'skip_download': True,
  2193. },
  2194. 'skip': 'This video is not available.',
  2195. 'add_ie': [YoutubeIE.ie_key()],
  2196. }, {
  2197. 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g',
  2198. 'only_matching': True,
  2199. }, {
  2200. 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
  2201. 'only_matching': True,
  2202. }, {
  2203. 'url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ/live',
  2204. 'info_dict': {
  2205. 'id': '9Auq9mYxFEE',
  2206. 'ext': 'mp4',
  2207. 'title': compat_str,
  2208. 'uploader': 'Sky News',
  2209. 'uploader_id': 'skynews',
  2210. 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/skynews',
  2211. 'upload_date': '20191102',
  2212. 'description': 'md5:85ddd75d888674631aaf9599a9a0b0ae',
  2213. 'categories': ['News & Politics'],
  2214. 'tags': list,
  2215. 'like_count': int,
  2216. 'dislike_count': int,
  2217. },
  2218. 'params': {
  2219. 'skip_download': True,
  2220. },
  2221. }, {
  2222. 'url': 'https://www.youtube.com/user/TheYoungTurks/live',
  2223. 'info_dict': {
  2224. 'id': 'a48o2S1cPoo',
  2225. 'ext': 'mp4',
  2226. 'title': 'The Young Turks - Live Main Show',
  2227. 'uploader': 'The Young Turks',
  2228. 'uploader_id': 'TheYoungTurks',
  2229. 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks',
  2230. 'upload_date': '20150715',
  2231. 'license': 'Standard YouTube License',
  2232. 'description': 'md5:438179573adcdff3c97ebb1ee632b891',
  2233. 'categories': ['News & Politics'],
  2234. 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],
  2235. 'like_count': int,
  2236. 'dislike_count': int,
  2237. },
  2238. 'params': {
  2239. 'skip_download': True,
  2240. },
  2241. 'only_matching': True,
  2242. }, {
  2243. 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live',
  2244. 'only_matching': True,
  2245. }, {
  2246. 'url': 'https://www.youtube.com/c/CommanderVideoHq/live',
  2247. 'only_matching': True,
  2248. }, {
  2249. 'url': 'https://www.youtube.com/feed/trending',
  2250. 'only_matching': True,
  2251. }, {
  2252. # needs auth
  2253. 'url': 'https://www.youtube.com/feed/library',
  2254. 'only_matching': True,
  2255. }, {
  2256. # needs auth
  2257. 'url': 'https://www.youtube.com/feed/history',
  2258. 'only_matching': True,
  2259. }, {
  2260. # needs auth
  2261. 'url': 'https://www.youtube.com/feed/subscriptions',
  2262. 'only_matching': True,
  2263. }, {
  2264. # needs auth
  2265. 'url': 'https://www.youtube.com/feed/watch_later',
  2266. 'only_matching': True,
  2267. }, {
  2268. # no longer available?
  2269. 'url': 'https://www.youtube.com/feed/recommended',
  2270. 'only_matching': True,
  2271. }, {
  2272. # inline playlist with not always working continuations
  2273. 'url': 'https://www.youtube.com/watch?v=UC6u0Tct-Fo&list=PL36D642111D65BE7C',
  2274. 'only_matching': True,
  2275. }, {
  2276. 'url': 'https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8',
  2277. 'only_matching': True,
  2278. }, {
  2279. 'url': 'https://www.youtube.com/course',
  2280. 'only_matching': True,
  2281. }, {
  2282. 'url': 'https://www.youtube.com/zsecurity',
  2283. 'only_matching': True,
  2284. }, {
  2285. 'url': 'http://www.youtube.com/NASAgovVideo/videos',
  2286. 'only_matching': True,
  2287. }, {
  2288. 'url': 'https://www.youtube.com/TheYoungTurks/live',
  2289. 'only_matching': True,
  2290. }]
  2291. @classmethod
  2292. def suitable(cls, url):
  2293. return False if YoutubeIE.suitable(url) else super(
  2294. YoutubeTabIE, cls).suitable(url)
  2295. def _extract_channel_id(self, webpage):
  2296. channel_id = self._html_search_meta(
  2297. 'channelId', webpage, 'channel id', default=None)
  2298. if channel_id:
  2299. return channel_id
  2300. channel_url = self._html_search_meta(
  2301. ('og:url', 'al:ios:url', 'al:android:url', 'al:web:url',
  2302. 'twitter:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad',
  2303. 'twitter:app:url:googleplay'), webpage, 'channel url')
  2304. return self._search_regex(
  2305. r'https?://(?:www\.)?youtube\.com/channel/([^/?#&])+',
  2306. channel_url, 'channel id')
  2307. @staticmethod
  2308. def _extract_grid_item_renderer(item):
  2309. for item_kind in ('Playlist', 'Video', 'Channel'):
  2310. renderer = item.get('grid%sRenderer' % item_kind)
  2311. if renderer:
  2312. return renderer
  2313. def _grid_entries(self, grid_renderer):
  2314. for item in grid_renderer['items']:
  2315. if not isinstance(item, dict):
  2316. continue
  2317. renderer = self._extract_grid_item_renderer(item)
  2318. if not isinstance(renderer, dict):
  2319. continue
  2320. title = try_get(
  2321. renderer, lambda x: x['title']['runs'][0]['text'], compat_str)
  2322. # playlist
  2323. playlist_id = renderer.get('playlistId')
  2324. if playlist_id:
  2325. yield self.url_result(
  2326. 'https://www.youtube.com/playlist?list=%s' % playlist_id,
  2327. ie=YoutubeTabIE.ie_key(), video_id=playlist_id,
  2328. video_title=title)
  2329. # video
  2330. video_id = renderer.get('videoId')
  2331. if video_id:
  2332. yield self._extract_video(renderer)
  2333. # channel
  2334. channel_id = renderer.get('channelId')
  2335. if channel_id:
  2336. title = try_get(
  2337. renderer, lambda x: x['title']['simpleText'], compat_str)
  2338. yield self.url_result(
  2339. 'https://www.youtube.com/channel/%s' % channel_id,
  2340. ie=YoutubeTabIE.ie_key(), video_title=title)
  2341. def _shelf_entries_from_content(self, shelf_renderer):
  2342. content = shelf_renderer.get('content')
  2343. if not isinstance(content, dict):
  2344. return
  2345. renderer = content.get('gridRenderer')
  2346. if renderer:
  2347. # TODO: add support for nested playlists so each shelf is processed
  2348. # as separate playlist
  2349. # TODO: this includes only first N items
  2350. for entry in self._grid_entries(renderer):
  2351. yield entry
  2352. renderer = content.get('horizontalListRenderer')
  2353. if renderer:
  2354. # TODO
  2355. pass
  2356. def _shelf_entries(self, shelf_renderer, skip_channels=False):
  2357. ep = try_get(
  2358. shelf_renderer, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'],
  2359. compat_str)
  2360. shelf_url = urljoin('https://www.youtube.com', ep)
  2361. if shelf_url:
  2362. # Skipping links to another channels, note that checking for
  2363. # endpoint.commandMetadata.webCommandMetadata.webPageTypwebPageType == WEB_PAGE_TYPE_CHANNEL
  2364. # will not work
  2365. if skip_channels and '/channels?' in shelf_url:
  2366. return
  2367. title = try_get(
  2368. shelf_renderer, lambda x: x['title']['runs'][0]['text'], compat_str)
  2369. yield self.url_result(shelf_url, video_title=title)
  2370. # Shelf may not contain shelf URL, fallback to extraction from content
  2371. for entry in self._shelf_entries_from_content(shelf_renderer):
  2372. yield entry
  2373. def _playlist_entries(self, video_list_renderer):
  2374. for content in video_list_renderer['contents']:
  2375. if not isinstance(content, dict):
  2376. continue
  2377. renderer = content.get('playlistVideoRenderer') or content.get('playlistPanelVideoRenderer')
  2378. if not isinstance(renderer, dict):
  2379. continue
  2380. video_id = renderer.get('videoId')
  2381. if not video_id:
  2382. continue
  2383. yield self._extract_video(renderer)
  2384. r""" # Not needed in the new implementation
  2385. def _itemSection_entries(self, item_sect_renderer):
  2386. for content in item_sect_renderer['contents']:
  2387. if not isinstance(content, dict):
  2388. continue
  2389. renderer = content.get('videoRenderer', {})
  2390. if not isinstance(renderer, dict):
  2391. continue
  2392. video_id = renderer.get('videoId')
  2393. if not video_id:
  2394. continue
  2395. yield self._extract_video(renderer)
  2396. """
  2397. def _rich_entries(self, rich_grid_renderer):
  2398. renderer = try_get(
  2399. rich_grid_renderer, lambda x: x['content']['videoRenderer'], dict) or {}
  2400. video_id = renderer.get('videoId')
  2401. if not video_id:
  2402. return
  2403. yield self._extract_video(renderer)
  2404. def _video_entry(self, video_renderer):
  2405. video_id = video_renderer.get('videoId')
  2406. if video_id:
  2407. return self._extract_video(video_renderer)
  2408. def _post_thread_entries(self, post_thread_renderer):
  2409. post_renderer = try_get(
  2410. post_thread_renderer, lambda x: x['post']['backstagePostRenderer'], dict)
  2411. if not post_renderer:
  2412. return
  2413. # video attachment
  2414. video_renderer = try_get(
  2415. post_renderer, lambda x: x['backstageAttachment']['videoRenderer'], dict)
  2416. video_id = None
  2417. if video_renderer:
  2418. entry = self._video_entry(video_renderer)
  2419. if entry:
  2420. yield entry
  2421. # inline video links
  2422. runs = try_get(post_renderer, lambda x: x['contentText']['runs'], list) or []
  2423. for run in runs:
  2424. if not isinstance(run, dict):
  2425. continue
  2426. ep_url = try_get(
  2427. run, lambda x: x['navigationEndpoint']['urlEndpoint']['url'], compat_str)
  2428. if not ep_url:
  2429. continue
  2430. if not YoutubeIE.suitable(ep_url):
  2431. continue
  2432. ep_video_id = YoutubeIE._match_id(ep_url)
  2433. if video_id == ep_video_id:
  2434. continue
  2435. yield self.url_result(ep_url, ie=YoutubeIE.ie_key(), video_id=video_id)
  2436. def _post_thread_continuation_entries(self, post_thread_continuation):
  2437. contents = post_thread_continuation.get('contents')
  2438. if not isinstance(contents, list):
  2439. return
  2440. for content in contents:
  2441. renderer = content.get('backstagePostThreadRenderer')
  2442. if not isinstance(renderer, dict):
  2443. continue
  2444. for entry in self._post_thread_entries(renderer):
  2445. yield entry
  2446. @staticmethod
  2447. def _build_continuation_query(continuation, ctp=None):
  2448. query = {
  2449. 'ctoken': continuation,
  2450. 'continuation': continuation,
  2451. }
  2452. if ctp:
  2453. query['itct'] = ctp
  2454. return query
  2455. @staticmethod
  2456. def _extract_next_continuation_data(renderer):
  2457. next_continuation = try_get(
  2458. renderer, lambda x: x['continuations'][0]['nextContinuationData'], dict)
  2459. if not next_continuation:
  2460. return
  2461. continuation = next_continuation.get('continuation')
  2462. if not continuation:
  2463. return
  2464. ctp = next_continuation.get('clickTrackingParams')
  2465. return YoutubeTabIE._build_continuation_query(continuation, ctp)
  2466. @classmethod
  2467. def _extract_continuation(cls, renderer):
  2468. next_continuation = cls._extract_next_continuation_data(renderer)
  2469. if next_continuation:
  2470. return next_continuation
  2471. contents = []
  2472. for key in ('contents', 'items'):
  2473. contents.extend(try_get(renderer, lambda x: x[key], list) or [])
  2474. for content in contents:
  2475. if not isinstance(content, dict):
  2476. continue
  2477. continuation_ep = try_get(
  2478. content, lambda x: x['continuationItemRenderer']['continuationEndpoint'],
  2479. dict)
  2480. if not continuation_ep:
  2481. continue
  2482. continuation = try_get(
  2483. continuation_ep, lambda x: x['continuationCommand']['token'], compat_str)
  2484. if not continuation:
  2485. continue
  2486. ctp = continuation_ep.get('clickTrackingParams')
  2487. return YoutubeTabIE._build_continuation_query(continuation, ctp)
  2488. def _entries(self, tab, identity_token):
  2489. def extract_entries(parent_renderer): # this needs to called again for continuation to work with feeds
  2490. contents = try_get(parent_renderer, lambda x: x['contents'], list) or []
  2491. for content in contents:
  2492. if not isinstance(content, dict):
  2493. continue
  2494. is_renderer = try_get(content, lambda x: x['itemSectionRenderer'], dict)
  2495. if not is_renderer:
  2496. renderer = content.get('richItemRenderer')
  2497. if renderer:
  2498. for entry in self._rich_entries(renderer):
  2499. yield entry
  2500. continuation_list[0] = self._extract_continuation(parent_renderer)
  2501. continue
  2502. isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or []
  2503. for isr_content in isr_contents:
  2504. if not isinstance(isr_content, dict):
  2505. continue
  2506. known_renderers = {
  2507. 'playlistVideoListRenderer': self._playlist_entries,
  2508. 'gridRenderer': self._grid_entries,
  2509. 'shelfRenderer': lambda x: self._shelf_entries(x, tab.get('title') != 'Channels'),
  2510. 'backstagePostThreadRenderer': self._post_thread_entries,
  2511. 'videoRenderer': lambda x: [self._video_entry(x)],
  2512. }
  2513. for key, renderer in isr_content.items():
  2514. if key not in known_renderers:
  2515. continue
  2516. for entry in known_renderers[key](renderer):
  2517. if entry:
  2518. yield entry
  2519. continuation_list[0] = self._extract_continuation(renderer)
  2520. break
  2521. if not continuation_list[0]:
  2522. continuation_list[0] = self._extract_continuation(is_renderer)
  2523. if not continuation_list[0]:
  2524. continuation_list[0] = self._extract_continuation(parent_renderer)
  2525. continuation_list = [None] # Python 2 doesnot support nonlocal
  2526. tab_content = try_get(tab, lambda x: x['content'], dict)
  2527. if not tab_content:
  2528. return
  2529. parent_renderer = (
  2530. try_get(tab_content, lambda x: x['sectionListRenderer'], dict)
  2531. or try_get(tab_content, lambda x: x['richGridRenderer'], dict) or {})
  2532. for entry in extract_entries(parent_renderer):
  2533. yield entry
  2534. continuation = continuation_list[0]
  2535. headers = {
  2536. 'x-youtube-client-name': '1',
  2537. 'x-youtube-client-version': '2.20201112.04.01',
  2538. }
  2539. if identity_token:
  2540. headers['x-youtube-identity-token'] = identity_token
  2541. for page_num in itertools.count(1):
  2542. if not continuation:
  2543. break
  2544. count = 0
  2545. retries = 3
  2546. while count <= retries:
  2547. try:
  2548. # Downloading page may result in intermittent 5xx HTTP error
  2549. # that is usually worked around with a retry
  2550. browse = self._download_json(
  2551. 'https://www.youtube.com/browse_ajax', None,
  2552. 'Downloading page %d%s'
  2553. % (page_num, ' (retry #%d)' % count if count else ''),
  2554. headers=headers, query=continuation)
  2555. break
  2556. except ExtractorError as e:
  2557. if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503):
  2558. count += 1
  2559. if count <= retries:
  2560. continue
  2561. raise
  2562. if not browse:
  2563. break
  2564. response = try_get(browse, lambda x: x[1]['response'], dict)
  2565. if not response:
  2566. break
  2567. known_continuation_renderers = {
  2568. 'playlistVideoListContinuation': self._playlist_entries,
  2569. 'gridContinuation': self._grid_entries,
  2570. 'itemSectionContinuation': self._post_thread_continuation_entries,
  2571. 'sectionListContinuation': extract_entries, # for feeds
  2572. }
  2573. continuation_contents = try_get(
  2574. response, lambda x: x['continuationContents'], dict) or {}
  2575. continuation_renderer = None
  2576. for key, value in continuation_contents.items():
  2577. if key not in known_continuation_renderers:
  2578. continue
  2579. continuation_renderer = value
  2580. continuation_list = [None]
  2581. for entry in known_continuation_renderers[key](continuation_renderer):
  2582. yield entry
  2583. continuation = continuation_list[0] or self._extract_continuation(continuation_renderer)
  2584. break
  2585. if continuation_renderer:
  2586. continue
  2587. known_renderers = {
  2588. 'gridPlaylistRenderer': (self._grid_entries, 'items'),
  2589. 'gridVideoRenderer': (self._grid_entries, 'items'),
  2590. 'playlistVideoRenderer': (self._playlist_entries, 'contents'),
  2591. 'itemSectionRenderer': (self._playlist_entries, 'contents'),
  2592. }
  2593. continuation_items = try_get(
  2594. response, lambda x: x['onResponseReceivedActions'][0]['appendContinuationItemsAction']['continuationItems'], list)
  2595. continuation_item = try_get(continuation_items, lambda x: x[0], dict) or {}
  2596. video_items_renderer = None
  2597. for key, value in continuation_item.items():
  2598. if key not in known_renderers:
  2599. continue
  2600. video_items_renderer = {known_renderers[key][1]: continuation_items}
  2601. for entry in known_renderers[key][0](video_items_renderer):
  2602. yield entry
  2603. continuation = self._extract_continuation(video_items_renderer)
  2604. break
  2605. if video_items_renderer:
  2606. continue
  2607. break
  2608. @staticmethod
  2609. def _extract_selected_tab(tabs):
  2610. for tab in tabs:
  2611. if try_get(tab, lambda x: x['tabRenderer']['selected'], bool):
  2612. return tab['tabRenderer']
  2613. else:
  2614. raise ExtractorError('Unable to find selected tab')
  2615. @staticmethod
  2616. def _extract_uploader(data):
  2617. uploader = {}
  2618. sidebar_renderer = try_get(
  2619. data, lambda x: x['sidebar']['playlistSidebarRenderer']['items'], list)
  2620. if sidebar_renderer:
  2621. for item in sidebar_renderer:
  2622. if not isinstance(item, dict):
  2623. continue
  2624. renderer = item.get('playlistSidebarSecondaryInfoRenderer')
  2625. if not isinstance(renderer, dict):
  2626. continue
  2627. owner = try_get(
  2628. renderer, lambda x: x['videoOwner']['videoOwnerRenderer']['title']['runs'][0], dict)
  2629. if owner:
  2630. uploader['uploader'] = owner.get('text')
  2631. uploader['uploader_id'] = try_get(
  2632. owner, lambda x: x['navigationEndpoint']['browseEndpoint']['browseId'], compat_str)
  2633. uploader['uploader_url'] = urljoin(
  2634. 'https://www.youtube.com/',
  2635. try_get(owner, lambda x: x['navigationEndpoint']['browseEndpoint']['canonicalBaseUrl'], compat_str))
  2636. return {k: v for k, v in uploader.items() if v is not None}
  2637. def _extract_from_tabs(self, item_id, webpage, data, tabs, identity_token):
  2638. playlist_id = title = description = channel_url = channel_name = channel_id = None
  2639. thumbnails_list = tags = []
  2640. selected_tab = self._extract_selected_tab(tabs)
  2641. renderer = try_get(
  2642. data, lambda x: x['metadata']['channelMetadataRenderer'], dict)
  2643. if renderer:
  2644. channel_name = renderer.get('title')
  2645. channel_url = renderer.get('channelUrl')
  2646. channel_id = renderer.get('externalId')
  2647. if not renderer:
  2648. renderer = try_get(
  2649. data, lambda x: x['metadata']['playlistMetadataRenderer'], dict)
  2650. if renderer:
  2651. title = renderer.get('title')
  2652. description = renderer.get('description', '')
  2653. playlist_id = channel_id
  2654. tags = renderer.get('keywords', '').split()
  2655. thumbnails_list = (
  2656. try_get(renderer, lambda x: x['avatar']['thumbnails'], list)
  2657. or try_get(
  2658. data,
  2659. lambda x: x['sidebar']['playlistSidebarRenderer']['items'][0]['playlistSidebarPrimaryInfoRenderer']['thumbnailRenderer']['playlistVideoThumbnailRenderer']['thumbnail']['thumbnails'],
  2660. list)
  2661. or [])
  2662. thumbnails = []
  2663. for t in thumbnails_list:
  2664. if not isinstance(t, dict):
  2665. continue
  2666. thumbnail_url = url_or_none(t.get('url'))
  2667. if not thumbnail_url:
  2668. continue
  2669. thumbnails.append({
  2670. 'url': thumbnail_url,
  2671. 'width': int_or_none(t.get('width')),
  2672. 'height': int_or_none(t.get('height')),
  2673. })
  2674. if playlist_id is None:
  2675. playlist_id = item_id
  2676. if title is None:
  2677. title = playlist_id
  2678. title += format_field(selected_tab, 'title', ' - %s')
  2679. metadata = {
  2680. 'playlist_id': playlist_id,
  2681. 'playlist_title': title,
  2682. 'playlist_description': description,
  2683. 'uploader': channel_name,
  2684. 'uploader_id': channel_id,
  2685. 'uploader_url': channel_url,
  2686. 'thumbnails': thumbnails,
  2687. 'tags': tags,
  2688. }
  2689. if not channel_id:
  2690. metadata.update(self._extract_uploader(data))
  2691. metadata.update({
  2692. 'channel': metadata['uploader'],
  2693. 'channel_id': metadata['uploader_id'],
  2694. 'channel_url': metadata['uploader_url']})
  2695. return self.playlist_result(
  2696. self._entries(selected_tab, identity_token),
  2697. **metadata)
  2698. def _extract_from_playlist(self, item_id, url, data, playlist):
  2699. title = playlist.get('title') or try_get(
  2700. data, lambda x: x['titleText']['simpleText'], compat_str)
  2701. playlist_id = playlist.get('playlistId') or item_id
  2702. # Inline playlist rendition continuation does not always work
  2703. # at Youtube side, so delegating regular tab-based playlist URL
  2704. # processing whenever possible.
  2705. playlist_url = urljoin(url, try_get(
  2706. playlist, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'],
  2707. compat_str))
  2708. if playlist_url and playlist_url != url:
  2709. return self.url_result(
  2710. playlist_url, ie=YoutubeTabIE.ie_key(), video_id=playlist_id,
  2711. video_title=title)
  2712. return self.playlist_result(
  2713. self._playlist_entries(playlist), playlist_id=playlist_id,
  2714. playlist_title=title)
  2715. @staticmethod
  2716. def _extract_alerts(data):
  2717. for alert_dict in try_get(data, lambda x: x['alerts'], list) or []:
  2718. if not isinstance(alert_dict, dict):
  2719. continue
  2720. for renderer in alert_dict:
  2721. alert = alert_dict[renderer]
  2722. alert_type = alert.get('type')
  2723. if not alert_type:
  2724. continue
  2725. message = try_get(alert, lambda x: x['text']['simpleText'], compat_str)
  2726. if message:
  2727. yield alert_type, message
  2728. for run in try_get(alert, lambda x: x['text']['runs'], list) or []:
  2729. message = try_get(run, lambda x: x['text'], compat_str)
  2730. if message:
  2731. yield alert_type, message
  2732. def _extract_identity_token(self, webpage, item_id):
  2733. ytcfg = self._extract_ytcfg(item_id, webpage)
  2734. if ytcfg:
  2735. token = try_get(ytcfg, lambda x: x['ID_TOKEN'], compat_str)
  2736. if token:
  2737. return token
  2738. return self._search_regex(
  2739. r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage,
  2740. 'identity token', default=None)
  2741. def _real_extract(self, url):
  2742. item_id = self._match_id(url)
  2743. url = compat_urlparse.urlunparse(
  2744. compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com'))
  2745. is_home = re.match(r'(?P<pre>%s)(?P<post>/?(?![^#?]).*$)' % self._VALID_URL, url)
  2746. if is_home is not None and is_home.group('not_channel') is None and item_id != 'feed':
  2747. self._downloader.report_warning(
  2748. 'A channel/user page was given. All the channel\'s videos will be downloaded. '
  2749. 'To download only the videos in the home page, add a "/featured" to the URL')
  2750. url = '%s/videos%s' % (is_home.group('pre'), is_home.group('post') or '')
  2751. # Handle both video/playlist URLs
  2752. qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
  2753. video_id = qs.get('v', [None])[0]
  2754. playlist_id = qs.get('list', [None])[0]
  2755. if is_home is not None and is_home.group('not_channel') is not None and is_home.group('not_channel').startswith('watch') and not video_id:
  2756. if playlist_id:
  2757. self._downloader.report_warning('%s is not a valid Youtube URL. Trying to download playlist %s' % (url, playlist_id))
  2758. url = 'https://www.youtube.com/playlist?list=%s' % playlist_id
  2759. # return self.url_result(playlist_id, ie=YoutubePlaylistIE.ie_key())
  2760. else:
  2761. raise ExtractorError('Unable to recognize tab page')
  2762. if video_id and playlist_id:
  2763. if self._downloader.params.get('noplaylist'):
  2764. self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
  2765. return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
  2766. self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
  2767. webpage = self._download_webpage(url, item_id)
  2768. identity_token = self._extract_identity_token(webpage, item_id)
  2769. data = self._extract_yt_initial_data(item_id, webpage)
  2770. err_msg = None
  2771. for alert_type, alert_message in self._extract_alerts(data):
  2772. if alert_type.lower() == 'error':
  2773. if err_msg:
  2774. self._downloader.report_warning('YouTube said: %s - %s' % ('ERROR', err_msg))
  2775. err_msg = alert_message
  2776. else:
  2777. self._downloader.report_warning('YouTube said: %s - %s' % (alert_type, alert_message))
  2778. if err_msg:
  2779. raise ExtractorError('YouTube said: %s' % err_msg, expected=True)
  2780. tabs = try_get(
  2781. data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list)
  2782. if tabs:
  2783. return self._extract_from_tabs(item_id, webpage, data, tabs, identity_token)
  2784. playlist = try_get(
  2785. data, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict)
  2786. if playlist:
  2787. return self._extract_from_playlist(item_id, url, data, playlist)
  2788. # Fallback to video extraction if no playlist alike page is recognized.
  2789. # First check for the current video then try the v attribute of URL query.
  2790. video_id = try_get(
  2791. data, lambda x: x['currentVideoEndpoint']['watchEndpoint']['videoId'],
  2792. compat_str) or video_id
  2793. if video_id:
  2794. return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
  2795. # Failed to recognize
  2796. raise ExtractorError('Unable to recognize tab page')
  2797. class YoutubePlaylistIE(InfoExtractor):
  2798. IE_DESC = 'YouTube.com playlists'
  2799. _VALID_URL = r'''(?x)(?:
  2800. (?:https?://)?
  2801. (?:\w+\.)?
  2802. (?:
  2803. (?:
  2804. youtube(?:kids)?\.com|
  2805. invidio\.us
  2806. )
  2807. /.*?\?.*?\blist=
  2808. )?
  2809. (?P<id>%(playlist_id)s)
  2810. )''' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
  2811. IE_NAME = 'youtube:playlist'
  2812. _TESTS = [{
  2813. 'note': 'issue #673',
  2814. 'url': 'PLBB231211A4F62143',
  2815. 'info_dict': {
  2816. 'title': '[OLD]Team Fortress 2 (Class-based LP)',
  2817. 'id': 'PLBB231211A4F62143',
  2818. 'uploader': 'Wickydoo',
  2819. 'uploader_id': 'UCKSpbfbl5kRQpTdL7kMc-1Q',
  2820. },
  2821. 'playlist_mincount': 29,
  2822. }, {
  2823. 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
  2824. 'info_dict': {
  2825. 'title': 'YDL_safe_search',
  2826. 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
  2827. },
  2828. 'playlist_count': 2,
  2829. 'skip': 'This playlist is private',
  2830. }, {
  2831. 'note': 'embedded',
  2832. 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
  2833. 'playlist_count': 4,
  2834. 'info_dict': {
  2835. 'title': 'JODA15',
  2836. 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
  2837. 'uploader': 'milan',
  2838. 'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw',
  2839. }
  2840. }, {
  2841. 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
  2842. 'playlist_mincount': 982,
  2843. 'info_dict': {
  2844. 'title': '2018 Chinese New Singles (11/6 updated)',
  2845. 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
  2846. 'uploader': 'LBK',
  2847. 'uploader_id': 'UC21nz3_MesPLqtDqwdvnoxA',
  2848. }
  2849. }, {
  2850. 'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
  2851. 'only_matching': True,
  2852. }, {
  2853. # music album playlist
  2854. 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',
  2855. 'only_matching': True,
  2856. }]
  2857. @classmethod
  2858. def suitable(cls, url):
  2859. return False if YoutubeTabIE.suitable(url) else super(
  2860. YoutubePlaylistIE, cls).suitable(url)
  2861. def _real_extract(self, url):
  2862. playlist_id = self._match_id(url)
  2863. qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
  2864. if not qs:
  2865. qs = {'list': playlist_id}
  2866. return self.url_result(
  2867. update_url_query('https://www.youtube.com/playlist', qs),
  2868. ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
  2869. class YoutubeYtBeIE(InfoExtractor):
  2870. IE_DESC = 'youtu.be'
  2871. _VALID_URL = r'https?://youtu\.be/(?P<id>[0-9A-Za-z_-]{11})/*?.*?\blist=(?P<playlist_id>%(playlist_id)s)' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
  2872. _TESTS = [{
  2873. 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',
  2874. 'info_dict': {
  2875. 'id': 'yeWKywCrFtk',
  2876. 'ext': 'mp4',
  2877. 'title': 'Small Scale Baler and Braiding Rugs',
  2878. 'uploader': 'Backus-Page House Museum',
  2879. 'uploader_id': 'backuspagemuseum',
  2880. 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',
  2881. 'upload_date': '20161008',
  2882. 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',
  2883. 'categories': ['Nonprofits & Activism'],
  2884. 'tags': list,
  2885. 'like_count': int,
  2886. 'dislike_count': int,
  2887. },
  2888. 'params': {
  2889. 'noplaylist': True,
  2890. 'skip_download': True,
  2891. },
  2892. }, {
  2893. 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',
  2894. 'only_matching': True,
  2895. }]
  2896. def _real_extract(self, url):
  2897. mobj = re.match(self._VALID_URL, url)
  2898. video_id = mobj.group('id')
  2899. playlist_id = mobj.group('playlist_id')
  2900. return self.url_result(
  2901. update_url_query('https://www.youtube.com/watch', {
  2902. 'v': video_id,
  2903. 'list': playlist_id,
  2904. 'feature': 'youtu.be',
  2905. }), ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
  2906. class YoutubeYtUserIE(InfoExtractor):
  2907. IE_DESC = 'YouTube.com user videos, URL or "ytuser" keyword'
  2908. _VALID_URL = r'ytuser:(?P<id>.+)'
  2909. _TESTS = [{
  2910. 'url': 'ytuser:phihag',
  2911. 'only_matching': True,
  2912. }]
  2913. def _real_extract(self, url):
  2914. user_id = self._match_id(url)
  2915. return self.url_result(
  2916. 'https://www.youtube.com/user/%s' % user_id,
  2917. ie=YoutubeTabIE.ie_key(), video_id=user_id)
  2918. class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
  2919. IE_NAME = 'youtube:favorites'
  2920. IE_DESC = 'YouTube.com liked videos, ":ytfav" for short (requires authentication)'
  2921. _VALID_URL = r':ytfav(?:ou?rite)?s?'
  2922. _LOGIN_REQUIRED = True
  2923. _TESTS = [{
  2924. 'url': ':ytfav',
  2925. 'only_matching': True,
  2926. }, {
  2927. 'url': ':ytfavorites',
  2928. 'only_matching': True,
  2929. }]
  2930. def _real_extract(self, url):
  2931. return self.url_result(
  2932. 'https://www.youtube.com/playlist?list=LL',
  2933. ie=YoutubeTabIE.ie_key())
  2934. class YoutubeSearchIE(SearchInfoExtractor, YoutubeBaseInfoExtractor):
  2935. IE_DESC = 'YouTube.com searches, "ytsearch" keyword'
  2936. # there doesn't appear to be a real limit, for example if you search for
  2937. # 'python' you get more than 8.000.000 results
  2938. _MAX_RESULTS = float('inf')
  2939. IE_NAME = 'youtube:search'
  2940. _SEARCH_KEY = 'ytsearch'
  2941. _SEARCH_PARAMS = None
  2942. _TESTS = []
  2943. def _entries(self, query, n):
  2944. data = {
  2945. 'context': {
  2946. 'client': {
  2947. 'clientName': 'WEB',
  2948. 'clientVersion': '2.20201021.03.00',
  2949. }
  2950. },
  2951. 'query': query,
  2952. }
  2953. if self._SEARCH_PARAMS:
  2954. data['params'] = self._SEARCH_PARAMS
  2955. total = 0
  2956. for page_num in itertools.count(1):
  2957. search = self._download_json(
  2958. 'https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
  2959. video_id='query "%s"' % query,
  2960. note='Downloading page %s' % page_num,
  2961. errnote='Unable to download API page', fatal=False,
  2962. data=json.dumps(data).encode('utf8'),
  2963. headers={'content-type': 'application/json'})
  2964. if not search:
  2965. break
  2966. slr_contents = try_get(
  2967. search,
  2968. (lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'],
  2969. lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems']),
  2970. list)
  2971. if not slr_contents:
  2972. break
  2973. # Youtube sometimes adds promoted content to searches,
  2974. # changing the index location of videos and token.
  2975. # So we search through all entries till we find them.
  2976. continuation_token = None
  2977. for slr_content in slr_contents:
  2978. if continuation_token is None:
  2979. continuation_token = try_get(
  2980. slr_content,
  2981. lambda x: x['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token'],
  2982. compat_str)
  2983. isr_contents = try_get(
  2984. slr_content,
  2985. lambda x: x['itemSectionRenderer']['contents'],
  2986. list)
  2987. if not isr_contents:
  2988. continue
  2989. for content in isr_contents:
  2990. if not isinstance(content, dict):
  2991. continue
  2992. video = content.get('videoRenderer')
  2993. if not isinstance(video, dict):
  2994. continue
  2995. video_id = video.get('videoId')
  2996. if not video_id:
  2997. continue
  2998. yield self._extract_video(video)
  2999. total += 1
  3000. if total == n:
  3001. return
  3002. if not continuation_token:
  3003. break
  3004. data['continuation'] = continuation_token
  3005. def _get_n_results(self, query, n):
  3006. """Get a specified number of results for a query"""
  3007. return self.playlist_result(self._entries(query, n), query)
  3008. class YoutubeSearchDateIE(YoutubeSearchIE):
  3009. IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
  3010. _SEARCH_KEY = 'ytsearchdate'
  3011. IE_DESC = 'YouTube.com searches, newest videos first, "ytsearchdate" keyword'
  3012. _SEARCH_PARAMS = 'CAI%3D'
  3013. class YoutubeSearchURLIE(YoutubeSearchIE):
  3014. IE_DESC = 'YouTube.com search URLs'
  3015. IE_NAME = YoutubeSearchIE.IE_NAME + '_url'
  3016. _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?:[^&]+)(?:[&]|$)'
  3017. # _MAX_RESULTS = 100
  3018. _TESTS = [{
  3019. 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
  3020. 'playlist_mincount': 5,
  3021. 'info_dict': {
  3022. 'title': 'youtube-dl test video',
  3023. }
  3024. }, {
  3025. 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
  3026. 'only_matching': True,
  3027. }]
  3028. @classmethod
  3029. def _make_valid_url(cls):
  3030. return cls._VALID_URL
  3031. def _real_extract(self, url):
  3032. qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
  3033. query = (qs.get('search_query') or qs.get('q'))[0]
  3034. self._SEARCH_PARAMS = qs.get('sp', ('',))[0]
  3035. return self._get_n_results(query, self._MAX_RESULTS)
  3036. class YoutubeFeedsInfoExtractor(YoutubeTabIE):
  3037. """
  3038. Base class for feed extractors
  3039. Subclasses must define the _FEED_NAME property.
  3040. """
  3041. _LOGIN_REQUIRED = True
  3042. # _MAX_PAGES = 5
  3043. _TESTS = []
  3044. @property
  3045. def IE_NAME(self):
  3046. return 'youtube:%s' % self._FEED_NAME
  3047. def _real_initialize(self):
  3048. self._login()
  3049. def _real_extract(self, url):
  3050. return self.url_result(
  3051. 'https://www.youtube.com/feed/%s' % self._FEED_NAME,
  3052. ie=YoutubeTabIE.ie_key())
  3053. class YoutubeWatchLaterIE(InfoExtractor):
  3054. IE_NAME = 'youtube:watchlater'
  3055. IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
  3056. _VALID_URL = r':ytwatchlater'
  3057. _TESTS = [{
  3058. 'url': ':ytwatchlater',
  3059. 'only_matching': True,
  3060. }]
  3061. def _real_extract(self, url):
  3062. return self.url_result(
  3063. 'https://www.youtube.com/playlist?list=WL', ie=YoutubeTabIE.ie_key())
  3064. class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
  3065. IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
  3066. _VALID_URL = r'https?://(?:www\.)?youtube\.com/?(?:[?#]|$)|:ytrec(?:ommended)?'
  3067. _FEED_NAME = 'recommended'
  3068. _TESTS = [{
  3069. 'url': ':ytrec',
  3070. 'only_matching': True,
  3071. }, {
  3072. 'url': ':ytrecommended',
  3073. 'only_matching': True,
  3074. }, {
  3075. 'url': 'https://youtube.com',
  3076. 'only_matching': True,
  3077. }]
  3078. class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
  3079. IE_DESC = 'YouTube.com subscriptions feed, ":ytsubs" for short (requires authentication)'
  3080. _VALID_URL = r':ytsub(?:scription)?s?'
  3081. _FEED_NAME = 'subscriptions'
  3082. _TESTS = [{
  3083. 'url': ':ytsubs',
  3084. 'only_matching': True,
  3085. }, {
  3086. 'url': ':ytsubscriptions',
  3087. 'only_matching': True,
  3088. }]
  3089. class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
  3090. IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
  3091. _VALID_URL = r':ythistory'
  3092. _FEED_NAME = 'history'
  3093. _TESTS = [{
  3094. 'url': ':ythistory',
  3095. 'only_matching': True,
  3096. }]
  3097. class YoutubeTruncatedURLIE(InfoExtractor):
  3098. IE_NAME = 'youtube:truncated_url'
  3099. IE_DESC = False # Do not list
  3100. _VALID_URL = r'''(?x)
  3101. (?:https?://)?
  3102. (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
  3103. (?:watch\?(?:
  3104. feature=[a-z_]+|
  3105. annotation_id=annotation_[^&]+|
  3106. x-yt-cl=[0-9]+|
  3107. hl=[^&]*|
  3108. t=[0-9]+
  3109. )?
  3110. |
  3111. attribution_link\?a=[^&]+
  3112. )
  3113. $
  3114. '''
  3115. _TESTS = [{
  3116. 'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041',
  3117. 'only_matching': True,
  3118. }, {
  3119. 'url': 'https://www.youtube.com/watch?',
  3120. 'only_matching': True,
  3121. }, {
  3122. 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
  3123. 'only_matching': True,
  3124. }, {
  3125. 'url': 'https://www.youtube.com/watch?feature=foo',
  3126. 'only_matching': True,
  3127. }, {
  3128. 'url': 'https://www.youtube.com/watch?hl=en-GB',
  3129. 'only_matching': True,
  3130. }, {
  3131. 'url': 'https://www.youtube.com/watch?t=2372',
  3132. 'only_matching': True,
  3133. }]
  3134. def _real_extract(self, url):
  3135. raise ExtractorError(
  3136. 'Did you forget to quote the URL? Remember that & is a meta '
  3137. 'character in most shells, so you want to put the URL in quotes, '
  3138. 'like youtube-dl '
  3139. '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
  3140. ' or simply youtube-dl BaW_jenozKc .',
  3141. expected=True)
  3142. class YoutubeTruncatedIDIE(InfoExtractor):
  3143. IE_NAME = 'youtube:truncated_id'
  3144. IE_DESC = False # Do not list
  3145. _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
  3146. _TESTS = [{
  3147. 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
  3148. 'only_matching': True,
  3149. }]
  3150. def _real_extract(self, url):
  3151. video_id = self._match_id(url)
  3152. raise ExtractorError(
  3153. 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
  3154. expected=True)