archiveorg.py 45 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950
  1. from __future__ import annotations
  2. import json
  3. import re
  4. import urllib.parse
  5. from .common import InfoExtractor
  6. from .youtube import YoutubeBaseInfoExtractor, YoutubeIE
  7. from ..networking import HEADRequest
  8. from ..networking.exceptions import HTTPError
  9. from ..utils import (
  10. KNOWN_EXTENSIONS,
  11. ExtractorError,
  12. bug_reports_message,
  13. clean_html,
  14. dict_get,
  15. extract_attributes,
  16. get_element_by_id,
  17. int_or_none,
  18. join_nonempty,
  19. js_to_json,
  20. merge_dicts,
  21. mimetype2ext,
  22. orderedSet,
  23. parse_duration,
  24. parse_qs,
  25. str_or_none,
  26. str_to_int,
  27. traverse_obj,
  28. try_get,
  29. unified_strdate,
  30. unified_timestamp,
  31. url_or_none,
  32. urlhandle_detect_ext,
  33. variadic,
  34. )
  35. class ArchiveOrgIE(InfoExtractor):
  36. IE_NAME = 'archive.org'
  37. IE_DESC = 'archive.org video and audio'
  38. _VALID_URL = r'https?://(?:www\.)?archive\.org/(?:details|embed)/(?P<id>[^?#]+)(?:[?].*)?$'
  39. _TESTS = [{
  40. 'url': 'http://archive.org/details/XD300-23_68HighlightsAResearchCntAugHumanIntellect',
  41. 'md5': '8af1d4cf447933ed3c7f4871162602db',
  42. 'info_dict': {
  43. 'id': 'XD300-23_68HighlightsAResearchCntAugHumanIntellect',
  44. 'ext': 'ogv',
  45. 'title': '1968 Demo - FJCC Conference Presentation Reel #1',
  46. 'description': 'md5:da45c349df039f1cc8075268eb1b5c25',
  47. 'release_date': '19681210',
  48. 'timestamp': 1268695290,
  49. 'upload_date': '20100315',
  50. 'creators': ['SRI International'],
  51. 'uploader': 'laura@archive.org',
  52. 'thumbnail': r're:https://archive\.org/download/.*\.jpg',
  53. 'display_id': 'XD300-23_68HighlightsAResearchCntAugHumanIntellect.cdr',
  54. 'track': 'XD300-23 68HighlightsAResearchCntAugHumanIntellect',
  55. },
  56. }, {
  57. 'url': 'https://archive.org/details/Cops1922',
  58. 'md5': '0869000b4ce265e8ca62738b336b268a',
  59. 'info_dict': {
  60. 'id': 'Cops1922',
  61. 'ext': 'mp4',
  62. 'title': 'Buster Keaton\'s "Cops" (1922)',
  63. 'description': 'md5:cd6f9910c35aedd5fc237dbc3957e2ca',
  64. 'uploader': 'yorkmba99@hotmail.com',
  65. 'timestamp': 1387699629,
  66. 'upload_date': '20131222',
  67. 'display_id': 'Cops-v2.mp4',
  68. 'thumbnail': r're:https://archive\.org/download/.*\.jpg',
  69. 'duration': 1091.96,
  70. },
  71. }, {
  72. 'url': 'http://archive.org/embed/XD300-23_68HighlightsAResearchCntAugHumanIntellect',
  73. 'only_matching': True,
  74. }, {
  75. 'url': 'https://archive.org/details/Election_Ads',
  76. 'md5': 'eec5cddebd4793c6a653b69c3b11f2e6',
  77. 'info_dict': {
  78. 'id': 'Election_Ads/Commercial-JFK1960ElectionAdCampaignJingle.mpg',
  79. 'title': 'Commercial-JFK1960ElectionAdCampaignJingle.mpg',
  80. 'ext': 'mpg',
  81. 'thumbnail': r're:https://archive\.org/download/.*\.jpg',
  82. 'duration': 59.77,
  83. 'display_id': 'Commercial-JFK1960ElectionAdCampaignJingle.mpg',
  84. },
  85. }, {
  86. 'url': 'https://archive.org/details/Election_Ads/Commercial-Nixon1960ElectionAdToughonDefense.mpg',
  87. 'md5': 'ea1eed8234e7d4165f38c8c769edef38',
  88. 'info_dict': {
  89. 'id': 'Election_Ads/Commercial-Nixon1960ElectionAdToughonDefense.mpg',
  90. 'title': 'Commercial-Nixon1960ElectionAdToughonDefense.mpg',
  91. 'ext': 'mpg',
  92. 'timestamp': 1205588045,
  93. 'uploader': 'mikedavisstripmaster@yahoo.com',
  94. 'description': '1960 Presidential Campaign Election Commercials John F Kennedy, Richard M Nixon',
  95. 'upload_date': '20080315',
  96. 'display_id': 'Commercial-Nixon1960ElectionAdToughonDefense.mpg',
  97. 'duration': 59.51,
  98. 'license': 'http://creativecommons.org/licenses/publicdomain/',
  99. 'thumbnail': r're:https://archive\.org/download/.*\.jpg',
  100. },
  101. }, {
  102. 'url': 'https://archive.org/details/gd1977-05-08.shure57.stevenson.29303.flac16',
  103. 'md5': '7d07ffb42aba6537c28e053efa4b54c9',
  104. 'info_dict': {
  105. 'id': 'gd1977-05-08.shure57.stevenson.29303.flac16/gd1977-05-08d01t01.flac',
  106. 'title': 'Turning',
  107. 'ext': 'flac',
  108. 'track': 'Turning',
  109. 'creators': ['Grateful Dead'],
  110. 'display_id': 'gd1977-05-08d01t01.flac',
  111. 'track_number': 1,
  112. 'album': '1977-05-08 - Barton Hall - Cornell University',
  113. 'duration': 39.8,
  114. },
  115. }, {
  116. 'url': 'https://archive.org/details/gd1977-05-08.shure57.stevenson.29303.flac16/gd1977-05-08d01t07.flac',
  117. 'md5': 'a07cd8c6ab4ee1560f8a0021717130f3',
  118. 'info_dict': {
  119. 'id': 'gd1977-05-08.shure57.stevenson.29303.flac16/gd1977-05-08d01t07.flac',
  120. 'title': 'Deal',
  121. 'ext': 'flac',
  122. 'timestamp': 1205895624,
  123. 'uploader': 'mvernon54@yahoo.com',
  124. 'description': 'md5:6c921464414814720c6593810a5c7e3d',
  125. 'upload_date': '20080319',
  126. 'location': 'Barton Hall - Cornell University',
  127. 'duration': 438.68,
  128. 'track': 'Deal',
  129. 'creators': ['Grateful Dead'],
  130. 'album': '1977-05-08 - Barton Hall - Cornell University',
  131. 'release_date': '19770508',
  132. 'display_id': 'gd1977-05-08d01t07.flac',
  133. 'track_number': 7,
  134. },
  135. }, {
  136. # FIXME: give a better error message than just IndexError when all available formats are restricted
  137. 'url': 'https://archive.org/details/lp_the-music-of-russia_various-artists-a-askaryan-alexander-melik',
  138. 'md5': '7cb019baa9b332e82ea7c10403acd180',
  139. 'info_dict': {
  140. 'id': 'lp_the-music-of-russia_various-artists-a-askaryan-alexander-melik/disc1/01.01. Bells Of Rostov.mp3',
  141. 'title': 'Bells Of Rostov',
  142. 'ext': 'mp3',
  143. },
  144. 'skip': 'restricted',
  145. }, {
  146. 'url': 'https://archive.org/details/lp_the-music-of-russia_various-artists-a-askaryan-alexander-melik/disc1/02.02.+Song+And+Chorus+In+The+Polovetsian+Camp+From+%22Prince+Igor%22+(Act+2%2C+Scene+1).mp3',
  147. 'md5': '1d0aabe03edca83ca58d9ed3b493a3c3',
  148. 'info_dict': {
  149. 'id': 'lp_the-music-of-russia_various-artists-a-askaryan-alexander-melik/disc1/02.02. Song And Chorus In The Polovetsian Camp From "Prince Igor" (Act 2, Scene 1).mp3',
  150. 'title': 'Song And Chorus In The Polovetsian Camp From "Prince Igor" (Act 2, Scene 1)',
  151. 'ext': 'mp3',
  152. 'timestamp': 1569662587,
  153. 'uploader': 'associate-joygen-odiongan@archive.org',
  154. 'description': 'md5:012b2d668ae753be36896f343d12a236',
  155. 'upload_date': '20190928',
  156. },
  157. 'skip': 'restricted',
  158. }, {
  159. # Original formats are private
  160. 'url': 'https://archive.org/details/irelandthemakingofarepublic',
  161. 'info_dict': {
  162. 'id': 'irelandthemakingofarepublic',
  163. 'title': 'Ireland: The Making of a Republic',
  164. 'upload_date': '20160610',
  165. 'description': 'md5:f70956a156645a658a0dc9513d9e78b7',
  166. 'uploader': 'dimitrios@archive.org',
  167. 'creators': ['British Broadcasting Corporation', 'Time-Life Films'],
  168. 'timestamp': 1465594947,
  169. },
  170. 'playlist': [
  171. {
  172. 'md5': '0b211261b26590d49df968f71b90690d',
  173. 'info_dict': {
  174. 'id': 'irelandthemakingofarepublic/irelandthemakingofarepublicreel1_01.mov',
  175. 'ext': 'mp4',
  176. 'title': 'irelandthemakingofarepublicreel1_01.mov',
  177. 'duration': 130.46,
  178. 'thumbnail': 'https://archive.org/download/irelandthemakingofarepublic/irelandthemakingofarepublic.thumbs/irelandthemakingofarepublicreel1_01_000117.jpg',
  179. 'display_id': 'irelandthemakingofarepublicreel1_01.mov',
  180. },
  181. }, {
  182. 'md5': '67335ee3b23a0da930841981c1e79b02',
  183. 'info_dict': {
  184. 'id': 'irelandthemakingofarepublic/irelandthemakingofarepublicreel1_02.mov',
  185. 'ext': 'mp4',
  186. 'duration': 1395.13,
  187. 'title': 'irelandthemakingofarepublicreel1_02.mov',
  188. 'display_id': 'irelandthemakingofarepublicreel1_02.mov',
  189. 'thumbnail': 'https://archive.org/download/irelandthemakingofarepublic/irelandthemakingofarepublic.thumbs/irelandthemakingofarepublicreel1_02_001374.jpg',
  190. },
  191. }, {
  192. 'md5': 'e470e86787893603f4a341a16c281eb5',
  193. 'info_dict': {
  194. 'id': 'irelandthemakingofarepublic/irelandthemakingofarepublicreel2.mov',
  195. 'ext': 'mp4',
  196. 'duration': 1602.67,
  197. 'title': 'irelandthemakingofarepublicreel2.mov',
  198. 'thumbnail': 'https://archive.org/download/irelandthemakingofarepublic/irelandthemakingofarepublic.thumbs/irelandthemakingofarepublicreel2_001554.jpg',
  199. 'display_id': 'irelandthemakingofarepublicreel2.mov',
  200. },
  201. },
  202. ],
  203. }]
  204. @staticmethod
  205. def _playlist_data(webpage):
  206. element = re.findall(r'''(?xs)
  207. <input
  208. (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
  209. \s+class=['"]?js-play8-playlist['"]?
  210. (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
  211. \s*/>
  212. ''', webpage)[0]
  213. return json.loads(extract_attributes(element)['value'])
  214. def _real_extract(self, url):
  215. video_id = urllib.parse.unquote_plus(self._match_id(url))
  216. identifier, _, entry_id = video_id.partition('/')
  217. # Archive.org metadata API doesn't clearly demarcate playlist entries
  218. # or subtitle tracks, so we get them from the embeddable player.
  219. embed_page = self._download_webpage(f'https://archive.org/embed/{identifier}', identifier)
  220. playlist = self._playlist_data(embed_page)
  221. entries = {}
  222. for p in playlist:
  223. # If the user specified a playlist entry in the URL, ignore the
  224. # rest of the playlist.
  225. if entry_id and p['orig'] != entry_id:
  226. continue
  227. entries[p['orig']] = {
  228. 'formats': [],
  229. 'thumbnails': [],
  230. 'artist': p.get('artist'),
  231. 'track': p.get('title'),
  232. 'subtitles': {},
  233. }
  234. for track in p.get('tracks', []):
  235. if track['kind'] != 'subtitles':
  236. continue
  237. entries[p['orig']][track['label']] = {
  238. 'url': 'https://archive.org/' + track['file'].lstrip('/'),
  239. }
  240. metadata = self._download_json('http://archive.org/metadata/' + identifier, identifier)
  241. m = metadata['metadata']
  242. identifier = m['identifier']
  243. info = {
  244. 'id': identifier,
  245. 'title': m['title'],
  246. 'description': clean_html(m.get('description')),
  247. 'uploader': dict_get(m, ['uploader', 'adder']),
  248. 'creators': traverse_obj(m, ('creator', {variadic}, {lambda x: x[0] and list(x)})),
  249. 'license': m.get('licenseurl'),
  250. 'release_date': unified_strdate(m.get('date')),
  251. 'timestamp': unified_timestamp(dict_get(m, ['publicdate', 'addeddate'])),
  252. 'webpage_url': f'https://archive.org/details/{identifier}',
  253. 'location': m.get('venue'),
  254. 'release_year': int_or_none(m.get('year'))}
  255. for f in metadata['files']:
  256. if f['name'] in entries:
  257. entries[f['name']] = merge_dicts(entries[f['name']], {
  258. 'id': identifier + '/' + f['name'],
  259. 'title': f.get('title') or f['name'],
  260. 'display_id': f['name'],
  261. 'description': clean_html(f.get('description')),
  262. 'creators': traverse_obj(f, ('creator', {variadic}, {lambda x: x[0] and list(x)})),
  263. 'duration': parse_duration(f.get('length')),
  264. 'track_number': int_or_none(f.get('track')),
  265. 'album': f.get('album'),
  266. 'discnumber': int_or_none(f.get('disc')),
  267. 'release_year': int_or_none(f.get('year'))})
  268. entry = entries[f['name']]
  269. elif traverse_obj(f, 'original', expected_type=str) in entries:
  270. entry = entries[f['original']]
  271. else:
  272. continue
  273. if f.get('format') == 'Thumbnail':
  274. entry['thumbnails'].append({
  275. 'id': f['name'],
  276. 'url': 'https://archive.org/download/' + identifier + '/' + f['name'],
  277. 'width': int_or_none(f.get('width')),
  278. 'height': int_or_none(f.get('width')),
  279. 'filesize': int_or_none(f.get('size'))})
  280. _, has_ext, extension = f['name'].rpartition('.')
  281. if not has_ext:
  282. extension = None
  283. # We don't want to skip private formats if the user has access to them,
  284. # however without access to an account with such privileges we can't implement/test this.
  285. # For now to be safe, we will only skip them if there is no user logged in.
  286. is_logged_in = bool(self._get_cookies('https://archive.org').get('logged-in-sig'))
  287. if extension in KNOWN_EXTENSIONS and (not f.get('private') or is_logged_in):
  288. entry['formats'].append({
  289. 'url': 'https://archive.org/download/' + identifier + '/' + urllib.parse.quote(f['name']),
  290. 'format': f.get('format'),
  291. 'width': int_or_none(f.get('width')),
  292. 'height': int_or_none(f.get('height')),
  293. 'filesize': int_or_none(f.get('size')),
  294. 'protocol': 'https',
  295. 'source_preference': 0 if f.get('source') == 'original' else -1,
  296. 'format_note': f.get('source'),
  297. })
  298. for entry in entries.values():
  299. entry['_format_sort_fields'] = ('source', )
  300. if len(entries) == 1:
  301. # If there's only one item, use it as the main info dict
  302. only_video = next(iter(entries.values()))
  303. if entry_id:
  304. info = merge_dicts(only_video, info)
  305. else:
  306. info = merge_dicts(info, only_video)
  307. else:
  308. # Otherwise, we have a playlist.
  309. info['_type'] = 'playlist'
  310. info['entries'] = list(entries.values())
  311. if metadata.get('reviews'):
  312. info['comments'] = []
  313. for review in metadata['reviews']:
  314. info['comments'].append({
  315. 'id': review.get('review_id'),
  316. 'author': review.get('reviewer'),
  317. 'text': str_or_none(review.get('reviewtitle'), '') + '\n\n' + review.get('reviewbody'),
  318. 'timestamp': unified_timestamp(review.get('createdate')),
  319. 'parent': 'root'})
  320. return info
  321. class YoutubeWebArchiveIE(InfoExtractor):
  322. IE_NAME = 'web.archive:youtube'
  323. IE_DESC = 'web.archive.org saved youtube videos, "ytarchive:" prefix'
  324. _VALID_URL = r'''(?x)(?:(?P<prefix>ytarchive:)|
  325. (?:https?://)?web\.archive\.org/
  326. (?:web/)?(?:(?P<date>[0-9]{14})?[0-9A-Za-z_*]*/)? # /web and the version index is optional
  327. (?:https?(?::|%3[Aa])//)?(?:
  328. (?:\w+\.)?youtube\.com(?::(?:80|443))?/watch(?:\.php)?(?:\?|%3[fF])(?:[^\#]+(?:&|%26))?v(?:=|%3[dD]) # Youtube URL
  329. |(?:wayback-fakeurl\.archive\.org/yt/) # Or the internal fake url
  330. )
  331. )(?P<id>[0-9A-Za-z_-]{11})
  332. (?(prefix)
  333. (?::(?P<date2>[0-9]{14}))?$|
  334. (?:%26|[#&]|$)
  335. )'''
  336. _TESTS = [
  337. {
  338. 'url': 'https://web.archive.org/web/20150415002341/https://www.youtube.com/watch?v=aYAGB11YrSs',
  339. 'info_dict': {
  340. 'id': 'aYAGB11YrSs',
  341. 'ext': 'webm',
  342. 'title': 'Team Fortress 2 - Sandviches!',
  343. 'description': 'md5:4984c0f9a07f349fc5d8e82ab7af4eaf',
  344. 'upload_date': '20110926',
  345. 'uploader': 'Zeurel',
  346. 'channel_id': 'UCukCyHaD-bK3in_pKpfH9Eg',
  347. 'duration': 32,
  348. 'uploader_id': 'Zeurel',
  349. 'uploader_url': 'https://www.youtube.com/user/Zeurel',
  350. 'thumbnail': r're:https?://.*\.(jpg|webp)',
  351. 'channel_url': 'https://www.youtube.com/channel/UCukCyHaD-bK3in_pKpfH9Eg',
  352. },
  353. }, {
  354. # Internal link
  355. 'url': 'https://web.archive.org/web/2oe/http://wayback-fakeurl.archive.org/yt/97t7Xj_iBv0',
  356. 'info_dict': {
  357. 'id': '97t7Xj_iBv0',
  358. 'ext': 'mp4',
  359. 'title': 'Why Machines That Bend Are Better',
  360. 'description': 'md5:00404df2c632d16a674ff8df1ecfbb6c',
  361. 'upload_date': '20190312',
  362. 'uploader': 'Veritasium',
  363. 'channel_id': 'UCHnyfMqiRRG1u-2MsSQLbXA',
  364. 'duration': 771,
  365. 'uploader_id': '1veritasium',
  366. 'uploader_url': 'https://www.youtube.com/user/1veritasium',
  367. 'thumbnail': r're:https?://.*\.(jpg|webp)',
  368. 'channel_url': 'https://www.youtube.com/channel/UCHnyfMqiRRG1u-2MsSQLbXA',
  369. },
  370. }, {
  371. # Video from 2012, webm format itag 45. Newest capture is deleted video, with an invalid description.
  372. # Should use the date in the link. Title ends with '- Youtube'. Capture has description in eow-description
  373. 'url': 'https://web.archive.org/web/20120712231619/http://www.youtube.com/watch?v=AkhihxRKcrs&gl=US&hl=en',
  374. 'info_dict': {
  375. 'id': 'AkhihxRKcrs',
  376. 'ext': 'webm',
  377. 'title': 'Limited Run: Mondo\'s Modern Classic 1 of 3 (SDCC 2012)',
  378. 'upload_date': '20120712',
  379. 'duration': 398,
  380. 'description': 'md5:ff4de6a7980cb65d951c2f6966a4f2f3',
  381. 'uploader_id': 'machinima',
  382. 'uploader_url': 'https://www.youtube.com/user/machinima',
  383. 'thumbnail': r're:https?://.*\.(jpg|webp)',
  384. 'uploader': 'machinima',
  385. },
  386. }, {
  387. # FLV video. Video file URL does not provide itag information
  388. 'url': 'https://web.archive.org/web/20081211103536/http://www.youtube.com/watch?v=jNQXAC9IVRw',
  389. 'info_dict': {
  390. 'id': 'jNQXAC9IVRw',
  391. 'ext': 'flv',
  392. 'title': 'Me at the zoo',
  393. 'upload_date': '20050423',
  394. 'channel_id': 'UC4QobU6STFB0P71PMvOGN5A',
  395. 'duration': 19,
  396. 'description': 'md5:10436b12e07ac43ff8df65287a56efb4',
  397. 'uploader_id': 'jawed',
  398. 'uploader_url': 'https://www.youtube.com/user/jawed',
  399. 'channel_url': 'https://www.youtube.com/channel/UC4QobU6STFB0P71PMvOGN5A',
  400. 'thumbnail': r're:https?://.*\.(jpg|webp)',
  401. 'uploader': 'jawed',
  402. },
  403. }, {
  404. 'url': 'https://web.archive.org/web/20110712231407/http://www.youtube.com/watch?v=lTx3G6h2xyA',
  405. 'info_dict': {
  406. 'id': 'lTx3G6h2xyA',
  407. 'ext': 'flv',
  408. 'title': 'Madeon - Pop Culture (live mashup)',
  409. 'upload_date': '20110711',
  410. 'uploader': 'Madeon',
  411. 'channel_id': 'UCqMDNf3Pn5L7pcNkuSEeO3w',
  412. 'duration': 204,
  413. 'description': 'md5:f7535343b6eda34a314eff8b85444680',
  414. 'uploader_id': 'itsmadeon',
  415. 'uploader_url': 'https://www.youtube.com/user/itsmadeon',
  416. 'channel_url': 'https://www.youtube.com/channel/UCqMDNf3Pn5L7pcNkuSEeO3w',
  417. 'thumbnail': r're:https?://.*\.(jpg|webp)',
  418. },
  419. }, {
  420. # First capture is of dead video, second is the oldest from CDX response.
  421. 'url': 'https://web.archive.org/https://www.youtube.com/watch?v=1JYutPM8O6E',
  422. 'info_dict': {
  423. 'id': '1JYutPM8O6E',
  424. 'ext': 'mp4',
  425. 'title': 'Fake Teen Doctor Strikes AGAIN! - Weekly Weird News',
  426. 'upload_date': '20160218',
  427. 'channel_id': 'UCdIaNUarhzLSXGoItz7BHVA',
  428. 'duration': 1235,
  429. 'description': 'md5:21032bae736421e89c2edf36d1936947',
  430. 'uploader_id': 'MachinimaETC',
  431. 'uploader_url': 'https://www.youtube.com/user/MachinimaETC',
  432. 'channel_url': 'https://www.youtube.com/channel/UCdIaNUarhzLSXGoItz7BHVA',
  433. 'thumbnail': r're:https?://.*\.(jpg|webp)',
  434. 'uploader': 'ETC News',
  435. },
  436. }, {
  437. # First capture of dead video, capture date in link links to dead capture.
  438. 'url': 'https://web.archive.org/web/20180803221945/https://www.youtube.com/watch?v=6FPhZJGvf4E',
  439. 'info_dict': {
  440. 'id': '6FPhZJGvf4E',
  441. 'ext': 'mp4',
  442. 'title': 'WTF: Video Games Still Launch BROKEN?! - T.U.G.S.',
  443. 'upload_date': '20160219',
  444. 'channel_id': 'UCdIaNUarhzLSXGoItz7BHVA',
  445. 'duration': 797,
  446. 'description': 'md5:a1dbf12d9a3bd7cb4c5e33b27d77ffe7',
  447. 'uploader_id': 'MachinimaETC',
  448. 'uploader_url': 'https://www.youtube.com/user/MachinimaETC',
  449. 'channel_url': 'https://www.youtube.com/channel/UCdIaNUarhzLSXGoItz7BHVA',
  450. 'thumbnail': r're:https?://.*\.(jpg|webp)',
  451. 'uploader': 'ETC News',
  452. },
  453. 'expected_warnings': [
  454. r'unable to download capture webpage \(it may not be archived\)',
  455. ],
  456. }, { # Very old YouTube page, has - YouTube in title.
  457. 'url': 'http://web.archive.org/web/20070302011044/http://youtube.com/watch?v=-06-KB9XTzg',
  458. 'info_dict': {
  459. 'id': '-06-KB9XTzg',
  460. 'ext': 'flv',
  461. 'title': 'New Coin Hack!! 100% Safe!!',
  462. },
  463. }, {
  464. 'url': 'web.archive.org/https://www.youtube.com/watch?v=dWW7qP423y8',
  465. 'info_dict': {
  466. 'id': 'dWW7qP423y8',
  467. 'ext': 'mp4',
  468. 'title': 'It\'s Bootleg AirPods Time.',
  469. 'upload_date': '20211021',
  470. 'channel_id': 'UC7Jwj9fkrf1adN4fMmTkpug',
  471. 'channel_url': 'https://www.youtube.com/channel/UC7Jwj9fkrf1adN4fMmTkpug',
  472. 'duration': 810,
  473. 'description': 'md5:7b567f898d8237b256f36c1a07d6d7bc',
  474. 'thumbnail': r're:https?://.*\.(jpg|webp)',
  475. 'uploader': 'DankPods',
  476. },
  477. }, {
  478. # player response contains '};' See: https://github.com/ytdl-org/youtube-dl/issues/27093
  479. 'url': 'https://web.archive.org/web/20200827003909if_/http://www.youtube.com/watch?v=6Dh-RL__uN4',
  480. 'info_dict': {
  481. 'id': '6Dh-RL__uN4',
  482. 'ext': 'mp4',
  483. 'title': 'bitch lasagna',
  484. 'upload_date': '20181005',
  485. 'channel_id': 'UC-lHJZR3Gqxm24_Vd_AJ5Yw',
  486. 'channel_url': 'https://www.youtube.com/channel/UC-lHJZR3Gqxm24_Vd_AJ5Yw',
  487. 'duration': 135,
  488. 'description': 'md5:2dbe4051feeff2dab5f41f82bb6d11d0',
  489. 'uploader': 'PewDiePie',
  490. 'uploader_id': 'PewDiePie',
  491. 'uploader_url': 'https://www.youtube.com/user/PewDiePie',
  492. 'thumbnail': r're:https?://.*\.(jpg|webp)',
  493. },
  494. }, {
  495. # ~June 2010 Capture. swfconfig
  496. 'url': 'https://web.archive.org/web/0/https://www.youtube.com/watch?v=8XeW5ilk-9Y',
  497. 'info_dict': {
  498. 'id': '8XeW5ilk-9Y',
  499. 'ext': 'flv',
  500. 'title': 'Story of Stuff, The Critique Part 4 of 4',
  501. 'duration': 541,
  502. 'description': 'md5:28157da06f2c5e94c97f7f3072509972',
  503. 'uploader': 'HowTheWorldWorks',
  504. 'uploader_id': 'HowTheWorldWorks',
  505. 'thumbnail': r're:https?://.*\.(jpg|webp)',
  506. 'uploader_url': 'https://www.youtube.com/user/HowTheWorldWorks',
  507. 'upload_date': '20090520',
  508. },
  509. }, {
  510. # Jan 2011: watch-video-date/eow-date surrounded by whitespace
  511. 'url': 'https://web.archive.org/web/20110126141719/http://www.youtube.com/watch?v=Q_yjX80U7Yc',
  512. 'info_dict': {
  513. 'id': 'Q_yjX80U7Yc',
  514. 'ext': 'flv',
  515. 'title': 'Spray Paint Art by Clay Butler: Purple Fantasy Forest',
  516. 'uploader_id': 'claybutlermusic',
  517. 'description': 'md5:4595264559e3d0a0ceb3f011f6334543',
  518. 'upload_date': '20090803',
  519. 'uploader': 'claybutlermusic',
  520. 'thumbnail': r're:https?://.*\.(jpg|webp)',
  521. 'duration': 132,
  522. 'uploader_url': 'https://www.youtube.com/user/claybutlermusic',
  523. },
  524. }, {
  525. # ~May 2009 swfArgs. ytcfg is spread out over various vars
  526. 'url': 'https://web.archive.org/web/0/https://www.youtube.com/watch?v=c5uJgG05xUY',
  527. 'info_dict': {
  528. 'id': 'c5uJgG05xUY',
  529. 'ext': 'webm',
  530. 'title': 'Story of Stuff, The Critique Part 1 of 4',
  531. 'uploader_id': 'HowTheWorldWorks',
  532. 'uploader': 'HowTheWorldWorks',
  533. 'uploader_url': 'https://www.youtube.com/user/HowTheWorldWorks',
  534. 'upload_date': '20090513',
  535. 'description': 'md5:4ca77d79538064e41e4cc464e93f44f0',
  536. 'thumbnail': r're:https?://.*\.(jpg|webp)',
  537. 'duration': 754,
  538. },
  539. }, {
  540. # ~June 2012. Upload date is in another lang so cannot extract.
  541. 'url': 'https://web.archive.org/web/20120607174520/http://www.youtube.com/watch?v=xWTLLl-dQaA',
  542. 'info_dict': {
  543. 'id': 'xWTLLl-dQaA',
  544. 'ext': 'mp4',
  545. 'title': 'Black Nerd eHarmony Video Bio Parody (SPOOF)',
  546. 'uploader_url': 'https://www.youtube.com/user/BlackNerdComedy',
  547. 'description': 'md5:e25f0133aaf9e6793fb81c18021d193e',
  548. 'uploader_id': 'BlackNerdComedy',
  549. 'uploader': 'BlackNerdComedy',
  550. 'duration': 182,
  551. 'thumbnail': r're:https?://.*\.(jpg|webp)',
  552. },
  553. }, {
  554. # ~July 2013
  555. 'url': 'https://web.archive.org/web/*/https://www.youtube.com/watch?v=9eO1aasHyTM',
  556. 'info_dict': {
  557. 'id': '9eO1aasHyTM',
  558. 'ext': 'mp4',
  559. 'title': 'Polar-oid',
  560. 'description': 'Cameras and bears are dangerous!',
  561. 'uploader_url': 'https://www.youtube.com/user/punkybird',
  562. 'uploader_id': 'punkybird',
  563. 'duration': 202,
  564. 'channel_id': 'UC62R2cBezNBOqxSerfb1nMQ',
  565. 'channel_url': 'https://www.youtube.com/channel/UC62R2cBezNBOqxSerfb1nMQ',
  566. 'upload_date': '20060428',
  567. 'uploader': 'punkybird',
  568. },
  569. }, {
  570. # April 2020: Player response in player config
  571. 'url': 'https://web.archive.org/web/20200416034815/https://www.youtube.com/watch?v=Cf7vS8jc7dY&gl=US&hl=en',
  572. 'info_dict': {
  573. 'id': 'Cf7vS8jc7dY',
  574. 'ext': 'mp4',
  575. 'title': 'A Dramatic Pool Story (by Jamie Spicer-Lewis) - Game Grumps Animated',
  576. 'duration': 64,
  577. 'upload_date': '20200408',
  578. 'uploader_id': 'GameGrumps',
  579. 'uploader': 'GameGrumps',
  580. 'channel_url': 'https://www.youtube.com/channel/UC9CuvdOVfMPvKCiwdGKL3cQ',
  581. 'channel_id': 'UC9CuvdOVfMPvKCiwdGKL3cQ',
  582. 'thumbnail': r're:https?://.*\.(jpg|webp)',
  583. 'description': 'md5:c625bb3c02c4f5fb4205971e468fa341',
  584. 'uploader_url': 'https://www.youtube.com/user/GameGrumps',
  585. },
  586. }, {
  587. # watch7-user-header with yt-user-info
  588. 'url': 'ytarchive:kbh4T_b4Ixw:20160307085057',
  589. 'info_dict': {
  590. 'id': 'kbh4T_b4Ixw',
  591. 'ext': 'mp4',
  592. 'title': 'Shovel Knight OST - Strike the Earth! Plains of Passage 16 bit SNES style remake / remix',
  593. 'channel_url': 'https://www.youtube.com/channel/UCnTaGvsHmMy792DWeT6HbGA',
  594. 'uploader': 'Nelward music',
  595. 'duration': 213,
  596. 'description': 'md5:804b4a9ce37b050a5fefdbb23aeba54d',
  597. 'thumbnail': r're:https?://.*\.(jpg|webp)',
  598. 'upload_date': '20150503',
  599. 'channel_id': 'UCnTaGvsHmMy792DWeT6HbGA',
  600. },
  601. }, {
  602. # April 2012
  603. 'url': 'https://web.archive.org/web/0/https://www.youtube.com/watch?v=SOm7mPoPskU',
  604. 'info_dict': {
  605. 'id': 'SOm7mPoPskU',
  606. 'ext': 'mp4',
  607. 'title': 'Boyfriend - Justin Bieber Parody',
  608. 'uploader_url': 'https://www.youtube.com/user/thecomputernerd01',
  609. 'uploader': 'thecomputernerd01',
  610. 'thumbnail': r're:https?://.*\.(jpg|webp)',
  611. 'description': 'md5:dd7fa635519c2a5b4d566beaecad7491',
  612. 'duration': 200,
  613. 'upload_date': '20120407',
  614. 'uploader_id': 'thecomputernerd01',
  615. },
  616. }, {
  617. 'url': 'https://web.archive.org/web/http://www.youtube.com/watch?v=kH-G_aIBlFw',
  618. 'only_matching': True,
  619. }, {
  620. 'url': 'https://web.archive.org/web/20050214000000_if/http://www.youtube.com/watch?v=0altSZ96U4M',
  621. 'only_matching': True,
  622. }, {
  623. # Video not archived, only capture is unavailable video page
  624. 'url': 'https://web.archive.org/web/20210530071008/https://www.youtube.com/watch?v=lHJTf93HL1s&spfreload=10',
  625. 'only_matching': True,
  626. }, { # Encoded url
  627. 'url': 'https://web.archive.org/web/20120712231619/http%3A//www.youtube.com/watch%3Fgl%3DUS%26v%3DAkhihxRKcrs%26hl%3Den',
  628. 'only_matching': True,
  629. }, {
  630. 'url': 'https://web.archive.org/web/20120712231619/http%3A//www.youtube.com/watch%3Fv%3DAkhihxRKcrs%26gl%3DUS%26hl%3Den',
  631. 'only_matching': True,
  632. }, {
  633. 'url': 'https://web.archive.org/web/20060527081937/http://www.youtube.com:80/watch.php?v=ELTFsLT73fA&amp;search=soccer',
  634. 'only_matching': True,
  635. }, {
  636. 'url': 'https://web.archive.org/http://www.youtube.com:80/watch?v=-05VVye-ffg',
  637. 'only_matching': True,
  638. }, {
  639. 'url': 'ytarchive:BaW_jenozKc:20050214000000',
  640. 'only_matching': True,
  641. }, {
  642. 'url': 'ytarchive:BaW_jenozKc',
  643. 'only_matching': True,
  644. },
  645. ]
  646. _YT_INITIAL_DATA_RE = YoutubeBaseInfoExtractor._YT_INITIAL_DATA_RE
  647. _YT_INITIAL_PLAYER_RESPONSE_RE = fr'''(?x:
  648. (?:window\s*\[\s*["\']ytInitialPlayerResponse["\']\s*\]|ytInitialPlayerResponse)\s*=[(\s]*|
  649. {YoutubeBaseInfoExtractor._YT_INITIAL_PLAYER_RESPONSE_RE}
  650. )'''
  651. _YT_DEFAULT_THUMB_SERVERS = ['i.ytimg.com'] # thumbnails most likely archived on these servers
  652. _YT_ALL_THUMB_SERVERS = orderedSet(
  653. [*_YT_DEFAULT_THUMB_SERVERS, 'img.youtube.com', *[f'{c}{n or ""}.ytimg.com' for c in ('i', 's') for n in (*range(5), 9)]])
  654. _WAYBACK_BASE_URL = 'https://web.archive.org/web/%sif_/'
  655. _OLDEST_CAPTURE_DATE = 20050214000000
  656. _NEWEST_CAPTURE_DATE = 20500101000000
  657. def _call_cdx_api(self, item_id, url, filters: list | None = None, collapse: list | None = None, query: dict | None = None, note=None, fatal=False):
  658. # CDX docs: https://github.com/internetarchive/wayback/blob/master/wayback-cdx-server/README.md
  659. query = {
  660. 'url': url,
  661. 'output': 'json',
  662. 'fl': 'original,mimetype,length,timestamp',
  663. 'limit': 500,
  664. 'filter': ['statuscode:200'] + (filters or []),
  665. 'collapse': collapse or [],
  666. **(query or {}),
  667. }
  668. res = self._download_json(
  669. 'https://web.archive.org/cdx/search/cdx', item_id,
  670. note or 'Downloading CDX API JSON', query=query, fatal=fatal)
  671. if isinstance(res, list) and len(res) >= 2:
  672. # format response to make it easier to use
  673. return [dict(zip(res[0], v)) for v in res[1:]]
  674. elif not isinstance(res, list) or len(res) != 0:
  675. self.report_warning('Error while parsing CDX API response' + bug_reports_message())
  676. def _extract_webpage_title(self, webpage):
  677. page_title = self._html_extract_title(webpage, default='')
  678. # YouTube video pages appear to always have either 'YouTube -' as prefix or '- YouTube' as suffix.
  679. return self._html_search_regex(
  680. r'(?:YouTube\s*-\s*(.*)$)|(?:(.*)\s*-\s*YouTube$)',
  681. page_title, 'title', default='')
  682. def _extract_metadata(self, video_id, webpage):
  683. search_meta = ((lambda x: self._html_search_meta(x, webpage, default=None)) if webpage else (lambda x: None))
  684. player_response = self._search_json(
  685. self._YT_INITIAL_PLAYER_RESPONSE_RE, webpage, 'initial player response',
  686. video_id, default={})
  687. initial_data = self._search_json(
  688. self._YT_INITIAL_DATA_RE, webpage, 'initial data', video_id, default={})
  689. ytcfg = {}
  690. for j in re.findall(r'yt\.setConfig\(\s*(?P<json>{\s*(?s:.+?)\s*})\s*\);', webpage): # ~June 2010
  691. ytcfg.update(self._parse_json(j, video_id, fatal=False, ignore_extra=True, transform_source=js_to_json, errnote='') or {})
  692. # XXX: this also may contain a 'ptchn' key
  693. player_config = (
  694. self._search_json(
  695. r'(?:yt\.playerConfig|ytplayer\.config|swfConfig)\s*=',
  696. webpage, 'player config', video_id, default=None)
  697. or ytcfg.get('PLAYER_CONFIG') or {})
  698. # XXX: this may also contain a 'creator' key.
  699. swf_args = self._search_json(r'swfArgs\s*=', webpage, 'swf config', video_id, default={})
  700. if swf_args and not traverse_obj(player_config, ('args',)):
  701. player_config['args'] = swf_args
  702. if not player_response:
  703. # April 2020
  704. player_response = self._parse_json(
  705. traverse_obj(player_config, ('args', 'player_response')) or '{}', video_id, fatal=False)
  706. initial_data_video = traverse_obj(
  707. initial_data, ('contents', 'twoColumnWatchNextResults', 'results', 'results', 'contents', ..., 'videoPrimaryInfoRenderer'),
  708. expected_type=dict, get_all=False, default={})
  709. video_details = traverse_obj(
  710. player_response, 'videoDetails', expected_type=dict, get_all=False, default={})
  711. microformats = traverse_obj(
  712. player_response, ('microformat', 'playerMicroformatRenderer'), expected_type=dict, get_all=False, default={})
  713. video_title = (
  714. video_details.get('title')
  715. or YoutubeBaseInfoExtractor._get_text(microformats, 'title')
  716. or YoutubeBaseInfoExtractor._get_text(initial_data_video, 'title')
  717. or traverse_obj(player_config, ('args', 'title'))
  718. or self._extract_webpage_title(webpage)
  719. or search_meta(['og:title', 'twitter:title', 'title']))
  720. def id_from_url(url, type_):
  721. return self._search_regex(
  722. rf'(?:{type_})/([^/#&?]+)', url or '', f'{type_} id', default=None)
  723. # XXX: would the get_elements_by_... functions be better suited here?
  724. _CHANNEL_URL_HREF_RE = r'href="[^"]*(?P<url>https?://www\.youtube\.com/(?:user|channel)/[^"]+)"'
  725. uploader_or_channel_url = self._search_regex(
  726. [fr'<(?:link\s*itemprop=\"url\"|a\s*id=\"watch-username\").*?\b{_CHANNEL_URL_HREF_RE}>', # @fd05024
  727. fr'<div\s*id=\"(?:watch-channel-stats|watch-headline-user-info)\"[^>]*>\s*<a[^>]*\b{_CHANNEL_URL_HREF_RE}'], # ~ May 2009, ~June 2012
  728. webpage, 'uploader or channel url', default=None)
  729. owner_profile_url = url_or_none(microformats.get('ownerProfileUrl')) # @a6211d2
  730. # Uploader refers to the /user/ id ONLY
  731. uploader_id = (
  732. id_from_url(owner_profile_url, 'user')
  733. or id_from_url(uploader_or_channel_url, 'user')
  734. or ytcfg.get('VIDEO_USERNAME'))
  735. uploader_url = f'https://www.youtube.com/user/{uploader_id}' if uploader_id else None
  736. # XXX: do we want to differentiate uploader and channel?
  737. uploader = (
  738. self._search_regex(
  739. [r'<a\s*id="watch-username"[^>]*>\s*<strong>([^<]+)</strong>', # June 2010
  740. r'var\s*watchUsername\s*=\s*\'(.+?)\';', # ~May 2009
  741. r'<div\s*\bid=\"watch-channel-stats"[^>]*>\s*<a[^>]*>\s*(.+?)\s*</a', # ~May 2009
  742. r'<a\s*id="watch-userbanner"[^>]*title="\s*(.+?)\s*"'], # ~June 2012
  743. webpage, 'uploader', default=None)
  744. or self._html_search_regex(
  745. [r'(?s)<div\s*class="yt-user-info".*?<a[^>]*[^>]*>\s*(.*?)\s*</a', # March 2016
  746. r'(?s)<a[^>]*yt-user-name[^>]*>\s*(.*?)\s*</a'], # july 2013
  747. get_element_by_id('watch7-user-header', webpage), 'uploader', default=None)
  748. or self._html_search_regex(
  749. r'<button\s*href="/user/[^>]*>\s*<span[^>]*>\s*(.+?)\s*<', # April 2012
  750. get_element_by_id('watch-headline-user-info', webpage), 'uploader', default=None)
  751. or traverse_obj(player_config, ('args', 'creator'))
  752. or video_details.get('author'))
  753. channel_id = str_or_none(
  754. video_details.get('channelId')
  755. or microformats.get('externalChannelId')
  756. or search_meta('channelId')
  757. or self._search_regex(
  758. r'data-channel-external-id=(["\'])(?P<id>(?:(?!\1).)+)\1', # @b45a9e6
  759. webpage, 'channel id', default=None, group='id')
  760. or id_from_url(owner_profile_url, 'channel')
  761. or id_from_url(uploader_or_channel_url, 'channel')
  762. or traverse_obj(player_config, ('args', 'ucid')))
  763. channel_url = f'https://www.youtube.com/channel/{channel_id}' if channel_id else None
  764. duration = int_or_none(
  765. video_details.get('lengthSeconds')
  766. or microformats.get('lengthSeconds')
  767. or traverse_obj(player_config, ('args', ('length_seconds', 'l')), get_all=False)
  768. or parse_duration(search_meta('duration')))
  769. description = (
  770. video_details.get('shortDescription')
  771. or YoutubeBaseInfoExtractor._get_text(microformats, 'description')
  772. or clean_html(get_element_by_id('eow-description', webpage)) # @9e6dd23
  773. or search_meta(['description', 'og:description', 'twitter:description']))
  774. upload_date = unified_strdate(
  775. dict_get(microformats, ('uploadDate', 'publishDate'))
  776. or search_meta(['uploadDate', 'datePublished'])
  777. or self._search_regex(
  778. [r'(?s)id="eow-date.*?>\s*(.*?)\s*</span>',
  779. r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']', # @7998520
  780. r'class\s*=\s*"(?:watch-video-date|watch-video-added post-date)"[^>]*>\s*([^<]+?)\s*<'], # ~June 2010, ~Jan 2009 (respectively)
  781. webpage, 'upload date', default=None))
  782. return {
  783. 'title': video_title,
  784. 'description': description,
  785. 'upload_date': upload_date,
  786. 'uploader': uploader,
  787. 'channel_id': channel_id,
  788. 'channel_url': channel_url,
  789. 'duration': duration,
  790. 'uploader_url': uploader_url,
  791. 'uploader_id': uploader_id,
  792. }
  793. def _extract_thumbnails(self, video_id):
  794. try_all = 'thumbnails' in self._configuration_arg('check_all')
  795. thumbnail_base_urls = ['http://{server}/vi{webp}/{video_id}'.format(
  796. webp='_webp' if ext == 'webp' else '', video_id=video_id, server=server)
  797. for server in (self._YT_ALL_THUMB_SERVERS if try_all else self._YT_DEFAULT_THUMB_SERVERS) for ext in (('jpg', 'webp') if try_all else ('jpg',))]
  798. thumbnails = []
  799. for url in thumbnail_base_urls:
  800. response = self._call_cdx_api(
  801. video_id, url, filters=['mimetype:image/(?:webp|jpeg)'],
  802. collapse=['urlkey'], query={'matchType': 'prefix'})
  803. if not response:
  804. continue
  805. thumbnails.extend(
  806. {
  807. 'url': (self._WAYBACK_BASE_URL % (int_or_none(thumbnail_dict.get('timestamp')) or self._OLDEST_CAPTURE_DATE)) + thumbnail_dict.get('original'),
  808. 'filesize': int_or_none(thumbnail_dict.get('length')),
  809. 'preference': int_or_none(thumbnail_dict.get('length')),
  810. } for thumbnail_dict in response)
  811. if not try_all:
  812. break
  813. self._remove_duplicate_formats(thumbnails)
  814. return thumbnails
  815. def _get_capture_dates(self, video_id, url_date):
  816. capture_dates = []
  817. # Note: CDX API will not find watch pages with extra params in the url.
  818. response = self._call_cdx_api(
  819. video_id, f'https://www.youtube.com/watch?v={video_id}',
  820. filters=['mimetype:text/html'], collapse=['timestamp:6', 'digest'], query={'matchType': 'prefix'}) or []
  821. all_captures = sorted(int_or_none(r['timestamp']) for r in response if int_or_none(r['timestamp']) is not None)
  822. # Prefer the new polymer UI captures as we support extracting more metadata from them
  823. # WBM captures seem to all switch to this layout ~July 2020
  824. modern_captures = [x for x in all_captures if x >= 20200701000000]
  825. if modern_captures:
  826. capture_dates.append(modern_captures[0])
  827. capture_dates.append(url_date)
  828. if all_captures:
  829. capture_dates.append(all_captures[0])
  830. if 'captures' in self._configuration_arg('check_all'):
  831. capture_dates.extend(modern_captures + all_captures)
  832. # Fallbacks if any of the above fail
  833. capture_dates.extend([self._OLDEST_CAPTURE_DATE, self._NEWEST_CAPTURE_DATE])
  834. return orderedSet(filter(None, capture_dates))
  835. def _real_extract(self, url):
  836. video_id, url_date, url_date_2 = self._match_valid_url(url).group('id', 'date', 'date2')
  837. url_date = url_date or url_date_2
  838. urlh = None
  839. retry_manager = self.RetryManager(fatal=False)
  840. for retry in retry_manager:
  841. try:
  842. urlh = self._request_webpage(
  843. HEADRequest(f'https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/{video_id}'),
  844. video_id, note='Fetching archived video file url', expected_status=True)
  845. except ExtractorError as e:
  846. # HTTP Error 404 is expected if the video is not saved.
  847. if isinstance(e.cause, HTTPError) and e.cause.status == 404:
  848. self.raise_no_formats(
  849. 'The requested video is not archived, indexed, or there is an issue with web.archive.org (try again later)', expected=True)
  850. else:
  851. retry.error = e
  852. if retry_manager.error:
  853. self.raise_no_formats(retry_manager.error, expected=True, video_id=video_id)
  854. capture_dates = self._get_capture_dates(video_id, int_or_none(url_date))
  855. self.write_debug('Captures to try: ' + join_nonempty(*capture_dates, delim=', '))
  856. info = {'id': video_id}
  857. for capture in capture_dates:
  858. webpage = self._download_webpage(
  859. (self._WAYBACK_BASE_URL + 'http://www.youtube.com/watch?v=%s') % (capture, video_id),
  860. video_id=video_id, fatal=False, errnote='unable to download capture webpage (it may not be archived)',
  861. note='Downloading capture webpage')
  862. current_info = self._extract_metadata(video_id, webpage or '')
  863. # Try avoid getting deleted video metadata
  864. if current_info.get('title'):
  865. info = merge_dicts(info, current_info)
  866. if 'captures' not in self._configuration_arg('check_all'):
  867. break
  868. info['thumbnails'] = self._extract_thumbnails(video_id)
  869. if urlh:
  870. url = urllib.parse.unquote(urlh.url)
  871. video_file_url_qs = parse_qs(url)
  872. # Attempt to recover any ext & format info from playback url & response headers
  873. fmt = {'url': url, 'filesize': int_or_none(urlh.headers.get('x-archive-orig-content-length'))}
  874. itag = try_get(video_file_url_qs, lambda x: x['itag'][0])
  875. if itag and itag in YoutubeIE._formats:
  876. fmt.update(YoutubeIE._formats[itag])
  877. fmt.update({'format_id': itag})
  878. else:
  879. mime = try_get(video_file_url_qs, lambda x: x['mime'][0])
  880. ext = (mimetype2ext(mime)
  881. or urlhandle_detect_ext(urlh)
  882. or mimetype2ext(urlh.headers.get('x-archive-guessed-content-type')))
  883. fmt.update({'ext': ext})
  884. info['formats'] = [fmt]
  885. if not info.get('duration'):
  886. info['duration'] = str_to_int(try_get(video_file_url_qs, lambda x: x['dur'][0]))
  887. if not info.get('title'):
  888. info['title'] = video_id
  889. return info