nebula.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468
  1. import itertools
  2. import json
  3. from .art19 import Art19IE
  4. from .common import InfoExtractor
  5. from ..networking.exceptions import HTTPError
  6. from ..utils import (
  7. ExtractorError,
  8. int_or_none,
  9. make_archive_id,
  10. parse_iso8601,
  11. smuggle_url,
  12. try_call,
  13. unsmuggle_url,
  14. update_url_query,
  15. url_or_none,
  16. urljoin,
  17. )
  18. from ..utils.traversal import traverse_obj
  19. _BASE_URL_RE = r'https?://(?:www\.|beta\.)?(?:watchnebula\.com|nebula\.app|nebula\.tv)'
  20. class NebulaBaseIE(InfoExtractor):
  21. _NETRC_MACHINE = 'watchnebula'
  22. _token = _api_token = None
  23. def _perform_login(self, username, password):
  24. try:
  25. response = self._download_json(
  26. 'https://nebula.tv/auth/login/', None,
  27. 'Logging in to Nebula', 'Login failed',
  28. data=json.dumps({'email': username, 'password': password}).encode(),
  29. headers={'content-type': 'application/json'})
  30. except ExtractorError as e:
  31. if isinstance(e.cause, HTTPError) and e.cause.status == 400:
  32. raise ExtractorError('Login failed: Invalid username or password', expected=True)
  33. raise
  34. self._api_token = traverse_obj(response, ('key', {str}))
  35. if not self._api_token:
  36. raise ExtractorError('Login failed: No token')
  37. def _call_api(self, *args, **kwargs):
  38. if self._token:
  39. kwargs.setdefault('headers', {})['Authorization'] = f'Bearer {self._token}'
  40. try:
  41. return self._download_json(*args, **kwargs)
  42. except ExtractorError as e:
  43. if not isinstance(e.cause, HTTPError) or e.cause.status not in (401, 403):
  44. raise
  45. self.to_screen(
  46. f'Reauthorizing with Nebula and retrying, because last API call resulted in error {e.cause.status}')
  47. self._real_initialize()
  48. if self._token:
  49. kwargs.setdefault('headers', {})['Authorization'] = f'Bearer {self._token}'
  50. return self._download_json(*args, **kwargs)
  51. def _real_initialize(self):
  52. if not self._api_token:
  53. self._api_token = try_call(
  54. lambda: self._get_cookies('https://nebula.tv')['nebula_auth.apiToken'].value)
  55. self._token = self._download_json(
  56. 'https://users.api.nebula.app/api/v1/authorization/', None,
  57. headers={'Authorization': f'Token {self._api_token}'} if self._api_token else None,
  58. note='Authorizing to Nebula', data=b'')['token']
  59. def _extract_formats(self, content_id, slug):
  60. for retry in (False, True):
  61. try:
  62. fmts, subs = self._extract_m3u8_formats_and_subtitles(
  63. f'https://content.api.nebula.app/{content_id.split(":")[0]}s/{content_id}/manifest.m3u8',
  64. slug, 'mp4', query={
  65. 'token': self._token,
  66. 'app_version': '23.10.0',
  67. 'platform': 'ios',
  68. })
  69. return {'formats': fmts, 'subtitles': subs}
  70. except ExtractorError as e:
  71. if isinstance(e.cause, HTTPError) and e.cause.status == 401:
  72. self.raise_login_required()
  73. if not retry and isinstance(e.cause, HTTPError) and e.cause.status == 403:
  74. self.to_screen('Reauthorizing with Nebula and retrying, because fetching video resulted in error')
  75. self._real_initialize()
  76. continue
  77. raise
  78. def _extract_video_metadata(self, episode):
  79. channel_url = traverse_obj(
  80. episode, (('channel_slug', 'class_slug'), {lambda x: urljoin('https://nebula.tv/', x)}), get_all=False)
  81. return {
  82. 'id': episode['id'].partition(':')[2],
  83. **traverse_obj(episode, {
  84. 'display_id': 'slug',
  85. 'title': 'title',
  86. 'description': 'description',
  87. 'timestamp': ('published_at', {parse_iso8601}),
  88. 'duration': ('duration', {int_or_none}),
  89. 'channel_id': 'channel_slug',
  90. 'uploader_id': 'channel_slug',
  91. 'channel': 'channel_title',
  92. 'uploader': 'channel_title',
  93. 'series': 'channel_title',
  94. 'creator': 'channel_title',
  95. 'thumbnail': ('images', 'thumbnail', 'src', {url_or_none}),
  96. 'episode_number': ('order', {int_or_none}),
  97. # Old code was wrongly setting extractor_key from NebulaSubscriptionsIE
  98. '_old_archive_ids': ('zype_id', {lambda x: [
  99. make_archive_id(NebulaIE, x), make_archive_id(NebulaSubscriptionsIE, x)] if x else None}),
  100. }),
  101. 'channel_url': channel_url,
  102. 'uploader_url': channel_url,
  103. }
  104. class NebulaIE(NebulaBaseIE):
  105. IE_NAME = 'nebula:video'
  106. _VALID_URL = rf'{_BASE_URL_RE}/videos/(?P<id>[\w-]+)'
  107. _TESTS = [{
  108. 'url': 'https://nebula.tv/videos/that-time-disney-remade-beauty-and-the-beast',
  109. 'info_dict': {
  110. 'id': '84ed544d-4afd-4723-8cd5-2b95261f0abf',
  111. 'ext': 'mp4',
  112. 'title': 'That Time Disney Remade Beauty and the Beast',
  113. 'description': 'md5:2aae3c4cfc5ee09a1ecdff0909618cf4',
  114. 'upload_date': '20180731',
  115. 'timestamp': 1533009600,
  116. 'channel': 'Lindsay Ellis',
  117. 'channel_id': 'lindsayellis',
  118. 'uploader': 'Lindsay Ellis',
  119. 'uploader_id': 'lindsayellis',
  120. 'uploader_url': r're:https://nebula\.(tv|app)/lindsayellis',
  121. 'series': 'Lindsay Ellis',
  122. 'display_id': 'that-time-disney-remade-beauty-and-the-beast',
  123. 'channel_url': r're:https://nebula\.(tv|app)/lindsayellis',
  124. 'creator': 'Lindsay Ellis',
  125. 'duration': 2212,
  126. 'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+',
  127. '_old_archive_ids': ['nebula 5c271b40b13fd613090034fd', 'nebulasubscriptions 5c271b40b13fd613090034fd'],
  128. },
  129. 'params': {'skip_download': 'm3u8'},
  130. }, {
  131. 'url': 'https://nebula.tv/videos/the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore',
  132. 'md5': 'd05739cf6c38c09322422f696b569c23',
  133. 'info_dict': {
  134. 'id': '7e623145-1b44-4ca3-aa0b-ed25a247ea34',
  135. 'ext': 'mp4',
  136. 'title': 'Landing Craft - How The Allies Got Ashore',
  137. 'description': r're:^In this episode we explore the unsung heroes of D-Day, the landing craft.',
  138. 'upload_date': '20200327',
  139. 'timestamp': 1585348140,
  140. 'channel': 'Real Engineering — The Logistics of D-Day',
  141. 'channel_id': 'd-day',
  142. 'uploader': 'Real Engineering — The Logistics of D-Day',
  143. 'uploader_id': 'd-day',
  144. 'series': 'Real Engineering — The Logistics of D-Day',
  145. 'display_id': 'the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore',
  146. 'creator': 'Real Engineering — The Logistics of D-Day',
  147. 'duration': 841,
  148. 'channel_url': 'https://nebula.tv/d-day',
  149. 'uploader_url': 'https://nebula.tv/d-day',
  150. 'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+',
  151. '_old_archive_ids': ['nebula 5e7e78171aaf320001fbd6be', 'nebulasubscriptions 5e7e78171aaf320001fbd6be'],
  152. },
  153. 'params': {'skip_download': 'm3u8'},
  154. }, {
  155. 'url': 'https://nebula.tv/videos/money-episode-1-the-draw',
  156. 'md5': 'ebe28a7ad822b9ee172387d860487868',
  157. 'info_dict': {
  158. 'id': 'b96c5714-9e2b-4ec3-b3f1-20f6e89cc553',
  159. 'ext': 'mp4',
  160. 'title': 'Episode 1: The Draw',
  161. 'description': r'contains:There’s free money on offer… if the players can all work together.',
  162. 'upload_date': '20200323',
  163. 'timestamp': 1584980400,
  164. 'channel': 'Tom Scott Presents: Money',
  165. 'channel_id': 'tom-scott-presents-money',
  166. 'uploader': 'Tom Scott Presents: Money',
  167. 'uploader_id': 'tom-scott-presents-money',
  168. 'uploader_url': 'https://nebula.tv/tom-scott-presents-money',
  169. 'duration': 825,
  170. 'channel_url': 'https://nebula.tv/tom-scott-presents-money',
  171. 'series': 'Tom Scott Presents: Money',
  172. 'display_id': 'money-episode-1-the-draw',
  173. 'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+',
  174. 'creator': 'Tom Scott Presents: Money',
  175. '_old_archive_ids': ['nebula 5e779ebdd157bc0001d1c75a', 'nebulasubscriptions 5e779ebdd157bc0001d1c75a'],
  176. },
  177. 'params': {'skip_download': 'm3u8'},
  178. }, {
  179. 'url': 'https://watchnebula.com/videos/money-episode-1-the-draw',
  180. 'only_matching': True,
  181. }, {
  182. 'url': 'https://nebula.tv/videos/tldrnewseu-did-the-us-really-blow-up-the-nordstream-pipelines',
  183. 'info_dict': {
  184. 'id': 'e389af9d-1dab-44f2-8788-ee24deb7ff0d',
  185. 'ext': 'mp4',
  186. 'display_id': 'tldrnewseu-did-the-us-really-blow-up-the-nordstream-pipelines',
  187. 'title': 'Did the US Really Blow Up the NordStream Pipelines?',
  188. 'description': 'md5:b4e2a14e3ff08f546a3209c75261e789',
  189. 'upload_date': '20230223',
  190. 'timestamp': 1677144070,
  191. 'channel': 'TLDR News EU',
  192. 'channel_id': 'tldrnewseu',
  193. 'uploader': 'TLDR News EU',
  194. 'uploader_id': 'tldrnewseu',
  195. 'uploader_url': r're:https://nebula\.(tv|app)/tldrnewseu',
  196. 'duration': 524,
  197. 'channel_url': r're:https://nebula\.(tv|app)/tldrnewseu',
  198. 'series': 'TLDR News EU',
  199. 'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+',
  200. 'creator': 'TLDR News EU',
  201. '_old_archive_ids': ['nebula 63f64c74366fcd00017c1513', 'nebulasubscriptions 63f64c74366fcd00017c1513'],
  202. },
  203. 'params': {'skip_download': 'm3u8'},
  204. }, {
  205. 'url': 'https://beta.nebula.tv/videos/money-episode-1-the-draw',
  206. 'only_matching': True,
  207. }]
  208. def _real_extract(self, url):
  209. slug = self._match_id(url)
  210. url, smuggled_data = unsmuggle_url(url, {})
  211. if smuggled_data.get('id'):
  212. return {
  213. 'id': smuggled_data['id'],
  214. 'display_id': slug,
  215. 'title': '',
  216. **self._extract_formats(smuggled_data['id'], slug),
  217. }
  218. metadata = self._call_api(
  219. f'https://content.api.nebula.app/content/videos/{slug}',
  220. slug, note='Fetching video metadata')
  221. return {
  222. **self._extract_video_metadata(metadata),
  223. **self._extract_formats(metadata['id'], slug),
  224. }
  225. class NebulaClassIE(NebulaBaseIE):
  226. IE_NAME = 'nebula:media'
  227. _VALID_URL = rf'{_BASE_URL_RE}/(?!(?:myshows|library|videos)/)(?P<id>[\w-]+)/(?P<ep>[\w-]+)/?(?:$|[?#])'
  228. _TESTS = [{
  229. 'url': 'https://nebula.tv/copyright-for-fun-and-profit/14',
  230. 'info_dict': {
  231. 'id': 'd7432cdc-c608-474d-942c-f74345daed7b',
  232. 'ext': 'mp4',
  233. 'display_id': '14',
  234. 'channel_url': 'https://nebula.tv/copyright-for-fun-and-profit',
  235. 'episode_number': 14,
  236. 'thumbnail': 'https://dj423fildxgac.cloudfront.net/d533718d-9307-42d4-8fb0-e283285e99c9',
  237. 'uploader_url': 'https://nebula.tv/copyright-for-fun-and-profit',
  238. 'duration': 646,
  239. 'episode': 'Episode 14',
  240. 'title': 'Photos, Sculpture, and Video',
  241. },
  242. 'params': {'skip_download': 'm3u8'},
  243. }, {
  244. 'url': 'https://nebula.tv/extremitiespodcast/pyramiden-the-high-arctic-soviet-ghost-town',
  245. 'info_dict': {
  246. 'ext': 'mp3',
  247. 'id': '018f65f0-0033-4021-8f87-2d132beb19aa',
  248. 'description': 'md5:05d2b23ab780c955e2511a2b9127acff',
  249. 'series_id': '335e8159-d663-491a-888f-1732285706ac',
  250. 'modified_timestamp': 1599091504,
  251. 'episode_id': '018f65f0-0033-4021-8f87-2d132beb19aa',
  252. 'series': 'Extremities',
  253. 'modified_date': '20200903',
  254. 'upload_date': '20200902',
  255. 'title': 'Pyramiden: The High-Arctic Soviet Ghost Town',
  256. 'release_timestamp': 1571237958,
  257. 'thumbnail': r're:^https?://content\.production\.cdn\.art19\.com.*\.jpeg$',
  258. 'duration': 1546.05714,
  259. 'timestamp': 1599085608,
  260. 'release_date': '20191016',
  261. },
  262. }, {
  263. 'url': 'https://nebula.tv/thelayover/the-layover-episode-1',
  264. 'info_dict': {
  265. 'ext': 'mp3',
  266. 'id': '9d74a762-00bb-45a8-9e8d-9ed47c04a1d0',
  267. 'episode_number': 1,
  268. 'thumbnail': r're:^https?://content\.production\.cdn\.art19\.com.*\.jpeg$',
  269. 'release_date': '20230304',
  270. 'modified_date': '20230403',
  271. 'series': 'The Layover',
  272. 'episode_id': '9d74a762-00bb-45a8-9e8d-9ed47c04a1d0',
  273. 'modified_timestamp': 1680554566,
  274. 'duration': 3130.46401,
  275. 'release_timestamp': 1677943800,
  276. 'title': 'The Layover — Episode 1',
  277. 'series_id': '874303a5-4900-4626-a4b6-2aacac34466a',
  278. 'upload_date': '20230303',
  279. 'episode': 'Episode 1',
  280. 'timestamp': 1677883672,
  281. 'description': 'md5:002cca89258e3bc7c268d5b8c24ba482',
  282. },
  283. }]
  284. def _real_extract(self, url):
  285. slug, episode = self._match_valid_url(url).group('id', 'ep')
  286. url, smuggled_data = unsmuggle_url(url, {})
  287. if smuggled_data.get('id'):
  288. return {
  289. 'id': smuggled_data['id'],
  290. 'display_id': slug,
  291. 'title': '',
  292. **self._extract_formats(smuggled_data['id'], slug),
  293. }
  294. metadata = self._call_api(
  295. f'https://content.api.nebula.app/content/{slug}/{episode}/?include=lessons',
  296. slug, note='Fetching class/podcast metadata')
  297. content_type = metadata.get('type')
  298. if content_type == 'lesson':
  299. return {
  300. **self._extract_video_metadata(metadata),
  301. **self._extract_formats(metadata['id'], slug),
  302. }
  303. elif content_type == 'podcast_episode':
  304. episode_url = metadata['episode_url']
  305. if not episode_url and metadata.get('premium'):
  306. self.raise_login_required()
  307. if Art19IE.suitable(episode_url):
  308. return self.url_result(episode_url, Art19IE)
  309. return traverse_obj(metadata, {
  310. 'id': ('id', {str}),
  311. 'url': ('episode_url', {url_or_none}),
  312. 'title': ('title', {str}),
  313. 'description': ('description', {str}),
  314. 'timestamp': ('published_at', {parse_iso8601}),
  315. 'duration': ('duration', {int_or_none}),
  316. 'channel_id': ('channel_id', {str}),
  317. 'chnanel': ('channel_title', {str}),
  318. 'thumbnail': ('assets', 'regular', {url_or_none}),
  319. })
  320. raise ExtractorError(f'Unexpected content type {content_type!r}')
  321. class NebulaSubscriptionsIE(NebulaBaseIE):
  322. IE_NAME = 'nebula:subscriptions'
  323. _VALID_URL = rf'{_BASE_URL_RE}/(?P<id>myshows|library/latest-videos)/?(?:$|[?#])'
  324. _TESTS = [{
  325. 'url': 'https://nebula.tv/myshows',
  326. 'playlist_mincount': 1,
  327. 'info_dict': {
  328. 'id': 'myshows',
  329. },
  330. }]
  331. def _generate_playlist_entries(self):
  332. next_url = update_url_query('https://content.api.nebula.app/video_episodes/', {
  333. 'following': 'true',
  334. 'include': 'engagement',
  335. 'ordering': '-published_at',
  336. })
  337. for page_num in itertools.count(1):
  338. channel = self._call_api(
  339. next_url, 'myshows', note=f'Retrieving subscriptions page {page_num}')
  340. for episode in channel['results']:
  341. metadata = self._extract_video_metadata(episode)
  342. yield self.url_result(smuggle_url(
  343. f'https://nebula.tv/videos/{metadata["display_id"]}',
  344. {'id': episode['id']}), NebulaIE, url_transparent=True, **metadata)
  345. next_url = channel.get('next')
  346. if not next_url:
  347. return
  348. def _real_extract(self, url):
  349. return self.playlist_result(self._generate_playlist_entries(), 'myshows')
  350. class NebulaChannelIE(NebulaBaseIE):
  351. IE_NAME = 'nebula:channel'
  352. _VALID_URL = rf'{_BASE_URL_RE}/(?!myshows|library|videos)(?P<id>[\w-]+)/?(?:$|[?#])'
  353. _TESTS = [{
  354. 'url': 'https://nebula.tv/tom-scott-presents-money',
  355. 'info_dict': {
  356. 'id': 'tom-scott-presents-money',
  357. 'title': 'Tom Scott Presents: Money',
  358. 'description': 'Tom Scott hosts a series all about trust, negotiation and money.',
  359. },
  360. 'playlist_count': 5,
  361. }, {
  362. 'url': 'https://nebula.tv/lindsayellis',
  363. 'info_dict': {
  364. 'id': 'lindsayellis',
  365. 'title': 'Lindsay Ellis',
  366. 'description': 'Enjoy these hottest of takes on Disney, Transformers, and Musicals.',
  367. },
  368. 'playlist_mincount': 2,
  369. }, {
  370. 'url': 'https://nebula.tv/johnnyharris',
  371. 'info_dict': {
  372. 'id': 'johnnyharris',
  373. 'title': 'Johnny Harris',
  374. 'description': 'I make videos about maps and many other things.',
  375. },
  376. 'playlist_mincount': 90,
  377. }, {
  378. 'url': 'https://nebula.tv/copyright-for-fun-and-profit',
  379. 'info_dict': {
  380. 'id': 'copyright-for-fun-and-profit',
  381. 'title': 'Copyright for Fun and Profit',
  382. 'description': 'md5:6690248223eed044a9f11cd5a24f9742',
  383. },
  384. 'playlist_count': 23,
  385. }, {
  386. 'url': 'https://nebula.tv/trussissuespodcast',
  387. 'info_dict': {
  388. 'id': 'trussissuespodcast',
  389. 'title': 'The TLDR News Podcast',
  390. 'description': 'md5:a08c4483bc0b705881d3e0199e721385',
  391. },
  392. 'playlist_mincount': 80,
  393. }]
  394. def _generate_playlist_entries(self, collection_id, collection_slug):
  395. next_url = f'https://content.api.nebula.app/video_channels/{collection_id}/video_episodes/?ordering=-published_at'
  396. for page_num in itertools.count(1):
  397. episodes = self._call_api(next_url, collection_slug, note=f'Retrieving channel page {page_num}')
  398. for episode in episodes['results']:
  399. metadata = self._extract_video_metadata(episode)
  400. yield self.url_result(smuggle_url(
  401. episode.get('share_url') or f'https://nebula.tv/videos/{metadata["display_id"]}',
  402. {'id': episode['id']}), NebulaIE, url_transparent=True, **metadata)
  403. next_url = episodes.get('next')
  404. if not next_url:
  405. break
  406. def _generate_class_entries(self, channel):
  407. for lesson in channel['lessons']:
  408. metadata = self._extract_video_metadata(lesson)
  409. yield self.url_result(smuggle_url(
  410. lesson.get('share_url') or f'https://nebula.tv/{metadata["class_slug"]}/{metadata["slug"]}',
  411. {'id': lesson['id']}), NebulaClassIE, url_transparent=True, **metadata)
  412. def _generate_podcast_entries(self, collection_id, collection_slug):
  413. next_url = f'https://content.api.nebula.app/podcast_channels/{collection_id}/podcast_episodes/?ordering=-published_at&premium=true'
  414. for page_num in itertools.count(1):
  415. episodes = self._call_api(next_url, collection_slug, note=f'Retrieving podcast page {page_num}')
  416. for episode in traverse_obj(episodes, ('results', lambda _, v: url_or_none(v['share_url']))):
  417. yield self.url_result(episode['share_url'], NebulaClassIE)
  418. next_url = episodes.get('next')
  419. if not next_url:
  420. break
  421. def _real_extract(self, url):
  422. collection_slug = self._match_id(url)
  423. channel = self._call_api(
  424. f'https://content.api.nebula.app/content/{collection_slug}/?include=lessons',
  425. collection_slug, note='Retrieving channel')
  426. if channel.get('type') == 'class':
  427. entries = self._generate_class_entries(channel)
  428. elif channel.get('type') == 'podcast_channel':
  429. entries = self._generate_podcast_entries(channel['id'], collection_slug)
  430. else:
  431. entries = self._generate_playlist_entries(channel['id'], collection_slug)
  432. return self.playlist_result(
  433. entries=entries,
  434. playlist_id=collection_slug,
  435. playlist_title=channel.get('title'),
  436. playlist_description=channel.get('description'))