nbc.py 34 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850
  1. import base64
  2. import json
  3. import re
  4. import urllib.parse
  5. import xml.etree.ElementTree
  6. from .adobepass import AdobePassIE
  7. from .common import InfoExtractor
  8. from .theplatform import ThePlatformIE, default_ns
  9. from ..networking import HEADRequest
  10. from ..utils import (
  11. ExtractorError,
  12. RegexNotFoundError,
  13. UserNotLive,
  14. clean_html,
  15. determine_ext,
  16. float_or_none,
  17. int_or_none,
  18. join_nonempty,
  19. mimetype2ext,
  20. parse_age_limit,
  21. parse_duration,
  22. remove_end,
  23. smuggle_url,
  24. traverse_obj,
  25. try_get,
  26. unescapeHTML,
  27. unified_timestamp,
  28. update_url_query,
  29. url_basename,
  30. )
  31. class NBCIE(ThePlatformIE): # XXX: Do not subclass from concrete IE
  32. _VALID_URL = r'https?(?P<permalink>://(?:www\.)?nbc\.com/(?:classic-tv/)?[^/]+/video/[^/]+/(?P<id>(?:NBCE|n)?\d+))'
  33. _TESTS = [
  34. {
  35. 'url': 'http://www.nbc.com/the-tonight-show/video/jimmy-fallon-surprises-fans-at-ben-jerrys/2848237',
  36. 'info_dict': {
  37. 'id': '2848237',
  38. 'ext': 'mp4',
  39. 'title': 'Jimmy Fallon Surprises Fans at Ben & Jerry\'s',
  40. 'description': 'Jimmy gives out free scoops of his new "Tonight Dough" ice cream flavor by surprising customers at the Ben & Jerry\'s scoop shop.',
  41. 'timestamp': 1424246400,
  42. 'upload_date': '20150218',
  43. 'uploader': 'NBCU-COM',
  44. 'episode': 'Jimmy Fallon Surprises Fans at Ben & Jerry\'s',
  45. 'episode_number': 86,
  46. 'season': 'Season 2',
  47. 'season_number': 2,
  48. 'series': 'Tonight Show: Jimmy Fallon',
  49. 'duration': 237.0,
  50. 'chapters': 'count:1',
  51. 'tags': 'count:4',
  52. 'thumbnail': r're:https?://.+\.jpg',
  53. 'categories': ['Series/The Tonight Show Starring Jimmy Fallon'],
  54. 'media_type': 'Full Episode',
  55. },
  56. 'params': {
  57. 'skip_download': 'm3u8',
  58. },
  59. },
  60. {
  61. 'url': 'http://www.nbc.com/saturday-night-live/video/star-wars-teaser/2832821',
  62. 'info_dict': {
  63. 'id': '2832821',
  64. 'ext': 'mp4',
  65. 'title': 'Star Wars Teaser',
  66. 'description': 'md5:0b40f9cbde5b671a7ff62fceccc4f442',
  67. 'timestamp': 1417852800,
  68. 'upload_date': '20141206',
  69. 'uploader': 'NBCU-COM',
  70. },
  71. 'skip': 'page not found',
  72. },
  73. {
  74. # HLS streams requires the 'hdnea3' cookie
  75. 'url': 'http://www.nbc.com/Kings/video/goliath/n1806',
  76. 'info_dict': {
  77. 'id': '101528f5a9e8127b107e98c5e6ce4638',
  78. 'ext': 'mp4',
  79. 'title': 'Goliath',
  80. 'description': 'When an unknown soldier saves the life of the King\'s son in battle, he\'s thrust into the limelight and politics of the kingdom.',
  81. 'timestamp': 1237100400,
  82. 'upload_date': '20090315',
  83. 'uploader': 'NBCU-COM',
  84. },
  85. 'skip': 'page not found',
  86. },
  87. {
  88. # manifest url does not have extension
  89. 'url': 'https://www.nbc.com/the-golden-globe-awards/video/oprah-winfrey-receives-cecil-b-de-mille-award-at-the-2018-golden-globes/3646439',
  90. 'info_dict': {
  91. 'id': '3646439',
  92. 'ext': 'mp4',
  93. 'title': 'Oprah Winfrey Receives Cecil B. de Mille Award at the 2018 Golden Globes',
  94. 'episode': 'Oprah Winfrey Receives Cecil B. de Mille Award at the 2018 Golden Globes',
  95. 'episode_number': 1,
  96. 'season': 'Season 75',
  97. 'season_number': 75,
  98. 'series': 'The Golden Globe Awards',
  99. 'description': 'Oprah Winfrey receives the Cecil B. de Mille Award at the 75th Annual Golden Globe Awards.',
  100. 'uploader': 'NBCU-COM',
  101. 'upload_date': '20180107',
  102. 'timestamp': 1515312000,
  103. 'duration': 570.0,
  104. 'tags': 'count:8',
  105. 'thumbnail': r're:https?://.+\.jpg',
  106. 'chapters': 'count:1',
  107. },
  108. 'params': {
  109. 'skip_download': 'm3u8',
  110. },
  111. },
  112. {
  113. # new video_id format
  114. 'url': 'https://www.nbc.com/quantum-leap/video/bens-first-leap-nbcs-quantum-leap/NBCE125189978',
  115. 'info_dict': {
  116. 'id': 'NBCE125189978',
  117. 'ext': 'mp4',
  118. 'title': 'Ben\'s First Leap | NBC\'s Quantum Leap',
  119. 'description': 'md5:a82762449b7ec4bb83291a7b355ebf8e',
  120. 'uploader': 'NBCU-COM',
  121. 'series': 'Quantum Leap',
  122. 'season': 'Season 1',
  123. 'season_number': 1,
  124. 'episode': 'Ben\'s First Leap | NBC\'s Quantum Leap',
  125. 'episode_number': 1,
  126. 'duration': 170.171,
  127. 'chapters': [],
  128. 'timestamp': 1663956155,
  129. 'upload_date': '20220923',
  130. 'tags': 'count:10',
  131. 'age_limit': 0,
  132. 'thumbnail': r're:https?://.+\.jpg',
  133. 'categories': ['Series/Quantum Leap 2022'],
  134. 'media_type': 'Highlight',
  135. },
  136. 'params': {
  137. 'skip_download': 'm3u8',
  138. },
  139. },
  140. {
  141. 'url': 'https://www.nbc.com/classic-tv/charles-in-charge/video/charles-in-charge-pilot/n3310',
  142. 'only_matching': True,
  143. },
  144. {
  145. # Percent escaped url
  146. 'url': 'https://www.nbc.com/up-all-night/video/day-after-valentine%27s-day/n2189',
  147. 'only_matching': True,
  148. },
  149. ]
  150. def _real_extract(self, url):
  151. permalink, video_id = self._match_valid_url(url).groups()
  152. permalink = 'http' + urllib.parse.unquote(permalink)
  153. video_data = self._download_json(
  154. 'https://friendship.nbc.co/v2/graphql', video_id, query={
  155. 'query': '''query bonanzaPage(
  156. $app: NBCUBrands! = nbc
  157. $name: String!
  158. $oneApp: Boolean
  159. $platform: SupportedPlatforms! = web
  160. $type: EntityPageType! = VIDEO
  161. $userId: String!
  162. ) {
  163. bonanzaPage(
  164. app: $app
  165. name: $name
  166. oneApp: $oneApp
  167. platform: $platform
  168. type: $type
  169. userId: $userId
  170. ) {
  171. metadata {
  172. ... on VideoPageData {
  173. description
  174. episodeNumber
  175. keywords
  176. locked
  177. mpxAccountId
  178. mpxGuid
  179. rating
  180. resourceId
  181. seasonNumber
  182. secondaryTitle
  183. seriesShortTitle
  184. }
  185. }
  186. }
  187. }''',
  188. 'variables': json.dumps({
  189. 'name': permalink,
  190. 'oneApp': True,
  191. 'userId': '0',
  192. }),
  193. })['data']['bonanzaPage']['metadata']
  194. query = {
  195. 'mbr': 'true',
  196. 'manifest': 'm3u',
  197. 'switch': 'HLSServiceSecure',
  198. }
  199. video_id = video_data['mpxGuid']
  200. tp_path = 'NnzsPC/media/guid/{}/{}'.format(video_data.get('mpxAccountId') or '2410887629', video_id)
  201. tpm = self._download_theplatform_metadata(tp_path, video_id)
  202. title = tpm.get('title') or video_data.get('secondaryTitle')
  203. if video_data.get('locked'):
  204. resource = self._get_mvpd_resource(
  205. video_data.get('resourceId') or 'nbcentertainment',
  206. title, video_id, video_data.get('rating'))
  207. query['auth'] = self._extract_mvpd_auth(
  208. url, video_id, 'nbcentertainment', resource)
  209. theplatform_url = smuggle_url(update_url_query(
  210. 'http://link.theplatform.com/s/NnzsPC/media/guid/{}/{}'.format(video_data.get('mpxAccountId') or '2410887629', video_id),
  211. query), {'force_smil_url': True})
  212. # Empty string or 0 can be valid values for these. So the check must be `is None`
  213. description = video_data.get('description')
  214. if description is None:
  215. description = tpm.get('description')
  216. episode_number = int_or_none(video_data.get('episodeNumber'))
  217. if episode_number is None:
  218. episode_number = int_or_none(tpm.get('nbcu$airOrder'))
  219. rating = video_data.get('rating')
  220. if rating is None:
  221. try_get(tpm, lambda x: x['ratings'][0]['rating'])
  222. season_number = int_or_none(video_data.get('seasonNumber'))
  223. if season_number is None:
  224. season_number = int_or_none(tpm.get('nbcu$seasonNumber'))
  225. series = video_data.get('seriesShortTitle')
  226. if series is None:
  227. series = tpm.get('nbcu$seriesShortTitle')
  228. tags = video_data.get('keywords')
  229. if tags is None or len(tags) == 0:
  230. tags = tpm.get('keywords')
  231. return {
  232. '_type': 'url_transparent',
  233. 'age_limit': parse_age_limit(rating),
  234. 'description': description,
  235. 'episode': title,
  236. 'episode_number': episode_number,
  237. 'id': video_id,
  238. 'ie_key': 'ThePlatform',
  239. 'season_number': season_number,
  240. 'series': series,
  241. 'tags': tags,
  242. 'title': title,
  243. 'url': theplatform_url,
  244. }
  245. class NBCSportsVPlayerIE(InfoExtractor):
  246. _VALID_URL_BASE = r'https?://(?:vplayer\.nbcsports\.com|(?:www\.)?nbcsports\.com/vplayer)/'
  247. _VALID_URL = _VALID_URL_BASE + r'(?:[^/]+/)+(?P<id>[0-9a-zA-Z_]+)'
  248. _EMBED_REGEX = [rf'(?:iframe[^>]+|var video|div[^>]+data-(?:mpx-)?)[sS]rc\s?=\s?"(?P<url>{_VALID_URL_BASE}[^\"]+)']
  249. _TESTS = [{
  250. 'url': 'https://vplayer.nbcsports.com/p/BxmELC/nbcsports_embed/select/9CsDKds0kvHI',
  251. 'info_dict': {
  252. 'id': '9CsDKds0kvHI',
  253. 'ext': 'mp4',
  254. 'description': 'md5:df390f70a9ba7c95ff1daace988f0d8d',
  255. 'title': 'Tyler Kalinoski hits buzzer-beater to lift Davidson',
  256. 'timestamp': 1426270238,
  257. 'upload_date': '20150313',
  258. 'uploader': 'NBCU-SPORTS',
  259. 'duration': 72.818,
  260. 'chapters': [],
  261. 'thumbnail': r're:^https?://.*\.jpg$',
  262. },
  263. }, {
  264. 'url': 'https://vplayer.nbcsports.com/p/BxmELC/nbcsports_embed/select/media/PEgOtlNcC_y2',
  265. 'only_matching': True,
  266. }, {
  267. 'url': 'https://www.nbcsports.com/vplayer/p/BxmELC/nbcsports/select/PHJSaFWbrTY9?form=html&autoPlay=true',
  268. 'only_matching': True,
  269. }]
  270. def _real_extract(self, url):
  271. video_id = self._match_id(url)
  272. webpage = self._download_webpage(url, video_id)
  273. theplatform_url = self._html_search_regex(r'tp:releaseUrl="(.+?)"', webpage, 'url')
  274. return self.url_result(theplatform_url, 'ThePlatform')
  275. class NBCSportsIE(InfoExtractor):
  276. _VALID_URL = r'https?://(?:www\.)?nbcsports\.com//?(?!vplayer/)(?:[^/]+/)+(?P<id>[0-9a-z-]+)'
  277. _TESTS = [{
  278. # iframe src
  279. 'url': 'https://www.nbcsports.com/watch/nfl/profootballtalk/pft-pm/unpacking-addisons-reckless-driving-citation',
  280. 'info_dict': {
  281. 'id': 'PHJSaFWbrTY9',
  282. 'ext': 'mp4',
  283. 'title': 'Tom Izzo, Michigan St. has \'so much respect\' for Duke',
  284. 'description': 'md5:ecb459c9d59e0766ac9c7d5d0eda8113',
  285. 'uploader': 'NBCU-SPORTS',
  286. 'upload_date': '20150330',
  287. 'timestamp': 1427726529,
  288. 'chapters': [],
  289. 'thumbnail': 'https://hdliveextra-a.akamaihd.net/HD/image_sports/NBCU_Sports_Group_-_nbcsports/253/303/izzodps.jpg',
  290. 'duration': 528.395,
  291. },
  292. }, {
  293. # data-mpx-src
  294. 'url': 'https://www.nbcsports.com/philadelphia/philadelphia-phillies/bruce-bochy-hector-neris-hes-idiot',
  295. 'only_matching': True,
  296. }, {
  297. # data-src
  298. 'url': 'https://www.nbcsports.com/boston/video/report-card-pats-secondary-no-match-josh-allen',
  299. 'only_matching': True,
  300. }]
  301. def _real_extract(self, url):
  302. video_id = self._match_id(url)
  303. webpage = self._download_webpage(url, video_id)
  304. return self.url_result(
  305. NBCSportsVPlayerIE._extract_url(webpage), 'NBCSportsVPlayer')
  306. class NBCSportsStreamIE(AdobePassIE):
  307. _VALID_URL = r'https?://stream\.nbcsports\.com/.+?\bpid=(?P<id>\d+)'
  308. _TEST = {
  309. 'url': 'http://stream.nbcsports.com/nbcsn/generic?pid=206559',
  310. 'info_dict': {
  311. 'id': '206559',
  312. 'ext': 'mp4',
  313. 'title': 'Amgen Tour of California Women\'s Recap',
  314. 'description': 'md5:66520066b3b5281ada7698d0ea2aa894',
  315. },
  316. 'params': {
  317. # m3u8 download
  318. 'skip_download': True,
  319. },
  320. 'skip': 'Requires Adobe Pass Authentication',
  321. }
  322. def _real_extract(self, url):
  323. video_id = self._match_id(url)
  324. live_source = self._download_json(
  325. f'http://stream.nbcsports.com/data/live_sources_{video_id}.json',
  326. video_id)
  327. video_source = live_source['videoSources'][0]
  328. title = video_source['title']
  329. source_url = None
  330. for k in ('source', 'msl4source', 'iossource', 'hlsv4'):
  331. sk = k + 'Url'
  332. source_url = video_source.get(sk) or video_source.get(sk + 'Alt')
  333. if source_url:
  334. break
  335. else:
  336. source_url = video_source['ottStreamUrl']
  337. is_live = video_source.get('type') == 'live' or video_source.get('status') == 'Live'
  338. resource = self._get_mvpd_resource('nbcsports', title, video_id, '')
  339. token = self._extract_mvpd_auth(url, video_id, 'nbcsports', resource)
  340. tokenized_url = self._download_json(
  341. 'https://token.playmakerservices.com/cdn',
  342. video_id, data=json.dumps({
  343. 'requestorId': 'nbcsports',
  344. 'pid': video_id,
  345. 'application': 'NBCSports',
  346. 'version': 'v1',
  347. 'platform': 'desktop',
  348. 'cdn': 'akamai',
  349. 'url': video_source['sourceUrl'],
  350. 'token': base64.b64encode(token.encode()).decode(),
  351. 'resourceId': base64.b64encode(resource.encode()).decode(),
  352. }).encode())['tokenizedUrl']
  353. formats = self._extract_m3u8_formats(tokenized_url, video_id, 'mp4')
  354. return {
  355. 'id': video_id,
  356. 'title': title,
  357. 'description': live_source.get('description'),
  358. 'formats': formats,
  359. 'is_live': is_live,
  360. }
  361. class NBCNewsIE(ThePlatformIE): # XXX: Do not subclass from concrete IE
  362. _VALID_URL = r'(?x)https?://(?:www\.)?(?:nbcnews|today|msnbc)\.com/([^/]+/)*(?:.*-)?(?P<id>[^/?]+)'
  363. _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//www\.nbcnews\.com/widget/video-embed/[^"\']+)\1']
  364. _TESTS = [
  365. {
  366. 'url': 'http://www.nbcnews.com/watch/nbcnews-com/how-twitter-reacted-to-the-snowden-interview-269389891880',
  367. 'md5': 'fb3dcd2d7b1dd9804305fa2fc95ab610', # md5 tends to fluctuate
  368. 'info_dict': {
  369. 'id': '269389891880',
  370. 'ext': 'mp4',
  371. 'title': 'How Twitter Reacted To The Snowden Interview',
  372. 'description': 'md5:65a0bd5d76fe114f3c2727aa3a81fe64',
  373. 'timestamp': 1401363060,
  374. 'upload_date': '20140529',
  375. 'duration': 46.0,
  376. 'thumbnail': 'https://media-cldnry.s-nbcnews.com/image/upload/MSNBC/Components/Video/140529/p_tweet_snow_140529.jpg',
  377. },
  378. },
  379. {
  380. 'url': 'http://www.nbcnews.com/feature/dateline-full-episodes/full-episode-family-business-n285156',
  381. 'md5': 'fdbf39ab73a72df5896b6234ff98518a',
  382. 'info_dict': {
  383. 'id': '529953347624',
  384. 'ext': 'mp4',
  385. 'title': 'FULL EPISODE: Family Business',
  386. 'description': 'md5:757988edbaae9d7be1d585eb5d55cc04',
  387. },
  388. 'skip': 'This page is unavailable.',
  389. },
  390. {
  391. 'url': 'http://www.nbcnews.com/nightly-news/video/nightly-news-with-brian-williams-full-broadcast-february-4-394064451844',
  392. 'md5': '40d0e48c68896359c80372306ece0fc3',
  393. 'info_dict': {
  394. 'id': '394064451844',
  395. 'ext': 'mp4',
  396. 'title': 'Nightly News with Brian Williams Full Broadcast (February 4)',
  397. 'description': 'md5:1c10c1eccbe84a26e5debb4381e2d3c5',
  398. 'timestamp': 1423104900,
  399. 'upload_date': '20150205',
  400. 'duration': 1236.0,
  401. 'thumbnail': 'https://media-cldnry.s-nbcnews.com/image/upload/MSNBC/Components/Video/__NEW/nn_netcast_150204.jpg',
  402. },
  403. },
  404. {
  405. 'url': 'http://www.nbcnews.com/business/autos/volkswagen-11-million-vehicles-could-have-suspect-software-emissions-scandal-n431456',
  406. 'md5': 'ffb59bcf0733dc3c7f0ace907f5e3939',
  407. 'info_dict': {
  408. 'id': 'n431456',
  409. 'ext': 'mp4',
  410. 'title': "Volkswagen U.S. Chief: We 'Totally Screwed Up'",
  411. 'description': 'md5:d22d1281a24f22ea0880741bb4dd6301',
  412. 'upload_date': '20150922',
  413. 'timestamp': 1442917800,
  414. 'duration': 37.0,
  415. 'thumbnail': 'https://media-cldnry.s-nbcnews.com/image/upload/MSNBC/Components/Video/__NEW/x_lon_vwhorn_150922.jpg',
  416. },
  417. },
  418. {
  419. 'url': 'http://www.today.com/video/see-the-aurora-borealis-from-space-in-stunning-new-nasa-video-669831235788',
  420. 'md5': '693d1fa21d23afcc9b04c66b227ed9ff',
  421. 'info_dict': {
  422. 'id': '669831235788',
  423. 'ext': 'mp4',
  424. 'title': 'See the aurora borealis from space in stunning new NASA video',
  425. 'description': 'md5:74752b7358afb99939c5f8bb2d1d04b1',
  426. 'upload_date': '20160420',
  427. 'timestamp': 1461152093,
  428. 'duration': 69.0,
  429. 'thumbnail': 'https://media-cldnry.s-nbcnews.com/image/upload/MSNBC/Components/Video/201604/2016-04-20T11-35-09-133Z--1280x720.jpg',
  430. },
  431. },
  432. {
  433. 'url': 'http://www.msnbc.com/all-in-with-chris-hayes/watch/the-chaotic-gop-immigration-vote-314487875924',
  434. 'md5': '6d236bf4f3dddc226633ce6e2c3f814d',
  435. 'info_dict': {
  436. 'id': '314487875924',
  437. 'ext': 'mp4',
  438. 'title': 'The chaotic GOP immigration vote',
  439. 'description': 'The Republican House votes on a border bill that has no chance of getting through the Senate or signed by the President and is drawing criticism from all sides.',
  440. 'thumbnail': r're:^https?://.*\.jpg$',
  441. 'timestamp': 1406937606,
  442. 'upload_date': '20140802',
  443. 'duration': 940.0,
  444. },
  445. },
  446. {
  447. 'url': 'http://www.nbcnews.com/watch/dateline/full-episode--deadly-betrayal-386250819952',
  448. 'only_matching': True,
  449. },
  450. {
  451. # From http://www.vulture.com/2016/06/letterman-couldnt-care-less-about-late-night.html
  452. 'url': 'http://www.nbcnews.com/widget/video-embed/701714499682',
  453. 'only_matching': True,
  454. },
  455. ]
  456. def _real_extract(self, url):
  457. video_id = self._match_id(url)
  458. webpage = self._download_webpage(url, video_id)
  459. data = self._search_nextjs_data(webpage, video_id)['props']['initialState']
  460. video_data = try_get(data, lambda x: x['video']['current'], dict)
  461. if not video_data:
  462. video_data = data['article']['content'][0]['primaryMedia']['video']
  463. title = video_data['headline']['primary']
  464. formats = []
  465. for va in video_data.get('videoAssets', []):
  466. public_url = va.get('publicUrl')
  467. if not public_url:
  468. continue
  469. if '://link.theplatform.com/' in public_url:
  470. public_url = update_url_query(public_url, {'format': 'redirect'})
  471. format_id = va.get('format')
  472. if format_id == 'M3U':
  473. formats.extend(self._extract_m3u8_formats(
  474. public_url, video_id, 'mp4', 'm3u8_native',
  475. m3u8_id=format_id, fatal=False))
  476. continue
  477. tbr = int_or_none(va.get('bitrate'), 1000)
  478. formats.append({
  479. 'format_id': join_nonempty(format_id, tbr),
  480. 'url': public_url,
  481. 'width': int_or_none(va.get('width')),
  482. 'height': int_or_none(va.get('height')),
  483. 'tbr': tbr,
  484. 'ext': 'mp4',
  485. })
  486. subtitles = {}
  487. closed_captioning = video_data.get('closedCaptioning')
  488. if closed_captioning:
  489. for cc_url in closed_captioning.values():
  490. if not cc_url:
  491. continue
  492. subtitles.setdefault('en', []).append({
  493. 'url': cc_url,
  494. })
  495. return {
  496. 'id': video_id,
  497. 'title': title,
  498. 'description': try_get(video_data, lambda x: x['description']['primary']),
  499. 'thumbnail': try_get(video_data, lambda x: x['primaryImage']['url']['primary']),
  500. 'duration': parse_duration(video_data.get('duration')),
  501. 'timestamp': unified_timestamp(video_data.get('datePublished')),
  502. 'formats': formats,
  503. 'subtitles': subtitles,
  504. }
  505. class NBCOlympicsIE(InfoExtractor):
  506. IE_NAME = 'nbcolympics'
  507. _VALID_URL = r'https?://www\.nbcolympics\.com/videos?/(?P<id>[0-9a-z-]+)'
  508. _TEST = {
  509. # Geo-restricted to US
  510. 'url': 'http://www.nbcolympics.com/video/justin-roses-son-leo-was-tears-after-his-dad-won-gold',
  511. 'md5': '54fecf846d05429fbaa18af557ee523a',
  512. 'info_dict': {
  513. 'id': 'WjTBzDXx5AUq',
  514. 'display_id': 'justin-roses-son-leo-was-tears-after-his-dad-won-gold',
  515. 'ext': 'mp4',
  516. 'title': 'Rose\'s son Leo was in tears after his dad won gold',
  517. 'description': 'Olympic gold medalist Justin Rose gets emotional talking to the impact his win in men\'s golf has already had on his children.',
  518. 'timestamp': 1471274964,
  519. 'upload_date': '20160815',
  520. 'uploader': 'NBCU-SPORTS',
  521. },
  522. 'skip': '404 Not Found',
  523. }
  524. def _real_extract(self, url):
  525. display_id = self._match_id(url)
  526. webpage = self._download_webpage(url, display_id)
  527. try:
  528. drupal_settings = self._parse_json(self._search_regex(
  529. r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);',
  530. webpage, 'drupal settings'), display_id)
  531. iframe_url = drupal_settings['vod']['iframe_url']
  532. theplatform_url = iframe_url.replace(
  533. 'vplayer.nbcolympics.com', 'player.theplatform.com')
  534. except RegexNotFoundError:
  535. theplatform_url = self._search_regex(
  536. r"([\"'])embedUrl\1: *([\"'])(?P<embedUrl>.+)\2",
  537. webpage, 'embedding URL', group='embedUrl')
  538. return {
  539. '_type': 'url_transparent',
  540. 'url': theplatform_url,
  541. 'ie_key': ThePlatformIE.ie_key(),
  542. 'display_id': display_id,
  543. }
  544. class NBCOlympicsStreamIE(AdobePassIE):
  545. IE_NAME = 'nbcolympics:stream'
  546. _VALID_URL = r'https?://stream\.nbcolympics\.com/(?P<id>[0-9a-z-]+)'
  547. _TESTS = [
  548. {
  549. 'note': 'Tokenized m3u8 source URL',
  550. 'url': 'https://stream.nbcolympics.com/womens-soccer-group-round-11',
  551. 'info_dict': {
  552. 'id': '2019740',
  553. 'ext': 'mp4',
  554. 'title': r"re:Women's Group Stage - Netherlands vs\. Brazil [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$",
  555. },
  556. 'params': {
  557. 'skip_download': 'm3u8',
  558. },
  559. 'skip': 'Livestream',
  560. }, {
  561. 'note': 'Plain m3u8 source URL',
  562. 'url': 'https://stream.nbcolympics.com/gymnastics-event-finals-mens-floor-pommel-horse-womens-vault-bars',
  563. 'info_dict': {
  564. 'id': '2021729',
  565. 'ext': 'mp4',
  566. 'title': r're:Event Finals: M Floor, W Vault, M Pommel, W Uneven Bars [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
  567. },
  568. 'params': {
  569. 'skip_download': 'm3u8',
  570. },
  571. 'skip': 'Livestream',
  572. },
  573. ]
  574. def _real_extract(self, url):
  575. display_id = self._match_id(url)
  576. webpage = self._download_webpage(url, display_id)
  577. pid = self._search_regex(r'pid\s*=\s*(\d+);', webpage, 'pid')
  578. event_config = self._download_json(
  579. f'http://stream.nbcolympics.com/data/event_config_{pid}.json',
  580. pid, 'Downloading event config')['eventConfig']
  581. title = event_config['eventTitle']
  582. is_live = {'live': True, 'replay': False}.get(event_config.get('eventStatus'))
  583. source_url = self._download_json(
  584. f'https://api-leap.nbcsports.com/feeds/assets/{pid}?application=NBCOlympics&platform=desktop&format=nbc-player&env=staging',
  585. pid, 'Downloading leap config',
  586. )['videoSources'][0]['cdnSources']['primary'][0]['sourceUrl']
  587. if event_config.get('cdnToken'):
  588. ap_resource = self._get_mvpd_resource(
  589. event_config.get('resourceId', 'NBCOlympics'),
  590. re.sub(r'[^\w\d ]+', '', event_config['eventTitle']), pid,
  591. event_config.get('ratingId', 'NO VALUE'))
  592. media_token = self._extract_mvpd_auth(url, pid, event_config.get('requestorId', 'NBCOlympics'), ap_resource)
  593. source_url = self._download_json(
  594. 'https://tokens.playmakerservices.com/', pid, 'Retrieving tokenized URL',
  595. data=json.dumps({
  596. 'application': 'NBCSports',
  597. 'authentication-type': 'adobe-pass',
  598. 'cdn': 'akamai',
  599. 'pid': pid,
  600. 'platform': 'desktop',
  601. 'requestorId': 'NBCOlympics',
  602. 'resourceId': base64.b64encode(ap_resource.encode()).decode(),
  603. 'token': base64.b64encode(media_token.encode()).decode(),
  604. 'url': source_url,
  605. 'version': 'v1',
  606. }).encode(),
  607. )['akamai'][0]['tokenizedUrl']
  608. formats = self._extract_m3u8_formats(source_url, pid, 'mp4', live=is_live)
  609. for f in formats:
  610. # -http_seekable requires ffmpeg 4.3+ but it doesnt seem possible to
  611. # download with ffmpeg without this option
  612. f['downloader_options'] = {'ffmpeg_args': ['-seekable', '0', '-http_seekable', '0', '-icy', '0']}
  613. return {
  614. 'id': pid,
  615. 'display_id': display_id,
  616. 'title': title,
  617. 'formats': formats,
  618. 'is_live': is_live,
  619. }
  620. class NBCStationsIE(InfoExtractor):
  621. _DOMAIN_RE = '|'.join(map(re.escape, (
  622. 'nbcbayarea', 'nbcboston', 'nbcchicago', 'nbcconnecticut', 'nbcdfw', 'nbclosangeles',
  623. 'nbcmiami', 'nbcnewyork', 'nbcphiladelphia', 'nbcsandiego', 'nbcwashington',
  624. 'necn', 'telemundo52', 'telemundoarizona', 'telemundochicago', 'telemundonuevainglaterra',
  625. )))
  626. _VALID_URL = rf'https?://(?:www\.)?(?P<site>{_DOMAIN_RE})\.com/(?:[^/?#]+/)*(?P<id>[^/?#]+)/?(?:$|[#?])'
  627. _TESTS = [{
  628. 'url': 'https://www.nbclosangeles.com/news/local/large-structure-fire-in-downtown-la-prompts-smoke-odor-advisory/2968618/',
  629. 'info_dict': {
  630. 'id': '2968618',
  631. 'ext': 'mp4',
  632. 'title': 'Large Structure Fire in Downtown LA Prompts Smoke Odor Advisory',
  633. 'description': 'md5:417ed3c2d91fe9d301e6db7b0942f182',
  634. 'duration': 112.513,
  635. 'timestamp': 1661135892,
  636. 'upload_date': '20220822',
  637. 'uploader': 'NBC 4',
  638. 'channel_id': 'KNBC',
  639. 'channel': 'nbclosangeles',
  640. },
  641. 'params': {
  642. 'skip_download': 'm3u8',
  643. },
  644. }, {
  645. 'url': 'https://www.telemundoarizona.com/responde/huracan-complica-reembolso-para-televidente-de-tucson/2247002/',
  646. 'info_dict': {
  647. 'id': '2247002',
  648. 'ext': 'mp4',
  649. 'title': 'Huracán complica que televidente de Tucson reciba reembolso',
  650. 'description': 'md5:af298dc73aab74d4fca6abfb12acb6cf',
  651. 'duration': 172.406,
  652. 'timestamp': 1660886507,
  653. 'upload_date': '20220819',
  654. 'uploader': 'Telemundo Arizona',
  655. 'channel_id': 'KTAZ',
  656. 'channel': 'telemundoarizona',
  657. },
  658. 'params': {
  659. 'skip_download': 'm3u8',
  660. },
  661. }, {
  662. # direct mp4 link
  663. 'url': 'https://www.nbcboston.com/weather/video-weather/highs-near-freezing-in-boston-on-wednesday/2961135/',
  664. 'md5': '9bf8c41dc7abbb75b1a44f1491a4cc85',
  665. 'info_dict': {
  666. 'id': '2961135',
  667. 'ext': 'mp4',
  668. 'title': 'Highs Near Freezing in Boston on Wednesday',
  669. 'description': 'md5:3ec486609a926c99f00a3512e6c0e85b',
  670. 'duration': 235.669,
  671. 'timestamp': 1675268656,
  672. 'upload_date': '20230201',
  673. 'uploader': '',
  674. 'channel_id': 'WBTS',
  675. 'channel': 'nbcboston',
  676. },
  677. }]
  678. _RESOLUTIONS = {
  679. '1080': '1920',
  680. '720': '1280',
  681. '540': '960',
  682. '360': '640',
  683. '234': '416',
  684. }
  685. def _real_extract(self, url):
  686. channel, video_id = self._match_valid_url(url).group('site', 'id')
  687. webpage = self._download_webpage(url, video_id)
  688. nbc_data = self._search_json(
  689. r'<script>\s*var\s+nbc\s*=', webpage, 'NBC JSON data', video_id)
  690. pdk_acct = nbc_data.get('pdkAcct') or 'Yh1nAC'
  691. fw_ssid = traverse_obj(nbc_data, ('video', 'fwSSID'))
  692. video_data = self._search_json(
  693. r'data-videos="\[', webpage, 'video data', video_id, default={}, transform_source=unescapeHTML)
  694. video_data.update(self._search_json(
  695. r'data-meta="', webpage, 'metadata', video_id, default={}, transform_source=unescapeHTML))
  696. if not video_data:
  697. raise ExtractorError('No video metadata found in webpage', expected=True)
  698. info, formats = {}, []
  699. is_live = int_or_none(video_data.get('mpx_is_livestream')) == 1
  700. query = {
  701. 'formats': 'MPEG-DASH none,M3U none,MPEG-DASH none,MPEG4,MP3',
  702. 'format': 'SMIL',
  703. 'fwsitesection': fw_ssid,
  704. 'fwNetworkID': traverse_obj(nbc_data, ('video', 'fwNetworkID'), default='382114'),
  705. 'pprofile': 'ots_desktop_html',
  706. 'sensitive': 'false',
  707. 'w': '1920',
  708. 'h': '1080',
  709. 'mode': 'LIVE' if is_live else 'on-demand',
  710. 'vpaid': 'script',
  711. 'schema': '2.0',
  712. 'sdk': 'PDK 6.1.3',
  713. }
  714. if is_live:
  715. player_id = traverse_obj(video_data, ((None, ('video', 'meta')), (
  716. 'mpx_m3upid', 'mpx_pid', 'pid_streaming_web_medium')), get_all=False)
  717. info['title'] = f'{channel} livestream'
  718. else:
  719. player_id = traverse_obj(video_data, (
  720. (None, ('video', 'meta')), ('pid_streaming_web_high', 'mpx_pid')), get_all=False)
  721. date_string = traverse_obj(video_data, 'date_string', 'date_gmt')
  722. if date_string:
  723. date_string = self._search_regex(
  724. r'datetime="([^"]+)"', date_string, 'date string', fatal=False)
  725. else:
  726. date_string = traverse_obj(
  727. nbc_data, ('dataLayer', 'adobe', ('prop70', 'eVar70', 'eVar59')), get_all=False)
  728. video_url = traverse_obj(video_data, ((None, ('video', 'meta')), 'mp4_url'), get_all=False)
  729. if video_url:
  730. ext = determine_ext(video_url)
  731. height = self._search_regex(r'\d+-(\d+)p', url_basename(video_url), 'height', default=None)
  732. formats.append({
  733. 'url': video_url,
  734. 'ext': ext,
  735. 'width': int_or_none(self._RESOLUTIONS.get(height)),
  736. 'height': int_or_none(height),
  737. 'format_id': f'http-{ext}',
  738. })
  739. info.update({
  740. 'title': video_data.get('title') or traverse_obj(nbc_data, (
  741. 'dataLayer', (None, 'adobe'), ('contenttitle', 'title', 'prop22')), get_all=False),
  742. 'description':
  743. traverse_obj(video_data, 'summary', 'excerpt', 'video_hero_text')
  744. or clean_html(traverse_obj(nbc_data, ('dataLayer', 'summary'))),
  745. 'timestamp': unified_timestamp(date_string),
  746. })
  747. smil = None
  748. if player_id and fw_ssid:
  749. smil = self._download_xml(
  750. f'https://link.theplatform.com/s/{pdk_acct}/{player_id}', video_id,
  751. note='Downloading SMIL data', query=query, fatal=is_live)
  752. if not isinstance(smil, xml.etree.ElementTree.Element):
  753. smil = None
  754. subtitles = self._parse_smil_subtitles(smil, default_ns) if smil is not None else {}
  755. for video in smil.findall(self._xpath_ns('.//video', default_ns)) if smil is not None else []:
  756. info['duration'] = float_or_none(remove_end(video.get('dur'), 'ms'), 1000)
  757. video_src_url = video.get('src')
  758. ext = mimetype2ext(video.get('type'), default=determine_ext(video_src_url))
  759. if ext == 'm3u8':
  760. fmts, subs = self._extract_m3u8_formats_and_subtitles(
  761. video_src_url, video_id, 'mp4', m3u8_id='hls', fatal=is_live,
  762. live=is_live, errnote='No HLS formats found')
  763. formats.extend(fmts)
  764. self._merge_subtitles(subs, target=subtitles)
  765. elif video_src_url:
  766. formats.append({
  767. 'url': video_src_url,
  768. 'format_id': f'https-{ext}',
  769. 'ext': ext,
  770. 'width': int_or_none(video.get('width')),
  771. 'height': int_or_none(video.get('height')),
  772. })
  773. if not formats:
  774. self.raise_no_formats('No video content found in webpage', expected=True)
  775. elif is_live:
  776. try:
  777. self._request_webpage(
  778. HEADRequest(formats[0]['url']), video_id, note='Checking live status')
  779. except ExtractorError:
  780. raise UserNotLive(video_id=channel)
  781. return {
  782. 'id': video_id,
  783. 'channel': channel,
  784. 'channel_id': nbc_data.get('callLetters'),
  785. 'uploader': nbc_data.get('on_air_name'),
  786. 'formats': formats,
  787. 'subtitles': subtitles,
  788. 'is_live': is_live,
  789. **info,
  790. }