brightcove.py 42 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944
  1. import base64
  2. import re
  3. import struct
  4. import urllib.parse
  5. import xml.etree.ElementTree
  6. from .adobepass import AdobePassIE
  7. from .common import InfoExtractor
  8. from ..compat import compat_etree_fromstring
  9. from ..networking.exceptions import HTTPError
  10. from ..utils import (
  11. ExtractorError,
  12. UnsupportedError,
  13. clean_html,
  14. dict_get,
  15. extract_attributes,
  16. find_xpath_attr,
  17. fix_xml_ampersands,
  18. float_or_none,
  19. int_or_none,
  20. join_nonempty,
  21. js_to_json,
  22. mimetype2ext,
  23. parse_iso8601,
  24. parse_qs,
  25. smuggle_url,
  26. str_or_none,
  27. try_get,
  28. unescapeHTML,
  29. unsmuggle_url,
  30. update_url_query,
  31. url_or_none,
  32. )
  33. class BrightcoveLegacyIE(InfoExtractor):
  34. IE_NAME = 'brightcove:legacy'
  35. _VALID_URL = r'(?:https?://.*brightcove\.com/(services|viewer).*?\?|brightcove:)(?P<query>.*)'
  36. _TESTS = [
  37. {
  38. # From http://www.8tv.cat/8aldia/videos/xavier-sala-i-martin-aquesta-tarda-a-8-al-dia/
  39. 'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1654948606001&flashID=myExperience&%40videoPlayer=2371591881001',
  40. 'md5': '5423e113865d26e40624dce2e4b45d95',
  41. 'note': 'Test Brightcove downloads and detection in GenericIE',
  42. 'info_dict': {
  43. 'id': '2371591881001',
  44. 'ext': 'mp4',
  45. 'title': 'Xavier Sala i Martín: “Un banc que no presta és un banc zombi que no serveix per a res”',
  46. 'uploader': '8TV',
  47. 'description': 'md5:a950cc4285c43e44d763d036710cd9cd',
  48. 'timestamp': 1368213670,
  49. 'upload_date': '20130510',
  50. 'uploader_id': '1589608506001',
  51. },
  52. 'skip': 'The player has been deactivated by the content owner',
  53. },
  54. {
  55. # From http://medianetwork.oracle.com/video/player/1785452137001
  56. 'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1217746023001&flashID=myPlayer&%40videoPlayer=1785452137001',
  57. 'info_dict': {
  58. 'id': '1785452137001',
  59. 'ext': 'flv',
  60. 'title': 'JVMLS 2012: Arrays 2.0 - Opportunities and Challenges',
  61. 'description': 'John Rose speaks at the JVM Language Summit, August 1, 2012.',
  62. 'uploader': 'Oracle',
  63. 'timestamp': 1344975024,
  64. 'upload_date': '20120814',
  65. 'uploader_id': '1460825906',
  66. },
  67. 'skip': 'video not playable',
  68. },
  69. {
  70. # From http://mashable.com/2013/10/26/thermoelectric-bracelet-lets-you-control-your-body-temperature/
  71. 'url': 'http://c.brightcove.com/services/viewer/federated_f9?&playerID=1265504713001&publisherID=AQ%7E%7E%2CAAABBzUwv1E%7E%2CxP-xFHVUstiMFlNYfvF4G9yFnNaqCw_9&videoID=2750934548001',
  72. 'info_dict': {
  73. 'id': '2750934548001',
  74. 'ext': 'mp4',
  75. 'title': 'This Bracelet Acts as a Personal Thermostat',
  76. 'description': 'md5:547b78c64f4112766ccf4e151c20b6a0',
  77. # 'uploader': 'Mashable',
  78. 'timestamp': 1382041798,
  79. 'upload_date': '20131017',
  80. 'uploader_id': '1130468786001',
  81. },
  82. },
  83. {
  84. # test that the default referer works
  85. # from http://national.ballet.ca/interact/video/Lost_in_Motion_II/
  86. 'url': 'http://link.brightcove.com/services/player/bcpid756015033001?bckey=AQ~~,AAAApYJi_Ck~,GxhXCegT1Dp39ilhXuxMJxasUhVNZiil&bctid=2878862109001',
  87. 'info_dict': {
  88. 'id': '2878862109001',
  89. 'ext': 'mp4',
  90. 'title': 'Lost in Motion II',
  91. 'description': 'md5:363109c02998fee92ec02211bd8000df',
  92. 'uploader': 'National Ballet of Canada',
  93. },
  94. 'skip': 'Video gone',
  95. },
  96. {
  97. # test flv videos served by akamaihd.net
  98. # From http://www.redbull.com/en/bike/stories/1331655643987/replay-uci-dh-world-cup-2014-from-fort-william
  99. 'url': 'http://c.brightcove.com/services/viewer/htmlFederated?%40videoPlayer=ref%3Aevent-stream-356&linkBaseURL=http%3A%2F%2Fwww.redbull.com%2Fen%2Fbike%2Fvideos%2F1331655630249%2Freplay-uci-fort-william-2014-dh&playerKey=AQ%7E%7E%2CAAAApYJ7UqE%7E%2Cxqr_zXk0I-zzNndy8NlHogrCb5QdyZRf&playerID=1398061561001#__youtubedl_smuggle=%7B%22Referer%22%3A+%22http%3A%2F%2Fwww.redbull.com%2Fen%2Fbike%2Fstories%2F1331655643987%2Freplay-uci-dh-world-cup-2014-from-fort-william%22%7D',
  100. # The md5 checksum changes on each download
  101. 'info_dict': {
  102. 'id': '3750436379001',
  103. 'ext': 'flv',
  104. 'title': 'UCI MTB World Cup 2014: Fort William, UK - Downhill Finals',
  105. 'uploader': 'RBTV Old (do not use)',
  106. 'description': 'UCI MTB World Cup 2014: Fort William, UK - Downhill Finals',
  107. 'timestamp': 1409122195,
  108. 'upload_date': '20140827',
  109. 'uploader_id': '710858724001',
  110. },
  111. 'skip': 'Video gone',
  112. },
  113. {
  114. # playlist with 'videoList'
  115. # from http://support.brightcove.com/en/video-cloud/docs/playlist-support-single-video-players
  116. 'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=3550052898001&playerKey=AQ%7E%7E%2CAAABmA9XpXk%7E%2C-Kp7jNgisre1fG5OdqpAFUTcs0lP_ZoL',
  117. 'info_dict': {
  118. 'title': 'Sealife',
  119. 'id': '3550319591001',
  120. },
  121. 'playlist_mincount': 7,
  122. 'skip': 'Unsupported URL',
  123. },
  124. {
  125. # playlist with 'playlistTab' (https://github.com/ytdl-org/youtube-dl/issues/9965)
  126. 'url': 'http://c.brightcove.com/services/json/experience/runtime/?command=get_programming_for_experience&playerKey=AQ%7E%7E,AAABXlLMdok%7E,NJ4EoMlZ4rZdx9eU1rkMVd8EaYPBBUlg',
  127. 'info_dict': {
  128. 'id': '1522758701001',
  129. 'title': 'Lesson 08',
  130. },
  131. 'playlist_mincount': 10,
  132. 'skip': 'Unsupported URL',
  133. },
  134. {
  135. # playerID inferred from bcpid
  136. # from http://www.un.org/chinese/News/story.asp?NewsID=27724
  137. 'url': 'https://link.brightcove.com/services/player/bcpid1722935254001/?bctid=5360463607001&autoStart=false&secureConnections=true&width=650&height=350',
  138. 'only_matching': True, # Tested in GenericIE
  139. },
  140. ]
  141. _WEBPAGE_TESTS = [{
  142. # embedded brightcove video
  143. # it also tests brightcove videos that need to set the 'Referer'
  144. # in the http requests
  145. 'url': 'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/',
  146. 'info_dict': {
  147. 'id': '2765128793001',
  148. 'ext': 'mp4',
  149. 'title': 'Le cours de bourse : l’analyse technique',
  150. 'description': 'md5:7e9ad046e968cb2d1114004aba466fd9',
  151. 'uploader': 'BFM BUSINESS',
  152. },
  153. 'params': {
  154. 'skip_download': True,
  155. },
  156. 'skip': '404 Not Found',
  157. }, {
  158. # embedded with itemprop embedURL and video id spelled as `idVideo`
  159. 'url': 'http://bfmbusiness.bfmtv.com/mediaplayer/chroniques/olivier-delamarche/',
  160. 'info_dict': {
  161. 'id': '5255628253001',
  162. 'ext': 'mp4',
  163. 'title': 'md5:37c519b1128915607601e75a87995fc0',
  164. 'description': 'md5:37f7f888b434bb8f8cc8dbd4f7a4cf26',
  165. 'uploader': 'BFM BUSINESS',
  166. 'uploader_id': '876450612001',
  167. 'timestamp': 1482255315,
  168. 'upload_date': '20161220',
  169. },
  170. 'params': {
  171. 'skip_download': True,
  172. },
  173. 'skip': 'Redirects, page gone',
  174. }, {
  175. # https://github.com/ytdl-org/youtube-dl/issues/2253
  176. 'url': 'http://bcove.me/i6nfkrc3',
  177. 'md5': '0ba9446db037002366bab3b3eb30c88c',
  178. 'info_dict': {
  179. 'id': '3101154703001',
  180. 'ext': 'mp4',
  181. 'title': 'Still no power',
  182. 'uploader': 'thestar.com',
  183. 'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.',
  184. },
  185. 'skip': 'video gone',
  186. }, {
  187. # https://github.com/ytdl-org/youtube-dl/issues/3541
  188. 'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1',
  189. 'info_dict': {
  190. 'id': '3866516442001',
  191. 'ext': 'mp4',
  192. 'title': 'Leer mij vrouwen kennen: Aflevering 1',
  193. 'description': 'Leer mij vrouwen kennen: Aflevering 1',
  194. 'uploader': 'SBS Broadcasting',
  195. },
  196. 'skip': 'Restricted to Netherlands, 404 Not Found',
  197. 'params': {
  198. 'skip_download': True, # m3u8 download
  199. },
  200. }, {
  201. # Brightcove video in <iframe>
  202. 'url': 'http://www.un.org/chinese/News/story.asp?NewsID=27724',
  203. 'md5': '36d74ef5e37c8b4a2ce92880d208b968',
  204. 'info_dict': {
  205. 'id': '5360463607001',
  206. 'ext': 'mp4',
  207. 'title': '叙利亚失明儿童在废墟上演唱《心跳》 呼吁获得正常童年生活',
  208. 'description': '联合国儿童基金会中东和北非区域大使、作曲家扎德·迪拉尼(Zade Dirani)在3月15日叙利亚冲突爆发7周年纪念日之际发布了为叙利亚谱写的歌曲《心跳》(HEARTBEAT),为受到六年冲突影响的叙利亚儿童发出强烈呐喊,呼吁世界做出共同努力,使叙利亚儿童重新获得享有正常童年生活的权利。',
  209. 'uploader': 'United Nations',
  210. 'uploader_id': '1362235914001',
  211. 'timestamp': 1489593889,
  212. 'upload_date': '20170315',
  213. },
  214. 'skip': '404 Not Found',
  215. }, {
  216. # Brightcove with UUID in videoPlayer
  217. 'url': 'http://www8.hp.com/cn/zh/home.html',
  218. 'info_dict': {
  219. 'id': '5255815316001',
  220. 'ext': 'mp4',
  221. 'title': 'Sprocket Video - China',
  222. 'description': 'Sprocket Video - China',
  223. 'uploader': 'HP-Video Gallery',
  224. 'timestamp': 1482263210,
  225. 'upload_date': '20161220',
  226. 'uploader_id': '1107601872001',
  227. },
  228. 'params': {
  229. 'skip_download': True, # m3u8 download
  230. },
  231. 'skip': 'video rotates...weekly?',
  232. }, {
  233. # Multiple brightcove videos
  234. # https://github.com/ytdl-org/youtube-dl/issues/2283
  235. 'url': 'http://www.newyorker.com/online/blogs/newsdesk/2014/01/always-never-nuclear-command-and-control.html',
  236. 'info_dict': {
  237. 'id': 'always-never',
  238. 'title': 'Always / Never - The New Yorker',
  239. },
  240. 'playlist_count': 3,
  241. 'params': {
  242. 'extract_flat': False,
  243. 'skip_download': True,
  244. },
  245. 'skip': 'Redirects, page gone',
  246. }, {
  247. # BrightcoveInPageEmbed embed
  248. 'url': 'http://www.geekandsundry.com/tabletop-bonus-wils-final-thoughts-on-dread/',
  249. 'info_dict': {
  250. 'id': '4238694884001',
  251. 'ext': 'flv',
  252. 'title': 'Tabletop: Dread, Last Thoughts',
  253. 'description': 'Tabletop: Dread, Last Thoughts',
  254. 'duration': 51690,
  255. },
  256. 'skip': 'Redirects, page gone',
  257. }, {
  258. # Brightcove embed, with no valid 'renditions' but valid 'IOSRenditions'
  259. # This video can't be played in browsers if Flash disabled and UA set to iPhone, which is actually a false alarm
  260. 'url': 'https://dl.dropboxusercontent.com/u/29092637/interview.html',
  261. 'info_dict': {
  262. 'id': '4785848093001',
  263. 'ext': 'mp4',
  264. 'title': 'The Cardinal Pell Interview',
  265. 'description': 'Sky News Contributor Andrew Bolt interviews George Pell in Rome, following the Cardinal\'s evidence before the Royal Commission into Child Abuse. ',
  266. 'uploader': 'GlobeCast Australia - GlobeStream',
  267. 'uploader_id': '2733773828001',
  268. 'upload_date': '20160304',
  269. 'timestamp': 1457083087,
  270. },
  271. 'params': {
  272. # m3u8 downloads
  273. 'skip_download': True,
  274. },
  275. 'skip': '404 Not Found',
  276. }, {
  277. # Brightcove embed with whitespace around attribute names
  278. 'url': 'http://www.stack.com/video/3167554373001/learn-to-hit-open-three-pointers-with-damian-lillard-s-baseline-drift-drill',
  279. 'info_dict': {
  280. 'id': '3167554373001',
  281. 'ext': 'mp4',
  282. 'title': "Learn to Hit Open Three-Pointers With Damian Lillard's Baseline Drift Drill",
  283. 'description': 'md5:57bacb0e0f29349de4972bfda3191713',
  284. 'uploader_id': '1079349493',
  285. 'upload_date': '20140207',
  286. 'timestamp': 1391810548,
  287. },
  288. 'params': {
  289. 'skip_download': True,
  290. },
  291. 'skip': '410 Gone',
  292. }]
  293. @classmethod
  294. def _build_brightcove_url(cls, object_str):
  295. """
  296. Build a Brightcove url from a xml string containing
  297. <object class="BrightcoveExperience">{params}</object>
  298. """
  299. # Fix up some stupid HTML, see https://github.com/ytdl-org/youtube-dl/issues/1553
  300. object_str = re.sub(r'(<param(?:\s+[a-zA-Z0-9_]+="[^"]*")*)>',
  301. lambda m: m.group(1) + '/>', object_str)
  302. # Fix up some stupid XML, see https://github.com/ytdl-org/youtube-dl/issues/1608
  303. object_str = object_str.replace('<--', '<!--')
  304. # remove namespace to simplify extraction
  305. object_str = re.sub(r'(<object[^>]*)(xmlns=".*?")', r'\1', object_str)
  306. object_str = fix_xml_ampersands(object_str)
  307. try:
  308. object_doc = compat_etree_fromstring(object_str.encode())
  309. except xml.etree.ElementTree.ParseError:
  310. return
  311. fv_el = find_xpath_attr(object_doc, './param', 'name', 'flashVars')
  312. if fv_el is not None:
  313. flashvars = dict(
  314. (k, v[0])
  315. for k, v in urllib.parse.parse_qs(fv_el.attrib['value']).items())
  316. else:
  317. flashvars = {}
  318. data_url = object_doc.attrib.get('data', '')
  319. data_url_params = parse_qs(data_url)
  320. def find_param(name):
  321. if name in flashvars:
  322. return flashvars[name]
  323. node = find_xpath_attr(object_doc, './param', 'name', name)
  324. if node is not None:
  325. return node.attrib['value']
  326. return data_url_params.get(name)
  327. params = {}
  328. player_id = find_param('playerID') or find_param('playerId')
  329. if player_id is None:
  330. raise ExtractorError('Cannot find player ID')
  331. params['playerID'] = player_id
  332. player_key = find_param('playerKey')
  333. # Not all pages define this value
  334. if player_key is not None:
  335. params['playerKey'] = player_key
  336. # These fields hold the id of the video
  337. video_player = find_param('@videoPlayer') or find_param('videoId') or find_param('videoID') or find_param('@videoList')
  338. if video_player is not None:
  339. if isinstance(video_player, list):
  340. video_player = video_player[0]
  341. video_player = video_player.strip()
  342. # UUID is also possible for videoPlayer (e.g.
  343. # http://www.popcornflix.com/hoodies-vs-hooligans/7f2d2b87-bbf2-4623-acfb-ea942b4f01dd
  344. # or http://www8.hp.com/cn/zh/home.html)
  345. if not (re.match(
  346. r'^(?:\d+|[\da-fA-F]{8}-?[\da-fA-F]{4}-?[\da-fA-F]{4}-?[\da-fA-F]{4}-?[\da-fA-F]{12})$',
  347. video_player) or video_player.startswith('ref:')):
  348. return None
  349. params['@videoPlayer'] = video_player
  350. link_base = find_param('linkBaseURL')
  351. if link_base is not None:
  352. params['linkBaseURL'] = link_base
  353. return cls._make_brightcove_url(params)
  354. @classmethod
  355. def _build_brightcove_url_from_js(cls, object_js):
  356. # The layout of JS is as follows:
  357. # customBC.createVideo = function (width, height, playerID, playerKey, videoPlayer, VideoRandomID) {
  358. # // build Brightcove <object /> XML
  359. # }
  360. m = re.search(
  361. r'''(?x)customBC\.createVideo\(
  362. .*? # skipping width and height
  363. ["\'](?P<playerID>\d+)["\']\s*,\s* # playerID
  364. ["\'](?P<playerKey>AQ[^"\']{48})[^"\']*["\']\s*,\s* # playerKey begins with AQ and is 50 characters
  365. # in length, however it's appended to itself
  366. # in places, so truncate
  367. ["\'](?P<videoID>\d+)["\'] # @videoPlayer
  368. ''', object_js)
  369. if m:
  370. return cls._make_brightcove_url(m.groupdict())
  371. @classmethod
  372. def _make_brightcove_url(cls, params):
  373. return update_url_query(
  374. 'https://c.brightcove.com/services/viewer/htmlFederated', params)
  375. @classmethod
  376. def _extract_brightcove_url(cls, webpage):
  377. """Try to extract the brightcove url from the webpage, returns None
  378. if it can't be found
  379. """
  380. urls = cls._extract_brightcove_urls(webpage)
  381. return urls[0] if urls else None
  382. @classmethod
  383. def _extract_brightcove_urls(cls, webpage):
  384. """Return a list of all Brightcove URLs from the webpage """
  385. url_m = re.search(
  386. r'''(?x)
  387. <meta\s+
  388. (?:property|itemprop)=([\'"])(?:og:video|embedURL)\1[^>]+
  389. content=([\'"])(?P<url>https?://(?:secure|c)\.brightcove.com/(?:(?!\2).)+)\2
  390. ''', webpage)
  391. if url_m:
  392. url = unescapeHTML(url_m.group('url'))
  393. # Some sites don't add it, we can't download with this url, for example:
  394. # http://www.ktvu.com/videos/news/raw-video-caltrain-releases-video-of-man-almost/vCTZdY/
  395. if 'playerKey' in url or 'videoId' in url or 'idVideo' in url:
  396. return [url]
  397. matches = re.findall(
  398. r'''(?sx)<object
  399. (?:
  400. [^>]+?class=[\'"][^>]*?BrightcoveExperience.*?[\'"] |
  401. [^>]*?>\s*<param\s+name="movie"\s+value="https?://[^/]*brightcove\.com/
  402. ).+?>\s*</object>''',
  403. webpage)
  404. if matches:
  405. return list(filter(None, [cls._build_brightcove_url(m) for m in matches]))
  406. matches = re.findall(r'(customBC\.createVideo\(.+?\);)', webpage)
  407. if matches:
  408. return list(filter(None, [
  409. cls._build_brightcove_url_from_js(custom_bc)
  410. for custom_bc in matches]))
  411. return [src for _, src in re.findall(
  412. r'<iframe[^>]+src=([\'"])((?:https?:)?//link\.brightcove\.com/services/player/(?!\1).+)\1', webpage)]
  413. def _extract_from_webpage(self, url, webpage):
  414. bc_urls = self._extract_brightcove_urls(webpage)
  415. for bc_url in bc_urls:
  416. yield self.url_result(smuggle_url(bc_url, {'Referer': url}), BrightcoveLegacyIE)
  417. def _real_extract(self, url):
  418. url, smuggled_data = unsmuggle_url(url, {})
  419. # Change the 'videoId' and others field to '@videoPlayer'
  420. url = re.sub(r'(?<=[?&])(videoI(d|D)|idVideo|bctid)', '%40videoPlayer', url)
  421. # Change bckey (used by bcove.me urls) to playerKey
  422. url = re.sub(r'(?<=[?&])bckey', 'playerKey', url)
  423. mobj = self._match_valid_url(url)
  424. query_str = mobj.group('query')
  425. query = urllib.parse.parse_qs(query_str)
  426. video_player = query.get('@videoPlayer')
  427. if video_player:
  428. # We set the original url as the default 'Referer' header
  429. referer = query.get('linkBaseURL', [None])[0] or smuggled_data.get('Referer', url)
  430. video_id = video_player[0]
  431. if 'playerID' not in query:
  432. mobj = re.search(r'/bcpid(\d+)', url)
  433. if mobj is not None:
  434. query['playerID'] = [mobj.group(1)]
  435. publisher_id = query.get('publisherId')
  436. if publisher_id and publisher_id[0].isdigit():
  437. publisher_id = publisher_id[0]
  438. if not publisher_id:
  439. player_key = query.get('playerKey')
  440. if player_key and ',' in player_key[0]:
  441. player_key = player_key[0]
  442. else:
  443. player_id = query.get('playerID')
  444. if player_id and player_id[0].isdigit():
  445. headers = {}
  446. if referer:
  447. headers['Referer'] = referer
  448. player_page = self._download_webpage(
  449. 'https://link.brightcove.com/services/player/bcpid' + player_id[0],
  450. video_id, headers=headers, fatal=False)
  451. if player_page:
  452. player_key = self._search_regex(
  453. r'<param\s+name="playerKey"\s+value="([\w~,-]+)"',
  454. player_page, 'player key', fatal=False)
  455. if player_key:
  456. enc_pub_id = player_key.split(',')[1].replace('~', '=')
  457. publisher_id = struct.unpack('>Q', base64.urlsafe_b64decode(enc_pub_id))[0]
  458. if publisher_id:
  459. brightcove_new_url = f'https://players.brightcove.net/{publisher_id}/default_default/index.html?videoId={video_id}'
  460. if referer:
  461. brightcove_new_url = smuggle_url(brightcove_new_url, {'referrer': referer})
  462. return self.url_result(brightcove_new_url, BrightcoveNewIE.ie_key(), video_id)
  463. # TODO: figure out if it's possible to extract playlistId from playerKey
  464. # elif 'playerKey' in query:
  465. # player_key = query['playerKey']
  466. # return self._get_playlist_info(player_key[0])
  467. raise UnsupportedError(url)
  468. class BrightcoveNewBaseIE(AdobePassIE):
  469. def _parse_brightcove_metadata(self, json_data, video_id, headers={}):
  470. title = json_data['name'].strip()
  471. formats, subtitles = [], {}
  472. sources = json_data.get('sources') or []
  473. for source in sources:
  474. container = source.get('container')
  475. ext = mimetype2ext(source.get('type'))
  476. src = source.get('src')
  477. if ext == 'm3u8' or container == 'M2TS':
  478. if not src:
  479. continue
  480. fmts, subs = self._extract_m3u8_formats_and_subtitles(
  481. src, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)
  482. subtitles = self._merge_subtitles(subtitles, subs)
  483. elif ext == 'mpd':
  484. if not src:
  485. continue
  486. fmts, subs = self._extract_mpd_formats_and_subtitles(src, video_id, 'dash', fatal=False)
  487. subtitles = self._merge_subtitles(subtitles, subs)
  488. else:
  489. streaming_src = source.get('streaming_src')
  490. stream_name, app_name = source.get('stream_name'), source.get('app_name')
  491. if not src and not streaming_src and (not stream_name or not app_name):
  492. continue
  493. tbr = float_or_none(source.get('avg_bitrate'), 1000)
  494. height = int_or_none(source.get('height'))
  495. width = int_or_none(source.get('width'))
  496. f = {
  497. 'tbr': tbr,
  498. 'filesize': int_or_none(source.get('size')),
  499. 'container': container,
  500. 'ext': ext or container.lower(),
  501. }
  502. if width == 0 and height == 0:
  503. f.update({
  504. 'vcodec': 'none',
  505. })
  506. else:
  507. f.update({
  508. 'width': width,
  509. 'height': height,
  510. 'vcodec': source.get('codec'),
  511. })
  512. def build_format_id(kind):
  513. return join_nonempty(kind, tbr and f'{int(tbr)}k', height and f'{height}p')
  514. if src or streaming_src:
  515. f.update({
  516. 'url': src or streaming_src,
  517. 'format_id': build_format_id('http' if src else 'http-streaming'),
  518. 'source_preference': 0 if src else -1,
  519. })
  520. else:
  521. f.update({
  522. 'url': app_name,
  523. 'play_path': stream_name,
  524. 'format_id': build_format_id('rtmp'),
  525. })
  526. fmts = [f]
  527. # https://support.brightcove.com/playback-api-video-fields-reference#key_systems_object
  528. if container == 'WVM' or source.get('key_systems') or ext == 'ism':
  529. for f in fmts:
  530. f['has_drm'] = True
  531. formats.extend(fmts)
  532. if not formats:
  533. errors = json_data.get('errors')
  534. if errors:
  535. error = errors[0]
  536. self.raise_no_formats(
  537. error.get('message') or error.get('error_subcode') or error['error_code'], expected=True)
  538. headers.pop('Authorization', None) # or else http formats will give error 400
  539. for f in formats:
  540. f.setdefault('http_headers', {}).update(headers)
  541. for text_track in json_data.get('text_tracks', []):
  542. if text_track.get('kind') != 'captions':
  543. continue
  544. text_track_url = url_or_none(text_track.get('src'))
  545. if not text_track_url:
  546. continue
  547. lang = (str_or_none(text_track.get('srclang'))
  548. or str_or_none(text_track.get('label')) or 'en').lower()
  549. subtitles.setdefault(lang, []).append({
  550. 'url': text_track_url,
  551. })
  552. is_live = False
  553. duration = float_or_none(json_data.get('duration'), 1000)
  554. if duration is not None and duration <= 0:
  555. is_live = True
  556. common_res = [(160, 90), (320, 180), (480, 720), (640, 360), (768, 432), (1024, 576), (1280, 720), (1366, 768), (1920, 1080)]
  557. thumb_base_url = dict_get(json_data, ('poster', 'thumbnail'))
  558. thumbnails = [{
  559. 'url': re.sub(r'\d+x\d+', f'{w}x{h}', thumb_base_url),
  560. 'width': w,
  561. 'height': h,
  562. } for w, h in common_res] if thumb_base_url else None
  563. return {
  564. 'id': video_id,
  565. 'title': title,
  566. 'description': clean_html(json_data.get('description')),
  567. 'thumbnails': thumbnails,
  568. 'duration': duration,
  569. 'timestamp': parse_iso8601(json_data.get('published_at')),
  570. 'uploader_id': json_data.get('account_id'),
  571. 'formats': formats,
  572. 'subtitles': subtitles,
  573. 'tags': json_data.get('tags', []),
  574. 'is_live': is_live,
  575. }
  576. class BrightcoveNewIE(BrightcoveNewBaseIE):
  577. IE_NAME = 'brightcove:new'
  578. _VALID_URL = r'https?://players\.brightcove\.net/(?P<account_id>\d+)/(?P<player_id>[^/]+)_(?P<embed>[^/]+)/index\.html\?.*(?P<content_type>video|playlist)Id=(?P<video_id>\d+|ref:[^&]+)'
  579. _TESTS = [{
  580. 'url': 'http://players.brightcove.net/929656772001/e41d32dc-ec74-459e-a845-6c69f7b724ea_default/index.html?videoId=4463358922001',
  581. 'md5': 'c8100925723840d4b0d243f7025703be',
  582. 'info_dict': {
  583. 'id': '4463358922001',
  584. 'ext': 'mp4',
  585. 'title': 'Meet the man behind Popcorn Time',
  586. 'description': 'md5:eac376a4fe366edc70279bfb681aea16',
  587. 'duration': 165.768,
  588. 'timestamp': 1441391203,
  589. 'upload_date': '20150904',
  590. 'uploader_id': '929656772001',
  591. 'formats': 'mincount:20',
  592. },
  593. 'skip': '404 Not Found',
  594. }, {
  595. # with rtmp streams
  596. 'url': 'http://players.brightcove.net/4036320279001/5d112ed9-283f-485f-a7f9-33f42e8bc042_default/index.html?videoId=4279049078001',
  597. 'info_dict': {
  598. 'id': '4279049078001',
  599. 'ext': 'mp4',
  600. 'title': 'Titansgrave: Chapter 0',
  601. 'description': 'Titansgrave: Chapter 0',
  602. 'duration': 1242.058,
  603. 'timestamp': 1433556729,
  604. 'upload_date': '20150606',
  605. 'uploader_id': '4036320279001',
  606. 'formats': 'mincount:39',
  607. },
  608. 'params': {
  609. # m3u8 download
  610. 'skip_download': True,
  611. },
  612. }, {
  613. # playlist stream
  614. 'url': 'https://players.brightcove.net/1752604059001/S13cJdUBz_default/index.html?playlistId=5718313430001',
  615. 'info_dict': {
  616. 'id': '5718313430001',
  617. 'title': 'No Audio Playlist',
  618. },
  619. 'playlist_count': 7,
  620. 'params': {
  621. # m3u8 download
  622. 'skip_download': True,
  623. },
  624. }, {
  625. 'url': 'http://players.brightcove.net/5690807595001/HyZNerRl7_default/index.html?playlistId=5743160747001',
  626. 'only_matching': True,
  627. }, {
  628. # ref: prefixed video id
  629. 'url': 'http://players.brightcove.net/3910869709001/21519b5c-4b3b-4363-accb-bdc8f358f823_default/index.html?videoId=ref:7069442',
  630. 'only_matching': True,
  631. }, {
  632. # non numeric ref: prefixed video id
  633. 'url': 'http://players.brightcove.net/710858724001/default_default/index.html?videoId=ref:event-stream-356',
  634. 'only_matching': True,
  635. }, {
  636. # unavailable video without message but with error_code
  637. 'url': 'http://players.brightcove.net/1305187701/c832abfb-641b-44eb-9da0-2fe76786505f_default/index.html?videoId=4377407326001',
  638. 'only_matching': True,
  639. }]
  640. _WEBPAGE_TESTS = [{
  641. # brightcove player url embed
  642. 'url': 'https://nbc-2.com/weather/forecast/2022/11/16/forecast-warmest-day-of-the-week/',
  643. 'md5': '2934d5372b354d27083ccf8575dbfee2',
  644. 'info_dict': {
  645. 'id': '6315650313112',
  646. 'title': 'First Alert Forecast: November 15, 2022',
  647. 'ext': 'mp4',
  648. 'tags': ['nbc2', 'forecast'],
  649. 'uploader_id': '6146886170001',
  650. 'thumbnail': r're:^https?://.*\.jpg$',
  651. 'timestamp': 1668574571,
  652. 'duration': 233.375,
  653. 'upload_date': '20221116',
  654. },
  655. }, {
  656. # embedded with video tag only
  657. 'url': 'https://www.gooddishtv.com/tiktok-rapping-chef-mr-pyrex',
  658. 'info_dict': {
  659. 'id': 'tiktok-rapping-chef-mr-pyrex',
  660. 'title': 'TikTok\'s Rapping Chef Makes Jambalaya for the Hosts',
  661. 'thumbnail': r're:^https?://.*\.jpg$',
  662. 'age_limit': 0,
  663. 'description': 'Just in time for Mardi Gras',
  664. },
  665. 'playlist': [{
  666. 'info_dict': {
  667. 'id': '6299189544001',
  668. 'ext': 'mp4',
  669. 'title': 'TGD_01-032_5',
  670. 'thumbnail': r're:^https?://.*\.jpg$',
  671. 'tags': [],
  672. 'timestamp': 1646078943,
  673. 'uploader_id': '1569565978001',
  674. 'upload_date': '20220228',
  675. 'duration': 217.195,
  676. },
  677. }, {
  678. 'info_dict': {
  679. 'id': '6305565995112',
  680. 'ext': 'mp4',
  681. 'title': 'TGD 01-087 (Airs 05.25.22)_Segment 5',
  682. 'thumbnail': r're:^https?://.*\.jpg$',
  683. 'tags': [],
  684. 'timestamp': 1651604591,
  685. 'uploader_id': '1569565978001',
  686. 'upload_date': '20220503',
  687. 'duration': 310.421,
  688. },
  689. }],
  690. }, {
  691. # Brightcove:new type [2].
  692. 'url': 'http://www.delawaresportszone.com/video-st-thomas-more-earns-first-trip-to-basketball-semis',
  693. 'md5': '2b35148fcf48da41c9fb4591650784f3',
  694. 'info_dict': {
  695. 'id': '5348741021001',
  696. 'ext': 'mp4',
  697. 'upload_date': '20170306',
  698. 'uploader_id': '4191638492001',
  699. 'timestamp': 1488769918,
  700. 'title': 'VIDEO: St. Thomas More earns first trip to basketball semis',
  701. },
  702. 'skip': '404 Not Found',
  703. }, {
  704. # Alternative brightcove <video> attributes
  705. 'url': 'http://www.programme-tv.net/videos/extraits/81095-guillaume-canet-evoque-les-rumeurs-d-infidelite-de-marion-cotillard-avec-brad-pitt-dans-vivement-dimanche/',
  706. 'info_dict': {
  707. 'id': '81095-guillaume-canet-evoque-les-rumeurs-d-infidelite-de-marion-cotillard-avec-brad-pitt-dans-vivement-dimanche',
  708. 'title': "Guillaume Canet évoque les rumeurs d'infidélité de Marion Cotillard avec Brad Pitt dans Vivement Dimanche, Extraits : toutes les vidéos avec Télé-Loisirs",
  709. },
  710. 'playlist': [{
  711. 'md5': '732d22ba3d33f2f3fc253c39f8f36523',
  712. 'info_dict': {
  713. 'id': '5311302538001',
  714. 'ext': 'mp4',
  715. 'title': "Guillaume Canet évoque les rumeurs d'infidélité de Marion Cotillard avec Brad Pitt dans Vivement Dimanche",
  716. 'description': "Guillaume Canet évoque les rumeurs d'infidélité de Marion Cotillard avec Brad Pitt dans Vivement Dimanche (France 2, 5 février 2017)",
  717. 'timestamp': 1486321708,
  718. 'upload_date': '20170205',
  719. 'uploader_id': '800000640001',
  720. },
  721. 'only_matching': True,
  722. }],
  723. 'skip': '404 Not Found',
  724. }, {
  725. # Brightcove URL in single quotes
  726. 'url': 'http://www.sportsnet.ca/baseball/mlb/sn-presents-russell-martin-world-citizen/',
  727. 'md5': '4ae374f1f8b91c889c4b9203c8c752af',
  728. 'info_dict': {
  729. 'id': '4255764656001',
  730. 'ext': 'mp4',
  731. 'title': 'SN Presents: Russell Martin, World Citizen',
  732. 'description': 'To understand why he was the Toronto Blue Jays’ top off-season priority is to appreciate his background and upbringing in Montreal, where he first developed his baseball skills. Written and narrated by Stephen Brunt.',
  733. 'uploader': 'Rogers Sportsnet',
  734. 'uploader_id': '1704050871',
  735. 'upload_date': '20150525',
  736. 'timestamp': 1432570283,
  737. },
  738. 'skip': 'Page no longer has URL, now has javascript',
  739. }]
  740. @staticmethod
  741. def _extract_url(ie, webpage):
  742. urls = BrightcoveNewIE._extract_brightcove_urls(ie, webpage)
  743. return urls[0] if urls else None
  744. @staticmethod
  745. def _extract_brightcove_urls(ie, webpage):
  746. # Reference:
  747. # 1. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideoiniframe
  748. # 2. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#tag
  749. # 3. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideousingjavascript
  750. # 4. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/in-page-embed-player-implementation.html
  751. # 5. https://support.brightcove.com/en/video-cloud/docs/dynamically-assigning-videos-player
  752. entries = []
  753. # Look for iframe embeds [1]
  754. for _, url in re.findall(
  755. r'<iframe[^>]+src=(["\'])((?:https?:)?//players\.brightcove\.net/\d+/[^/]+/index\.html.+?)\1', webpage):
  756. entries.append(url if url.startswith(('http:', 'https:')) else 'https:' + url)
  757. # Look for <video> tags [2] and embed_in_page embeds [3]
  758. # [2] looks like:
  759. for video, script_tag, account_id, player_id, embed in re.findall(
  760. r'''(?isx)
  761. (<video(?:-js)?\s+[^>]*\bdata-video-id\s*=\s*['"]?[^>]+>)
  762. (?:.*?
  763. (<script[^>]+
  764. src=["\'](?:https?:)?//players\.brightcove\.net/
  765. (\d+)/([^/]+)_([^/]+)/index(?:\.min)?\.js
  766. )
  767. )?
  768. ''', webpage):
  769. attrs = extract_attributes(video)
  770. # According to examples from [4] it's unclear whether video id
  771. # may be optional and what to do when it is
  772. video_id = attrs.get('data-video-id')
  773. if not video_id:
  774. continue
  775. account_id = account_id or attrs.get('data-account')
  776. if not account_id:
  777. continue
  778. player_id = player_id or attrs.get('data-player') or 'default'
  779. embed = embed or attrs.get('data-embed') or 'default'
  780. bc_url = f'https://players.brightcove.net/{account_id}/{player_id}_{embed}/index.html?videoId={video_id}'
  781. # Some brightcove videos may be embedded with video tag only and
  782. # without script tag or any mentioning of brightcove at all. Such
  783. # embeds are considered ambiguous since they are matched based only
  784. # on data-video-id and data-account attributes and in the wild may
  785. # not be brightcove embeds at all. Let's check reconstructed
  786. # brightcove URLs in case of such embeds and only process valid
  787. # ones. By this we ensure there is indeed a brightcove embed.
  788. if not script_tag and not ie._is_valid_url(
  789. bc_url, video_id, 'possible brightcove video'):
  790. continue
  791. entries.append(bc_url)
  792. return entries
  793. def _extract_from_webpage(self, url, webpage):
  794. bc_urls = self._extract_brightcove_urls(self, webpage)
  795. for bc_url in bc_urls:
  796. yield self.url_result(smuggle_url(bc_url, {'referrer': url}), BrightcoveNewIE)
  797. def _real_extract(self, url):
  798. url, smuggled_data = unsmuggle_url(url, {})
  799. self._initialize_geo_bypass({
  800. 'countries': smuggled_data.get('geo_countries'),
  801. 'ip_blocks': smuggled_data.get('geo_ip_blocks'),
  802. })
  803. account_id, player_id, embed, content_type, video_id = self._match_valid_url(url).groups()
  804. policy_key_id = f'{account_id}_{player_id}'
  805. policy_key = self.cache.load('brightcove', policy_key_id)
  806. policy_key_extracted = False
  807. store_pk = lambda x: self.cache.store('brightcove', policy_key_id, x)
  808. def extract_policy_key():
  809. base_url = f'https://players.brightcove.net/{account_id}/{player_id}_{embed}/'
  810. config = self._download_json(
  811. base_url + 'config.json', video_id, fatal=False) or {}
  812. policy_key = try_get(
  813. config, lambda x: x['video_cloud']['policy_key'])
  814. if not policy_key:
  815. webpage = self._download_webpage(
  816. base_url + 'index.min.js', video_id)
  817. catalog = self._search_regex(
  818. r'catalog\(({.+?})\);', webpage, 'catalog', default=None)
  819. if catalog:
  820. catalog = self._parse_json(
  821. js_to_json(catalog), video_id, fatal=False)
  822. if catalog:
  823. policy_key = catalog.get('policyKey')
  824. if not policy_key:
  825. policy_key = self._search_regex(
  826. r'policyKey\s*:\s*(["\'])(?P<pk>.+?)\1',
  827. webpage, 'policy key', group='pk')
  828. store_pk(policy_key)
  829. return policy_key
  830. token = smuggled_data.get('token')
  831. api_url = f'https://{"edge-auth" if token else "edge"}.api.brightcove.com/playback/v1/accounts/{account_id}/{content_type}s/{video_id}'
  832. headers = {'Authorization': f'Bearer {token}'} if token else {}
  833. referrer = smuggled_data.get('referrer') # XXX: notice the spelling/case of the key
  834. if referrer:
  835. headers.update({
  836. 'Referer': referrer,
  837. 'Origin': re.search(r'https?://[^/]+', referrer).group(0),
  838. })
  839. for _ in range(2):
  840. if not policy_key:
  841. policy_key = extract_policy_key()
  842. policy_key_extracted = True
  843. headers['Accept'] = f'application/json;pk={policy_key}'
  844. try:
  845. json_data = self._download_json(api_url, video_id, headers=headers)
  846. break
  847. except ExtractorError as e:
  848. if isinstance(e.cause, HTTPError) and e.cause.status in (401, 403):
  849. json_data = self._parse_json(e.cause.response.read().decode(), video_id)[0]
  850. message = json_data.get('message') or json_data['error_code']
  851. if json_data.get('error_subcode') == 'CLIENT_GEO':
  852. self.raise_geo_restricted(msg=message)
  853. elif json_data.get('error_code') == 'INVALID_POLICY_KEY' and not policy_key_extracted:
  854. policy_key = None
  855. store_pk(None)
  856. continue
  857. raise ExtractorError(message, expected=True)
  858. raise
  859. errors = json_data.get('errors')
  860. if errors and errors[0].get('error_subcode') == 'TVE_AUTH':
  861. custom_fields = json_data['custom_fields']
  862. tve_token = self._extract_mvpd_auth(
  863. smuggled_data['source_url'], video_id,
  864. custom_fields['bcadobepassrequestorid'],
  865. custom_fields['bcadobepassresourceid'])
  866. json_data = self._download_json(
  867. api_url, video_id, headers={
  868. 'Accept': f'application/json;pk={policy_key}',
  869. }, query={
  870. 'tveToken': tve_token,
  871. })
  872. if content_type == 'playlist':
  873. return self.playlist_result(
  874. [self._parse_brightcove_metadata(vid, vid.get('id'), headers)
  875. for vid in json_data.get('videos', []) if vid.get('id')],
  876. json_data.get('id'), json_data.get('name'),
  877. json_data.get('description'))
  878. return self._parse_brightcove_metadata(
  879. json_data, video_id, headers=headers)