theplatform.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419
  1. import hashlib
  2. import hmac
  3. import re
  4. import time
  5. from .adobepass import AdobePassIE
  6. from .once import OnceIE
  7. from ..networking import HEADRequest, Request
  8. from ..utils import (
  9. ExtractorError,
  10. determine_ext,
  11. find_xpath_attr,
  12. float_or_none,
  13. int_or_none,
  14. mimetype2ext,
  15. parse_qs,
  16. traverse_obj,
  17. unsmuggle_url,
  18. update_url,
  19. update_url_query,
  20. urlhandle_detect_ext,
  21. xpath_with_ns,
  22. )
  23. default_ns = 'http://www.w3.org/2005/SMIL21/Language'
  24. _x = lambda p: xpath_with_ns(p, {'smil': default_ns})
  25. class ThePlatformBaseIE(OnceIE):
  26. _TP_TLD = 'com'
  27. def _extract_theplatform_smil(self, smil_url, video_id, note='Downloading SMIL data'):
  28. meta = self._download_xml(
  29. smil_url, video_id, note=note, query={'format': 'SMIL'},
  30. headers=self.geo_verification_headers())
  31. error_element = find_xpath_attr(meta, _x('.//smil:ref'), 'src')
  32. if error_element is not None:
  33. exception = find_xpath_attr(
  34. error_element, _x('.//smil:param'), 'name', 'exception')
  35. if exception is not None:
  36. if exception.get('value') == 'GeoLocationBlocked':
  37. self.raise_geo_restricted(error_element.attrib['abstract'])
  38. elif error_element.attrib['src'].startswith(
  39. f'http://link.theplatform.{self._TP_TLD}/s/errorFiles/Unavailable.'):
  40. raise ExtractorError(
  41. error_element.attrib['abstract'], expected=True)
  42. smil_formats, subtitles = self._parse_smil_formats_and_subtitles(
  43. meta, smil_url, video_id, namespace=default_ns,
  44. # the parameters are from syfy.com, other sites may use others,
  45. # they also work for nbc.com
  46. f4m_params={'g': 'UXWGVKRWHFSP', 'hdcore': '3.0.3'},
  47. transform_rtmp_url=lambda streamer, src: (streamer, 'mp4:' + src))
  48. formats = []
  49. for _format in smil_formats:
  50. if OnceIE.suitable(_format['url']):
  51. formats.extend(self._extract_once_formats(_format['url']))
  52. else:
  53. media_url = _format['url']
  54. if determine_ext(media_url) == 'm3u8':
  55. hdnea2 = self._get_cookies(media_url).get('hdnea2')
  56. if hdnea2:
  57. _format['url'] = update_url_query(media_url, {'hdnea3': hdnea2.value})
  58. formats.append(_format)
  59. return formats, subtitles
  60. def _download_theplatform_metadata(self, path, video_id):
  61. info_url = f'http://link.theplatform.{self._TP_TLD}/s/{path}?format=preview'
  62. return self._download_json(info_url, video_id)
  63. def _parse_theplatform_metadata(self, info):
  64. subtitles = {}
  65. captions = info.get('captions')
  66. if isinstance(captions, list):
  67. for caption in captions:
  68. lang, src, mime = caption.get('lang', 'en'), caption.get('src'), caption.get('type')
  69. subtitles.setdefault(lang, []).append({
  70. 'ext': mimetype2ext(mime),
  71. 'url': src,
  72. })
  73. duration = info.get('duration')
  74. tp_chapters = info.get('chapters', [])
  75. chapters = []
  76. if tp_chapters:
  77. def _add_chapter(start_time, end_time):
  78. start_time = float_or_none(start_time, 1000)
  79. end_time = float_or_none(end_time, 1000)
  80. if start_time is None or end_time is None:
  81. return
  82. chapters.append({
  83. 'start_time': start_time,
  84. 'end_time': end_time,
  85. })
  86. for chapter in tp_chapters[:-1]:
  87. _add_chapter(chapter.get('startTime'), chapter.get('endTime'))
  88. _add_chapter(tp_chapters[-1].get('startTime'), tp_chapters[-1].get('endTime') or duration)
  89. def extract_site_specific_field(field):
  90. # A number of sites have custom-prefixed keys, e.g. 'cbc$seasonNumber'
  91. return traverse_obj(info, lambda k, v: v and k.endswith(f'${field}'), get_all=False)
  92. return {
  93. 'title': info['title'],
  94. 'subtitles': subtitles,
  95. 'description': info['description'],
  96. 'thumbnail': info['defaultThumbnailUrl'],
  97. 'duration': float_or_none(duration, 1000),
  98. 'timestamp': int_or_none(info.get('pubDate'), 1000) or None,
  99. 'uploader': info.get('billingCode'),
  100. 'chapters': chapters,
  101. 'creator': traverse_obj(info, ('author', {str})) or None,
  102. 'categories': traverse_obj(info, (
  103. 'categories', lambda _, v: v.get('label') in ('category', None), 'name', {str})) or None,
  104. 'tags': traverse_obj(info, ('keywords', {lambda x: re.split(r'[;,]\s?', x) if x else None})),
  105. 'location': extract_site_specific_field('region'),
  106. 'series': extract_site_specific_field('show'),
  107. 'season_number': int_or_none(extract_site_specific_field('seasonNumber')),
  108. 'media_type': extract_site_specific_field('programmingType') or extract_site_specific_field('type'),
  109. }
  110. def _extract_theplatform_metadata(self, path, video_id):
  111. info = self._download_theplatform_metadata(path, video_id)
  112. return self._parse_theplatform_metadata(info)
  113. class ThePlatformIE(ThePlatformBaseIE, AdobePassIE):
  114. _VALID_URL = r'''(?x)
  115. (?:https?://(?:link|player)\.theplatform\.com/[sp]/(?P<provider_id>[^/]+)/
  116. (?:(?:(?:[^/]+/)+select/)?(?P<media>media/(?:guid/\d+/)?)?|(?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/))?
  117. |theplatform:)(?P<id>[^/\?&]+)'''
  118. _EMBED_REGEX = [
  119. r'''(?x)
  120. <meta\s+
  121. property=(["'])(?:og:video(?::(?:secure_)?url)?|twitter:player)\1\s+
  122. content=(["'])(?P<url>https?://player\.theplatform\.com/p/.+?)\2''',
  123. r'(?s)<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//player\.theplatform\.com/p/.+?)\1',
  124. ]
  125. _TESTS = [{
  126. # from http://www.metacafe.com/watch/cb-e9I_cZgTgIPd/blackberrys_big_bold_z30/
  127. 'url': 'http://link.theplatform.com/s/dJ5BDC/e9I_cZgTgIPd/meta.smil?format=smil&Tracking=true&mbr=true',
  128. 'info_dict': {
  129. 'id': 'e9I_cZgTgIPd',
  130. 'ext': 'flv',
  131. 'title': 'Blackberry\'s big, bold Z30',
  132. 'description': 'The Z30 is Blackberry\'s biggest, baddest mobile messaging device yet.',
  133. 'duration': 247,
  134. 'timestamp': 1383239700,
  135. 'upload_date': '20131031',
  136. 'uploader': 'CBSI-NEW',
  137. },
  138. 'params': {
  139. # rtmp download
  140. 'skip_download': True,
  141. },
  142. 'skip': '404 Not Found',
  143. }, {
  144. # from http://www.cnet.com/videos/tesla-model-s-a-second-step-towards-a-cleaner-motoring-future/
  145. 'url': 'http://link.theplatform.com/s/kYEXFC/22d_qsQ6MIRT',
  146. 'info_dict': {
  147. 'id': '22d_qsQ6MIRT',
  148. 'ext': 'flv',
  149. 'description': 'md5:ac330c9258c04f9d7512cf26b9595409',
  150. 'title': 'Tesla Model S: A second step towards a cleaner motoring future',
  151. 'timestamp': 1426176191,
  152. 'upload_date': '20150312',
  153. 'uploader': 'CBSI-NEW',
  154. },
  155. 'params': {
  156. # rtmp download
  157. 'skip_download': True,
  158. },
  159. 'skip': 'CNet no longer uses ThePlatform',
  160. }, {
  161. 'url': 'https://player.theplatform.com/p/D6x-PC/pulse_preview/embed/select/media/yMBg9E8KFxZD',
  162. 'info_dict': {
  163. 'id': 'yMBg9E8KFxZD',
  164. 'ext': 'mp4',
  165. 'description': 'md5:644ad9188d655b742f942bf2e06b002d',
  166. 'title': 'HIGHLIGHTS: USA bag first ever series Cup win',
  167. 'uploader': 'EGSM',
  168. },
  169. 'skip': 'Dead link',
  170. }, {
  171. 'url': 'http://player.theplatform.com/p/NnzsPC/widget/select/media/4Y0TlYUr_ZT7',
  172. 'only_matching': True,
  173. }, {
  174. 'url': 'http://player.theplatform.com/p/2E2eJC/nbcNewsOffsite?guid=tdy_or_siri_150701',
  175. 'md5': 'fb96bb3d85118930a5b055783a3bd992',
  176. 'info_dict': {
  177. 'id': 'tdy_or_siri_150701',
  178. 'ext': 'mp4',
  179. 'title': 'iPhone Siri’s sassy response to a math question has people talking',
  180. 'description': 'md5:a565d1deadd5086f3331d57298ec6333',
  181. 'duration': 83.0,
  182. 'thumbnail': r're:^https?://.*\.jpg$',
  183. 'timestamp': 1435752600,
  184. 'upload_date': '20150701',
  185. 'uploader': 'NBCU-NEWS',
  186. },
  187. 'skip': 'Error: Player PID "nbcNewsOffsite" is disabled',
  188. }, {
  189. # From http://www.nbc.com/the-blacklist/video/sir-crispin-crandall/2928790?onid=137781#vc137781=1
  190. # geo-restricted (US), HLS encrypted with AES-128
  191. 'url': 'http://player.theplatform.com/p/NnzsPC/onsite_universal/select/media/guid/2410887629/2928790?fwsitesection=nbc_the_blacklist_video_library&autoPlay=true&carouselID=137781',
  192. 'only_matching': True,
  193. }]
  194. @classmethod
  195. def _extract_embed_urls(cls, url, webpage):
  196. # Are whitespaces ignored in URLs?
  197. # https://github.com/ytdl-org/youtube-dl/issues/12044
  198. for embed_url in super()._extract_embed_urls(url, webpage):
  199. yield re.sub(r'\s', '', embed_url)
  200. @staticmethod
  201. def _sign_url(url, sig_key, sig_secret, life=600, include_qs=False):
  202. flags = '10' if include_qs else '00'
  203. expiration_date = '%x' % (int(time.time()) + life)
  204. def str_to_hex(str_data):
  205. return str_data.encode('ascii').hex()
  206. relative_path = re.match(r'https?://link\.theplatform\.com/s/([^?]+)', url).group(1)
  207. clear_text = bytes.fromhex(flags + expiration_date + str_to_hex(relative_path))
  208. checksum = hmac.new(sig_key.encode('ascii'), clear_text, hashlib.sha1).hexdigest()
  209. sig = flags + expiration_date + checksum + str_to_hex(sig_secret)
  210. return f'{url}&sig={sig}'
  211. def _real_extract(self, url):
  212. url, smuggled_data = unsmuggle_url(url, {})
  213. self._initialize_geo_bypass({
  214. 'countries': smuggled_data.get('geo_countries'),
  215. })
  216. mobj = self._match_valid_url(url)
  217. provider_id = mobj.group('provider_id')
  218. video_id = mobj.group('id')
  219. if not provider_id:
  220. provider_id = 'dJ5BDC'
  221. path = provider_id + '/'
  222. if mobj.group('media'):
  223. path += mobj.group('media')
  224. path += video_id
  225. qs_dict = parse_qs(url)
  226. if 'guid' in qs_dict:
  227. webpage = self._download_webpage(url, video_id)
  228. scripts = re.findall(r'<script[^>]+src="([^"]+)"', webpage)
  229. feed_id = None
  230. # feed id usually locates in the last script.
  231. # Seems there's no pattern for the interested script filename, so
  232. # I try one by one
  233. for script in reversed(scripts):
  234. feed_script = self._download_webpage(
  235. self._proto_relative_url(script, 'http:'),
  236. video_id, 'Downloading feed script')
  237. feed_id = self._search_regex(
  238. r'defaultFeedId\s*:\s*"([^"]+)"', feed_script,
  239. 'default feed id', default=None)
  240. if feed_id is not None:
  241. break
  242. if feed_id is None:
  243. raise ExtractorError('Unable to find feed id')
  244. return self.url_result('http://feed.theplatform.com/f/{}/{}?byGuid={}'.format(
  245. provider_id, feed_id, qs_dict['guid'][0]))
  246. if smuggled_data.get('force_smil_url', False):
  247. smil_url = url
  248. # Explicitly specified SMIL (see https://github.com/ytdl-org/youtube-dl/issues/7385)
  249. elif '/guid/' in url:
  250. headers = {}
  251. source_url = smuggled_data.get('source_url')
  252. if source_url:
  253. headers['Referer'] = source_url
  254. request = Request(url, headers=headers)
  255. webpage = self._download_webpage(request, video_id)
  256. smil_url = self._search_regex(
  257. r'<link[^>]+href=(["\'])(?P<url>.+?)\1[^>]+type=["\']application/smil\+xml',
  258. webpage, 'smil url', group='url')
  259. path = self._search_regex(
  260. r'link\.theplatform\.com/s/((?:[^/?#&]+/)+[^/?#&]+)', smil_url, 'path')
  261. smil_url += '?' if '?' not in smil_url else '&' + 'formats=m3u,mpeg4'
  262. elif mobj.group('config'):
  263. config_url = url + '&form=json'
  264. config_url = config_url.replace('swf/', 'config/')
  265. config_url = config_url.replace('onsite/', 'onsite/config/')
  266. config = self._download_json(config_url, video_id, 'Downloading config')
  267. release_url = config.get('releaseUrl') or f'http://link.theplatform.com/s/{path}?mbr=true'
  268. smil_url = release_url + '&formats=MPEG4&manifest=f4m'
  269. else:
  270. smil_url = f'http://link.theplatform.com/s/{path}?mbr=true'
  271. sig = smuggled_data.get('sig')
  272. if sig:
  273. smil_url = self._sign_url(smil_url, sig['key'], sig['secret'])
  274. formats, subtitles = self._extract_theplatform_smil(smil_url, video_id)
  275. # With some sites, manifest URL must be forced to extract HLS formats
  276. if not traverse_obj(formats, lambda _, v: v['format_id'].startswith('hls')):
  277. m3u8_url = update_url(url, query='mbr=true&manifest=m3u', fragment=None)
  278. urlh = self._request_webpage(
  279. HEADRequest(m3u8_url), video_id, 'Checking for HLS formats', 'No HLS formats found', fatal=False)
  280. if urlh and urlhandle_detect_ext(urlh) == 'm3u8':
  281. m3u8_fmts, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
  282. m3u8_url, video_id, m3u8_id='hls', fatal=False)
  283. formats.extend(m3u8_fmts)
  284. self._merge_subtitles(m3u8_subs, target=subtitles)
  285. ret = self._extract_theplatform_metadata(path, video_id)
  286. combined_subtitles = self._merge_subtitles(ret.get('subtitles', {}), subtitles)
  287. ret.update({
  288. 'id': video_id,
  289. 'formats': formats,
  290. 'subtitles': combined_subtitles,
  291. })
  292. return ret
  293. class ThePlatformFeedIE(ThePlatformBaseIE):
  294. _URL_TEMPLATE = '%s//feed.theplatform.com/f/%s/%s?form=json&%s'
  295. _VALID_URL = r'https?://feed\.theplatform\.com/f/(?P<provider_id>[^/]+)/(?P<feed_id>[^?/]+)\?(?:[^&]+&)*(?P<filter>by(?:Gui|I)d=(?P<id>[^&]+))'
  296. _TESTS = [{
  297. # From http://player.theplatform.com/p/7wvmTC/MSNBCEmbeddedOffSite?guid=n_hardball_5biden_140207
  298. 'url': 'http://feed.theplatform.com/f/7wvmTC/msnbc_video-p-test?form=json&pretty=true&range=-40&byGuid=n_hardball_5biden_140207',
  299. 'md5': '6e32495b5073ab414471b615c5ded394',
  300. 'info_dict': {
  301. 'id': 'n_hardball_5biden_140207',
  302. 'ext': 'mp4',
  303. 'title': 'The Biden factor: will Joe run in 2016?',
  304. 'description': 'Could Vice President Joe Biden be preparing a 2016 campaign? Mark Halperin and Sam Stein weigh in.',
  305. 'thumbnail': r're:^https?://.*\.jpg$',
  306. 'upload_date': '20140208',
  307. 'timestamp': 1391824260,
  308. 'duration': 467.0,
  309. 'categories': ['MSNBC/Issues/Democrats', 'MSNBC/Issues/Elections/Election 2016'],
  310. 'uploader': 'NBCU-NEWS',
  311. },
  312. }, {
  313. 'url': 'http://feed.theplatform.com/f/2E2eJC/nnd_NBCNews?byGuid=nn_netcast_180306.Copy.01',
  314. 'only_matching': True,
  315. }]
  316. def _extract_feed_info(self, provider_id, feed_id, filter_query, video_id, custom_fields=None, asset_types_query={}, account_id=None):
  317. real_url = self._URL_TEMPLATE % (self.http_scheme(), provider_id, feed_id, filter_query)
  318. entry = self._download_json(real_url, video_id)['entries'][0]
  319. main_smil_url = 'http://link.theplatform.com/s/%s/media/guid/%d/%s' % (provider_id, account_id, entry['guid']) if account_id else entry.get('plmedia$publicUrl')
  320. formats = []
  321. subtitles = {}
  322. first_video_id = None
  323. duration = None
  324. asset_types = []
  325. for item in entry['media$content']:
  326. smil_url = item['plfile$url']
  327. cur_video_id = ThePlatformIE._match_id(smil_url)
  328. if first_video_id is None:
  329. first_video_id = cur_video_id
  330. duration = float_or_none(item.get('plfile$duration'))
  331. file_asset_types = item.get('plfile$assetTypes') or parse_qs(smil_url)['assetTypes']
  332. for asset_type in file_asset_types:
  333. if asset_type in asset_types:
  334. continue
  335. asset_types.append(asset_type)
  336. query = {
  337. 'mbr': 'true',
  338. 'formats': item['plfile$format'],
  339. 'assetTypes': asset_type,
  340. }
  341. if asset_type in asset_types_query:
  342. query.update(asset_types_query[asset_type])
  343. cur_formats, cur_subtitles = self._extract_theplatform_smil(update_url_query(
  344. main_smil_url or smil_url, query), video_id, f'Downloading SMIL data for {asset_type}')
  345. formats.extend(cur_formats)
  346. subtitles = self._merge_subtitles(subtitles, cur_subtitles)
  347. thumbnails = [{
  348. 'url': thumbnail['plfile$url'],
  349. 'width': int_or_none(thumbnail.get('plfile$width')),
  350. 'height': int_or_none(thumbnail.get('plfile$height')),
  351. } for thumbnail in entry.get('media$thumbnails', [])]
  352. timestamp = int_or_none(entry.get('media$availableDate'), scale=1000)
  353. categories = [item['media$name'] for item in entry.get('media$categories', [])]
  354. ret = self._extract_theplatform_metadata(f'{provider_id}/{first_video_id}', video_id)
  355. subtitles = self._merge_subtitles(subtitles, ret['subtitles'])
  356. ret.update({
  357. 'id': video_id,
  358. 'formats': formats,
  359. 'subtitles': subtitles,
  360. 'thumbnails': thumbnails,
  361. 'duration': duration,
  362. 'timestamp': timestamp,
  363. 'categories': categories,
  364. })
  365. if custom_fields:
  366. ret.update(custom_fields(entry))
  367. return ret
  368. def _real_extract(self, url):
  369. mobj = self._match_valid_url(url)
  370. video_id = mobj.group('id')
  371. provider_id = mobj.group('provider_id')
  372. feed_id = mobj.group('feed_id')
  373. filter_query = mobj.group('filter')
  374. return self._extract_feed_info(provider_id, feed_id, filter_query, video_id)