mainstreaming.py 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209
  1. import re
  2. from .common import InfoExtractor
  3. from ..utils import (
  4. int_or_none,
  5. js_to_json,
  6. parse_duration,
  7. traverse_obj,
  8. try_get,
  9. urljoin,
  10. )
  11. class MainStreamingIE(InfoExtractor):
  12. _VALID_URL = r'https?://(?:webtools-?)?(?P<host>[A-Za-z0-9-]*\.msvdn\.net)/(?:embed|amp_embed|content)/(?P<id>\w+)'
  13. _EMBED_REGEX = [rf'<iframe[^>]+?src=["\']?(?P<url>{_VALID_URL})["\']?']
  14. IE_DESC = 'MainStreaming Player'
  15. _TESTS = [
  16. {
  17. # Live stream offline, has alternative content id
  18. 'url': 'https://webtools-e18da6642b684f8aa9ae449862783a56.msvdn.net/embed/53EN6GxbWaJC',
  19. 'info_dict': {
  20. 'id': '53EN6GxbWaJC',
  21. 'title': 'Diretta homepage 2021-12-31 12:00',
  22. 'description': '',
  23. 'live_status': 'was_live',
  24. 'ext': 'mp4',
  25. 'thumbnail': r're:https?://[A-Za-z0-9-]*\.msvdn.net/image/\w+/poster',
  26. },
  27. 'expected_warnings': [
  28. 'Ignoring alternative content ID: WDAF1KOWUpH3',
  29. 'MainStreaming said: Live event is OFFLINE',
  30. ],
  31. 'skip': 'live stream offline',
  32. }, {
  33. # playlist
  34. 'url': 'https://webtools-e18da6642b684f8aa9ae449862783a56.msvdn.net/embed/WDAF1KOWUpH3',
  35. 'info_dict': {
  36. 'id': 'WDAF1KOWUpH3',
  37. 'title': 'Playlist homepage',
  38. },
  39. 'playlist_mincount': 2,
  40. }, {
  41. # livestream
  42. 'url': 'https://webtools-859c1818ed614cc5b0047439470927b0.msvdn.net/embed/tDoFkZD3T1Lw',
  43. 'info_dict': {
  44. 'id': 'tDoFkZD3T1Lw',
  45. 'title': r're:Class CNBC Live \d{4}-\d{2}-\d{2} \d{2}:\d{2}$',
  46. 'live_status': 'is_live',
  47. 'ext': 'mp4',
  48. 'thumbnail': r're:https?://[A-Za-z0-9-]*\.msvdn.net/image/\w+/poster',
  49. },
  50. 'skip': 'live stream',
  51. }, {
  52. 'url': 'https://webtools-f5842579ff984c1c98d63b8d789673eb.msvdn.net/embed/EUlZfGWkGpOd?autoPlay=false',
  53. 'info_dict': {
  54. 'id': 'EUlZfGWkGpOd',
  55. 'title': 'La Settimana ',
  56. 'description': '03 Ottobre ore 02:00',
  57. 'ext': 'mp4',
  58. 'live_status': 'not_live',
  59. 'thumbnail': r're:https?://[A-Za-z0-9-]*\.msvdn.net/image/\w+/poster',
  60. 'duration': 1512,
  61. },
  62. }, {
  63. # video without webtools- prefix
  64. 'url': 'https://f5842579ff984c1c98d63b8d789673eb.msvdn.net/embed/MfuWmzL2lGkA?autoplay=false&T=1635860445',
  65. 'info_dict': {
  66. 'id': 'MfuWmzL2lGkA',
  67. 'title': 'TG Mattina',
  68. 'description': '06 Ottobre ore 08:00',
  69. 'ext': 'mp4',
  70. 'live_status': 'not_live',
  71. 'thumbnail': r're:https?://[A-Za-z0-9-]*\.msvdn.net/image/\w+/poster',
  72. 'duration': 789.04,
  73. },
  74. }, {
  75. # always-on livestream with DVR
  76. 'url': 'https://webtools-f5842579ff984c1c98d63b8d789673eb.msvdn.net/embed/HVvPMzy',
  77. 'info_dict': {
  78. 'id': 'HVvPMzy',
  79. 'title': r're:^Diretta LaC News24 \d{4}-\d{2}-\d{2} \d{2}:\d{2}$',
  80. 'description': 'canale all news',
  81. 'live_status': 'is_live',
  82. 'ext': 'mp4',
  83. 'thumbnail': r're:https?://[A-Za-z0-9-]*\.msvdn.net/image/\w+/poster',
  84. },
  85. 'params': {
  86. 'skip_download': True,
  87. },
  88. }, {
  89. # no host
  90. 'url': 'https://webtools.msvdn.net/embed/MfuWmzL2lGkA',
  91. 'only_matching': True,
  92. }, {
  93. 'url': 'https://859c1818ed614cc5b0047439470927b0.msvdn.net/amp_embed/tDoFkZD3T1Lw',
  94. 'only_matching': True,
  95. }, {
  96. 'url': 'https://859c1818ed614cc5b0047439470927b0.msvdn.net/content/tDoFkZD3T1Lw#',
  97. 'only_matching': True,
  98. },
  99. ]
  100. def _playlist_entries(self, host, playlist_content):
  101. for entry in playlist_content:
  102. content_id = entry.get('contentID')
  103. yield {
  104. '_type': 'url',
  105. 'ie_key': MainStreamingIE.ie_key(),
  106. 'id': content_id,
  107. 'duration': int_or_none(traverse_obj(entry, ('duration', 'totalSeconds'))),
  108. 'title': entry.get('title'),
  109. 'url': f'https://{host}/embed/{content_id}',
  110. }
  111. @staticmethod
  112. def _get_webtools_host(host):
  113. if not host.startswith('webtools'):
  114. host = 'webtools' + ('-' if not host.startswith('.') else '') + host
  115. return host
  116. def _get_webtools_base_url(self, host):
  117. return f'{self.http_scheme()}//{self._get_webtools_host(host)}'
  118. def _call_api(self, host: str, path: str, item_id: str, query=None, note='Downloading API JSON', fatal=False):
  119. # JSON API, does not appear to be documented
  120. return self._call_webtools_api(host, '/api/v2/' + path, item_id, query, note, fatal)
  121. def _call_webtools_api(self, host: str, path: str, item_id: str, query=None, note='Downloading webtools API JSON', fatal=False):
  122. # webtools docs: https://webtools.msvdn.net/
  123. return self._download_json(
  124. urljoin(self._get_webtools_base_url(host), path), item_id, query=query, note=note, fatal=fatal)
  125. def _real_extract(self, url):
  126. host, video_id = self._match_valid_url(url).groups()
  127. content_info = try_get(
  128. self._call_api(
  129. host, f'content/{video_id}', video_id, note='Downloading content info API JSON'), lambda x: x['playerContentInfo'])
  130. # Fallback
  131. if not content_info:
  132. webpage = self._download_webpage(url, video_id)
  133. player_config = self._parse_json(
  134. self._search_regex(
  135. r'config\s*=\s*({.+?})\s*;', webpage, 'mainstreaming player config',
  136. default='{}', flags=re.DOTALL),
  137. video_id, transform_source=js_to_json, fatal=False) or {}
  138. content_info = player_config['contentInfo']
  139. host = content_info.get('host') or host
  140. video_id = content_info.get('contentID') or video_id
  141. title = content_info.get('title')
  142. description = traverse_obj(content_info, 'longDescription', 'shortDescription', expected_type=str)
  143. live_status = 'not_live'
  144. if content_info.get('drmEnabled'):
  145. self.report_drm(video_id)
  146. alternative_content_id = content_info.get('alternativeContentID')
  147. if alternative_content_id:
  148. self.report_warning(f'Ignoring alternative content ID: {alternative_content_id}')
  149. content_type = int_or_none(content_info.get('contentType'))
  150. format_base_url = None
  151. formats = []
  152. subtitles = {}
  153. # Live content
  154. if content_type == 20:
  155. dvr_enabled = traverse_obj(content_info, ('playerSettings', 'dvrEnabled'), expected_type=bool)
  156. format_base_url = f"https://{host}/live/{content_info['liveSourceID']}/{video_id}/%s{'?DVR' if dvr_enabled else ''}"
  157. live_status = 'is_live'
  158. heartbeat = self._call_api(host, f'heartbeat/{video_id}', video_id, note='Checking stream status') or {}
  159. if heartbeat.get('heartBeatUp') is False:
  160. self.raise_no_formats(f'MainStreaming said: {heartbeat.get("responseMessage")}', expected=True)
  161. live_status = 'was_live'
  162. # Playlist
  163. elif content_type == 31:
  164. return self.playlist_result(
  165. self._playlist_entries(host, content_info.get('playlistContents')), video_id, title, description)
  166. # Normal video content?
  167. elif content_type == 10:
  168. format_base_url = f'https://{host}/vod/{video_id}/%s'
  169. # Progressive format
  170. # Note: in https://webtools.msvdn.net/loader/playerV2.js there is mention of original.mp3 format,
  171. # however it seems to be the same as original.mp4?
  172. formats.append({'url': format_base_url % 'original.mp4', 'format_note': 'original', 'quality': 1})
  173. else:
  174. self.raise_no_formats(f'Unknown content type {content_type}')
  175. if format_base_url:
  176. m3u8_formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
  177. format_base_url % 'playlist.m3u8', video_id=video_id, fatal=False)
  178. mpd_formats, mpd_subs = self._extract_mpd_formats_and_subtitles(
  179. format_base_url % 'manifest.mpd', video_id=video_id, fatal=False)
  180. subtitles = self._merge_subtitles(m3u8_subs, mpd_subs)
  181. formats.extend(m3u8_formats + mpd_formats)
  182. return {
  183. 'id': video_id,
  184. 'title': title,
  185. 'description': description,
  186. 'formats': formats,
  187. 'live_status': live_status,
  188. 'duration': parse_duration(content_info.get('duration')),
  189. 'tags': content_info.get('tags'),
  190. 'subtitles': subtitles,
  191. 'thumbnail': urljoin(self._get_webtools_base_url(host), f'image/{video_id}/poster'),
  192. }