glomex.py 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216
  1. import re
  2. import urllib.parse
  3. from .common import InfoExtractor
  4. from ..utils import (
  5. ExtractorError,
  6. determine_ext,
  7. extract_attributes,
  8. int_or_none,
  9. parse_qs,
  10. smuggle_url,
  11. unescapeHTML,
  12. unsmuggle_url,
  13. )
  14. class GlomexBaseIE(InfoExtractor):
  15. _DEFAULT_ORIGIN_URL = 'https://player.glomex.com/'
  16. _API_URL = 'https://integration-cloudfront-eu-west-1.mes.glomex.cloud/'
  17. @staticmethod
  18. def _smuggle_origin_url(url, origin_url):
  19. if origin_url is None:
  20. return url
  21. return smuggle_url(url, {'origin': origin_url})
  22. @classmethod
  23. def _unsmuggle_origin_url(cls, url, fallback_origin_url=None):
  24. defaults = {'origin': fallback_origin_url or cls._DEFAULT_ORIGIN_URL}
  25. unsmuggled_url, data = unsmuggle_url(url, default=defaults)
  26. return unsmuggled_url, data['origin']
  27. def _get_videoid_type(self, video_id):
  28. _VIDEOID_TYPES = {
  29. 'v': 'video',
  30. 'pl': 'playlist',
  31. 'rl': 'related videos playlist',
  32. 'cl': 'curated playlist',
  33. }
  34. prefix = video_id.split('-')[0]
  35. return _VIDEOID_TYPES.get(prefix, 'unknown type')
  36. def _download_api_data(self, video_id, integration, current_url=None):
  37. query = {
  38. 'integration_id': integration,
  39. 'playlist_id': video_id,
  40. 'current_url': current_url or self._DEFAULT_ORIGIN_URL,
  41. }
  42. video_id_type = self._get_videoid_type(video_id)
  43. return self._download_json(
  44. self._API_URL,
  45. video_id, f'Downloading {video_id_type} JSON',
  46. f'Unable to download {video_id_type} JSON',
  47. query=query)
  48. def _download_and_extract_api_data(self, video_id, integration, current_url):
  49. api_data = self._download_api_data(video_id, integration, current_url)
  50. videos = api_data['videos']
  51. if not videos:
  52. raise ExtractorError(f'no videos found for {video_id}')
  53. videos = [self._extract_api_data(video, video_id) for video in videos]
  54. return videos[0] if len(videos) == 1 else self.playlist_result(videos, video_id)
  55. def _extract_api_data(self, video, video_id):
  56. if video.get('error_code') == 'contentGeoblocked':
  57. self.raise_geo_restricted(countries=video['geo_locations'])
  58. formats, subs = [], {}
  59. for format_id, format_url in video['source'].items():
  60. ext = determine_ext(format_url)
  61. if ext == 'm3u8':
  62. formats_, subs_ = self._extract_m3u8_formats_and_subtitles(
  63. format_url, video_id, 'mp4', m3u8_id=format_id,
  64. fatal=False)
  65. formats.extend(formats_)
  66. self._merge_subtitles(subs_, target=subs)
  67. else:
  68. formats.append({
  69. 'url': format_url,
  70. 'format_id': format_id,
  71. })
  72. if video.get('language'):
  73. for fmt in formats:
  74. fmt['language'] = video['language']
  75. images = (video.get('images') or []) + [video.get('image') or {}]
  76. thumbnails = [{
  77. 'id': image.get('id'),
  78. 'url': f'{image["url"]}/profile:player-960x540',
  79. 'width': 960,
  80. 'height': 540,
  81. } for image in images if image.get('url')]
  82. self._remove_duplicate_formats(thumbnails)
  83. return {
  84. 'id': video.get('clip_id') or video_id,
  85. 'title': video.get('title'),
  86. 'description': video.get('description'),
  87. 'thumbnails': thumbnails,
  88. 'duration': int_or_none(video.get('clip_duration')),
  89. 'timestamp': video.get('created_at'),
  90. 'formats': formats,
  91. 'subtitles': subs,
  92. }
  93. class GlomexIE(GlomexBaseIE):
  94. IE_NAME = 'glomex'
  95. IE_DESC = 'Glomex videos'
  96. _VALID_URL = r'https?://video\.glomex\.com/[^/]+/(?P<id>v-[^-]+)'
  97. _INTEGRATION_ID = '19syy24xjn1oqlpc'
  98. _TESTS = [{
  99. 'url': 'https://video.glomex.com/sport/v-cb24uwg77hgh-nach-2-0-sieg-guardiola-mit-mancity-vor-naechstem-titel',
  100. 'md5': 'cec33a943c4240c9cb33abea8c26242e',
  101. 'info_dict': {
  102. 'id': 'v-cb24uwg77hgh',
  103. 'ext': 'mp4',
  104. 'title': 'md5:38a90cedcfadd72982c81acf13556e0c',
  105. 'description': 'md5:1ea6b6caff1443fcbbba159e432eedb8',
  106. 'duration': 29600,
  107. 'timestamp': 1619895017,
  108. 'upload_date': '20210501',
  109. },
  110. }]
  111. def _real_extract(self, url):
  112. video_id = self._match_id(url)
  113. return self.url_result(
  114. GlomexEmbedIE.build_player_url(video_id, self._INTEGRATION_ID, url),
  115. GlomexEmbedIE.ie_key(), video_id)
  116. class GlomexEmbedIE(GlomexBaseIE):
  117. IE_NAME = 'glomex:embed'
  118. IE_DESC = 'Glomex embedded videos'
  119. _BASE_PLAYER_URL = '//player.glomex.com/integration/1/iframe-player.html'
  120. _BASE_PLAYER_URL_RE = re.escape(_BASE_PLAYER_URL).replace('/1/', r'/[^/]/')
  121. _VALID_URL = rf'https?:{_BASE_PLAYER_URL_RE}\?([^#]+&)?playlistId=(?P<id>[^#&]+)'
  122. _TESTS = [{
  123. 'url': 'https://player.glomex.com/integration/1/iframe-player.html?integrationId=4059a013k56vb2yd&playlistId=v-cfa6lye0dkdd-sf',
  124. 'md5': '68f259b98cc01918ac34180142fce287',
  125. 'info_dict': {
  126. 'id': 'v-cfa6lye0dkdd-sf',
  127. 'ext': 'mp4',
  128. 'timestamp': 1635337199,
  129. 'duration': 133080,
  130. 'upload_date': '20211027',
  131. 'description': 'md5:e741185fc309310ff5d0c789b437be66',
  132. 'title': 'md5:35647293513a6c92363817a0fb0a7961',
  133. },
  134. }, {
  135. 'url': 'https://player.glomex.com/integration/1/iframe-player.html?origin=fullpage&integrationId=19syy24xjn1oqlpc&playlistId=rl-vcb49w1fb592p&playlistIndex=0',
  136. 'info_dict': {
  137. 'id': 'rl-vcb49w1fb592p',
  138. },
  139. 'playlist_count': 100,
  140. }, {
  141. 'url': 'https://player.glomex.com/integration/1/iframe-player.html?playlistId=cl-bgqaata6aw8x&integrationId=19syy24xjn1oqlpc',
  142. 'info_dict': {
  143. 'id': 'cl-bgqaata6aw8x',
  144. },
  145. 'playlist_mincount': 2,
  146. }]
  147. @classmethod
  148. def build_player_url(cls, video_id, integration, origin_url=None):
  149. query_string = urllib.parse.urlencode({
  150. 'playlistId': video_id,
  151. 'integrationId': integration,
  152. })
  153. return cls._smuggle_origin_url(f'https:{cls._BASE_PLAYER_URL}?{query_string}', origin_url)
  154. @classmethod
  155. def _extract_embed_urls(cls, url, webpage):
  156. # https://docs.glomex.com/publisher/video-player-integration/javascript-api/
  157. quot_re = r'["\']'
  158. regex = fr'''(?x)
  159. <iframe[^>]+?src=(?P<q>{quot_re})(?P<url>
  160. (?:https?:)?{cls._BASE_PLAYER_URL_RE}\?(?:(?!(?P=q)).)+
  161. )(?P=q)'''
  162. for mobj in re.finditer(regex, webpage):
  163. embed_url = unescapeHTML(mobj.group('url'))
  164. if cls.suitable(embed_url):
  165. yield cls._smuggle_origin_url(embed_url, url)
  166. regex = fr'''(?x)
  167. <glomex-player [^>]+?>|
  168. <div[^>]* data-glomex-player=(?P<q>{quot_re})true(?P=q)[^>]*>'''
  169. for mobj in re.finditer(regex, webpage):
  170. attrs = extract_attributes(mobj.group(0))
  171. if attrs.get('data-integration-id') and attrs.get('data-playlist-id'):
  172. yield cls.build_player_url(attrs['data-playlist-id'], attrs['data-integration-id'], url)
  173. # naive parsing of inline scripts for hard-coded integration parameters
  174. regex = fr'''(?x)
  175. (?P<is_js>dataset\.)?%s\s*(?(is_js)=|:)\s*
  176. (?P<q>{quot_re})(?P<id>(?:(?!(?P=q)).)+)(?P=q)\s'''
  177. for mobj in re.finditer(r'(?x)<script[^<]*>.+?</script>', webpage):
  178. script = mobj.group(0)
  179. integration_id = re.search(regex % 'integrationId', script)
  180. if not integration_id:
  181. continue
  182. playlist_id = re.search(regex % 'playlistId', script)
  183. if playlist_id:
  184. yield cls.build_player_url(playlist_id, integration_id, url)
  185. def _real_extract(self, url):
  186. url, origin_url = self._unsmuggle_origin_url(url)
  187. playlist_id = self._match_id(url)
  188. integration = parse_qs(url).get('integrationId', [None])[0]
  189. if not integration:
  190. raise ExtractorError('No integrationId in URL', expected=True)
  191. return self._download_and_extract_api_data(playlist_id, integration, origin_url)