sproutvideo.py 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198
  1. import base64
  2. import urllib.parse
  3. from .common import InfoExtractor
  4. from ..networking.exceptions import HTTPError
  5. from ..utils import (
  6. ExtractorError,
  7. int_or_none,
  8. qualities,
  9. remove_start,
  10. smuggle_url,
  11. unsmuggle_url,
  12. update_url_query,
  13. url_or_none,
  14. urlencode_postdata,
  15. )
  16. from ..utils.traversal import traverse_obj
  17. class SproutVideoIE(InfoExtractor):
  18. _NO_SCHEME_RE = r'//videos\.sproutvideo\.com/embed/(?P<id>[\da-f]+)/[\da-f]+'
  19. _VALID_URL = rf'https?:{_NO_SCHEME_RE}'
  20. _EMBED_REGEX = [rf'<iframe [^>]*\bsrc=["\'](?P<url>(?:https?:)?{_NO_SCHEME_RE}[^"\']*)["\']']
  21. _TESTS = [{
  22. 'url': 'https://videos.sproutvideo.com/embed/4c9dddb01910e3c9c4/0fc24387c4f24ee3',
  23. 'md5': '1343ce1a6cb39d67889bfa07c7b02b0e',
  24. 'info_dict': {
  25. 'id': '4c9dddb01910e3c9c4',
  26. 'ext': 'mp4',
  27. 'title': 'Adrien Labaeye : Berlin, des communautés aux communs',
  28. 'duration': 576,
  29. 'thumbnail': r're:https?://images\.sproutvideo\.com/.+\.jpg',
  30. },
  31. }, {
  32. 'url': 'https://videos.sproutvideo.com/embed/a79fdcb21f1be2c62e/93bf31e41e39ca27',
  33. 'md5': 'cebae5cf558cca83271917cf4ec03f26',
  34. 'info_dict': {
  35. 'id': 'a79fdcb21f1be2c62e',
  36. 'ext': 'mp4',
  37. 'title': 'HS_01_Live Stream 2023-01-14 10:00',
  38. 'duration': 703,
  39. 'thumbnail': r're:https?://images\.sproutvideo\.com/.+\.jpg',
  40. },
  41. }, {
  42. # http formats 'sd' and 'hd' are available
  43. 'url': 'https://videos.sproutvideo.com/embed/119cd6bc1a18e6cd98/30751a1761ae5b90',
  44. 'md5': 'f368c78df07e78a749508b221528672c',
  45. 'info_dict': {
  46. 'id': '119cd6bc1a18e6cd98',
  47. 'ext': 'mp4',
  48. 'title': '3. Updating your Partner details',
  49. 'thumbnail': r're:https?://images\.sproutvideo\.com/.+\.jpg',
  50. 'duration': 60,
  51. },
  52. 'params': {'format': 'hd'},
  53. }, {
  54. # subtitles
  55. 'url': 'https://videos.sproutvideo.com/embed/119dd8ba121ee0cc98/4ee50c88a343215d?type=hd',
  56. 'md5': '7f6798f037d7a3e3e07e67959de68fc6',
  57. 'info_dict': {
  58. 'id': '119dd8ba121ee0cc98',
  59. 'ext': 'mp4',
  60. 'title': 'Recipients Setup - Domestic Wire Only',
  61. 'thumbnail': r're:https?://images\.sproutvideo\.com/.+\.jpg',
  62. 'duration': 77,
  63. 'subtitles': {'en': 'count:1'},
  64. },
  65. }]
  66. _WEBPAGE_TESTS = [{
  67. 'url': 'https://www.solidarum.org/vivre-ensemble/adrien-labaeye-berlin-des-communautes-aux-communs',
  68. 'info_dict': {
  69. 'id': '4c9dddb01910e3c9c4',
  70. 'ext': 'mp4',
  71. 'title': 'Adrien Labaeye : Berlin, des communautés aux communs',
  72. 'duration': 576,
  73. 'thumbnail': r're:https?://images\.sproutvideo\.com/.+\.jpg',
  74. },
  75. }]
  76. _M3U8_URL_TMPL = 'https://{base}.videos.sproutvideo.com/{s3_user_hash}/{s3_video_hash}/video/index.m3u8'
  77. _QUALITIES = ('hd', 'uhd', 'source') # Exclude 'sd' to prioritize hls formats above it
  78. @staticmethod
  79. def _policy_to_qs(policy, signature_key, as_string=False):
  80. query = {}
  81. for key, value in policy['signatures'][signature_key].items():
  82. query[remove_start(key, 'CloudFront-')] = value
  83. query['sessionID'] = policy['sessionID']
  84. return urllib.parse.urlencode(query, doseq=True) if as_string else query
  85. @classmethod
  86. def _extract_embed_urls(cls, url, webpage):
  87. for embed_url in super()._extract_embed_urls(url, webpage):
  88. if embed_url.startswith('//'):
  89. embed_url = f'https:{embed_url}'
  90. yield smuggle_url(embed_url, {'referer': url})
  91. def _real_extract(self, url):
  92. url, smuggled_data = unsmuggle_url(url, {})
  93. video_id = self._match_id(url)
  94. webpage = self._download_webpage(
  95. url, video_id, headers=traverse_obj(smuggled_data, {'Referer': 'referer'}))
  96. data = self._search_json(
  97. r'var\s+dat\s*=\s*["\']', webpage, 'data', video_id, contains_pattern=r'[A-Za-z0-9+/=]+',
  98. end_pattern=r'["\'];', transform_source=lambda x: base64.b64decode(x).decode())
  99. formats, subtitles = [], {}
  100. headers = {
  101. 'Accept': '*/*',
  102. 'Origin': 'https://videos.sproutvideo.com',
  103. 'Referer': url,
  104. }
  105. # HLS extraction is fatal; only attempt it if the JSON data says it's available
  106. if traverse_obj(data, 'hls'):
  107. manifest_query = self._policy_to_qs(data, 'm')
  108. fragment_query = self._policy_to_qs(data, 't', as_string=True)
  109. key_query = self._policy_to_qs(data, 'k', as_string=True)
  110. formats.extend(self._extract_m3u8_formats(
  111. self._M3U8_URL_TMPL.format(**data), video_id, 'mp4',
  112. m3u8_id='hls', headers=headers, query=manifest_query))
  113. for fmt in formats:
  114. fmt.update({
  115. 'url': update_url_query(fmt['url'], manifest_query),
  116. 'extra_param_to_segment_url': fragment_query,
  117. 'extra_param_to_key_url': key_query,
  118. })
  119. if downloads := traverse_obj(data, ('downloads', {dict.items}, lambda _, v: url_or_none(v[1]))):
  120. quality = qualities(self._QUALITIES)
  121. acodec = 'none' if data.get('has_audio') is False else None
  122. formats.extend([{
  123. 'format_id': str(format_id),
  124. 'url': format_url,
  125. 'ext': 'mp4',
  126. 'quality': quality(format_id),
  127. 'acodec': acodec,
  128. } for format_id, format_url in downloads])
  129. for sub_data in traverse_obj(data, ('subtitleData', lambda _, v: url_or_none(v['src']))):
  130. subtitles.setdefault(sub_data.get('srclang', 'en'), []).append({
  131. 'url': sub_data['src'],
  132. })
  133. return {
  134. 'id': video_id,
  135. 'formats': formats,
  136. 'subtitles': subtitles,
  137. 'http_headers': headers,
  138. **traverse_obj(data, {
  139. 'title': ('title', {str}),
  140. 'duration': ('duration', {int_or_none}),
  141. 'thumbnail': ('posterframe_url', {url_or_none}),
  142. }),
  143. }
  144. class VidsIoIE(InfoExtractor):
  145. IE_NAME = 'vids.io'
  146. _VALID_URL = r'https?://[\w-]+\.vids\.io/videos/(?P<id>[\da-f]+)/(?P<display_id>[\w-]+)'
  147. _TESTS = [{
  148. 'url': 'https://how-to-video.vids.io/videos/799cd8b11c10efc1f0/how-to-video-live-streaming',
  149. 'md5': '9bbbb2c0c0739eb163b80f87b8d77c9e',
  150. 'info_dict': {
  151. 'id': '799cd8b11c10efc1f0',
  152. 'ext': 'mp4',
  153. 'title': 'How to Video: Live Streaming',
  154. 'duration': 2787,
  155. 'thumbnail': r're:https?://images\.sproutvideo\.com/.+\.jpg',
  156. },
  157. }]
  158. def _real_extract(self, url):
  159. video_id, display_id = self._match_valid_url(url).group('id', 'display_id')
  160. webpage, urlh = self._download_webpage_handle(url, display_id, expected_status=403)
  161. if urlh.status == 403:
  162. password = self.get_param('videopassword')
  163. if not password:
  164. raise ExtractorError(
  165. 'This video is password-protected; use the --video-password option', expected=True)
  166. try:
  167. webpage = self._download_webpage(
  168. url, display_id, 'Submitting video password',
  169. data=urlencode_postdata({
  170. 'password': password,
  171. **self._hidden_inputs(webpage),
  172. }))
  173. # Requests with user's session cookie `_sproutvideo_session` are now authorized
  174. except ExtractorError as e:
  175. if isinstance(e.cause, HTTPError) and e.cause.status == 403:
  176. raise ExtractorError('Incorrect password', expected=True)
  177. raise
  178. if embed_url := next(SproutVideoIE._extract_embed_urls(url, webpage), None):
  179. return self.url_result(embed_url, SproutVideoIE, video_id)
  180. raise ExtractorError('Unable to extract any SproutVideo embed url')