videoken.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337
  1. import base64
  2. import functools
  3. import math
  4. import re
  5. import time
  6. import urllib.parse
  7. from .common import InfoExtractor
  8. from .slideslive import SlidesLiveIE
  9. from ..utils import (
  10. ExtractorError,
  11. InAdvancePagedList,
  12. int_or_none,
  13. remove_start,
  14. traverse_obj,
  15. update_url_query,
  16. url_or_none,
  17. )
  18. class VideoKenBaseIE(InfoExtractor):
  19. _ORGANIZATIONS = {
  20. 'videos.icts.res.in': 'icts',
  21. 'videos.cncf.io': 'cncf',
  22. 'videos.neurips.cc': 'neurips',
  23. }
  24. _BASE_URL_RE = rf'https?://(?P<host>{"|".join(map(re.escape, _ORGANIZATIONS))})/'
  25. _PAGE_SIZE = 12
  26. def _get_org_id_and_api_key(self, org, video_id):
  27. details = self._download_json(
  28. f'https://analytics.videoken.com/api/videolake/{org}/details', video_id,
  29. note='Downloading organization ID and API key', headers={
  30. 'Accept': 'application/json',
  31. })
  32. return details['id'], details['apikey']
  33. def _create_slideslive_url(self, video_url, video_id, referer):
  34. if not video_url and not video_id:
  35. return
  36. elif not video_url or 'embed/sign-in' in video_url:
  37. video_url = f'https://slideslive.com/embed/{remove_start(video_id, "slideslive-")}'
  38. if url_or_none(referer):
  39. return update_url_query(video_url, {
  40. 'embed_parent_url': referer,
  41. 'embed_container_origin': f'https://{urllib.parse.urlparse(referer).hostname}',
  42. })
  43. return video_url
  44. def _extract_videos(self, videos, url):
  45. for video in traverse_obj(videos, (('videos', 'results'), ...)):
  46. video_id = traverse_obj(video, 'youtube_id', 'videoid')
  47. if not video_id:
  48. continue
  49. ie_key = None
  50. if traverse_obj(video, 'type', 'source') == 'youtube':
  51. video_url = video_id
  52. ie_key = 'Youtube'
  53. else:
  54. video_url = traverse_obj(video, 'embed_url', 'embeddableurl', expected_type=url_or_none)
  55. if not video_url:
  56. continue
  57. elif urllib.parse.urlparse(video_url).hostname == 'slideslive.com':
  58. ie_key = SlidesLiveIE
  59. video_url = self._create_slideslive_url(video_url, video_id, url)
  60. yield self.url_result(video_url, ie_key, video_id)
  61. class VideoKenIE(VideoKenBaseIE):
  62. _VALID_URL = VideoKenBaseIE._BASE_URL_RE + r'(?:(?:topic|category)/[^/#?]+/)?video/(?P<id>[\w-]+)'
  63. _TESTS = [{
  64. # neurips -> videoken -> slideslive
  65. 'url': 'https://videos.neurips.cc/video/slideslive-38922815',
  66. 'info_dict': {
  67. 'id': '38922815',
  68. 'ext': 'mp4',
  69. 'title': 'Efficient Processing of Deep Neural Network: from Algorithms to Hardware Architectures',
  70. 'timestamp': 1630939331,
  71. 'upload_date': '20210906',
  72. 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
  73. 'thumbnails': 'count:330',
  74. 'chapters': 'count:329',
  75. },
  76. 'params': {
  77. 'skip_download': 'm3u8',
  78. },
  79. 'expected_warnings': ['Failed to download VideoKen API JSON'],
  80. }, {
  81. # neurips -> videoken -> slideslive -> youtube
  82. 'url': 'https://videos.neurips.cc/topic/machine%20learning/video/slideslive-38923348',
  83. 'info_dict': {
  84. 'id': '2Xa_dt78rJE',
  85. 'ext': 'mp4',
  86. 'display_id': '38923348',
  87. 'title': 'Machine Education',
  88. 'description': 'Watch full version of this video at https://slideslive.com/38923348.',
  89. 'channel': 'SlidesLive Videos - G2',
  90. 'channel_id': 'UCOExahQQ588Da8Nft_Ltb9w',
  91. 'channel_url': 'https://www.youtube.com/channel/UCOExahQQ588Da8Nft_Ltb9w',
  92. 'uploader': 'SlidesLive Videos - G2',
  93. 'uploader_id': 'UCOExahQQ588Da8Nft_Ltb9w',
  94. 'uploader_url': 'http://www.youtube.com/channel/UCOExahQQ588Da8Nft_Ltb9w',
  95. 'duration': 2504,
  96. 'timestamp': 1618922125,
  97. 'upload_date': '20200131',
  98. 'age_limit': 0,
  99. 'channel_follower_count': int,
  100. 'view_count': int,
  101. 'availability': 'unlisted',
  102. 'live_status': 'not_live',
  103. 'playable_in_embed': True,
  104. 'categories': ['People & Blogs'],
  105. 'tags': [],
  106. 'thumbnail': r're:^https?://.*\.(?:jpg|webp)',
  107. 'thumbnails': 'count:78',
  108. 'chapters': 'count:77',
  109. },
  110. 'params': {
  111. 'skip_download': 'm3u8',
  112. },
  113. 'expected_warnings': ['Failed to download VideoKen API JSON'],
  114. }, {
  115. # icts -> videoken -> youtube
  116. 'url': 'https://videos.icts.res.in/topic/random%20variable/video/zysIsojYdvc',
  117. 'info_dict': {
  118. 'id': 'zysIsojYdvc',
  119. 'ext': 'mp4',
  120. 'title': 'Small-worlds, complex networks and random graphs (Lecture 3) by Remco van der Hofstad',
  121. 'description': 'md5:87433069d79719eeadc1962cc2ace00b',
  122. 'channel': 'International Centre for Theoretical Sciences',
  123. 'channel_id': 'UCO3xnVTHzB7l-nc8mABUJIQ',
  124. 'channel_url': 'https://www.youtube.com/channel/UCO3xnVTHzB7l-nc8mABUJIQ',
  125. 'uploader': 'International Centre for Theoretical Sciences',
  126. 'uploader_id': 'ICTStalks',
  127. 'uploader_url': 'http://www.youtube.com/user/ICTStalks',
  128. 'duration': 3372,
  129. 'upload_date': '20191004',
  130. 'age_limit': 0,
  131. 'live_status': 'not_live',
  132. 'availability': 'public',
  133. 'playable_in_embed': True,
  134. 'channel_follower_count': int,
  135. 'like_count': int,
  136. 'view_count': int,
  137. 'categories': ['Science & Technology'],
  138. 'tags': [],
  139. 'thumbnail': r're:^https?://.*\.(?:jpg|webp)',
  140. 'thumbnails': 'count:42',
  141. 'chapters': 'count:20',
  142. },
  143. 'params': {
  144. 'skip_download': 'm3u8',
  145. },
  146. }, {
  147. 'url': 'https://videos.cncf.io/category/478/video/IL4nxbmUIX8',
  148. 'only_matching': True,
  149. }, {
  150. 'url': 'https://videos.cncf.io/topic/kubernetes/video/YAM2d7yTrrI',
  151. 'only_matching': True,
  152. }, {
  153. 'url': 'https://videos.icts.res.in/video/d7HuP_abpKU',
  154. 'only_matching': True,
  155. }]
  156. def _real_extract(self, url):
  157. hostname, video_id = self._match_valid_url(url).group('host', 'id')
  158. org_id, _ = self._get_org_id_and_api_key(self._ORGANIZATIONS[hostname], video_id)
  159. details = self._download_json(
  160. 'https://analytics.videoken.com/api/videoinfo_private', video_id, query={
  161. 'videoid': video_id,
  162. 'org_id': org_id,
  163. }, headers={'Accept': 'application/json'}, note='Downloading VideoKen API JSON',
  164. errnote='Failed to download VideoKen API JSON', fatal=False)
  165. if details:
  166. return next(self._extract_videos({'videos': [details]}, url))
  167. # fallback for API error 400 response
  168. elif video_id.startswith('slideslive-'):
  169. return self.url_result(
  170. self._create_slideslive_url(None, video_id, url), SlidesLiveIE, video_id)
  171. elif re.match(r'^[\w-]{11}$', video_id):
  172. return self.url_result(video_id, 'Youtube', video_id)
  173. else:
  174. raise ExtractorError('Unable to extract without VideoKen API response')
  175. class VideoKenPlayerIE(VideoKenBaseIE):
  176. _VALID_URL = r'https?://player\.videoken\.com/embed/slideslive-(?P<id>\d+)'
  177. _TESTS = [{
  178. 'url': 'https://player.videoken.com/embed/slideslive-38968434',
  179. 'info_dict': {
  180. 'id': '38968434',
  181. 'ext': 'mp4',
  182. 'title': 'Deep Learning with Label Differential Privacy',
  183. 'timestamp': 1643377020,
  184. 'upload_date': '20220128',
  185. 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
  186. 'thumbnails': 'count:30',
  187. 'chapters': 'count:29',
  188. },
  189. 'params': {
  190. 'skip_download': 'm3u8',
  191. },
  192. }]
  193. def _real_extract(self, url):
  194. video_id = self._match_id(url)
  195. return self.url_result(
  196. self._create_slideslive_url(None, video_id, url), SlidesLiveIE, video_id)
  197. class VideoKenPlaylistIE(VideoKenBaseIE):
  198. _VALID_URL = VideoKenBaseIE._BASE_URL_RE + r'(?:category/\d+/)?playlist/(?P<id>\d+)'
  199. _TESTS = [{
  200. 'url': 'https://videos.icts.res.in/category/1822/playlist/381',
  201. 'playlist_mincount': 117,
  202. 'info_dict': {
  203. 'id': '381',
  204. 'title': 'Cosmology - The Next Decade',
  205. },
  206. }]
  207. def _real_extract(self, url):
  208. hostname, playlist_id = self._match_valid_url(url).group('host', 'id')
  209. org_id, _ = self._get_org_id_and_api_key(self._ORGANIZATIONS[hostname], playlist_id)
  210. videos = self._download_json(
  211. f'https://analytics.videoken.com/api/{org_id}/playlistitems/{playlist_id}/',
  212. playlist_id, headers={'Accept': 'application/json'}, note='Downloading API JSON')
  213. return self.playlist_result(self._extract_videos(videos, url), playlist_id, videos.get('title'))
  214. class VideoKenCategoryIE(VideoKenBaseIE):
  215. _VALID_URL = VideoKenBaseIE._BASE_URL_RE + r'category/(?P<id>\d+)/?(?:$|[?#])'
  216. _TESTS = [{
  217. 'url': 'https://videos.icts.res.in/category/1822/',
  218. 'playlist_mincount': 500,
  219. 'info_dict': {
  220. 'id': '1822',
  221. 'title': 'Programs',
  222. },
  223. }, {
  224. 'url': 'https://videos.neurips.cc/category/350/',
  225. 'playlist_mincount': 34,
  226. 'info_dict': {
  227. 'id': '350',
  228. 'title': 'NeurIPS 2018',
  229. },
  230. }, {
  231. 'url': 'https://videos.cncf.io/category/479/',
  232. 'playlist_mincount': 328,
  233. 'info_dict': {
  234. 'id': '479',
  235. 'title': 'KubeCon + CloudNativeCon Europe\'19',
  236. },
  237. }]
  238. def _get_category_page(self, category_id, org_id, page=1, note=None):
  239. return self._download_json(
  240. f'https://analytics.videoken.com/api/videolake/{org_id}/category_videos', category_id,
  241. fatal=False, note=note if note else f'Downloading category page {page}',
  242. query={
  243. 'category_id': category_id,
  244. 'page_number': page,
  245. 'length': self._PAGE_SIZE,
  246. }, headers={'Accept': 'application/json'}) or {}
  247. def _entries(self, category_id, org_id, url, page):
  248. videos = self._get_category_page(category_id, org_id, page + 1)
  249. yield from self._extract_videos(videos, url)
  250. def _real_extract(self, url):
  251. hostname, category_id = self._match_valid_url(url).group('host', 'id')
  252. org_id, _ = self._get_org_id_and_api_key(self._ORGANIZATIONS[hostname], category_id)
  253. category_info = self._get_category_page(category_id, org_id, note='Downloading category info')
  254. category = category_info['category_name']
  255. total_pages = math.ceil(int(category_info['recordsTotal']) / self._PAGE_SIZE)
  256. return self.playlist_result(InAdvancePagedList(
  257. functools.partial(self._entries, category_id, org_id, url),
  258. total_pages, self._PAGE_SIZE), category_id, category)
  259. class VideoKenTopicIE(VideoKenBaseIE):
  260. _VALID_URL = VideoKenBaseIE._BASE_URL_RE + r'topic/(?P<id>[^/#?]+)/?(?:$|[?#])'
  261. _TESTS = [{
  262. 'url': 'https://videos.neurips.cc/topic/machine%20learning/',
  263. 'playlist_mincount': 500,
  264. 'info_dict': {
  265. 'id': 'machine_learning',
  266. 'title': 'machine learning',
  267. },
  268. }, {
  269. 'url': 'https://videos.icts.res.in/topic/gravitational%20waves/',
  270. 'playlist_mincount': 77,
  271. 'info_dict': {
  272. 'id': 'gravitational_waves',
  273. 'title': 'gravitational waves',
  274. },
  275. }, {
  276. 'url': 'https://videos.cncf.io/topic/prometheus/',
  277. 'playlist_mincount': 134,
  278. 'info_dict': {
  279. 'id': 'prometheus',
  280. 'title': 'prometheus',
  281. },
  282. }]
  283. def _get_topic_page(self, topic, org_id, search_id, api_key, page=1, note=None):
  284. return self._download_json(
  285. 'https://es.videoken.com/api/v1.0/get_results', topic, fatal=False, query={
  286. 'orgid': org_id,
  287. 'size': self._PAGE_SIZE,
  288. 'query': topic,
  289. 'page': page,
  290. 'sort': 'upload_desc',
  291. 'filter': 'all',
  292. 'token': api_key,
  293. 'is_topic': 'true',
  294. 'category': '',
  295. 'searchid': search_id,
  296. }, headers={'Accept': 'application/json'},
  297. note=note if note else f'Downloading topic page {page}') or {}
  298. def _entries(self, topic, org_id, search_id, api_key, url, page):
  299. videos = self._get_topic_page(topic, org_id, search_id, api_key, page + 1)
  300. yield from self._extract_videos(videos, url)
  301. def _real_extract(self, url):
  302. hostname, topic_id = self._match_valid_url(url).group('host', 'id')
  303. topic = urllib.parse.unquote(topic_id)
  304. topic_id = topic.replace(' ', '_')
  305. org_id, api_key = self._get_org_id_and_api_key(self._ORGANIZATIONS[hostname], topic)
  306. search_id = base64.b64encode(f':{topic}:{int(time.time())}:transient'.encode()).decode()
  307. total_pages = int_or_none(self._get_topic_page(
  308. topic, org_id, search_id, api_key, note='Downloading topic info')['total_no_of_pages'])
  309. return self.playlist_result(InAdvancePagedList(
  310. functools.partial(self._entries, topic, org_id, search_id, api_key, url),
  311. total_pages, self._PAGE_SIZE), topic_id, topic)