rtvcplay.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286
  1. import re
  2. from .common import InfoExtractor
  3. from ..utils import (
  4. ExtractorError,
  5. clean_html,
  6. determine_ext,
  7. float_or_none,
  8. int_or_none,
  9. js_to_json,
  10. mimetype2ext,
  11. traverse_obj,
  12. url_or_none,
  13. urljoin,
  14. )
  15. class RTVCPlayBaseIE(InfoExtractor):
  16. _BASE_VALID_URL = r'https?://(?:www\.)?rtvcplay\.co'
  17. def _extract_player_config(self, webpage, video_id):
  18. return self._search_json(
  19. r'<script\b[^>]*>[^<]*(?:var|let|const)\s+config\s*=', re.sub(r'"\s*\+\s*"', '', webpage),
  20. 'player_config', video_id, transform_source=js_to_json)
  21. def _extract_formats_and_subtitles_player_config(self, player_config, video_id):
  22. formats, subtitles = [], {}
  23. for source in traverse_obj(player_config, ('sources', ..., lambda _, v: url_or_none(v['url']))):
  24. ext = mimetype2ext(source.get('mimetype'), default=determine_ext(source['url']))
  25. if ext == 'm3u8':
  26. fmts, subs = self._extract_m3u8_formats_and_subtitles(
  27. source['url'], video_id, 'mp4', fatal=False)
  28. formats.extend(fmts)
  29. self._merge_subtitles(subs, target=subtitles)
  30. else:
  31. formats.append({
  32. 'url': source['url'],
  33. 'ext': ext,
  34. })
  35. return formats, subtitles
  36. class RTVCPlayIE(RTVCPlayBaseIE):
  37. _VALID_URL = RTVCPlayBaseIE._BASE_VALID_URL + r'/(?P<category>(?!embed)[^/]+)/(?:[^?#]+/)?(?P<id>[\w-]+)'
  38. _TESTS = [{
  39. 'url': 'https://www.rtvcplay.co/en-vivo/canal-institucional',
  40. 'info_dict': {
  41. 'id': 'canal-institucional',
  42. 'title': r're:^Canal Institucional',
  43. 'description': 'md5:eff9e548394175928059320c006031ea',
  44. 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
  45. 'live_status': 'is_live',
  46. 'ext': 'mp4',
  47. },
  48. 'params': {
  49. 'skip_download': 'Livestream',
  50. },
  51. }, {
  52. 'url': 'https://www.rtvcplay.co/en-vivo/senal-colombia',
  53. 'info_dict': {
  54. 'id': 'senal-colombia',
  55. 'title': r're:^Señal Colombia',
  56. 'description': 'md5:799f16a401d97f40c33a2c6a3e2a507b',
  57. 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
  58. 'live_status': 'is_live',
  59. 'ext': 'mp4',
  60. },
  61. 'params': {
  62. 'skip_download': 'Livestream',
  63. },
  64. }, {
  65. 'url': 'https://www.rtvcplay.co/en-vivo/radio-nacional',
  66. 'info_dict': {
  67. 'id': 'radio-nacional',
  68. 'title': r're:^Radio Nacional',
  69. 'description': 'md5:5de009bc6a9fa79d2a6cf0b73f977d53',
  70. 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
  71. 'live_status': 'is_live',
  72. 'ext': 'mp4',
  73. },
  74. 'params': {
  75. 'skip_download': 'Livestream',
  76. },
  77. }, {
  78. 'url': 'https://www.rtvcplay.co/peliculas-ficcion/senoritas',
  79. 'md5': '1288ee6f6d1330d880f98bff2ed710a3',
  80. 'info_dict': {
  81. 'id': 'senoritas',
  82. 'title': 'Señoritas',
  83. 'description': 'md5:f095a2bb52cb6cf279daf6302f86fb32',
  84. 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
  85. 'ext': 'mp4',
  86. },
  87. }, {
  88. 'url': 'https://www.rtvcplay.co/competencias-basicas-ciudadanas-y-socioemocionales/profe-en-tu-casa/james-regresa-clases-28022022',
  89. 'md5': 'f040a7380a269ad633cf837384d5e9fc',
  90. 'info_dict': {
  91. 'id': 'james-regresa-clases-28022022',
  92. 'title': 'James regresa a clases - 28/02/2022',
  93. 'description': 'md5:c5dcdf757c7ab29305e8763c6007e675',
  94. 'ext': 'mp4',
  95. },
  96. }, {
  97. 'url': 'https://www.rtvcplay.co/peliculas-documentales/llinas-el-cerebro-y-el-universo',
  98. 'info_dict': {
  99. 'id': 'llinas-el-cerebro-y-el-universo',
  100. 'title': 'Llinás, el cerebro y el universo',
  101. 'description': 'md5:add875bf2309bb52b3e8b9b06116d9b0',
  102. 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
  103. },
  104. 'playlist_mincount': 3,
  105. }, {
  106. 'url': 'https://www.rtvcplay.co/competencias-basicas-ciudadanas-y-socioemocionales/profe-en-tu-casa',
  107. 'info_dict': {
  108. 'id': 'profe-en-tu-casa',
  109. 'title': 'Profe en tu casa',
  110. 'description': 'md5:47dbe20e263194413b1db2a2805a4f2e',
  111. 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
  112. },
  113. 'playlist_mincount': 537,
  114. }, {
  115. 'url': 'https://www.rtvcplay.co/series-al-oido/relato-de-un-naufrago-una-travesia-del-periodismo-a-la-literatura',
  116. 'info_dict': {
  117. 'id': 'relato-de-un-naufrago-una-travesia-del-periodismo-a-la-literatura',
  118. 'title': 'Relato de un náufrago: una travesía del periodismo a la literatura',
  119. 'description': 'md5:6da28fdca4a5a568ea47ef65ef775603',
  120. 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
  121. },
  122. 'playlist_mincount': 5,
  123. }, {
  124. 'url': 'https://www.rtvcplay.co/series-al-oido/diez-versiones',
  125. 'info_dict': {
  126. 'id': 'diez-versiones',
  127. 'title': 'Diez versiones',
  128. 'description': 'md5:997471ed971cb3fd8e41969457675306',
  129. 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
  130. },
  131. 'playlist_mincount': 20,
  132. }]
  133. def _real_extract(self, url):
  134. video_id, category = self._match_valid_url(url).group('id', 'category')
  135. webpage = self._download_webpage(url, video_id)
  136. hydration = self._search_json(
  137. r'window\.__RTVCPLAY_STATE__\s*=', webpage, 'hydration',
  138. video_id, transform_source=js_to_json)['content']['currentContent']
  139. asset_id = traverse_obj(hydration, ('video', 'assetid'))
  140. if asset_id:
  141. hls_url = hydration['base_url_hls'].replace('[node:field_asset_id]', asset_id)
  142. else:
  143. hls_url = traverse_obj(hydration, ('channel', 'hls'))
  144. metadata = traverse_obj(hydration, {
  145. 'title': 'title',
  146. 'description': 'description',
  147. 'thumbnail': ((('channel', 'image', 'logo'), ('resource', 'image', 'cover_desktop')), 'path'),
  148. }, get_all=False)
  149. # Probably it's a program's page
  150. if not hls_url:
  151. seasons = traverse_obj(
  152. hydration, ('widgets', lambda _, y: y['type'] == 'seasonList', 'contents'),
  153. get_all=False)
  154. if not seasons:
  155. podcast_episodes = hydration.get('audios')
  156. if not podcast_episodes:
  157. raise ExtractorError('Could not find asset_id nor program playlist nor podcast episodes')
  158. return self.playlist_result([
  159. self.url_result(episode['file'], url_transparent=True, **traverse_obj(episode, {
  160. 'title': 'title',
  161. 'description': ('description', {clean_html}),
  162. 'episode_number': ('chapter_number', {float_or_none}, {int_or_none}),
  163. 'season_number': ('season', {int_or_none}),
  164. })) for episode in podcast_episodes], video_id, **metadata)
  165. entries = [self.url_result(
  166. urljoin(url, episode['slug']), url_transparent=True,
  167. **traverse_obj(season, {
  168. 'season': 'title',
  169. 'season_number': ('season', {int_or_none}),
  170. }), **traverse_obj(episode, {
  171. 'title': 'title',
  172. 'thumbnail': ('image', 'cover', 'path'),
  173. 'episode_number': ('chapter_number', {int_or_none}),
  174. })) for season in seasons for episode in traverse_obj(season, ('contents', ...))]
  175. return self.playlist_result(entries, video_id, **metadata)
  176. formats, subtitles = self._extract_m3u8_formats_and_subtitles(hls_url, video_id, 'mp4')
  177. return {
  178. 'id': video_id,
  179. 'formats': formats,
  180. 'subtitles': subtitles,
  181. 'is_live': category == 'en-vivo',
  182. **metadata,
  183. }
  184. class RTVCPlayEmbedIE(RTVCPlayBaseIE):
  185. _VALID_URL = RTVCPlayBaseIE._BASE_VALID_URL + r'/embed/(?P<id>[\w-]+)'
  186. _TESTS = [{
  187. 'url': 'https://www.rtvcplay.co/embed/72b0e699-248b-4929-a4a8-3782702fa7f9',
  188. 'md5': 'ed529aeaee7aa2a72afe91ac7d1177a8',
  189. 'info_dict': {
  190. 'id': '72b0e699-248b-4929-a4a8-3782702fa7f9',
  191. 'title': 'Tráiler: Señoritas',
  192. 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
  193. 'ext': 'mp4',
  194. },
  195. }]
  196. def _real_extract(self, url):
  197. video_id = self._match_id(url)
  198. webpage = self._download_webpage(url, video_id)
  199. player_config = self._extract_player_config(webpage, video_id)
  200. formats, subtitles = self._extract_formats_and_subtitles_player_config(player_config, video_id)
  201. asset_id = traverse_obj(player_config, ('rtvcplay', 'assetid'))
  202. metadata = {} if not asset_id else self._download_json(
  203. f'https://cms.rtvcplay.co/api/v1/video/asset-id/{asset_id}', video_id, fatal=False)
  204. return {
  205. 'id': video_id,
  206. 'formats': formats,
  207. 'subtitles': subtitles,
  208. **traverse_obj(metadata, {
  209. 'title': 'title',
  210. 'description': 'description',
  211. 'thumbnail': ('image', ..., 'thumbnail', 'path'),
  212. }, get_all=False),
  213. }
  214. class RTVCKalturaIE(RTVCPlayBaseIE):
  215. _VALID_URL = r'https?://media\.rtvc\.gov\.co/kalturartvc/(?P<id>[\w-]+)'
  216. _TESTS = [{
  217. 'url': 'https://media.rtvc.gov.co/kalturartvc/indexSC.html',
  218. 'info_dict': {
  219. 'id': 'indexSC',
  220. 'title': r're:^Señal Colombia',
  221. 'description': 'md5:799f16a401d97f40c33a2c6a3e2a507b',
  222. 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
  223. 'live_status': 'is_live',
  224. 'ext': 'mp4',
  225. },
  226. 'params': {
  227. 'skip_download': 'Livestream',
  228. },
  229. }]
  230. def _real_extract(self, url):
  231. video_id = self._match_id(url)
  232. webpage = self._download_webpage(url, video_id)
  233. player_config = self._extract_player_config(webpage, video_id)
  234. formats, subtitles = self._extract_formats_and_subtitles_player_config(player_config, video_id)
  235. channel_id = traverse_obj(player_config, ('rtvcplay', 'channelId'))
  236. metadata = {} if not channel_id else self._download_json(
  237. f'https://cms.rtvcplay.co/api/v1/taxonomy_term/streaming/{channel_id}', video_id, fatal=False)
  238. fmts, subs = self._extract_m3u8_formats_and_subtitles(
  239. traverse_obj(metadata, ('channel', 'hls')), video_id, 'mp4', fatal=False)
  240. formats.extend(fmts)
  241. self._merge_subtitles(subs, target=subtitles)
  242. return {
  243. 'id': video_id,
  244. 'formats': formats,
  245. 'subtitles': subtitles,
  246. 'is_live': True,
  247. **traverse_obj(metadata, {
  248. 'title': 'title',
  249. 'description': 'description',
  250. 'thumbnail': ('channel', 'image', 'logo', 'path'),
  251. }),
  252. }