tv5mondeplus.py 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177
  1. import urllib.parse
  2. from .common import InfoExtractor
  3. from ..utils import (
  4. clean_html,
  5. determine_ext,
  6. extract_attributes,
  7. get_element_by_class,
  8. get_element_html_by_class,
  9. int_or_none,
  10. url_or_none,
  11. )
  12. from ..utils.traversal import traverse_obj
  13. class TV5MondePlusIE(InfoExtractor):
  14. IE_NAME = 'TV5MONDE'
  15. _VALID_URL = r'https?://(?:www\.)?tv5monde\.com/tv/video/(?P<id>[^/?#]+)'
  16. _TESTS = [{
  17. # documentary
  18. 'url': 'https://www.tv5monde.com/tv/video/65931-baudouin-l-heritage-d-un-roi-baudouin-l-heritage-d-un-roi',
  19. 'md5': 'd2a708902d3df230a357c99701aece05',
  20. 'info_dict': {
  21. 'id': '3FPa7JMu21_6D4BA7b',
  22. 'display_id': '65931-baudouin-l-heritage-d-un-roi-baudouin-l-heritage-d-un-roi',
  23. 'ext': 'mp4',
  24. 'title': "Baudouin, l'héritage d'un roi",
  25. 'thumbnail': 'https://psi.tv5monde.com/upsilon-images/960x540/6f/baudouin-f49c6b0e.jpg',
  26. 'duration': 4842,
  27. 'upload_date': '20240130',
  28. 'timestamp': 1706641242,
  29. 'episode': "BAUDOUIN, L'HERITAGE D'UN ROI",
  30. 'description': 'md5:78125c74a5cac06d7743a2d09126edad',
  31. 'series': "Baudouin, l'héritage d'un roi",
  32. },
  33. }, {
  34. # series episode
  35. 'url': 'https://www.tv5monde.com/tv/video/52952-toute-la-vie-mardi-23-mars-2021',
  36. 'md5': 'f5e09637cadd55639c05874e22eb56bf',
  37. 'info_dict': {
  38. 'id': 'obRRZ8m6g9_6D4BA7b',
  39. 'display_id': '52952-toute-la-vie-mardi-23-mars-2021',
  40. 'ext': 'mp4',
  41. 'title': 'Toute la vie',
  42. 'description': 'md5:a824a2e1dfd94cf45fa379a1fb43ce65',
  43. 'thumbnail': 'https://psi.tv5monde.com/media/image/960px/5880553.jpg',
  44. 'duration': 2526,
  45. 'upload_date': '20230721',
  46. 'timestamp': 1689971646,
  47. 'series': 'Toute la vie',
  48. 'episode': 'Mardi 23 mars 2021',
  49. },
  50. }, {
  51. # movie
  52. 'url': 'https://www.tv5monde.com/tv/video/8771-ce-fleuve-qui-nous-charrie-ce-fleuve-qui-nous-charrie-p001-ce-fleuve-qui-nous-charrie',
  53. 'md5': '87cefc34e10a6bf4f7823cccd7b36eb2',
  54. 'info_dict': {
  55. 'id': 'DOcfvdLKXL_6D4BA7b',
  56. 'display_id': '8771-ce-fleuve-qui-nous-charrie-ce-fleuve-qui-nous-charrie-p001-ce-fleuve-qui-nous-charrie',
  57. 'ext': 'mp4',
  58. 'title': 'Ce fleuve qui nous charrie',
  59. 'description': 'md5:62ba3f875343c7fc4082bdfbbc1be992',
  60. 'thumbnail': 'https://psi.tv5monde.com/media/image/960px/5476617.jpg',
  61. 'duration': 5300,
  62. 'upload_date': '20210822',
  63. 'timestamp': 1629594105,
  64. 'episode': 'CE FLEUVE QUI NOUS CHARRIE-P001-CE FLEUVE QUI NOUS CHARRIE',
  65. 'series': 'Ce fleuve qui nous charrie',
  66. },
  67. }, {
  68. # news
  69. 'url': 'https://www.tv5monde.com/tv/video/70402-tv5monde-le-journal-edition-du-08-05-24-11h',
  70. 'md5': 'c62977d6d10754a2ecebba70ad370479',
  71. 'info_dict': {
  72. 'id': 'LgQFrOCNsc_6D4BA7b',
  73. 'display_id': '70402-tv5monde-le-journal-edition-du-08-05-24-11h',
  74. 'ext': 'mp4',
  75. 'title': 'TV5MONDE, le journal',
  76. 'description': 'md5:777dc209eaa4423b678477c36b0b04a8',
  77. 'thumbnail': 'https://psi.tv5monde.com/media/image/960px/6184105.jpg',
  78. 'duration': 854,
  79. 'upload_date': '20240508',
  80. 'timestamp': 1715159640,
  81. 'series': 'TV5MONDE, le journal',
  82. 'episode': 'EDITION DU 08/05/24 - 11H',
  83. },
  84. }]
  85. _GEO_BYPASS = False
  86. @staticmethod
  87. def _extract_subtitles(data_captions):
  88. subtitles = {}
  89. for f in traverse_obj(data_captions, ('files', lambda _, v: url_or_none(v['file']))):
  90. subtitles.setdefault(f.get('label') or 'fra', []).append({'url': f['file']})
  91. return subtitles
  92. def _real_extract(self, url):
  93. display_id = self._match_id(url)
  94. webpage = self._download_webpage(url, display_id)
  95. if ">Ce programme n'est malheureusement pas disponible pour votre zone géographique.<" in webpage:
  96. self.raise_geo_restricted(countries=['FR'])
  97. vpl_data = extract_attributes(self._search_regex(
  98. r'(<[^>]+class="video_player_loader"[^>]+>)',
  99. webpage, 'video player loader'))
  100. video_files = self._parse_json(
  101. vpl_data['data-broadcast'], display_id)
  102. formats = []
  103. video_id = None
  104. def process_video_files(v):
  105. nonlocal video_id
  106. for video_file in v:
  107. v_url = video_file.get('url')
  108. if not v_url:
  109. continue
  110. if video_file.get('type') == 'application/deferred':
  111. d_param = urllib.parse.quote(v_url)
  112. token = video_file.get('token')
  113. if not token:
  114. continue
  115. deferred_json = self._download_json(
  116. f'https://api.tv5monde.com/player/asset/{d_param}/resolve?condenseKS=true', display_id,
  117. note='Downloading deferred info', headers={'Authorization': f'Bearer {token}'}, fatal=False)
  118. v_url = traverse_obj(deferred_json, (0, 'url', {url_or_none}))
  119. if not v_url:
  120. continue
  121. # data-guid from the webpage isn't stable, use the material id from the json urls
  122. video_id = self._search_regex(
  123. r'materials/([\da-zA-Z]{10}_[\da-fA-F]{7})/', v_url, 'video id', default=None)
  124. process_video_files(deferred_json)
  125. video_format = video_file.get('format') or determine_ext(v_url)
  126. if video_format == 'm3u8':
  127. formats.extend(self._extract_m3u8_formats(
  128. v_url, display_id, 'mp4', 'm3u8_native',
  129. m3u8_id='hls', fatal=False))
  130. elif video_format == 'mpd':
  131. formats.extend(self._extract_mpd_formats(
  132. v_url, display_id, fatal=False))
  133. else:
  134. formats.append({
  135. 'url': v_url,
  136. 'format_id': video_format,
  137. })
  138. process_video_files(video_files)
  139. metadata = self._parse_json(
  140. vpl_data.get('data-metadata') or '{}', display_id, fatal=False)
  141. if not video_id:
  142. video_id = self._search_regex(
  143. (r'data-guid=["\']([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})',
  144. r'id_contenu["\']\s:\s*(\d+)'), webpage, 'video id',
  145. default=display_id)
  146. return {
  147. **traverse_obj(metadata, ('content', {
  148. 'id': ('id', {str}),
  149. 'title': ('title', {str}),
  150. 'episode': ('title', {str}),
  151. 'series': ('series', {str}),
  152. 'timestamp': ('publishDate_ts', {int_or_none}),
  153. 'duration': ('duration', {int_or_none}),
  154. })),
  155. 'id': video_id,
  156. 'display_id': display_id,
  157. 'title': clean_html(get_element_by_class('main-title', webpage)),
  158. 'description': clean_html(get_element_by_class('text', get_element_html_by_class('ep-summary', webpage) or '')),
  159. 'thumbnail': url_or_none(vpl_data.get('data-image')),
  160. 'formats': formats,
  161. 'subtitles': self._extract_subtitles(self._parse_json(
  162. traverse_obj(vpl_data, ('data-captions', {str}), default='{}'), display_id, fatal=False)),
  163. }