radiofrance.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473
  1. import itertools
  2. import re
  3. import urllib.parse
  4. from .common import InfoExtractor
  5. from ..utils import (
  6. int_or_none,
  7. join_nonempty,
  8. js_to_json,
  9. parse_duration,
  10. strftime_or_none,
  11. traverse_obj,
  12. unified_strdate,
  13. urljoin,
  14. )
  15. class RadioFranceIE(InfoExtractor):
  16. _VALID_URL = r'^https?://maison\.radiofrance\.fr/radiovisions/(?P<id>[^?#]+)'
  17. IE_NAME = 'radiofrance'
  18. _TEST = {
  19. 'url': 'http://maison.radiofrance.fr/radiovisions/one-one',
  20. 'md5': 'bdbb28ace95ed0e04faab32ba3160daf',
  21. 'info_dict': {
  22. 'id': 'one-one',
  23. 'ext': 'ogg',
  24. 'title': 'One to one',
  25. 'description': "Plutôt que d'imaginer la radio de demain comme technologie ou comme création de contenu, je veux montrer que quelles que soient ses évolutions, j'ai l'intime conviction que la radio continuera d'être un grand média de proximité pour les auditeurs.",
  26. 'uploader': 'Thomas Hercouët',
  27. },
  28. }
  29. def _real_extract(self, url):
  30. m = self._match_valid_url(url)
  31. video_id = m.group('id')
  32. webpage = self._download_webpage(url, video_id)
  33. title = self._html_search_regex(r'<h1>(.*?)</h1>', webpage, 'title')
  34. description = self._html_search_regex(
  35. r'<div class="bloc_page_wrapper"><div class="text">(.*?)</div>',
  36. webpage, 'description', fatal=False)
  37. uploader = self._html_search_regex(
  38. r'<div class="credit">&nbsp;&nbsp;&copy;&nbsp;(.*?)</div>',
  39. webpage, 'uploader', fatal=False)
  40. formats_str = self._html_search_regex(
  41. r'class="jp-jplayer[^"]*" data-source="([^"]+)">',
  42. webpage, 'audio URLs')
  43. formats = [
  44. {
  45. 'format_id': fm[0],
  46. 'url': fm[1],
  47. 'vcodec': 'none',
  48. 'quality': i,
  49. }
  50. for i, fm in
  51. enumerate(re.findall(r"([a-z0-9]+)\s*:\s*'([^']+)'", formats_str))
  52. ]
  53. return {
  54. 'id': video_id,
  55. 'title': title,
  56. 'formats': formats,
  57. 'description': description,
  58. 'uploader': uploader,
  59. }
  60. class RadioFranceBaseIE(InfoExtractor):
  61. _VALID_URL_BASE = r'https?://(?:www\.)?radiofrance\.fr'
  62. _STATIONS_RE = '|'.join(map(re.escape, (
  63. 'franceculture',
  64. 'franceinfo',
  65. 'franceinter',
  66. 'francemusique',
  67. 'fip',
  68. 'mouv',
  69. )))
  70. def _extract_data_from_webpage(self, webpage, display_id, key):
  71. return traverse_obj(self._search_json(
  72. r'\bconst\s+data\s*=', webpage, key, display_id,
  73. contains_pattern=r'\[\{(?s:.+)\}\]', transform_source=js_to_json),
  74. (..., 'data', key, {dict}), get_all=False) or {}
  75. class FranceCultureIE(RadioFranceBaseIE):
  76. _VALID_URL = rf'''(?x)
  77. {RadioFranceBaseIE._VALID_URL_BASE}
  78. /(?:{RadioFranceBaseIE._STATIONS_RE})
  79. /podcasts/(?:[^?#]+/)?(?P<display_id>[^?#]+)-(?P<id>\d{{6,}})(?:$|[?#])
  80. '''
  81. _TESTS = [
  82. {
  83. 'url': 'https://www.radiofrance.fr/franceculture/podcasts/science-en-questions/la-physique-d-einstein-aiderait-elle-a-comprendre-le-cerveau-8440487',
  84. 'info_dict': {
  85. 'id': '8440487',
  86. 'display_id': 'la-physique-d-einstein-aiderait-elle-a-comprendre-le-cerveau',
  87. 'ext': 'mp3',
  88. 'title': 'La physique d’Einstein aiderait-elle à comprendre le cerveau ?',
  89. 'description': 'Existerait-il un pont conceptuel entre la physique de l’espace-temps et les neurosciences ?',
  90. 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
  91. 'upload_date': '20220514',
  92. 'duration': 2750,
  93. },
  94. },
  95. {
  96. 'url': 'https://www.radiofrance.fr/franceinter/podcasts/le-7-9-30/le-7-9-30-du-vendredi-10-mars-2023-2107675',
  97. 'info_dict': {
  98. 'id': '2107675',
  99. 'display_id': 'le-7-9-30-du-vendredi-10-mars-2023',
  100. 'title': 'Inflation alimentaire : comment en sortir ? - Régis Debray et Claude Grange - Cybèle Idelot',
  101. 'description': 'md5:36ee74351ede77a314fdebb94026b916',
  102. 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
  103. 'upload_date': '20230310',
  104. 'duration': 8977,
  105. 'ext': 'mp3',
  106. },
  107. },
  108. {
  109. 'url': 'https://www.radiofrance.fr/franceinter/podcasts/la-rafle-du-vel-d-hiv-une-affaire-d-etat/les-racines-du-crime-episode-1-3715507',
  110. 'only_matching': True,
  111. }, {
  112. 'url': 'https://www.radiofrance.fr/franceinfo/podcasts/le-billet-sciences/sante-bientot-un-vaccin-contre-l-asthme-allergique-3057200',
  113. 'only_matching': True,
  114. },
  115. ]
  116. def _real_extract(self, url):
  117. video_id, display_id = self._match_valid_url(url).group('id', 'display_id')
  118. webpage = self._download_webpage(url, display_id)
  119. # _search_json_ld doesn't correctly handle this. See https://github.com/yt-dlp/yt-dlp/pull/3874#discussion_r891903846
  120. video_data = self._search_json('', webpage, 'audio data', display_id, contains_pattern=r'{\s*"@type"\s*:\s*"AudioObject".+}')
  121. return {
  122. 'id': video_id,
  123. 'display_id': display_id,
  124. 'url': video_data['contentUrl'],
  125. 'vcodec': 'none' if video_data.get('encodingFormat') == 'mp3' else None,
  126. 'duration': parse_duration(video_data.get('duration')),
  127. 'title': self._html_search_regex(r'(?s)<h1[^>]*itemprop="[^"]*name[^"]*"[^>]*>(.+?)</h1>',
  128. webpage, 'title', default=self._og_search_title(webpage)),
  129. 'description': self._html_search_regex(
  130. r'(?s)<meta name="description"\s*content="([^"]+)', webpage, 'description', default=None),
  131. 'thumbnail': self._og_search_thumbnail(webpage),
  132. 'uploader': self._html_search_regex(
  133. r'(?s)<span class="author">(.*?)</span>', webpage, 'uploader', default=None),
  134. 'upload_date': unified_strdate(self._search_regex(
  135. r'"datePublished"\s*:\s*"([^"]+)', webpage, 'timestamp', fatal=False)),
  136. }
  137. class RadioFranceLiveIE(RadioFranceBaseIE):
  138. _VALID_URL = rf'''(?x)
  139. https?://(?:www\.)?radiofrance\.fr
  140. /(?P<id>{RadioFranceBaseIE._STATIONS_RE})
  141. /?(?P<substation_id>radio-[\w-]+)?(?:[#?]|$)
  142. '''
  143. _TESTS = [{
  144. 'url': 'https://www.radiofrance.fr/franceinter/',
  145. 'info_dict': {
  146. 'id': 'franceinter',
  147. 'title': str,
  148. 'live_status': 'is_live',
  149. 'ext': 'aac',
  150. },
  151. 'params': {
  152. 'skip_download': 'Livestream',
  153. },
  154. }, {
  155. 'url': 'https://www.radiofrance.fr/franceculture',
  156. 'info_dict': {
  157. 'id': 'franceculture',
  158. 'title': str,
  159. 'live_status': 'is_live',
  160. 'ext': 'aac',
  161. },
  162. 'params': {
  163. 'skip_download': 'Livestream',
  164. },
  165. }, {
  166. 'url': 'https://www.radiofrance.fr/mouv/radio-musique-kids-family',
  167. 'info_dict': {
  168. 'id': 'mouv-radio-musique-kids-family',
  169. 'title': str,
  170. 'live_status': 'is_live',
  171. 'ext': 'aac',
  172. },
  173. 'params': {
  174. 'skip_download': 'Livestream',
  175. },
  176. }, {
  177. 'url': 'https://www.radiofrance.fr/mouv/radio-rnb-soul',
  178. 'info_dict': {
  179. 'id': 'mouv-radio-rnb-soul',
  180. 'title': str,
  181. 'live_status': 'is_live',
  182. 'ext': 'aac',
  183. },
  184. 'params': {
  185. 'skip_download': 'Livestream',
  186. },
  187. }, {
  188. 'url': 'https://www.radiofrance.fr/mouv/radio-musique-mix',
  189. 'info_dict': {
  190. 'id': 'mouv-radio-musique-mix',
  191. 'title': str,
  192. 'live_status': 'is_live',
  193. 'ext': 'aac',
  194. },
  195. 'params': {
  196. 'skip_download': 'Livestream',
  197. },
  198. }, {
  199. 'url': 'https://www.radiofrance.fr/fip/radio-rock',
  200. 'info_dict': {
  201. 'id': 'fip-radio-rock',
  202. 'title': str,
  203. 'live_status': 'is_live',
  204. 'ext': 'aac',
  205. },
  206. 'params': {
  207. 'skip_download': 'Livestream',
  208. },
  209. }, {
  210. 'url': 'https://www.radiofrance.fr/mouv',
  211. 'only_matching': True,
  212. }]
  213. def _real_extract(self, url):
  214. station_id, substation_id = self._match_valid_url(url).group('id', 'substation_id')
  215. if substation_id:
  216. webpage = self._download_webpage(url, station_id)
  217. api_response = self._extract_data_from_webpage(webpage, station_id, 'webRadioData')
  218. else:
  219. api_response = self._download_json(
  220. f'https://www.radiofrance.fr/{station_id}/api/live', station_id)
  221. formats, subtitles = [], {}
  222. for media_source in traverse_obj(api_response, (('now', None), 'media', 'sources', lambda _, v: v['url'])):
  223. if media_source.get('format') == 'hls':
  224. fmts, subs = self._extract_m3u8_formats_and_subtitles(media_source['url'], station_id, fatal=False)
  225. formats.extend(fmts)
  226. self._merge_subtitles(subs, target=subtitles)
  227. else:
  228. formats.append({
  229. 'url': media_source['url'],
  230. 'abr': media_source.get('bitrate'),
  231. })
  232. return {
  233. 'id': join_nonempty(station_id, substation_id),
  234. 'title': traverse_obj(api_response, ('visual', 'legend')) or join_nonempty(
  235. ('now', 'firstLine', 'title'), ('now', 'secondLine', 'title'), from_dict=api_response, delim=' - '),
  236. 'formats': formats,
  237. 'subtitles': subtitles,
  238. 'is_live': True,
  239. }
  240. class RadioFrancePlaylistBaseIE(RadioFranceBaseIE):
  241. """Subclasses must set _METADATA_KEY"""
  242. def _call_api(self, content_id, cursor, page_num):
  243. raise NotImplementedError('This method must be implemented by subclasses')
  244. def _generate_playlist_entries(self, content_id, content_response):
  245. for page_num in itertools.count(2):
  246. for entry in content_response['items']:
  247. yield self.url_result(
  248. f'https://www.radiofrance.fr/{entry["path"]}', url_transparent=True, **traverse_obj(entry, {
  249. 'title': 'title',
  250. 'description': 'standFirst',
  251. 'timestamp': ('publishedDate', {int_or_none}),
  252. 'thumbnail': ('visual', 'src'),
  253. }))
  254. next_cursor = traverse_obj(content_response, (('pagination', None), 'next'), get_all=False)
  255. if not next_cursor:
  256. break
  257. content_response = self._call_api(content_id, next_cursor, page_num)
  258. def _real_extract(self, url):
  259. display_id = self._match_id(url)
  260. metadata = self._download_json(
  261. 'https://www.radiofrance.fr/api/v2.1/path', display_id,
  262. query={'value': urllib.parse.urlparse(url).path})['content']
  263. content_id = metadata['id']
  264. return self.playlist_result(
  265. self._generate_playlist_entries(content_id, metadata[self._METADATA_KEY]), content_id,
  266. display_id=display_id, **{**traverse_obj(metadata, {
  267. 'title': 'title',
  268. 'description': 'standFirst',
  269. 'thumbnail': ('visual', 'src'),
  270. }), **traverse_obj(metadata, {
  271. 'title': 'name',
  272. 'description': 'role',
  273. })})
  274. class RadioFrancePodcastIE(RadioFrancePlaylistBaseIE):
  275. _VALID_URL = rf'''(?x)
  276. {RadioFranceBaseIE._VALID_URL_BASE}
  277. /(?:{RadioFranceBaseIE._STATIONS_RE})
  278. /podcasts/(?P<id>[\w-]+)/?(?:[?#]|$)
  279. '''
  280. _TESTS = [{
  281. 'url': 'https://www.radiofrance.fr/franceinfo/podcasts/le-billet-vert',
  282. 'info_dict': {
  283. 'id': 'eaf6ef81-a980-4f1c-a7d1-8a75ecd54b17',
  284. 'display_id': 'le-billet-vert',
  285. 'title': 'Le billet sciences',
  286. 'description': 'md5:eb1007b34b0c0a680daaa71525bbd4c1',
  287. 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
  288. },
  289. 'playlist_mincount': 11,
  290. }, {
  291. 'url': 'https://www.radiofrance.fr/franceinter/podcasts/jean-marie-le-pen-l-obsession-nationale',
  292. 'info_dict': {
  293. 'id': '566fd524-3074-4fbc-ac69-8696f2152a54',
  294. 'display_id': 'jean-marie-le-pen-l-obsession-nationale',
  295. 'title': 'Jean-Marie Le Pen, l\'obsession nationale',
  296. 'description': 'md5:a07c0cfb894f6d07a62d0ad12c4b7d73',
  297. 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
  298. },
  299. 'playlist_count': 7,
  300. }, {
  301. 'url': 'https://www.radiofrance.fr/franceculture/podcasts/serie-thomas-grjebine',
  302. 'info_dict': {
  303. 'id': '63c1ddc9-9f15-457a-98b2-411bac63f48d',
  304. 'display_id': 'serie-thomas-grjebine',
  305. 'title': 'Thomas Grjebine',
  306. },
  307. 'playlist_count': 1,
  308. }, {
  309. 'url': 'https://www.radiofrance.fr/fip/podcasts/certains-l-aiment-fip',
  310. 'info_dict': {
  311. 'id': '143dff38-e956-4a5d-8576-1c0b7242b99e',
  312. 'display_id': 'certains-l-aiment-fip',
  313. 'title': 'Certains l’aiment Fip',
  314. 'description': 'md5:ff974672ba00d4fd5be80fb001c5b27e',
  315. 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
  316. },
  317. 'playlist_mincount': 321,
  318. }, {
  319. 'url': 'https://www.radiofrance.fr/franceinter/podcasts/le-7-9',
  320. 'only_matching': True,
  321. }, {
  322. 'url': 'https://www.radiofrance.fr/mouv/podcasts/dirty-mix',
  323. 'only_matching': True,
  324. }]
  325. _METADATA_KEY = 'expressions'
  326. def _call_api(self, podcast_id, cursor, page_num):
  327. return self._download_json(
  328. f'https://www.radiofrance.fr/api/v2.1/concepts/{podcast_id}/expressions', podcast_id,
  329. note=f'Downloading page {page_num}', query={'pageCursor': cursor})
  330. class RadioFranceProfileIE(RadioFrancePlaylistBaseIE):
  331. _VALID_URL = rf'{RadioFranceBaseIE._VALID_URL_BASE}/personnes/(?P<id>[\w-]+)'
  332. _TESTS = [{
  333. 'url': 'https://www.radiofrance.fr/personnes/thomas-pesquet?p=3',
  334. 'info_dict': {
  335. 'id': '86c62790-e481-11e2-9f7b-782bcb6744eb',
  336. 'display_id': 'thomas-pesquet',
  337. 'title': 'Thomas Pesquet',
  338. 'description': 'Astronaute à l\'agence spatiale européenne',
  339. },
  340. 'playlist_mincount': 212,
  341. }, {
  342. 'url': 'https://www.radiofrance.fr/personnes/eugenie-bastie',
  343. 'info_dict': {
  344. 'id': '9593050b-0183-4972-a0b5-d8f699079e02',
  345. 'display_id': 'eugenie-bastie',
  346. 'title': 'Eugénie Bastié',
  347. 'description': 'Journaliste et essayiste',
  348. 'thumbnail': r're:^https?://.*\.(?:jpg|png)',
  349. },
  350. 'playlist_mincount': 39,
  351. }, {
  352. 'url': 'https://www.radiofrance.fr/personnes/lea-salame',
  353. 'only_matching': True,
  354. }]
  355. _METADATA_KEY = 'documents'
  356. def _call_api(self, profile_id, cursor, page_num):
  357. resp = self._download_json(
  358. f'https://www.radiofrance.fr/api/v2.1/taxonomy/{profile_id}/documents', profile_id,
  359. note=f'Downloading page {page_num}', query={
  360. 'relation': 'personality',
  361. 'cursor': cursor,
  362. })
  363. resp['next'] = traverse_obj(resp, ('pagination', 'next'))
  364. return resp
  365. class RadioFranceProgramScheduleIE(RadioFranceBaseIE):
  366. _VALID_URL = rf'''(?x)
  367. {RadioFranceBaseIE._VALID_URL_BASE}
  368. /(?P<station>{RadioFranceBaseIE._STATIONS_RE})
  369. /grille-programmes(?:\?date=(?P<date>[\d-]+))?
  370. '''
  371. _TESTS = [{
  372. 'url': 'https://www.radiofrance.fr/franceinter/grille-programmes?date=17-02-2023',
  373. 'info_dict': {
  374. 'id': 'franceinter-program-20230217',
  375. 'upload_date': '20230217',
  376. },
  377. 'playlist_count': 25,
  378. }, {
  379. 'url': 'https://www.radiofrance.fr/franceculture/grille-programmes?date=01-02-2023',
  380. 'info_dict': {
  381. 'id': 'franceculture-program-20230201',
  382. 'upload_date': '20230201',
  383. },
  384. 'playlist_count': 25,
  385. }, {
  386. 'url': 'https://www.radiofrance.fr/mouv/grille-programmes?date=19-03-2023',
  387. 'info_dict': {
  388. 'id': 'mouv-program-20230319',
  389. 'upload_date': '20230319',
  390. },
  391. 'playlist_count': 3,
  392. }, {
  393. 'url': 'https://www.radiofrance.fr/francemusique/grille-programmes?date=18-03-2023',
  394. 'info_dict': {
  395. 'id': 'francemusique-program-20230318',
  396. 'upload_date': '20230318',
  397. },
  398. 'playlist_count': 15,
  399. }, {
  400. 'url': 'https://www.radiofrance.fr/franceculture/grille-programmes',
  401. 'only_matching': True,
  402. }]
  403. def _generate_playlist_entries(self, webpage_url, api_response):
  404. for entry in traverse_obj(api_response, ('steps', lambda _, v: v['expression']['path'])):
  405. yield self.url_result(
  406. urljoin(webpage_url, f'/{entry["expression"]["path"]}'), ie=FranceCultureIE,
  407. url_transparent=True, **traverse_obj(entry, {
  408. 'title': ('expression', 'title'),
  409. 'thumbnail': ('expression', 'visual', 'src'),
  410. 'timestamp': ('startTime', {int_or_none}),
  411. 'series_id': ('concept', 'id'),
  412. 'series': ('concept', 'title'),
  413. }))
  414. def _real_extract(self, url):
  415. station, date = self._match_valid_url(url).group('station', 'date')
  416. webpage = self._download_webpage(url, station)
  417. grid_data = self._extract_data_from_webpage(webpage, station, 'grid')
  418. upload_date = strftime_or_none(grid_data.get('date'), '%Y%m%d')
  419. return self.playlist_result(
  420. self._generate_playlist_entries(url, grid_data),
  421. join_nonempty(station, 'program', upload_date), upload_date=upload_date)