theguardian.py 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134
  1. import itertools
  2. from .common import InfoExtractor
  3. from ..utils import (
  4. clean_html,
  5. extract_attributes,
  6. get_element_by_class,
  7. get_element_html_by_class,
  8. get_elements_html_by_class,
  9. parse_qs,
  10. traverse_obj,
  11. unified_strdate,
  12. urljoin,
  13. )
  14. class TheGuardianPodcastIE(InfoExtractor):
  15. _VALID_URL = r'https?://(?:www\.)?theguardian\.com/\w+/audio/\d{4}/\w{3}/\d{1,2}/(?P<id>[\w-]+)'
  16. _TESTS = [{
  17. 'url': 'https://www.theguardian.com/news/audio/2023/nov/03/we-are-just-getting-started-the-plastic-eating-bacteria-that-could-change-the-world-podcast',
  18. 'md5': 'd1771744681789b4cd7da2a08e487702',
  19. 'info_dict': {
  20. 'id': 'we-are-just-getting-started-the-plastic-eating-bacteria-that-could-change-the-world-podcast',
  21. 'ext': 'mp3',
  22. 'title': '‘We are just getting started’: the plastic-eating bacteria that could change the world – podcast',
  23. 'description': 'md5:cfd3df2791d394d2ab62cd571d5207ee',
  24. 'creator': 'Stephen Buranyi',
  25. 'thumbnail': 'md5:73c12558fcb3b0e2a59422bfb33b3f79',
  26. 'release_date': '20231103',
  27. },
  28. }, {
  29. 'url': 'https://www.theguardian.com/news/audio/2023/oct/30/the-trials-of-robert-habeck-is-the-worlds-most-powerful-green-politician-doomed-to-fail-podcast',
  30. 'md5': 'd1771744681789b4cd7da2a08e487702',
  31. 'info_dict': {
  32. 'id': 'the-trials-of-robert-habeck-is-the-worlds-most-powerful-green-politician-doomed-to-fail-podcast',
  33. 'ext': 'mp3',
  34. 'title': 'The trials of Robert Habeck: is the world’s most powerful green politician doomed to fail? – podcast',
  35. 'description': 'md5:1b5cf6582d1771c6b7077784b5456994',
  36. 'creator': 'Philip Oltermann',
  37. 'thumbnail': 'md5:6e5c5ec43843e956e20be793722e9080',
  38. 'release_date': '20231030',
  39. },
  40. }, {
  41. 'url': 'https://www.theguardian.com/football/audio/2023/nov/06/arsenal-feel-hard-done-by-and-luton-hold-liverpool-football-weekly',
  42. 'md5': 'a2fcff6f8e060a95b1483295273dc35e',
  43. 'info_dict': {
  44. 'id': 'arsenal-feel-hard-done-by-and-luton-hold-liverpool-football-weekly',
  45. 'ext': 'mp3',
  46. 'title': 'Arsenal feel hard done by and Luton hold Liverpool – Football Weekly',
  47. 'description': 'md5:286a9fbddaeb7c83cc65d1c4a5330b2a',
  48. 'creator': 'Max Rushden',
  49. 'thumbnail': 'md5:93eb7d6440f1bb94eb3a6cad63f48afd',
  50. 'release_date': '20231106',
  51. },
  52. }, {
  53. 'url': 'https://www.theguardian.com/politics/audio/2023/nov/02/the-covid-inquiry-politics-weekly-uk-podcast',
  54. 'md5': '06a0f7e9701a80c8064a5d35690481ec',
  55. 'info_dict': {
  56. 'id': 'the-covid-inquiry-politics-weekly-uk-podcast',
  57. 'ext': 'mp3',
  58. 'title': 'The Covid inquiry | Politics Weekly UK - podcast',
  59. 'description': 'md5:207c98859c14903582b17d25b014046e',
  60. 'creator': 'Gaby Hinsliff',
  61. 'thumbnail': 'md5:28932a7b5a25b057be330d2ed70ea7f3',
  62. 'release_date': '20231102',
  63. },
  64. }]
  65. def _real_extract(self, url):
  66. video_id = self._match_id(url)
  67. webpage = self._download_webpage(url, video_id)
  68. return {
  69. 'id': video_id,
  70. 'title': self._og_search_title(webpage) or get_element_by_class('content__headline', webpage),
  71. 'description': self._og_search_description(webpage),
  72. 'creator': self._html_search_meta('author', webpage),
  73. 'thumbnail': self._og_search_thumbnail(webpage),
  74. 'release_date': unified_strdate(self._html_search_meta('article:published_time', webpage)),
  75. 'url': extract_attributes(get_element_html_by_class(
  76. 'podcast__player', webpage) or '').get('data-source'),
  77. }
  78. class TheGuardianPodcastPlaylistIE(InfoExtractor):
  79. _VALID_URL = r'https?://(?:www\.)?theguardian\.com/\w+/series/(?P<id>[\w-]+)(?:\?page=\d+)?'
  80. _TESTS = [{
  81. 'url': 'https://www.theguardian.com/football/series/theguardianswomensfootballweekly',
  82. 'info_dict': {
  83. 'id': 'theguardianswomensfootballweekly',
  84. 'title': "The Guardian's Women's Football Weekly",
  85. 'description': 'md5:e2cc021311e582d29935a73614a43f51',
  86. },
  87. 'playlist_mincount': 69,
  88. }, {
  89. 'url': 'https://www.theguardian.com/news/series/todayinfocus?page=2',
  90. 'info_dict': {
  91. 'id': 'todayinfocus',
  92. 'title': 'Today in Focus',
  93. 'description': 'md5:0f097764fc0d359e0b6eb537be0387e2',
  94. },
  95. 'playlist_mincount': 1261,
  96. }, {
  97. 'url': 'https://www.theguardian.com/news/series/the-audio-long-read',
  98. 'info_dict': {
  99. 'id': 'the-audio-long-read',
  100. 'title': 'The Audio Long Read',
  101. 'description': 'md5:5462994a27527309562b25b6defc4ef3',
  102. },
  103. 'playlist_mincount': 996,
  104. }]
  105. def _entries(self, url, playlist_id):
  106. for page in itertools.count(1):
  107. webpage, urlh = self._download_webpage_handle(
  108. url, playlist_id, f'Downloading page {page}', query={'page': page})
  109. if 'page' not in parse_qs(urlh.url):
  110. break
  111. episodes = get_elements_html_by_class('fc-item--type-media', webpage)
  112. yield from traverse_obj(episodes, (..., {extract_attributes}, 'data-id'))
  113. def _real_extract(self, url):
  114. podcast_id = self._match_id(url)
  115. webpage = self._download_webpage(url, podcast_id)
  116. title = clean_html(get_element_by_class(
  117. 'index-page-header__title', webpage) or get_element_by_class('flagship-audio__title', webpage))
  118. description = self._og_search_description(webpage) or self._html_search_meta(
  119. 'description', webpage)
  120. return self.playlist_from_matches(
  121. self._entries(url, podcast_id), podcast_id, title, description=description,
  122. ie=TheGuardianPodcastIE, getter=lambda x: urljoin('https://www.theguardian.com', x))