simplecast.py 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151
  1. from .common import InfoExtractor
  2. from ..utils import (
  3. clean_podcast_url,
  4. int_or_none,
  5. parse_iso8601,
  6. strip_or_none,
  7. try_get,
  8. urlencode_postdata,
  9. )
  10. class SimplecastBaseIE(InfoExtractor):
  11. _UUID_REGEX = r'[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12}'
  12. _API_BASE = 'https://api.simplecast.com/'
  13. def _call_api(self, path_tmpl, video_id):
  14. return self._download_json(
  15. self._API_BASE + path_tmpl % video_id, video_id)
  16. def _call_search_api(self, resource, resource_id, resource_url):
  17. return self._download_json(
  18. f'https://api.simplecast.com/{resource}s/search', resource_id,
  19. data=urlencode_postdata({'url': resource_url}))
  20. def _parse_episode(self, episode):
  21. episode_id = episode['id']
  22. title = episode['title'].strip()
  23. audio_file = episode.get('audio_file') or {}
  24. audio_file_url = audio_file.get('url') or episode.get('audio_file_url') or episode['enclosure_url']
  25. season = episode.get('season') or {}
  26. season_href = season.get('href')
  27. season_id = None
  28. if season_href:
  29. season_id = self._search_regex(
  30. rf'https?://api.simplecast.com/seasons/({self._UUID_REGEX})',
  31. season_href, 'season id', default=None)
  32. webpage_url = episode.get('episode_url')
  33. channel_url = None
  34. if webpage_url:
  35. channel_url = self._search_regex(
  36. r'(https?://[^/]+\.simplecast\.com)',
  37. webpage_url, 'channel url', default=None)
  38. return {
  39. 'id': episode_id,
  40. 'display_id': episode.get('slug'),
  41. 'title': title,
  42. 'url': clean_podcast_url(audio_file_url),
  43. 'webpage_url': webpage_url,
  44. 'channel_url': channel_url,
  45. 'series': try_get(episode, lambda x: x['podcast']['title']),
  46. 'season_number': int_or_none(season.get('number')),
  47. 'season_id': season_id,
  48. 'thumbnail': episode.get('image_url'),
  49. 'episode_id': episode_id,
  50. 'episode_number': int_or_none(episode.get('number')),
  51. 'description': strip_or_none(episode.get('description')),
  52. 'timestamp': parse_iso8601(episode.get('published_at')),
  53. 'duration': int_or_none(episode.get('duration')),
  54. 'filesize': int_or_none(audio_file.get('size') or episode.get('audio_file_size')),
  55. }
  56. class SimplecastIE(SimplecastBaseIE):
  57. IE_NAME = 'simplecast'
  58. _VALID_URL = rf'https?://(?:api\.simplecast\.com/episodes|player\.simplecast\.com)/(?P<id>{SimplecastBaseIE._UUID_REGEX})'
  59. _EMBED_REGEX = [rf'''(?x)<iframe[^>]+src=["\']
  60. (?P<url>https?://(?:
  61. embed\.simplecast\.com/[0-9a-f]{8}|
  62. player\.simplecast\.com/{SimplecastBaseIE._UUID_REGEX}
  63. ))''']
  64. _COMMON_TEST_INFO = {
  65. 'display_id': 'errant-signal-chris-franklin-new-wave-video-essays',
  66. 'id': 'b6dc49a2-9404-4853-9aa9-9cfc097be876',
  67. 'ext': 'mp3',
  68. 'title': 'Errant Signal - Chris Franklin & New Wave Video Essays',
  69. 'episode_number': 1,
  70. 'episode_id': 'b6dc49a2-9404-4853-9aa9-9cfc097be876',
  71. 'description': 'md5:34752789d3d2702e2d2c975fbd14f357',
  72. 'season_number': 1,
  73. 'season_id': 'e23df0da-bae4-4531-8bbf-71364a88dc13',
  74. 'series': 'The RE:BIND.io Podcast',
  75. 'duration': 5343,
  76. 'timestamp': 1580979475,
  77. 'upload_date': '20200206',
  78. 'webpage_url': r're:^https?://the-re-bind-io-podcast\.simplecast\.com/episodes/errant-signal-chris-franklin-new-wave-video-essays',
  79. 'channel_url': r're:^https?://the-re-bind-io-podcast\.simplecast\.com$',
  80. }
  81. _TESTS = [{
  82. 'url': 'https://api.simplecast.com/episodes/b6dc49a2-9404-4853-9aa9-9cfc097be876',
  83. 'md5': '8c93be7be54251bf29ee97464eabe61c',
  84. 'info_dict': _COMMON_TEST_INFO,
  85. }, {
  86. 'url': 'https://player.simplecast.com/b6dc49a2-9404-4853-9aa9-9cfc097be876',
  87. 'only_matching': True,
  88. }]
  89. def _real_extract(self, url):
  90. episode_id = self._match_id(url)
  91. episode = self._call_api('episodes/%s', episode_id)
  92. return self._parse_episode(episode)
  93. class SimplecastEpisodeIE(SimplecastBaseIE):
  94. IE_NAME = 'simplecast:episode'
  95. _VALID_URL = r'https?://(?!api\.)[^/]+\.simplecast\.com/episodes/(?P<id>[^/?&#]+)'
  96. _TEST = {
  97. 'url': 'https://the-re-bind-io-podcast.simplecast.com/episodes/errant-signal-chris-franklin-new-wave-video-essays',
  98. 'md5': '8c93be7be54251bf29ee97464eabe61c',
  99. 'info_dict': SimplecastIE._COMMON_TEST_INFO,
  100. }
  101. def _real_extract(self, url):
  102. mobj = self._match_valid_url(url)
  103. episode = self._call_search_api(
  104. 'episode', mobj.group(1), mobj.group(0))
  105. return self._parse_episode(episode)
  106. class SimplecastPodcastIE(SimplecastBaseIE):
  107. IE_NAME = 'simplecast:podcast'
  108. _VALID_URL = r'https?://(?!(?:api|cdn|embed|feeds|player)\.)(?P<id>[^/]+)\.simplecast\.com(?!/episodes/[^/?&#]+)'
  109. _TESTS = [{
  110. 'url': 'https://the-re-bind-io-podcast.simplecast.com',
  111. 'playlist_mincount': 33,
  112. 'info_dict': {
  113. 'id': '07d28d26-7522-42eb-8c53-2bdcfc81c43c',
  114. 'title': 'The RE:BIND.io Podcast',
  115. },
  116. }, {
  117. 'url': 'https://the-re-bind-io-podcast.simplecast.com/episodes',
  118. 'only_matching': True,
  119. }]
  120. def _real_extract(self, url):
  121. subdomain = self._match_id(url)
  122. site = self._call_search_api('site', subdomain, url)
  123. podcast = site['podcast']
  124. podcast_id = podcast['id']
  125. podcast_title = podcast.get('title')
  126. def entries():
  127. episodes = self._call_api('podcasts/%s/episodes', podcast_id)
  128. for episode in (episodes.get('collection') or []):
  129. info = self._parse_episode(episode)
  130. info['series'] = podcast_title
  131. yield info
  132. return self.playlist_result(entries(), podcast_id, podcast_title)