podbayfm.py 3.0 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788
  1. from .common import InfoExtractor
  2. from ..utils import (
  3. OnDemandPagedList,
  4. clean_html,
  5. int_or_none,
  6. jwt_decode_hs256,
  7. url_or_none,
  8. )
  9. from ..utils.traversal import traverse_obj
  10. def result_from_props(props):
  11. return {
  12. **traverse_obj(props, {
  13. 'id': ('_id', {str}),
  14. 'title': ('title', {str}),
  15. 'url': ('mediaURL', {url_or_none}),
  16. 'description': ('description', {clean_html}),
  17. 'thumbnail': ('image', {jwt_decode_hs256}, 'url', {url_or_none}),
  18. 'timestamp': ('timestamp', {int_or_none}),
  19. 'duration': ('duration', {int_or_none}),
  20. }),
  21. 'ext': 'mp3',
  22. 'vcodec': 'none',
  23. }
  24. class PodbayFMIE(InfoExtractor):
  25. _VALID_URL = r'https?://podbay\.fm/p/[^/?#]+/e/(?P<id>\d+)'
  26. _TESTS = [{
  27. 'url': 'https://podbay.fm/p/behind-the-bastards/e/1647338400',
  28. 'md5': '895ac8505de349515f5ee8a4a3195c93',
  29. 'info_dict': {
  30. 'id': '62306451f4a48e58d0c4d6a8',
  31. 'title': 'Part One: Kissinger',
  32. 'ext': 'mp3',
  33. 'description': r're:^We begin our epic six part series on Henry Kissinger.+',
  34. 'thumbnail': r're:^https?://.*\.jpg',
  35. 'timestamp': 1647338400,
  36. 'duration': 5001,
  37. 'upload_date': '20220315',
  38. },
  39. }]
  40. def _real_extract(self, url):
  41. episode_id = self._match_id(url)
  42. webpage = self._download_webpage(url, episode_id)
  43. data = self._search_nextjs_data(webpage, episode_id)
  44. return result_from_props(data['props']['pageProps']['episode'])
  45. class PodbayFMChannelIE(InfoExtractor):
  46. _VALID_URL = r'https?://podbay\.fm/p/(?P<id>[^/?#]+)/?(?:$|[?#])'
  47. _TESTS = [{
  48. 'url': 'https://podbay.fm/p/behind-the-bastards',
  49. 'info_dict': {
  50. 'id': 'behind-the-bastards',
  51. 'title': 'Behind the Bastards',
  52. },
  53. 'playlist_mincount': 21,
  54. }]
  55. _PAGE_SIZE = 10
  56. def _fetch_page(self, channel_id, pagenum):
  57. return self._download_json(
  58. f'https://podbay.fm/api/podcast?reverse=true&page={pagenum}&slug={channel_id}',
  59. f'Downloading channel JSON page {pagenum + 1}', channel_id)['podcast']
  60. @staticmethod
  61. def _results_from_page(channel_id, page):
  62. return [{
  63. **result_from_props(e),
  64. 'extractor': PodbayFMIE.IE_NAME,
  65. 'extractor_key': PodbayFMIE.ie_key(),
  66. # somehow they use timestamps as the episode identifier
  67. 'webpage_url': f'https://podbay.fm/p/{channel_id}/e/{e["timestamp"]}',
  68. } for e in page['episodes']]
  69. def _real_extract(self, url):
  70. channel_id = self._match_id(url)
  71. first_page = self._fetch_page(channel_id, 0)
  72. entries = OnDemandPagedList(
  73. lambda pagenum: self._results_from_page(
  74. channel_id, self._fetch_page(channel_id, pagenum) if pagenum else first_page),
  75. self._PAGE_SIZE)
  76. return self.playlist_result(entries, channel_id, first_page.get('title'))