listennotes.py 4.1 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586
  1. import re
  2. from .common import InfoExtractor
  3. from ..utils import (
  4. clean_html,
  5. extract_attributes,
  6. get_element_by_class,
  7. get_element_html_by_id,
  8. get_element_text_and_html_by_tag,
  9. parse_duration,
  10. strip_or_none,
  11. traverse_obj,
  12. try_call,
  13. )
  14. class ListenNotesIE(InfoExtractor):
  15. _VALID_URL = r'https?://(?:www\.)?listennotes\.com/podcasts/[^/]+/[^/]+-(?P<id>.+)/'
  16. _TESTS = [{
  17. 'url': 'https://www.listennotes.com/podcasts/thriving-on-overload/tim-oreilly-on-noticing-KrDgvNb_u1n/',
  18. 'md5': '5b91a32f841e5788fb82b72a1a8af7f7',
  19. 'info_dict': {
  20. 'id': 'KrDgvNb_u1n',
  21. 'ext': 'mp3',
  22. 'title': 'md5:32236591a921adf17bbdbf0441b6c0e9',
  23. 'description': 'md5:c581ed197eeddcee55a67cdb547c8cbd',
  24. 'duration': 2148.0,
  25. 'channel': 'Thriving on Overload',
  26. 'channel_id': 'ed84wITivxF',
  27. 'episode_id': 'e1312583fa7b4e24acfbb5131050be00',
  28. 'thumbnail': 'https://production.listennotes.com/podcasts/thriving-on-overload-ross-dawson-1wb_KospA3P-ed84wITivxF.300x300.jpg',
  29. 'channel_url': 'https://www.listennotes.com/podcasts/thriving-on-overload-ross-dawson-ed84wITivxF/',
  30. 'cast': ['Tim O’Reilly', 'Cookie Monster', 'Lao Tzu', 'Wallace Steven', 'Eric Raymond', 'Christine Peterson', 'John Maynard Keyne', 'Ross Dawson'],
  31. },
  32. }, {
  33. 'url': 'https://www.listennotes.com/podcasts/ask-noah-show/episode-177-wireguard-with-lwEA3154JzG/',
  34. 'md5': '62fb4ffe7fc525632a1138bf72a5ce53',
  35. 'info_dict': {
  36. 'id': 'lwEA3154JzG',
  37. 'ext': 'mp3',
  38. 'title': 'Episode 177: WireGuard with Jason Donenfeld',
  39. 'description': 'md5:24744f36456a3e95f83c1193a3458594',
  40. 'duration': 3861.0,
  41. 'channel': 'Ask Noah Show',
  42. 'channel_id': '4DQTzdS5-j7',
  43. 'episode_id': '8c8954b95e0b4859ad1eecec8bf6d3a4',
  44. 'channel_url': 'https://www.listennotes.com/podcasts/ask-noah-show-noah-j-chelliah-4DQTzdS5-j7/',
  45. 'thumbnail': 'https://production.listennotes.com/podcasts/ask-noah-show-noah-j-chelliah-cfbRUw9Gs3F-4DQTzdS5-j7.300x300.jpg',
  46. 'cast': ['noah showlink', 'noah show', 'noah dashboard', 'jason donenfeld'],
  47. },
  48. }]
  49. def _clean_description(self, description):
  50. return clean_html(re.sub(r'(</?(div|p)>\s*)+', '<br/><br/>', description or ''))
  51. def _real_extract(self, url):
  52. audio_id = self._match_id(url)
  53. webpage = self._download_webpage(url, audio_id)
  54. data = self._search_json(
  55. r'<script id="original-content"[^>]+\btype="application/json">', webpage, 'content', audio_id)
  56. data.update(extract_attributes(get_element_html_by_id(
  57. r'episode-play-button-toolbar|episode-no-play-button-toolbar', webpage, escape_value=False)))
  58. duration, description = self._search_regex(
  59. r'(?P<duration>[\d:]+)\s*-\s*(?P<description>.+)',
  60. self._html_search_meta(['og:description', 'description', 'twitter:description'], webpage),
  61. 'description', fatal=False, group=('duration', 'description')) or (None, None)
  62. return {
  63. 'id': audio_id,
  64. 'url': data['audio'],
  65. 'title': (data.get('data-title')
  66. or try_call(lambda: get_element_text_and_html_by_tag('h1', webpage)[0])
  67. or self._html_search_meta(('og:title', 'title', 'twitter:title'), webpage, 'title')),
  68. 'description': (self._clean_description(get_element_by_class('ln-text-p', webpage))
  69. or strip_or_none(description)),
  70. 'duration': parse_duration(traverse_obj(data, 'audio_length', 'data-duration') or duration),
  71. 'episode_id': traverse_obj(data, 'uuid', 'data-episode-uuid'),
  72. **traverse_obj(data, {
  73. 'thumbnail': 'data-image',
  74. 'channel': 'data-channel-title',
  75. 'cast': ('nlp_entities', ..., 'name'),
  76. 'channel_url': 'channel_url',
  77. 'channel_id': 'channel_short_uuid',
  78. }),
  79. }