noice.py 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116
  1. from .common import InfoExtractor
  2. from ..utils import (
  3. clean_html,
  4. determine_ext,
  5. int_or_none,
  6. parse_iso8601,
  7. traverse_obj,
  8. variadic,
  9. )
  10. class NoicePodcastIE(InfoExtractor):
  11. _VALID_URL = r'https?://open\.noice\.id/content/(?P<id>[a-fA-F0-9-]+)'
  12. _TESTS = [{
  13. 'url': 'https://open.noice.id/content/7694bb04-ff0f-40fa-a60b-5b39f29584b2',
  14. 'info_dict': {
  15. 'id': '7694bb04-ff0f-40fa-a60b-5b39f29584b2',
  16. 'ext': 'm4a',
  17. 'season': 'Season 1',
  18. 'description': 'md5:58d1274e6857b6fbbecf47075885380d',
  19. 'release_date': '20221115',
  20. 'timestamp': 1668496642,
  21. 'season_number': 1,
  22. 'upload_date': '20221115',
  23. 'release_timestamp': 1668496642,
  24. 'title': 'Eps 1. Belajar dari Wishnutama: Kreatif Bukan Followers! (bersama Wishnutama)',
  25. 'modified_date': '20221121',
  26. 'categories': ['Bisnis dan Keuangan'],
  27. 'duration': 3567,
  28. 'modified_timestamp': 1669030647,
  29. 'thumbnail': 'https://images.noiceid.cc/catalog/content-1668496302560',
  30. 'channel_id': '9dab1024-5b92-4265-ae1c-63da87359832',
  31. 'like_count': int,
  32. 'channel': 'Noice Space Talks',
  33. 'comment_count': int,
  34. 'dislike_count': int,
  35. 'channel_follower_count': int,
  36. },
  37. }, {
  38. 'url': 'https://open.noice.id/content/222134e4-99f2-456f-b8a2-b8be404bf063',
  39. 'info_dict': {
  40. 'id': '222134e4-99f2-456f-b8a2-b8be404bf063',
  41. 'ext': 'm4a',
  42. 'release_timestamp': 1653488220,
  43. 'description': 'md5:35074f6190cef52b05dd133bb2ef460e',
  44. 'upload_date': '20220525',
  45. 'timestamp': 1653460637,
  46. 'release_date': '20220525',
  47. 'thumbnail': 'https://images.noiceid.cc/catalog/content-1653460337625',
  48. 'title': 'Eps 1: Dijodohin Sama Anak Pak RT',
  49. 'modified_timestamp': 1669030647,
  50. 'season_number': 1,
  51. 'modified_date': '20221121',
  52. 'categories': ['Cerita dan Drama'],
  53. 'duration': 1830,
  54. 'season': 'Season 1',
  55. 'channel_id': '60193f6b-d24d-4b23-913b-ceed5a731e74',
  56. 'dislike_count': int,
  57. 'like_count': int,
  58. 'comment_count': int,
  59. 'channel': 'Dear Jerome',
  60. 'channel_follower_count': int,
  61. },
  62. }]
  63. def _get_formats_and_subtitles(self, media_url, video_id):
  64. formats, subtitles = [], {}
  65. for url in variadic(media_url):
  66. ext = determine_ext(url)
  67. if ext == 'm3u8':
  68. fmts, subs = self._extract_m3u8_formats_and_subtitles(url, video_id)
  69. formats.extend(fmts)
  70. self._merge_subtitles(subs, target=subtitles)
  71. else:
  72. formats.append({
  73. 'url': url,
  74. 'ext': 'mp3',
  75. 'vcodec': 'none',
  76. 'acodec': 'mp3',
  77. })
  78. return formats, subtitles
  79. def _real_extract(self, url):
  80. display_id = self._match_id(url)
  81. webpage = self._download_webpage(url, display_id)
  82. nextjs_data = self._search_nextjs_data(webpage, display_id)['props']['pageProps']['contentDetails']
  83. media_url_list = traverse_obj(nextjs_data, (('rawContentUrl', 'url'), ))
  84. formats, subtitles = self._get_formats_and_subtitles(media_url_list, display_id)
  85. return {
  86. 'id': nextjs_data.get('id') or display_id,
  87. 'title': nextjs_data.get('title') or self._html_search_meta('og:title', webpage),
  88. 'formats': formats,
  89. 'subtitles': subtitles,
  90. 'description': (nextjs_data.get('description') or clean_html(nextjs_data.get('htmlDescription'))
  91. or self._html_search_meta(['description', 'og:description'], webpage)),
  92. 'thumbnail': nextjs_data.get('image') or self._html_search_meta('og:image', webpage),
  93. 'timestamp': parse_iso8601(nextjs_data.get('createdAt')),
  94. 'release_timestamp': parse_iso8601(nextjs_data.get('publishedAt')),
  95. 'modified_timestamp': parse_iso8601(
  96. nextjs_data.get('updatedAt') or self._html_search_meta('og:updated_time', webpage)),
  97. 'duration': int_or_none(nextjs_data.get('duration')),
  98. 'categories': traverse_obj(nextjs_data, ('genres', ..., 'name')),
  99. 'season': nextjs_data.get('seasonName'),
  100. 'season_number': int_or_none(nextjs_data.get('seasonNumber')),
  101. 'channel': traverse_obj(nextjs_data, ('catalog', 'title')),
  102. 'channel_id': traverse_obj(nextjs_data, ('catalog', 'id'), 'catalogId'),
  103. **traverse_obj(nextjs_data, ('meta', 'aggregations', {
  104. 'like_count': 'likes',
  105. 'dislike_count': 'dislikes',
  106. 'comment_count': 'comments',
  107. 'channel_follower_count': 'followers',
  108. })),
  109. }