radiocomercial.py 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154
  1. import itertools
  2. from .common import InfoExtractor
  3. from ..networking.exceptions import HTTPError
  4. from ..utils import (
  5. ExtractorError,
  6. extract_attributes,
  7. get_element_by_class,
  8. get_element_html_by_class,
  9. get_element_text_and_html_by_tag,
  10. get_elements_html_by_class,
  11. int_or_none,
  12. join_nonempty,
  13. try_call,
  14. unified_strdate,
  15. update_url,
  16. urljoin,
  17. )
  18. from ..utils.traversal import traverse_obj
  19. class RadioComercialIE(InfoExtractor):
  20. _VALID_URL = r'https?://(?:www\.)?radiocomercial\.pt/podcasts/[^/?#]+/t?(?P<season>\d+)/(?P<id>[\w-]+)'
  21. _TESTS = [{
  22. 'url': 'https://radiocomercial.pt/podcasts/o-homem-que-mordeu-o-cao/t6/taylor-swift-entranhando-se-que-nem-uma-espada-no-ventre-dos-fas#page-content-wrapper',
  23. 'md5': '5f4fe8e485b29d2e8fd495605bc2c7e4',
  24. 'info_dict': {
  25. 'id': 'taylor-swift-entranhando-se-que-nem-uma-espada-no-ventre-dos-fas',
  26. 'ext': 'mp3',
  27. 'title': 'Taylor Swift entranhando-se que nem uma espada no ventre dos fãs.',
  28. 'release_date': '20231025',
  29. 'thumbnail': r're:https://radiocomercial.pt/upload/[^.]+.jpg',
  30. 'season': 'Season 6',
  31. 'season_number': 6,
  32. },
  33. }, {
  34. 'url': 'https://radiocomercial.pt/podcasts/convenca-me-num-minuto/t3/convenca-me-num-minuto-que-os-lobisomens-existem',
  35. 'md5': '47e96c273aef96a8eb160cd6cf46d782',
  36. 'info_dict': {
  37. 'id': 'convenca-me-num-minuto-que-os-lobisomens-existem',
  38. 'ext': 'mp3',
  39. 'title': 'Convença-me num minuto que os lobisomens existem',
  40. 'release_date': '20231026',
  41. 'thumbnail': r're:https://radiocomercial.pt/upload/[^.]+.jpg',
  42. 'season': 'Season 3',
  43. 'season_number': 3,
  44. },
  45. }, {
  46. 'url': 'https://radiocomercial.pt/podcasts/inacreditavel-by-ines-castel-branco/t2/o-desastre-de-aviao',
  47. 'md5': '69be64255420fec23b7259955d771e54',
  48. 'info_dict': {
  49. 'id': 'o-desastre-de-aviao',
  50. 'ext': 'mp3',
  51. 'title': 'O desastre de avião',
  52. 'description': 'md5:8a82beeb372641614772baab7246245f',
  53. 'release_date': '20231101',
  54. 'thumbnail': r're:https://radiocomercial.pt/upload/[^.]+.jpg',
  55. 'season': 'Season 2',
  56. 'season_number': 2,
  57. },
  58. 'params': {
  59. # inconsistant md5
  60. 'skip_download': True,
  61. },
  62. }, {
  63. 'url': 'https://radiocomercial.pt/podcasts/tnt-todos-no-top/2023/t-n-t-29-de-outubro',
  64. 'md5': '91d32d4d4b1407272068b102730fc9fa',
  65. 'info_dict': {
  66. 'id': 't-n-t-29-de-outubro',
  67. 'ext': 'mp3',
  68. 'title': 'T.N.T 29 de outubro',
  69. 'release_date': '20231029',
  70. 'thumbnail': r're:https://radiocomercial.pt/upload/[^.]+.jpg',
  71. 'season': 'Season 2023',
  72. 'season_number': 2023,
  73. },
  74. }]
  75. def _real_extract(self, url):
  76. video_id, season = self._match_valid_url(url).group('id', 'season')
  77. webpage = self._download_webpage(url, video_id)
  78. return {
  79. 'id': video_id,
  80. 'title': self._html_extract_title(webpage),
  81. 'description': self._og_search_description(webpage, default=None),
  82. 'release_date': unified_strdate(get_element_by_class(
  83. 'date', get_element_html_by_class('descriptions', webpage) or '')),
  84. 'thumbnail': self._og_search_thumbnail(webpage),
  85. 'season_number': int_or_none(season),
  86. 'url': extract_attributes(get_element_html_by_class('audiofile', webpage) or '').get('href'),
  87. }
  88. class RadioComercialPlaylistIE(InfoExtractor):
  89. _VALID_URL = r'https?://(?:www\.)?radiocomercial\.pt/podcasts/(?P<id>[\w-]+)(?:/t?(?P<season>\d+))?/?(?:$|[?#])'
  90. _TESTS = [{
  91. 'url': 'https://radiocomercial.pt/podcasts/convenca-me-num-minuto/t3',
  92. 'info_dict': {
  93. 'id': 'convenca-me-num-minuto_t3',
  94. 'title': 'Convença-me num Minuto - Temporada 3',
  95. },
  96. 'playlist_mincount': 32,
  97. }, {
  98. 'url': 'https://radiocomercial.pt/podcasts/o-homem-que-mordeu-o-cao',
  99. 'info_dict': {
  100. 'id': 'o-homem-que-mordeu-o-cao',
  101. 'title': 'O Homem Que Mordeu o Cão',
  102. },
  103. 'playlist_mincount': 19,
  104. }, {
  105. 'url': 'https://radiocomercial.pt/podcasts/as-minhas-coisas-favoritas',
  106. 'info_dict': {
  107. 'id': 'as-minhas-coisas-favoritas',
  108. 'title': 'As Minhas Coisas Favoritas',
  109. },
  110. 'playlist_mincount': 131,
  111. }, {
  112. 'url': 'https://radiocomercial.pt/podcasts/tnt-todos-no-top/t2023',
  113. 'info_dict': {
  114. 'id': 'tnt-todos-no-top_t2023',
  115. 'title': 'TNT - Todos No Top - Temporada 2023',
  116. },
  117. 'playlist_mincount': 39,
  118. }]
  119. def _entries(self, url, playlist_id):
  120. for page in itertools.count(1):
  121. try:
  122. webpage = self._download_webpage(
  123. f'{url}/{page}', playlist_id, f'Downloading page {page}')
  124. except ExtractorError as e:
  125. if isinstance(e.cause, HTTPError) and e.cause.status == 404:
  126. break
  127. raise
  128. episodes = get_elements_html_by_class('tm-ouvir-podcast', webpage)
  129. if not episodes:
  130. break
  131. for url_path in traverse_obj(episodes, (..., {extract_attributes}, 'href')):
  132. episode_url = urljoin(url, url_path)
  133. if RadioComercialIE.suitable(episode_url):
  134. yield episode_url
  135. def _real_extract(self, url):
  136. podcast, season = self._match_valid_url(url).group('id', 'season')
  137. playlist_id = join_nonempty(podcast, season, delim='_t')
  138. url = update_url(url, query=None, fragment=None)
  139. webpage = self._download_webpage(url, playlist_id)
  140. name = try_call(lambda: get_element_text_and_html_by_tag('h1', webpage)[0])
  141. title = name if name == season else join_nonempty(name, season, delim=' - Temporada ')
  142. return self.playlist_from_matches(
  143. self._entries(url, playlist_id), playlist_id, title, ie=RadioComercialIE)