sverigesradio.py 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149
  1. from .common import InfoExtractor
  2. from ..utils import (
  3. determine_ext,
  4. extract_attributes,
  5. get_element_by_id,
  6. get_element_html_by_class,
  7. int_or_none,
  8. str_or_none,
  9. traverse_obj,
  10. url_or_none,
  11. )
  12. class SverigesRadioBaseIE(InfoExtractor):
  13. _BASE_URL = 'https://sverigesradio.se/sida/playerajax/'
  14. _QUALITIES = ['low', 'medium', 'high']
  15. _EXT_TO_CODEC_MAP = {
  16. 'mp3': 'mp3',
  17. 'm4a': 'aac',
  18. }
  19. _CODING_FORMAT_TO_ABR_MAP = {
  20. 5: 128,
  21. 11: 192,
  22. 12: 32,
  23. 13: 96,
  24. }
  25. def _real_extract(self, url):
  26. audio_id, display_id = self._match_valid_url(url).group('id', 'slug')
  27. if not audio_id:
  28. webpage = self._download_webpage(url, display_id)
  29. audio_id = (
  30. traverse_obj(
  31. get_element_html_by_class('audio-button', webpage),
  32. ({extract_attributes}, ('data-audio-id', 'data-publication-id')), get_all=False)
  33. or self._parse_json(get_element_by_id('gtm-metadata', webpage), display_id)['pageId'])
  34. query = {
  35. 'id': audio_id,
  36. 'type': self._AUDIO_TYPE,
  37. }
  38. item = self._download_json(
  39. self._BASE_URL + 'audiometadata', audio_id,
  40. 'Downloading audio JSON metadata', query=query)['items'][0]
  41. query['format'] = 'iis'
  42. urls = []
  43. formats = []
  44. for quality in self._QUALITIES:
  45. query['quality'] = quality
  46. audio_url_data = self._download_json(
  47. self._BASE_URL + 'getaudiourl', audio_id,
  48. f'Downloading {quality} format JSON metadata',
  49. fatal=False, query=query) or {}
  50. audio_url = audio_url_data.get('audioUrl')
  51. if not audio_url or audio_url in urls:
  52. continue
  53. urls.append(audio_url)
  54. ext = determine_ext(audio_url)
  55. coding_format = audio_url_data.get('codingFormat')
  56. abr = int_or_none(self._search_regex(
  57. r'_a(\d+)\.m4a', audio_url, 'audio bitrate',
  58. default=None)) or self._CODING_FORMAT_TO_ABR_MAP.get(coding_format)
  59. formats.append({
  60. 'abr': abr,
  61. 'acodec': self._EXT_TO_CODEC_MAP.get(ext),
  62. 'ext': ext,
  63. 'format_id': str_or_none(coding_format),
  64. 'vcodec': 'none',
  65. 'url': audio_url,
  66. })
  67. return {
  68. 'id': audio_id,
  69. 'formats': formats,
  70. **traverse_obj(item, {
  71. 'title': 'subtitle',
  72. 'series': 'title',
  73. 'duration': ('duration', {int_or_none}),
  74. 'thumbnail': ('displayimageurl', {url_or_none}),
  75. 'description': 'description',
  76. }),
  77. }
  78. class SverigesRadioPublicationIE(SverigesRadioBaseIE):
  79. IE_NAME = 'sverigesradio:publication'
  80. _VALID_URL = r'https?://(?:www\.)?sverigesradio\.se/(?:sida/)?(?:artikel|gruppsida)(?:\.aspx\?.*?\bartikel=(?P<id>[0-9]+)|/(?P<slug>[\w-]+))'
  81. _TESTS = [{
  82. 'url': 'https://sverigesradio.se/sida/artikel.aspx?programid=83&artikel=7038546',
  83. 'md5': '6a4917e1923fccb080e5a206a5afa542',
  84. 'info_dict': {
  85. 'id': '7038546',
  86. 'ext': 'm4a',
  87. 'duration': 132,
  88. 'series': 'Nyheter (Ekot)',
  89. 'title': 'Esa Teittinen: Sanningen har inte kommit fram',
  90. 'description': 'md5:daf7ce66a8f0a53d5465a5984d3839df',
  91. 'thumbnail': r're:^https?://.*\.jpg',
  92. },
  93. }, {
  94. 'url': 'https://sverigesradio.se/artikel/tysk-fotbollsfeber-bayern-munchens-10-ariga-segersvit-kan-brytas',
  95. 'md5': 'f8a914ad50f491bb74eed403ab4bfef6',
  96. 'info_dict': {
  97. 'id': '8360345',
  98. 'ext': 'm4a',
  99. 'title': 'Tysk fotbollsfeber när Bayern Münchens 10-åriga segersvit kan brytas',
  100. 'series': 'Radiosporten',
  101. 'description': 'md5:5254610e20ce527ecb3a6102a06dcc5f',
  102. 'duration': 72,
  103. 'thumbnail': r're:^https?://.*\.jpg',
  104. },
  105. }, {
  106. 'url': 'https://sverigesradio.se/sida/gruppsida.aspx?programid=3304&grupp=6247&artikel=7146887',
  107. 'only_matching': True,
  108. }]
  109. _AUDIO_TYPE = 'publication'
  110. class SverigesRadioEpisodeIE(SverigesRadioBaseIE):
  111. IE_NAME = 'sverigesradio:episode'
  112. _VALID_URL = r'https?://(?:www\.)?sverigesradio\.se/(?:sida/)?avsnitt/(?:(?P<id>\d+)|(?P<slug>[\w-]+))(?:$|[#?])'
  113. _TESTS = [{
  114. 'url': 'https://sverigesradio.se/avsnitt/1140922?programid=1300',
  115. 'md5': '20dc4d8db24228f846be390b0c59a07c',
  116. 'info_dict': {
  117. 'id': '1140922',
  118. 'ext': 'mp3',
  119. 'duration': 3307,
  120. 'series': 'Konflikt',
  121. 'title': 'Metoo och valen',
  122. 'description': 'md5:fcb5c1f667f00badcc702b196f10a27e',
  123. 'thumbnail': r're:^https?://.*\.jpg',
  124. },
  125. }, {
  126. 'url': 'https://sverigesradio.se/avsnitt/p4-live-med-first-aid-kit-scandinavium-mars-2023',
  127. 'md5': 'ce17fb82520a8033dbb846993d5589fe',
  128. 'info_dict': {
  129. 'id': '2160416',
  130. 'ext': 'm4a',
  131. 'title': 'P4 Live med First Aid Kit',
  132. 'description': 'md5:6d5b78eed3d2b65f6de04daa45e9285d',
  133. 'thumbnail': r're:^https?://.*\.jpg',
  134. 'series': 'P4 Live',
  135. 'duration': 5640,
  136. },
  137. }]
  138. _AUDIO_TYPE = 'episode'