polskieradio.py 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609
  1. import itertools
  2. import json
  3. import math
  4. import re
  5. import urllib.parse
  6. from .common import InfoExtractor
  7. from ..utils import (
  8. ExtractorError,
  9. InAdvancePagedList,
  10. determine_ext,
  11. extract_attributes,
  12. int_or_none,
  13. js_to_json,
  14. parse_iso8601,
  15. strip_or_none,
  16. traverse_obj,
  17. unescapeHTML,
  18. unified_timestamp,
  19. url_or_none,
  20. urljoin,
  21. )
  22. class PolskieRadioBaseExtractor(InfoExtractor):
  23. def _extract_webpage_player_entries(self, webpage, playlist_id, base_data):
  24. media_urls = set()
  25. for data_media in re.findall(r'<[^>]+data-media="?({[^>]+})"?', webpage):
  26. media = self._parse_json(data_media, playlist_id, transform_source=unescapeHTML, fatal=False)
  27. if not media.get('file') or not media.get('desc'):
  28. continue
  29. media_url = self._proto_relative_url(media['file'])
  30. if media_url in media_urls:
  31. continue
  32. media_urls.add(media_url)
  33. entry = base_data.copy()
  34. entry.update({
  35. 'id': str(media['id']),
  36. 'url': media_url,
  37. 'duration': int_or_none(media.get('length')),
  38. 'vcodec': 'none' if media.get('provider') == 'audio' else None,
  39. })
  40. entry_title = urllib.parse.unquote(media['desc'])
  41. if entry_title:
  42. entry['title'] = entry_title
  43. yield entry
  44. class PolskieRadioLegacyIE(PolskieRadioBaseExtractor):
  45. # legacy sites
  46. IE_NAME = 'polskieradio:legacy'
  47. _VALID_URL = r'https?://(?:www\.)?polskieradio(?:24)?\.pl/\d+/\d+/[Aa]rtykul/(?P<id>\d+)'
  48. _TESTS = [{
  49. 'url': 'https://www.polskieradio.pl/8/2382/Artykul/2534482,Zagarysci-Poezja-jak-spoiwo',
  50. 'info_dict': {
  51. 'id': '2534482',
  52. 'title': 'Żagaryści. Poezja jak spoiwo',
  53. 'description': 'md5:f18d95d5dcba747a09b635e21a4c0695',
  54. },
  55. 'playlist': [{
  56. 'md5': 'd07559829f61d5a93a75755987ded760',
  57. 'info_dict': {
  58. 'id': '2516679',
  59. 'ext': 'mp3',
  60. 'title': 'md5:c6e1234e0b747ad883cb91b7ad06b98c',
  61. 'timestamp': 1592654400,
  62. 'upload_date': '20200620',
  63. 'duration': 1430,
  64. 'thumbnail': r're:^https?://static\.prsa\.pl/images/.*\.jpg$',
  65. },
  66. }],
  67. }, {
  68. # PR4 audition - other frontend
  69. 'url': 'https://www.polskieradio.pl/10/6071/Artykul/2610977,Poglos-29-pazdziernika-godz-2301',
  70. 'info_dict': {
  71. 'id': '2610977',
  72. 'ext': 'mp3',
  73. 'title': 'Pogłos 29 października godz. 23:01',
  74. },
  75. }, {
  76. 'url': 'https://polskieradio24.pl/130/4503/Artykul/2621876,Narusza-nasza-suwerennosc-Publicysci-o-uzaleznieniu-funduszy-UE-od-praworzadnosci',
  77. 'only_matching': True,
  78. }]
  79. def _real_extract(self, url):
  80. playlist_id = self._match_id(url)
  81. webpage, urlh = self._download_webpage_handle(url, playlist_id)
  82. if PolskieRadioIE.suitable(urlh.url):
  83. return self.url_result(urlh.url, PolskieRadioIE, playlist_id)
  84. content = self._search_regex(
  85. r'(?s)<div[^>]+class="\s*this-article\s*"[^>]*>(.+?)<div[^>]+class="tags"[^>]*>',
  86. webpage, 'content', default=None)
  87. timestamp = unified_timestamp(self._html_search_regex(
  88. r'(?s)<span[^>]+id="datetime2"[^>]*>(.+?)</span>',
  89. webpage, 'timestamp', default=None))
  90. thumbnail_url = self._og_search_thumbnail(webpage, default=None)
  91. title = self._og_search_title(webpage).strip()
  92. description = strip_or_none(self._og_search_description(webpage, default=None))
  93. description = description.replace('\xa0', ' ') if description is not None else None
  94. if not content:
  95. return {
  96. 'id': playlist_id,
  97. 'url': self._proto_relative_url(
  98. self._search_regex(
  99. r"source:\s*'(//static\.prsa\.pl/[^']+)'",
  100. webpage, 'audition record url')),
  101. 'title': title,
  102. 'description': description,
  103. 'timestamp': timestamp,
  104. 'thumbnail': thumbnail_url,
  105. }
  106. entries = self._extract_webpage_player_entries(content, playlist_id, {
  107. 'title': title,
  108. 'timestamp': timestamp,
  109. 'thumbnail': thumbnail_url,
  110. })
  111. return self.playlist_result(entries, playlist_id, title, description)
  112. class PolskieRadioIE(PolskieRadioBaseExtractor):
  113. # new next.js sites
  114. _VALID_URL = r'https?://(?:[^/]+\.)?(?:polskieradio(?:24)?|radiokierowcow)\.pl/artykul/(?P<id>\d+)'
  115. _TESTS = [{
  116. # articleData, attachments
  117. 'url': 'https://jedynka.polskieradio.pl/artykul/1587943',
  118. 'info_dict': {
  119. 'id': '1587943',
  120. 'title': 'Prof. Andrzej Nowak: o historii nie da się myśleć beznamiętnie',
  121. 'description': 'md5:12f954edbf3120c5e7075e17bf9fc5c5',
  122. },
  123. 'playlist': [{
  124. 'md5': '2984ee6ce9046d91fc233bc1a864a09a',
  125. 'info_dict': {
  126. 'id': '7a85d429-5356-4def-a347-925e4ae7406b',
  127. 'ext': 'mp3',
  128. 'title': 'md5:d4623290d4ac983bf924061c75c23a0d',
  129. },
  130. }],
  131. }, {
  132. # post, legacy html players
  133. 'url': 'https://trojka.polskieradio.pl/artykul/2589163,Czy-wciaz-otrzymujemy-zdjecia-z-sond-Voyager',
  134. 'info_dict': {
  135. 'id': '2589163',
  136. 'title': 'Czy wciąż otrzymujemy zdjęcia z sond Voyager?',
  137. 'description': 'md5:cf1a7f348d63a2db9c0d7a63d1669473',
  138. },
  139. 'playlist': [{
  140. 'info_dict': {
  141. 'id': '2577880',
  142. 'ext': 'mp3',
  143. 'title': 'md5:a57d10a0c02abd34dd675cb33707ad5a',
  144. 'duration': 321,
  145. },
  146. }],
  147. }, {
  148. # data, legacy
  149. 'url': 'https://radiokierowcow.pl/artykul/2694529',
  150. 'info_dict': {
  151. 'id': '2694529',
  152. 'title': 'Zielona fala reliktem przeszłości?',
  153. 'description': 'md5:f20a9a7ed9cb58916c54add94eae3bc0',
  154. },
  155. 'playlist_count': 3,
  156. }, {
  157. 'url': 'https://trojka.polskieradio.pl/artykul/1632955',
  158. 'only_matching': True,
  159. }, {
  160. # with mp4 video
  161. 'url': 'https://trojka.polskieradio.pl/artykul/1634903',
  162. 'only_matching': True,
  163. }, {
  164. 'url': 'https://jedynka.polskieradio.pl/artykul/3042436,Polityka-wschodnia-ojca-i-syna-Wladyslawa-Lokietka-i-Kazimierza-Wielkiego',
  165. 'only_matching': True,
  166. }]
  167. def _real_extract(self, url):
  168. playlist_id = self._match_id(url)
  169. webpage = self._download_webpage(url, playlist_id)
  170. article_data = traverse_obj(
  171. self._search_nextjs_data(webpage, playlist_id), (
  172. 'props', 'pageProps', (('data', 'articleData'), 'post', 'data')), get_all=False)
  173. title = strip_or_none(article_data['title'])
  174. description = strip_or_none(article_data.get('lead'))
  175. entries = [{
  176. 'url': entry['file'],
  177. 'ext': determine_ext(entry.get('fileName')),
  178. 'id': self._search_regex(
  179. r'([a-f\d]{8}-(?:[a-f\d]{4}-){3}[a-f\d]{12})', entry['file'], 'entry id'),
  180. 'title': strip_or_none(entry.get('description')) or title,
  181. } for entry in article_data.get('attachments') or () if entry.get('fileType') in ('Audio', )]
  182. if not entries:
  183. # some legacy articles have no json attachments, but players in body
  184. entries = self._extract_webpage_player_entries(article_data['content'], playlist_id, {
  185. 'title': title,
  186. })
  187. return self.playlist_result(entries, playlist_id, title, description)
  188. class PolskieRadioAuditionIE(InfoExtractor):
  189. # new next.js sites
  190. IE_NAME = 'polskieradio:audition'
  191. _VALID_URL = r'https?://(?:[^/]+\.)?polskieradio\.pl/audycj[ae]/(?P<id>\d+)'
  192. _TESTS = [{
  193. # articles, PR1
  194. 'url': 'https://jedynka.polskieradio.pl/audycje/5102',
  195. 'info_dict': {
  196. 'id': '5102',
  197. 'title': 'Historia żywa',
  198. 'thumbnail': r're:https://static\.prsa\.pl/images/.+',
  199. },
  200. 'playlist_mincount': 38,
  201. }, {
  202. # episodes, PR1
  203. 'url': 'https://jedynka.polskieradio.pl/audycje/5769',
  204. 'info_dict': {
  205. 'id': '5769',
  206. 'title': 'AgroFakty',
  207. 'thumbnail': r're:https://static\.prsa\.pl/images/.+',
  208. },
  209. 'playlist_mincount': 269,
  210. }, {
  211. # both episodes and articles, PR3
  212. 'url': 'https://trojka.polskieradio.pl/audycja/8906',
  213. 'info_dict': {
  214. 'id': '8906',
  215. 'title': 'Trójka budzi',
  216. 'thumbnail': r're:https://static\.prsa\.pl/images/.+',
  217. },
  218. 'playlist_mincount': 722,
  219. }, {
  220. # some articles were "promoted to main page" and thus link to old frontend
  221. 'url': 'https://trojka.polskieradio.pl/audycja/305',
  222. 'info_dict': {
  223. 'id': '305',
  224. 'title': 'Co w mowie piszczy?',
  225. 'thumbnail': r're:https://static\.prsa\.pl/images/.+',
  226. },
  227. 'playlist_count': 1523,
  228. }]
  229. def _call_lp3(self, path, query, video_id, note):
  230. return self._download_json(
  231. f'https://lp3test.polskieradio.pl/{path}', video_id, note,
  232. query=query, headers={'x-api-key': '9bf6c5a2-a7d0-4980-9ed7-a3f7291f2a81'})
  233. def _entries(self, playlist_id, has_episodes, has_articles):
  234. for i in itertools.count(0) if has_episodes else []:
  235. page = self._call_lp3(
  236. 'AudioArticle/GetListByCategoryId', {
  237. 'categoryId': playlist_id,
  238. 'PageSize': 10,
  239. 'skip': i,
  240. 'format': 400,
  241. }, playlist_id, f'Downloading episode list page {i + 1}')
  242. if not traverse_obj(page, 'data'):
  243. break
  244. for episode in page['data']:
  245. yield {
  246. 'id': str(episode['id']),
  247. 'url': episode['file'],
  248. 'title': episode.get('title'),
  249. 'duration': int_or_none(episode.get('duration')),
  250. 'timestamp': parse_iso8601(episode.get('datePublic')),
  251. }
  252. for i in itertools.count(0) if has_articles else []:
  253. page = self._call_lp3(
  254. 'Article/GetListByCategoryId', {
  255. 'categoryId': playlist_id,
  256. 'PageSize': 9,
  257. 'skip': i,
  258. 'format': 400,
  259. }, playlist_id, f'Downloading article list page {i + 1}')
  260. if not traverse_obj(page, 'data'):
  261. break
  262. for article in page['data']:
  263. yield {
  264. '_type': 'url_transparent',
  265. 'id': str(article['id']),
  266. 'url': article['url'],
  267. 'title': article.get('shortTitle'),
  268. 'description': traverse_obj(article, ('description', 'lead')),
  269. 'timestamp': parse_iso8601(article.get('datePublic')),
  270. }
  271. def _real_extract(self, url):
  272. playlist_id = self._match_id(url)
  273. page_props = traverse_obj(
  274. self._search_nextjs_data(self._download_webpage(url, playlist_id), playlist_id),
  275. ('props', 'pageProps', ('data', None)), get_all=False)
  276. has_episodes = bool(traverse_obj(page_props, 'episodes', 'audios'))
  277. has_articles = bool(traverse_obj(page_props, 'articles'))
  278. return self.playlist_result(
  279. self._entries(playlist_id, has_episodes, has_articles), playlist_id,
  280. title=traverse_obj(page_props, ('details', 'name')),
  281. description=traverse_obj(page_props, ('details', 'description', 'lead')),
  282. thumbnail=traverse_obj(page_props, ('details', 'photo')))
  283. class PolskieRadioCategoryIE(InfoExtractor):
  284. # legacy sites
  285. IE_NAME = 'polskieradio:category'
  286. _VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/(?:\d+(?:,[^/]+)?/|[^/]+/Tag)(?P<id>\d+)'
  287. _TESTS = [{
  288. 'url': 'http://www.polskieradio.pl/37,RedakcjaKatolicka/4143,Kierunek-Krakow',
  289. 'info_dict': {
  290. 'id': '4143',
  291. 'title': 'Kierunek Kraków',
  292. },
  293. 'playlist_mincount': 61,
  294. }, {
  295. 'url': 'http://www.polskieradio.pl/10,czworka/214,muzyka',
  296. 'info_dict': {
  297. 'id': '214',
  298. 'title': 'Muzyka',
  299. },
  300. 'playlist_mincount': 61,
  301. }, {
  302. # billennium tabs
  303. 'url': 'https://www.polskieradio.pl/8/2385',
  304. 'info_dict': {
  305. 'id': '2385',
  306. 'title': 'Droga przez mąkę',
  307. },
  308. 'playlist_mincount': 111,
  309. }, {
  310. 'url': 'https://www.polskieradio.pl/10/4930',
  311. 'info_dict': {
  312. 'id': '4930',
  313. 'title': 'Teraz K-pop!',
  314. },
  315. 'playlist_mincount': 392,
  316. }, {
  317. # post back pages, audio content directly without articles
  318. 'url': 'https://www.polskieradio.pl/8,dwojka/7376,nowa-mowa',
  319. 'info_dict': {
  320. 'id': '7376',
  321. 'title': 'Nowa mowa',
  322. },
  323. 'playlist_mincount': 244,
  324. }, {
  325. 'url': 'https://www.polskieradio.pl/Krzysztof-Dziuba/Tag175458',
  326. 'info_dict': {
  327. 'id': '175458',
  328. 'title': 'Krzysztof Dziuba',
  329. },
  330. 'playlist_mincount': 420,
  331. }, {
  332. 'url': 'http://www.polskieradio.pl/8,Dwojka/196,Publicystyka',
  333. 'only_matching': True,
  334. }]
  335. @classmethod
  336. def suitable(cls, url):
  337. return False if PolskieRadioLegacyIE.suitable(url) else super().suitable(url)
  338. def _entries(self, url, page, category_id):
  339. content = page
  340. is_billennium_tabs = 'onclick="TB_LoadTab(' in page
  341. is_post_back = 'onclick="__doPostBack(' in page
  342. pagination = page if is_billennium_tabs else None
  343. for page_num in itertools.count(2):
  344. for a_entry, entry_id in re.findall(
  345. r'(?s)<article[^>]+>.*?(<a[^>]+href=["\'](?:(?:https?)?://[^/]+)?/\d+/\d+/Artykul/(\d+)[^>]+>).*?</article>',
  346. content):
  347. entry = extract_attributes(a_entry)
  348. if entry.get('href'):
  349. yield self.url_result(
  350. urljoin(url, entry['href']), PolskieRadioLegacyIE, entry_id, entry.get('title'))
  351. for a_entry in re.findall(r'<span data-media=({[^ ]+})', content):
  352. yield traverse_obj(self._parse_json(a_entry, category_id), {
  353. 'url': 'file',
  354. 'id': 'uid',
  355. 'duration': 'length',
  356. 'title': ('title', {urllib.parse.unquote}),
  357. 'description': ('desc', {urllib.parse.unquote}),
  358. })
  359. if is_billennium_tabs:
  360. params = self._search_json(
  361. r'<div[^>]+class=["\']next["\'][^>]*>\s*<a[^>]+onclick=["\']TB_LoadTab\(',
  362. pagination, 'next page params', category_id, default=None, close_objects=1,
  363. contains_pattern='.+', transform_source=lambda x: f'[{js_to_json(unescapeHTML(x))}')
  364. if not params:
  365. break
  366. tab_content = self._download_json(
  367. 'https://www.polskieradio.pl/CMS/TemplateBoxesManagement/TemplateBoxTabContent.aspx/GetTabContent',
  368. category_id, f'Downloading page {page_num}', headers={'content-type': 'application/json'},
  369. data=json.dumps(dict(zip((
  370. 'boxInstanceId', 'tabId', 'categoryType', 'sectionId', 'categoryId', 'pagerMode',
  371. 'subjectIds', 'tagIndexId', 'queryString', 'name', 'openArticlesInParentTemplate',
  372. 'idSectionFromUrl', 'maxDocumentAge', 'showCategoryForArticle', 'pageNumber',
  373. ), params))).encode())['d']
  374. content, pagination = tab_content['Content'], tab_content.get('PagerContent')
  375. elif is_post_back:
  376. target = self._search_regex(
  377. r'onclick=(?:["\'])__doPostBack\((?P<q1>["\'])(?P<target>[\w$]+)(?P=q1)\s*,\s*(?P<q2>["\'])Next(?P=q2)',
  378. content, 'pagination postback target', group='target', default=None)
  379. if not target:
  380. break
  381. content = self._download_webpage(
  382. url, category_id, f'Downloading page {page_num}',
  383. data=urllib.parse.urlencode({
  384. **self._hidden_inputs(content),
  385. '__EVENTTARGET': target,
  386. '__EVENTARGUMENT': 'Next',
  387. }).encode())
  388. else:
  389. next_url = urljoin(url, self._search_regex(
  390. r'<div[^>]+class=["\']next["\'][^>]*>\s*<a[^>]+href=(["\'])(?P<url>(?:(?!\1).)+)\1',
  391. content, 'next page url', group='url', default=None))
  392. if not next_url:
  393. break
  394. content = self._download_webpage(next_url, category_id, f'Downloading page {page_num}')
  395. def _real_extract(self, url):
  396. category_id = self._match_id(url)
  397. webpage, urlh = self._download_webpage_handle(url, category_id)
  398. if PolskieRadioAuditionIE.suitable(urlh.url):
  399. return self.url_result(urlh.url, PolskieRadioAuditionIE, category_id)
  400. title = self._html_search_regex(
  401. r'<title>([^<]+)(?: - [^<]+ - [^<]+| w [Pp]olskie[Rr]adio\.pl\s*)</title>',
  402. webpage, 'title', fatal=False)
  403. return self.playlist_result(
  404. self._entries(url, webpage, category_id),
  405. category_id, title)
  406. class PolskieRadioPlayerIE(InfoExtractor):
  407. IE_NAME = 'polskieradio:player'
  408. _VALID_URL = r'https?://player\.polskieradio\.pl/anteny/(?P<id>[^/]+)'
  409. _BASE_URL = 'https://player.polskieradio.pl'
  410. _PLAYER_URL = 'https://player.polskieradio.pl/main.bundle.js'
  411. _STATIONS_API_URL = 'https://apipr.polskieradio.pl/api/stacje'
  412. _TESTS = [{
  413. 'url': 'https://player.polskieradio.pl/anteny/trojka',
  414. 'info_dict': {
  415. 'id': '3',
  416. 'ext': 'm4a',
  417. 'title': 'Trójka',
  418. },
  419. 'params': {
  420. 'format': 'bestaudio',
  421. 'skip_download': 'endless stream',
  422. },
  423. }]
  424. def _get_channel_list(self, channel_url='no_channel'):
  425. player_code = self._download_webpage(
  426. self._PLAYER_URL, channel_url,
  427. note='Downloading js player')
  428. channel_list = js_to_json(self._search_regex(
  429. r';var r="anteny",a=(\[.+?\])},', player_code, 'channel list'))
  430. return self._parse_json(channel_list, channel_url)
  431. def _real_extract(self, url):
  432. channel_url = self._match_id(url)
  433. channel_list = self._get_channel_list(channel_url)
  434. channel = next((c for c in channel_list if c.get('url') == channel_url), None)
  435. if not channel:
  436. raise ExtractorError('Channel not found')
  437. station_list = self._download_json(self._STATIONS_API_URL, channel_url,
  438. note='Downloading stream url list',
  439. headers={
  440. 'Accept': 'application/json',
  441. 'Referer': url,
  442. 'Origin': self._BASE_URL,
  443. })
  444. station = next((s for s in station_list
  445. if s.get('Name') == (channel.get('streamName') or channel.get('name'))), None)
  446. if not station:
  447. raise ExtractorError('Station not found even though we extracted channel')
  448. formats = []
  449. for stream_url in station['Streams']:
  450. stream_url = self._proto_relative_url(stream_url)
  451. if stream_url.endswith('/playlist.m3u8'):
  452. formats.extend(self._extract_m3u8_formats(stream_url, channel_url, live=True))
  453. elif stream_url.endswith('/manifest.f4m'):
  454. formats.extend(self._extract_mpd_formats(stream_url, channel_url))
  455. elif stream_url.endswith('/Manifest'):
  456. formats.extend(self._extract_ism_formats(stream_url, channel_url))
  457. else:
  458. formats.append({
  459. 'url': stream_url,
  460. })
  461. return {
  462. 'id': str(channel['id']),
  463. 'formats': formats,
  464. 'title': channel.get('name') or channel.get('streamName'),
  465. 'display_id': channel_url,
  466. 'thumbnail': f'{self._BASE_URL}/images/{channel_url}-color-logo.png',
  467. 'is_live': True,
  468. }
  469. class PolskieRadioPodcastBaseExtractor(InfoExtractor):
  470. _API_BASE = 'https://apipodcasts.polskieradio.pl/api'
  471. def _parse_episode(self, data):
  472. return {
  473. 'id': data['guid'],
  474. 'formats': [{
  475. 'url': data['url'],
  476. 'filesize': int_or_none(data.get('fileSize')),
  477. }],
  478. 'title': data['title'],
  479. 'description': data.get('description'),
  480. 'duration': int_or_none(data.get('length')),
  481. 'timestamp': parse_iso8601(data.get('publishDate')),
  482. 'thumbnail': url_or_none(data.get('image')),
  483. 'series': data.get('podcastTitle'),
  484. 'episode': data['title'],
  485. }
  486. class PolskieRadioPodcastListIE(PolskieRadioPodcastBaseExtractor):
  487. IE_NAME = 'polskieradio:podcast:list'
  488. _VALID_URL = r'https?://podcasty\.polskieradio\.pl/podcast/(?P<id>\d+)'
  489. _TESTS = [{
  490. 'url': 'https://podcasty.polskieradio.pl/podcast/8/',
  491. 'info_dict': {
  492. 'id': '8',
  493. 'title': 'Śniadanie w Trójce',
  494. 'description': 'md5:57abcc27bc4c6a6b25baa3061975b9ef',
  495. 'uploader': 'Beata Michniewicz',
  496. },
  497. 'playlist_mincount': 714,
  498. }]
  499. _PAGE_SIZE = 10
  500. def _call_api(self, podcast_id, page):
  501. return self._download_json(
  502. f'{self._API_BASE}/Podcasts/{podcast_id}/?pageSize={self._PAGE_SIZE}&page={page}',
  503. podcast_id, f'Downloading page {page}')
  504. def _real_extract(self, url):
  505. podcast_id = self._match_id(url)
  506. data = self._call_api(podcast_id, 1)
  507. def get_page(page_num):
  508. page_data = self._call_api(podcast_id, page_num + 1) if page_num else data
  509. yield from (self._parse_episode(ep) for ep in page_data['items'])
  510. return {
  511. '_type': 'playlist',
  512. 'entries': InAdvancePagedList(
  513. get_page, math.ceil(data['itemCount'] / self._PAGE_SIZE), self._PAGE_SIZE),
  514. 'id': str(data['id']),
  515. 'title': data.get('title'),
  516. 'description': data.get('description'),
  517. 'uploader': data.get('announcer'),
  518. }
  519. class PolskieRadioPodcastIE(PolskieRadioPodcastBaseExtractor):
  520. IE_NAME = 'polskieradio:podcast'
  521. _VALID_URL = r'https?://podcasty\.polskieradio\.pl/track/(?P<id>[a-f\d]{8}(?:-[a-f\d]{4}){4}[a-f\d]{8})'
  522. _TESTS = [{
  523. 'url': 'https://podcasty.polskieradio.pl/track/6eafe403-cb8f-4756-b896-4455c3713c32',
  524. 'info_dict': {
  525. 'id': '6eafe403-cb8f-4756-b896-4455c3713c32',
  526. 'ext': 'mp3',
  527. 'title': 'Theresa May rezygnuje. Co dalej z brexitem?',
  528. 'description': 'md5:e41c409a29d022b70ef0faa61dbded60',
  529. 'episode': 'Theresa May rezygnuje. Co dalej z brexitem?',
  530. 'duration': 2893,
  531. 'thumbnail': 'https://static.prsa.pl/images/58649376-c8a0-4ba2-a714-78b383285f5f.jpg',
  532. 'series': 'Raport o stanie świata',
  533. },
  534. }]
  535. def _real_extract(self, url):
  536. podcast_id = self._match_id(url)
  537. data = self._download_json(
  538. f'{self._API_BASE}/audio',
  539. podcast_id, 'Downloading podcast metadata',
  540. data=json.dumps({
  541. 'guids': [podcast_id],
  542. }).encode(),
  543. headers={
  544. 'Content-Type': 'application/json',
  545. })
  546. return self._parse_episode(data[0])