prx.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429
  1. import itertools
  2. from .common import InfoExtractor, SearchInfoExtractor
  3. from ..utils import (
  4. clean_html,
  5. int_or_none,
  6. mimetype2ext,
  7. str_or_none,
  8. traverse_obj,
  9. unified_timestamp,
  10. url_or_none,
  11. urljoin,
  12. )
  13. class PRXBaseIE(InfoExtractor):
  14. PRX_BASE_URL_RE = r'https?://(?:(?:beta|listen)\.)?prx.org/%s'
  15. def _call_api(self, item_id, path, query=None, fatal=True, note='Downloading CMS API JSON'):
  16. return self._download_json(
  17. urljoin('https://cms.prx.org/api/v1/', path), item_id, query=query, fatal=fatal, note=note)
  18. @staticmethod
  19. def _get_prx_embed_response(response, section):
  20. return traverse_obj(response, ('_embedded', f'prx:{section}'))
  21. @staticmethod
  22. def _extract_file_link(response):
  23. return url_or_none(traverse_obj(
  24. response, ('_links', 'enclosure', 'href'), expected_type=str))
  25. @classmethod
  26. def _extract_image(cls, image_response):
  27. if not isinstance(image_response, dict):
  28. return
  29. return {
  30. 'id': str_or_none(image_response.get('id')),
  31. 'filesize': image_response.get('size'),
  32. 'width': image_response.get('width'),
  33. 'height': image_response.get('height'),
  34. 'url': cls._extract_file_link(image_response),
  35. }
  36. @classmethod
  37. def _extract_base_info(cls, response):
  38. if not isinstance(response, dict):
  39. return
  40. item_id = str_or_none(response.get('id'))
  41. if not item_id:
  42. return
  43. thumbnail_dict = cls._extract_image(cls._get_prx_embed_response(response, 'image'))
  44. description = (
  45. clean_html(response.get('description'))
  46. or response.get('shortDescription'))
  47. return {
  48. 'id': item_id,
  49. 'title': response.get('title') or item_id,
  50. 'thumbnails': [thumbnail_dict] if thumbnail_dict else None,
  51. 'description': description,
  52. 'release_timestamp': unified_timestamp(response.get('releasedAt')),
  53. 'timestamp': unified_timestamp(response.get('createdAt')),
  54. 'modified_timestamp': unified_timestamp(response.get('updatedAt')),
  55. 'duration': int_or_none(response.get('duration')),
  56. 'tags': response.get('tags'),
  57. 'episode_number': int_or_none(response.get('episodeIdentifier')),
  58. 'season_number': int_or_none(response.get('seasonIdentifier')),
  59. }
  60. @classmethod
  61. def _extract_series_info(cls, series_response):
  62. base_info = cls._extract_base_info(series_response)
  63. if not base_info:
  64. return
  65. account_info = cls._extract_account_info(
  66. cls._get_prx_embed_response(series_response, 'account')) or {}
  67. return {
  68. **base_info,
  69. 'channel_id': account_info.get('channel_id'),
  70. 'channel_url': account_info.get('channel_url'),
  71. 'channel': account_info.get('channel'),
  72. 'series': base_info.get('title'),
  73. 'series_id': base_info.get('id'),
  74. }
  75. @classmethod
  76. def _extract_account_info(cls, account_response):
  77. base_info = cls._extract_base_info(account_response)
  78. if not base_info:
  79. return
  80. name = account_response.get('name')
  81. return {
  82. **base_info,
  83. 'title': name,
  84. 'channel_id': base_info.get('id'),
  85. 'channel_url': 'https://beta.prx.org/accounts/{}'.format(base_info.get('id')),
  86. 'channel': name,
  87. }
  88. @classmethod
  89. def _extract_story_info(cls, story_response):
  90. base_info = cls._extract_base_info(story_response)
  91. if not base_info:
  92. return
  93. series = cls._extract_series_info(
  94. cls._get_prx_embed_response(story_response, 'series')) or {}
  95. account = cls._extract_account_info(
  96. cls._get_prx_embed_response(story_response, 'account')) or {}
  97. return {
  98. **base_info,
  99. 'series': series.get('series'),
  100. 'series_id': series.get('series_id'),
  101. 'channel_id': account.get('channel_id'),
  102. 'channel_url': account.get('channel_url'),
  103. 'channel': account.get('channel'),
  104. }
  105. def _entries(self, item_id, endpoint, entry_func, query=None):
  106. """
  107. Extract entries from paginated list API
  108. @param entry_func: Function to generate entry from response item
  109. """
  110. total = 0
  111. for page in itertools.count(1):
  112. response = self._call_api(f'{item_id}: page {page}', endpoint, query={
  113. **(query or {}),
  114. 'page': page,
  115. 'per': 100,
  116. })
  117. items = self._get_prx_embed_response(response, 'items')
  118. if not response or not items:
  119. break
  120. yield from filter(None, map(entry_func, items))
  121. total += response['count']
  122. if total >= response['total']:
  123. break
  124. def _story_playlist_entry(self, response):
  125. story = self._extract_story_info(response)
  126. if not story:
  127. return
  128. story.update({
  129. '_type': 'url',
  130. 'url': 'https://beta.prx.org/stories/{}'.format(story['id']),
  131. 'ie_key': PRXStoryIE.ie_key(),
  132. })
  133. return story
  134. def _series_playlist_entry(self, response):
  135. series = self._extract_series_info(response)
  136. if not series:
  137. return
  138. series.update({
  139. '_type': 'url',
  140. 'url': 'https://beta.prx.org/series/{}'.format(series['id']),
  141. 'ie_key': PRXSeriesIE.ie_key(),
  142. })
  143. return series
  144. class PRXStoryIE(PRXBaseIE):
  145. _VALID_URL = PRXBaseIE.PRX_BASE_URL_RE % r'stories/(?P<id>\d+)'
  146. _TESTS = [
  147. {
  148. # Story with season and episode details
  149. 'url': 'https://beta.prx.org/stories/399200',
  150. 'info_dict': {
  151. 'id': '399200',
  152. 'title': 'Fly Me To The Moon',
  153. 'description': 'md5:43230168390b95d3322048d8a56bf2bb',
  154. 'release_timestamp': 1640250000,
  155. 'timestamp': 1640208972,
  156. 'modified_timestamp': 1641318202,
  157. 'duration': 1004,
  158. 'tags': 'count:7',
  159. 'episode_number': 8,
  160. 'season_number': 5,
  161. 'series': 'AirSpace',
  162. 'series_id': '38057',
  163. 'channel_id': '220986',
  164. 'channel_url': 'https://beta.prx.org/accounts/220986',
  165. 'channel': 'Air and Space Museum',
  166. },
  167. 'playlist': [{
  168. 'info_dict': {
  169. 'id': '399200_part1',
  170. 'title': 'Fly Me To The Moon',
  171. 'description': 'md5:43230168390b95d3322048d8a56bf2bb',
  172. 'release_timestamp': 1640250000,
  173. 'timestamp': 1640208972,
  174. 'modified_timestamp': 1641318202,
  175. 'duration': 530,
  176. 'tags': 'count:7',
  177. 'episode_number': 8,
  178. 'season_number': 5,
  179. 'series': 'AirSpace',
  180. 'series_id': '38057',
  181. 'channel_id': '220986',
  182. 'channel_url': 'https://beta.prx.org/accounts/220986',
  183. 'channel': 'Air and Space Museum',
  184. 'ext': 'mp3',
  185. 'upload_date': '20211222',
  186. 'episode': 'Episode 8',
  187. 'release_date': '20211223',
  188. 'season': 'Season 5',
  189. 'modified_date': '20220104',
  190. },
  191. }, {
  192. 'info_dict': {
  193. 'id': '399200_part2',
  194. 'title': 'Fly Me To The Moon',
  195. 'description': 'md5:43230168390b95d3322048d8a56bf2bb',
  196. 'release_timestamp': 1640250000,
  197. 'timestamp': 1640208972,
  198. 'modified_timestamp': 1641318202,
  199. 'duration': 474,
  200. 'tags': 'count:7',
  201. 'episode_number': 8,
  202. 'season_number': 5,
  203. 'series': 'AirSpace',
  204. 'series_id': '38057',
  205. 'channel_id': '220986',
  206. 'channel_url': 'https://beta.prx.org/accounts/220986',
  207. 'channel': 'Air and Space Museum',
  208. 'ext': 'mp3',
  209. 'upload_date': '20211222',
  210. 'episode': 'Episode 8',
  211. 'release_date': '20211223',
  212. 'season': 'Season 5',
  213. 'modified_date': '20220104',
  214. },
  215. },
  216. ],
  217. }, {
  218. # Story with only split audio
  219. 'url': 'https://beta.prx.org/stories/326414',
  220. 'info_dict': {
  221. 'id': '326414',
  222. 'title': 'Massachusetts v EPA',
  223. 'description': 'md5:744fffba08f19f4deab69fa8d49d5816',
  224. 'timestamp': 1592509124,
  225. 'modified_timestamp': 1592510457,
  226. 'duration': 3088,
  227. 'tags': 'count:0',
  228. 'series': 'Outside/In',
  229. 'series_id': '36252',
  230. 'channel_id': '206',
  231. 'channel_url': 'https://beta.prx.org/accounts/206',
  232. 'channel': 'New Hampshire Public Radio',
  233. },
  234. 'playlist_count': 4,
  235. }, {
  236. # Story with single combined audio
  237. 'url': 'https://beta.prx.org/stories/400404',
  238. 'info_dict': {
  239. 'id': '400404',
  240. 'title': 'Cafe Chill (Episode 2022-01)',
  241. 'thumbnails': 'count:1',
  242. 'description': 'md5:9f1b5a3cbd64fb159d08c3baa31f1539',
  243. 'timestamp': 1641233952,
  244. 'modified_timestamp': 1641234248,
  245. 'duration': 3540,
  246. 'series': 'Café Chill',
  247. 'series_id': '37762',
  248. 'channel_id': '5767',
  249. 'channel_url': 'https://beta.prx.org/accounts/5767',
  250. 'channel': 'C89.5 - KNHC Seattle',
  251. 'ext': 'mp3',
  252. 'tags': 'count:0',
  253. 'thumbnail': r're:https?://cms\.prx\.org/pub/\w+/0/web/story_image/767965/medium/Aurora_Over_Trees\.jpg',
  254. 'upload_date': '20220103',
  255. 'modified_date': '20220103',
  256. },
  257. }, {
  258. 'url': 'https://listen.prx.org/stories/399200',
  259. 'only_matching': True,
  260. },
  261. ]
  262. def _extract_audio_pieces(self, audio_response):
  263. return [{
  264. 'format_id': str_or_none(piece_response.get('id')),
  265. 'format_note': str_or_none(piece_response.get('label')),
  266. 'filesize': int_or_none(piece_response.get('size')),
  267. 'duration': int_or_none(piece_response.get('duration')),
  268. 'ext': mimetype2ext(piece_response.get('contentType')),
  269. 'asr': int_or_none(piece_response.get('frequency'), scale=1000),
  270. 'abr': int_or_none(piece_response.get('bitRate')),
  271. 'url': self._extract_file_link(piece_response),
  272. 'vcodec': 'none',
  273. } for piece_response in sorted(
  274. self._get_prx_embed_response(audio_response, 'items') or [],
  275. key=lambda p: int_or_none(p.get('position')))]
  276. def _extract_story(self, story_response):
  277. info = self._extract_story_info(story_response)
  278. if not info:
  279. return
  280. audio_pieces = self._extract_audio_pieces(
  281. self._get_prx_embed_response(story_response, 'audio'))
  282. if len(audio_pieces) == 1:
  283. return {
  284. 'formats': audio_pieces,
  285. **info,
  286. }
  287. entries = [{
  288. **info,
  289. 'id': '{}_part{}'.format(info['id'], (idx + 1)),
  290. 'formats': [fmt],
  291. } for idx, fmt in enumerate(audio_pieces)]
  292. return {
  293. '_type': 'multi_video',
  294. 'entries': entries,
  295. **info,
  296. }
  297. def _real_extract(self, url):
  298. story_id = self._match_id(url)
  299. response = self._call_api(story_id, f'stories/{story_id}')
  300. return self._extract_story(response)
  301. class PRXSeriesIE(PRXBaseIE):
  302. _VALID_URL = PRXBaseIE.PRX_BASE_URL_RE % r'series/(?P<id>\d+)'
  303. _TESTS = [
  304. {
  305. 'url': 'https://beta.prx.org/series/36252',
  306. 'info_dict': {
  307. 'id': '36252',
  308. 'title': 'Outside/In',
  309. 'thumbnails': 'count:1',
  310. 'description': 'md5:a6bedc5f810777bcb09ab30ff9059114',
  311. 'timestamp': 1470684964,
  312. 'modified_timestamp': 1582308830,
  313. 'channel_id': '206',
  314. 'channel_url': 'https://beta.prx.org/accounts/206',
  315. 'channel': 'New Hampshire Public Radio',
  316. 'series': 'Outside/In',
  317. 'series_id': '36252',
  318. },
  319. 'playlist_mincount': 39,
  320. }, {
  321. # Blank series
  322. 'url': 'https://beta.prx.org/series/25038',
  323. 'info_dict': {
  324. 'id': '25038',
  325. 'title': '25038',
  326. 'timestamp': 1207612800,
  327. 'modified_timestamp': 1207612800,
  328. 'channel_id': '206',
  329. 'channel_url': 'https://beta.prx.org/accounts/206',
  330. 'channel': 'New Hampshire Public Radio',
  331. 'series': '25038',
  332. 'series_id': '25038',
  333. },
  334. 'playlist_count': 0,
  335. },
  336. ]
  337. def _extract_series(self, series_response):
  338. info = self._extract_series_info(series_response)
  339. return {
  340. '_type': 'playlist',
  341. 'entries': self._entries(info['id'], 'series/{}/stories'.format(info['id']), self._story_playlist_entry),
  342. **info,
  343. }
  344. def _real_extract(self, url):
  345. series_id = self._match_id(url)
  346. response = self._call_api(series_id, f'series/{series_id}')
  347. return self._extract_series(response)
  348. class PRXAccountIE(PRXBaseIE):
  349. _VALID_URL = PRXBaseIE.PRX_BASE_URL_RE % r'accounts/(?P<id>\d+)'
  350. _TESTS = [{
  351. 'url': 'https://beta.prx.org/accounts/206',
  352. 'info_dict': {
  353. 'id': '206',
  354. 'title': 'New Hampshire Public Radio',
  355. 'description': 'md5:277f2395301d0aca563c80c70a18ee0a',
  356. 'channel_id': '206',
  357. 'channel_url': 'https://beta.prx.org/accounts/206',
  358. 'channel': 'New Hampshire Public Radio',
  359. 'thumbnails': 'count:1',
  360. },
  361. 'playlist_mincount': 380,
  362. }]
  363. def _extract_account(self, account_response):
  364. info = self._extract_account_info(account_response)
  365. series = self._entries(
  366. info['id'], f'accounts/{info["id"]}/series', self._series_playlist_entry)
  367. stories = self._entries(
  368. info['id'], f'accounts/{info["id"]}/stories', self._story_playlist_entry)
  369. return {
  370. '_type': 'playlist',
  371. 'entries': itertools.chain(series, stories),
  372. **info,
  373. }
  374. def _real_extract(self, url):
  375. account_id = self._match_id(url)
  376. response = self._call_api(account_id, f'accounts/{account_id}')
  377. return self._extract_account(response)
  378. class PRXStoriesSearchIE(PRXBaseIE, SearchInfoExtractor):
  379. IE_DESC = 'PRX Stories Search'
  380. IE_NAME = 'prxstories:search'
  381. _SEARCH_KEY = 'prxstories'
  382. def _search_results(self, query):
  383. yield from self._entries(
  384. f'query {query}', 'stories/search', self._story_playlist_entry, query={'q': query})
  385. class PRXSeriesSearchIE(PRXBaseIE, SearchInfoExtractor):
  386. IE_DESC = 'PRX Series Search'
  387. IE_NAME = 'prxseries:search'
  388. _SEARCH_KEY = 'prxseries'
  389. def _search_results(self, query):
  390. yield from self._entries(
  391. f'query {query}', 'series/search', self._series_playlist_entry, query={'q': query})