art19.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303
  1. import re
  2. from .common import InfoExtractor
  3. from ..utils import float_or_none, int_or_none, parse_iso8601, url_or_none
  4. from ..utils.traversal import traverse_obj
  5. class Art19IE(InfoExtractor):
  6. _UUID_REGEX = r'[\da-f]{8}-?[\da-f]{4}-?[\da-f]{4}-?[\da-f]{4}-?[\da-f]{12}'
  7. _VALID_URL = [
  8. rf'https?://(?:www\.)?art19\.com/shows/[^/#?]+/episodes/(?P<id>{_UUID_REGEX})',
  9. rf'https?://rss\.art19\.com/episodes/(?P<id>{_UUID_REGEX})\.mp3',
  10. ]
  11. _EMBED_REGEX = [rf'<iframe[^>]+\bsrc=[\'"](?P<url>{_VALID_URL[0]})']
  12. _TESTS = [{
  13. 'url': 'https://rss.art19.com/episodes/5ba1413c-48b8-472b-9cc3-cfd952340bdb.mp3',
  14. 'info_dict': {
  15. 'id': '5ba1413c-48b8-472b-9cc3-cfd952340bdb',
  16. 'ext': 'mp3',
  17. 'title': 'Why Did DeSantis Drop Out?',
  18. 'series': 'The Daily Briefing',
  19. 'release_timestamp': 1705941275,
  20. 'description': 'md5:da38961da4a3f7e419471365e3c6b49f',
  21. 'episode': 'Episode 582',
  22. 'thumbnail': r're:^https?://content\.production\.cdn\.art19\.com.*\.jpeg$',
  23. 'series_id': 'ed52a0ab-08b1-4def-8afc-549e4d93296d',
  24. 'upload_date': '20240122',
  25. 'timestamp': 1705940815,
  26. 'episode_number': 582,
  27. 'modified_date': '20240122',
  28. 'episode_id': '5ba1413c-48b8-472b-9cc3-cfd952340bdb',
  29. 'modified_timestamp': 1705941275,
  30. 'release_date': '20240122',
  31. 'duration': 527.4,
  32. },
  33. }, {
  34. 'url': 'https://art19.com/shows/scamfluencers/episodes/8319b776-4153-4d22-8630-631f204a03dd',
  35. 'info_dict': {
  36. 'id': '8319b776-4153-4d22-8630-631f204a03dd',
  37. 'ext': 'mp3',
  38. 'title': 'Martha Stewart: The Homemaker Hustler Part 2',
  39. 'modified_date': '20240116',
  40. 'upload_date': '20240105',
  41. 'modified_timestamp': 1705435802,
  42. 'episode_id': '8319b776-4153-4d22-8630-631f204a03dd',
  43. 'series_id': 'd3c9b8ca-26b3-42f4-9bd8-21d1a9031e75',
  44. 'thumbnail': r're:^https?://content\.production\.cdn\.art19\.com.*\.jpeg$',
  45. 'description': 'md5:4aa7cfd1358dc57e729835bc208d7893',
  46. 'release_timestamp': 1705305660,
  47. 'release_date': '20240115',
  48. 'timestamp': 1704481536,
  49. 'episode_number': 88,
  50. 'series': 'Scamfluencers',
  51. 'duration': 2588.37501,
  52. 'episode': 'Episode 88',
  53. },
  54. }]
  55. _WEBPAGE_TESTS = [{
  56. 'url': 'https://www.nu.nl/formule-1/6291456/verstappen-wordt-een-synoniem-voor-formule-1.html',
  57. 'info_dict': {
  58. 'id': '7d42626a-7301-47db-bb8a-3b6f054d77d7',
  59. 'ext': 'mp3',
  60. 'title': "'Verstappen wordt een synoniem voor Formule 1'",
  61. 'season': 'Seizoen 6',
  62. 'description': 'md5:39a7159a31c4cda312b2e893bdd5c071',
  63. 'episode_id': '7d42626a-7301-47db-bb8a-3b6f054d77d7',
  64. 'duration': 3061.82111,
  65. 'series_id': '93f4e113-2a60-4609-a564-755058fa40d8',
  66. 'release_date': '20231126',
  67. 'modified_timestamp': 1701156004,
  68. 'thumbnail': r're:^https?://content\.production\.cdn\.art19\.com.*\.jpeg$',
  69. 'season_number': 6,
  70. 'episode_number': 52,
  71. 'modified_date': '20231128',
  72. 'upload_date': '20231126',
  73. 'timestamp': 1701025981,
  74. 'season_id': '36097c1e-7455-490d-a2fe-e2f10b4d5f26',
  75. 'series': 'De Boordradio',
  76. 'release_timestamp': 1701026308,
  77. 'episode': 'Episode 52',
  78. },
  79. }, {
  80. 'url': 'https://www.wishtv.com/podcast-episode/larry-bucshon-announces-retirement-from-congress/',
  81. 'info_dict': {
  82. 'id': '8da368bd-08d1-46d0-afaa-c134a4af7dc0',
  83. 'ext': 'mp3',
  84. 'title': 'Larry Bucshon announces retirement from congress',
  85. 'upload_date': '20240115',
  86. 'episode_number': 148,
  87. 'episode': 'Episode 148',
  88. 'thumbnail': r're:^https?://content\.production\.cdn\.art19\.com.*\.jpeg$',
  89. 'release_date': '20240115',
  90. 'timestamp': 1705328205,
  91. 'release_timestamp': 1705329275,
  92. 'series': 'All INdiana Politics',
  93. 'modified_date': '20240117',
  94. 'modified_timestamp': 1705458901,
  95. 'series_id': 'c4af6c27-b10f-4ff2-9f84-0f407df86ff1',
  96. 'episode_id': '8da368bd-08d1-46d0-afaa-c134a4af7dc0',
  97. 'description': 'md5:53b5239e4d14973a87125c217c255b2a',
  98. 'duration': 1256.18848,
  99. },
  100. }]
  101. @classmethod
  102. def _extract_embed_urls(cls, url, webpage):
  103. yield from super()._extract_embed_urls(url, webpage)
  104. for episode_id in re.findall(
  105. rf'<div[^>]+\bclass=[\'"][^\'"]*art19-web-player[^\'"]*[\'"][^>]+\bdata-episode-id=[\'"]({cls._UUID_REGEX})[\'"]', webpage):
  106. yield f'https://rss.art19.com/episodes/{episode_id}.mp3'
  107. def _real_extract(self, url):
  108. episode_id = self._match_id(url)
  109. player_metadata = self._download_json(
  110. f'https://art19.com/episodes/{episode_id}', episode_id,
  111. note='Downloading player metadata', fatal=False,
  112. headers={'Accept': 'application/vnd.art19.v0+json'})
  113. rss_metadata = self._download_json(
  114. f'https://rss.art19.com/episodes/{episode_id}.json', episode_id, fatal=False,
  115. note='Downloading RSS metadata')
  116. formats = [{
  117. 'format_id': 'direct',
  118. 'url': f'https://rss.art19.com/episodes/{episode_id}.mp3',
  119. 'vcodec': 'none',
  120. 'acodec': 'mp3',
  121. }]
  122. for fmt_id, fmt_data in traverse_obj(rss_metadata, ('content', 'media', {dict.items}, ...)):
  123. if fmt_id == 'waveform_bin':
  124. continue
  125. fmt_url = traverse_obj(fmt_data, ('url', {url_or_none}))
  126. if not fmt_url:
  127. continue
  128. formats.append({
  129. 'format_id': fmt_id,
  130. 'url': fmt_url,
  131. 'vcodec': 'none',
  132. 'acodec': fmt_id,
  133. 'quality': -2 if fmt_id == 'ogg' else -1,
  134. })
  135. return {
  136. 'id': episode_id,
  137. 'formats': formats,
  138. **traverse_obj(player_metadata, ('episode', {
  139. 'title': ('title', {str}),
  140. 'description': ('description_plain', {str}),
  141. 'episode_id': ('id', {str}),
  142. 'episode_number': ('episode_number', {int_or_none}),
  143. 'season_id': ('season_id', {str}),
  144. 'series_id': ('series_id', {str}),
  145. 'timestamp': ('created_at', {parse_iso8601}),
  146. 'release_timestamp': ('released_at', {parse_iso8601}),
  147. 'modified_timestamp': ('updated_at', {parse_iso8601}),
  148. })),
  149. **traverse_obj(rss_metadata, ('content', {
  150. 'title': ('episode_title', {str}),
  151. 'description': ('episode_description_plain', {str}),
  152. 'episode_id': ('episode_id', {str}),
  153. 'episode_number': ('episode_number', {int_or_none}),
  154. 'season': ('season_title', {str}),
  155. 'season_id': ('season_id', {str}),
  156. 'season_number': ('season_number', {int_or_none}),
  157. 'series': ('series_title', {str}),
  158. 'series_id': ('series_id', {str}),
  159. 'thumbnail': ('cover_image', {url_or_none}),
  160. 'duration': ('duration', {float_or_none}),
  161. })),
  162. }
  163. class Art19ShowIE(InfoExtractor):
  164. _VALID_URL_BASE = r'https?://(?:www\.)?art19\.com/shows/(?P<id>[\w-]+)(?:/embed)?/?'
  165. _VALID_URL = [
  166. rf'{_VALID_URL_BASE}(?:$|[#?])',
  167. r'https?://rss\.art19\.com/(?P<id>[\w-]+)/?(?:$|[#?])',
  168. ]
  169. _EMBED_REGEX = [rf'<iframe[^>]+\bsrc=[\'"](?P<url>{_VALID_URL_BASE}[^\'"])']
  170. _TESTS = [{
  171. 'url': 'https://www.art19.com/shows/5898c087-a14f-48dc-b6fc-a2280a1ff6e0/',
  172. 'info_dict': {
  173. '_type': 'playlist',
  174. 'id': '5898c087-a14f-48dc-b6fc-a2280a1ff6e0',
  175. 'display_id': 'echt-gebeurd',
  176. 'title': 'Echt Gebeurd',
  177. 'description': 'md5:5fd11dc80b76e51ffd34b6067fd5e560',
  178. 'timestamp': 1492642167,
  179. 'upload_date': '20170419',
  180. 'modified_timestamp': int,
  181. 'modified_date': str,
  182. 'tags': 'count:7',
  183. },
  184. 'playlist_mincount': 425,
  185. }, {
  186. 'url': 'https://www.art19.com/shows/echt-gebeurd',
  187. 'info_dict': {
  188. '_type': 'playlist',
  189. 'id': '5898c087-a14f-48dc-b6fc-a2280a1ff6e0',
  190. 'display_id': 'echt-gebeurd',
  191. 'title': 'Echt Gebeurd',
  192. 'description': 'md5:5fd11dc80b76e51ffd34b6067fd5e560',
  193. 'timestamp': 1492642167,
  194. 'upload_date': '20170419',
  195. 'modified_timestamp': int,
  196. 'modified_date': str,
  197. 'tags': 'count:7',
  198. },
  199. 'playlist_mincount': 425,
  200. }, {
  201. 'url': 'https://rss.art19.com/scamfluencers',
  202. 'info_dict': {
  203. '_type': 'playlist',
  204. 'id': 'd3c9b8ca-26b3-42f4-9bd8-21d1a9031e75',
  205. 'display_id': 'scamfluencers',
  206. 'title': 'Scamfluencers',
  207. 'description': 'md5:7d239d670c0ced6dadbf71c4caf764b7',
  208. 'timestamp': 1647368573,
  209. 'upload_date': '20220315',
  210. 'modified_timestamp': int,
  211. 'modified_date': str,
  212. 'tags': [],
  213. },
  214. 'playlist_mincount': 90,
  215. }, {
  216. 'url': 'https://art19.com/shows/enthuellt/embed',
  217. 'info_dict': {
  218. '_type': 'playlist',
  219. 'id': 'e2cacf57-bb8a-4263-aa81-719bcdd4f80c',
  220. 'display_id': 'enthuellt',
  221. 'title': 'Enthüllt',
  222. 'description': 'md5:17752246643414a2fd51744fc9a1c08e',
  223. 'timestamp': 1601645860,
  224. 'upload_date': '20201002',
  225. 'modified_timestamp': int,
  226. 'modified_date': str,
  227. 'tags': 'count:10',
  228. },
  229. 'playlist_mincount': 10,
  230. }]
  231. _WEBPAGE_TESTS = [{
  232. 'url': 'https://deconstructingyourself.com/deconstructing-yourself-podcast',
  233. 'info_dict': {
  234. '_type': 'playlist',
  235. 'id': 'cfbb9b01-c295-4adb-8726-adde7c03cf21',
  236. 'display_id': 'deconstructing-yourself',
  237. 'title': 'Deconstructing Yourself',
  238. 'description': 'md5:dab5082b28b248a35476abf64768854d',
  239. 'timestamp': 1570581181,
  240. 'upload_date': '20191009',
  241. 'modified_timestamp': int,
  242. 'modified_date': str,
  243. 'tags': 'count:5',
  244. },
  245. 'playlist_mincount': 80,
  246. }, {
  247. 'url': 'https://chicagoreader.com/columns-opinion/podcasts/ben-joravsky-show-podcast-episodes/',
  248. 'info_dict': {
  249. '_type': 'playlist',
  250. 'id': '9dfa2c37-ab87-4c13-8388-4897914313ec',
  251. 'display_id': 'the-ben-joravsky-show',
  252. 'title': 'The Ben Joravsky Show',
  253. 'description': 'md5:c0f3ec0ee0dbea764390e521adc8780a',
  254. 'timestamp': 1550875095,
  255. 'upload_date': '20190222',
  256. 'modified_timestamp': int,
  257. 'modified_date': str,
  258. 'tags': ['Chicago Politics', 'chicago', 'Ben Joravsky'],
  259. },
  260. 'playlist_mincount': 1900,
  261. }]
  262. @classmethod
  263. def _extract_embed_urls(cls, url, webpage):
  264. yield from super()._extract_embed_urls(url, webpage)
  265. for series_id in re.findall(
  266. r'<div[^>]+\bclass=[\'"][^\'"]*art19-web-player[^\'"]*[\'"][^>]+\bdata-series-id=[\'"]([\w-]+)[\'"]', webpage):
  267. yield f'https://art19.com/shows/{series_id}'
  268. def _real_extract(self, url):
  269. series_id = self._match_id(url)
  270. series_metadata = self._download_json(
  271. f'https://art19.com/series/{series_id}', series_id, note='Downloading series metadata',
  272. headers={'Accept': 'application/vnd.art19.v0+json'})
  273. return {
  274. '_type': 'playlist',
  275. 'entries': [
  276. self.url_result(f'https://rss.art19.com/episodes/{episode_id}.mp3', Art19IE)
  277. for episode_id in traverse_obj(series_metadata, ('series', 'episode_ids', ..., {str}))
  278. ],
  279. **traverse_obj(series_metadata, ('series', {
  280. 'id': ('id', {str}),
  281. 'display_id': ('slug', {str}),
  282. 'title': ('title', {str}),
  283. 'description': ('description_plain', {str}),
  284. 'timestamp': ('created_at', {parse_iso8601}),
  285. 'modified_timestamp': ('updated_at', {parse_iso8601}),
  286. })),
  287. 'tags': traverse_obj(series_metadata, ('tags', ..., 'name', {str})),
  288. }