nytimes.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420
  1. import json
  2. import uuid
  3. from .common import InfoExtractor
  4. from ..utils import (
  5. ExtractorError,
  6. clean_html,
  7. determine_ext,
  8. extract_attributes,
  9. float_or_none,
  10. get_elements_html_by_class,
  11. int_or_none,
  12. merge_dicts,
  13. mimetype2ext,
  14. parse_iso8601,
  15. remove_end,
  16. remove_start,
  17. str_or_none,
  18. traverse_obj,
  19. url_or_none,
  20. )
  21. class NYTimesBaseIE(InfoExtractor):
  22. _DNS_NAMESPACE = uuid.UUID('36dd619a-56dc-595b-9e09-37f4152c7b5d')
  23. _TOKEN = 'MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAuNIzKBOFB77aT/jN/FQ+/QVKWq5V1ka1AYmCR9hstz1pGNPH5ajOU9gAqta0T89iPnhjwla+3oec/Z3kGjxbpv6miQXufHFq3u2RC6HyU458cLat5kVPSOQCe3VVB5NRpOlRuwKHqn0txfxnwSSj8mqzstR997d3gKB//RO9zE16y3PoWlDQXkASngNJEWvL19iob/xwAkfEWCjyRILWFY0JYX3AvLMSbq7wsqOCE5srJpo7rRU32zsByhsp1D5W9OYqqwDmflsgCEQy2vqTsJjrJohuNg+urMXNNZ7Y3naMoqttsGDrWVxtPBafKMI8pM2ReNZBbGQsQXRzQNo7+QIDAQAB'
  24. _GRAPHQL_API = 'https://samizdat-graphql.nytimes.com/graphql/v2'
  25. _GRAPHQL_QUERY = '''query VideoQuery($id: String!) {
  26. video(id: $id) {
  27. ... on Video {
  28. bylines {
  29. renderedRepresentation
  30. }
  31. duration
  32. firstPublished
  33. promotionalHeadline
  34. promotionalMedia {
  35. ... on Image {
  36. crops {
  37. name
  38. renditions {
  39. name
  40. width
  41. height
  42. url
  43. }
  44. }
  45. }
  46. }
  47. renditions {
  48. type
  49. width
  50. height
  51. url
  52. bitrate
  53. }
  54. summary
  55. }
  56. }
  57. }'''
  58. def _call_api(self, media_id):
  59. # reference: `id-to-uri.js`
  60. video_uuid = uuid.uuid5(self._DNS_NAMESPACE, 'video')
  61. media_uuid = uuid.uuid5(video_uuid, media_id)
  62. return traverse_obj(self._download_json(
  63. self._GRAPHQL_API, media_id, 'Downloading JSON from GraphQL API', data=json.dumps({
  64. 'query': self._GRAPHQL_QUERY,
  65. 'variables': {'id': f'nyt://video/{media_uuid}'},
  66. }, separators=(',', ':')).encode(), headers={
  67. 'Content-Type': 'application/json',
  68. 'Nyt-App-Type': 'vhs',
  69. 'Nyt-App-Version': 'v3.52.21',
  70. 'Nyt-Token': self._TOKEN,
  71. 'Origin': 'https://nytimes.com',
  72. }, fatal=False), ('data', 'video', {dict})) or {}
  73. def _extract_thumbnails(self, thumbs):
  74. return traverse_obj(thumbs, (lambda _, v: url_or_none(v['url']), {
  75. 'url': 'url',
  76. 'width': ('width', {int_or_none}),
  77. 'height': ('height', {int_or_none}),
  78. }), default=None)
  79. def _extract_formats_and_subtitles(self, video_id, content_media_json):
  80. urls = []
  81. formats = []
  82. subtitles = {}
  83. for video in traverse_obj(content_media_json, ('renditions', ..., {dict})):
  84. video_url = video.get('url')
  85. format_id = video.get('type')
  86. if not video_url or format_id == 'thumbs' or video_url in urls:
  87. continue
  88. urls.append(video_url)
  89. ext = mimetype2ext(video.get('mimetype')) or determine_ext(video_url)
  90. if ext == 'm3u8':
  91. m3u8_fmts, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
  92. video_url, video_id, 'mp4', 'm3u8_native',
  93. m3u8_id=format_id or 'hls', fatal=False)
  94. formats.extend(m3u8_fmts)
  95. self._merge_subtitles(m3u8_subs, target=subtitles)
  96. elif ext == 'mpd':
  97. continue # all mpd urls give 404 errors
  98. else:
  99. formats.append({
  100. 'url': video_url,
  101. 'format_id': format_id,
  102. 'vcodec': video.get('videoencoding') or video.get('video_codec'),
  103. 'width': int_or_none(video.get('width')),
  104. 'height': int_or_none(video.get('height')),
  105. 'filesize': traverse_obj(video, (
  106. ('file_size', 'fileSize'), (None, ('value')), {int_or_none}), get_all=False),
  107. 'tbr': int_or_none(video.get('bitrate'), 1000) or None,
  108. 'ext': ext,
  109. })
  110. return formats, subtitles
  111. def _extract_video(self, media_id):
  112. data = self._call_api(media_id)
  113. formats, subtitles = self._extract_formats_and_subtitles(media_id, data)
  114. return {
  115. 'id': media_id,
  116. 'title': data.get('promotionalHeadline'),
  117. 'description': data.get('summary'),
  118. 'timestamp': parse_iso8601(data.get('firstPublished')),
  119. 'duration': float_or_none(data.get('duration'), scale=1000),
  120. 'creator': ', '.join(traverse_obj(data, ( # TODO: change to 'creators'
  121. 'bylines', ..., 'renderedRepresentation', {lambda x: remove_start(x, 'By ')}))),
  122. 'formats': formats,
  123. 'subtitles': subtitles,
  124. 'thumbnails': self._extract_thumbnails(
  125. traverse_obj(data, ('promotionalMedia', 'crops', ..., 'renditions', ...))),
  126. }
  127. class NYTimesIE(NYTimesBaseIE):
  128. _VALID_URL = r'https?://(?:(?:www\.)?nytimes\.com/video/(?:[^/]+/)+?|graphics8\.nytimes\.com/bcvideo/\d+(?:\.\d+)?/iframe/embed\.html\?videoId=)(?P<id>\d+)'
  129. _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//graphics8\.nytimes\.com/bcvideo/[^/]+/iframe/embed\.html.+?)\1>']
  130. _TESTS = [{
  131. 'url': 'http://www.nytimes.com/video/opinion/100000002847155/verbatim-what-is-a-photocopier.html?playlistId=100000001150263',
  132. 'md5': 'a553aa344014e3723d33893d89d4defc',
  133. 'info_dict': {
  134. 'id': '100000002847155',
  135. 'ext': 'mp4',
  136. 'title': 'Verbatim: What Is a Photocopier?',
  137. 'description': 'md5:93603dada88ddbda9395632fdc5da260',
  138. 'timestamp': 1398646132,
  139. 'upload_date': '20140428',
  140. 'creator': 'Brett Weiner',
  141. 'thumbnail': r're:https?://\w+\.nyt.com/images/.+\.jpg',
  142. 'duration': 419,
  143. },
  144. }, {
  145. 'url': 'http://www.nytimes.com/video/travel/100000003550828/36-hours-in-dubai.html',
  146. 'only_matching': True,
  147. }]
  148. def _real_extract(self, url):
  149. video_id = self._match_id(url)
  150. return self._extract_video(video_id)
  151. class NYTimesArticleIE(NYTimesBaseIE):
  152. _VALID_URL = r'https?://(?:www\.)?nytimes\.com/\d{4}/\d{2}/\d{2}/(?!books|podcasts)[^/?#]+/(?:\w+/)?(?P<id>[^./?#]+)(?:\.html)?'
  153. _TESTS = [{
  154. 'url': 'http://www.nytimes.com/2015/04/14/business/owner-of-gravity-payments-a-credit-card-processor-is-setting-a-new-minimum-wage-70000-a-year.html?_r=0',
  155. 'md5': '3eb5ddb1d6f86254fe4f233826778737',
  156. 'info_dict': {
  157. 'id': '100000003628438',
  158. 'ext': 'mp4',
  159. 'title': 'One Company’s New Minimum Wage: $70,000 a Year',
  160. 'description': 'md5:89ba9ab67ca767bb92bf823d1f138433',
  161. 'timestamp': 1429047468,
  162. 'upload_date': '20150414',
  163. 'uploader': 'Matthew Williams',
  164. 'creator': 'Patricia Cohen',
  165. 'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg',
  166. 'duration': 119.0,
  167. },
  168. }, {
  169. # article with audio and no video
  170. 'url': 'https://www.nytimes.com/2023/09/29/health/mosquitoes-genetic-engineering.html',
  171. 'md5': '2365b3555c8aa7f4dd34ca735ad02e6a',
  172. 'info_dict': {
  173. 'id': '100000009110381',
  174. 'ext': 'mp3',
  175. 'title': 'The Gamble: Can Genetically Modified Mosquitoes End Disease?',
  176. 'description': 'md5:9ff8b47acbaf7f3ca8c732f5c815be2e',
  177. 'timestamp': 1695960700,
  178. 'upload_date': '20230929',
  179. 'creator': 'Stephanie Nolen, Natalija Gormalova',
  180. 'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg',
  181. 'duration': 1322,
  182. },
  183. }, {
  184. 'url': 'https://www.nytimes.com/2023/11/29/business/dealbook/kamala-harris-biden-voters.html',
  185. 'md5': '3eb5ddb1d6f86254fe4f233826778737',
  186. 'info_dict': {
  187. 'id': '100000009202270',
  188. 'ext': 'mp4',
  189. 'title': 'Kamala Harris Defends Biden Policies, but Says ‘More Work’ Needed to Reach Voters',
  190. 'description': 'md5:de4212a7e19bb89e4fb14210ca915f1f',
  191. 'timestamp': 1701290997,
  192. 'upload_date': '20231129',
  193. 'uploader': 'By The New York Times',
  194. 'creator': 'Katie Rogers',
  195. 'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg',
  196. 'duration': 97.631,
  197. },
  198. 'params': {
  199. 'skip_download': 'm3u8',
  200. },
  201. }, {
  202. # multiple videos in the same article
  203. 'url': 'https://www.nytimes.com/2023/12/02/business/air-traffic-controllers-safety.html',
  204. 'info_dict': {
  205. 'id': 'air-traffic-controllers-safety',
  206. 'title': 'Drunk and Asleep on the Job: Air Traffic Controllers Pushed to the Brink',
  207. 'description': 'md5:549e5a5e935bf7d048be53ba3d2c863d',
  208. 'upload_date': '20231202',
  209. 'creator': 'Emily Steel, Sydney Ember',
  210. 'timestamp': 1701511264,
  211. },
  212. 'playlist_count': 3,
  213. }, {
  214. 'url': 'https://www.nytimes.com/2023/12/02/business/media/netflix-squid-game-challenge.html',
  215. 'only_matching': True,
  216. }]
  217. def _extract_content_from_block(self, block):
  218. details = traverse_obj(block, {
  219. 'id': ('sourceId', {str}),
  220. 'uploader': ('bylines', ..., 'renderedRepresentation', {str}),
  221. 'duration': (None, (('duration', {lambda x: float_or_none(x, scale=1000)}), ('length', {int_or_none}))),
  222. 'timestamp': ('firstPublished', {parse_iso8601}),
  223. 'series': ('podcastSeries', {str}),
  224. }, get_all=False)
  225. formats, subtitles = self._extract_formats_and_subtitles(details.get('id'), block)
  226. # audio articles will have an url and no formats
  227. url = traverse_obj(block, ('fileUrl', {url_or_none}))
  228. if not formats and url:
  229. formats.append({'url': url, 'vcodec': 'none'})
  230. return {
  231. **details,
  232. 'thumbnails': self._extract_thumbnails(traverse_obj(
  233. block, ('promotionalMedia', 'crops', ..., 'renditions', ...))),
  234. 'formats': formats,
  235. 'subtitles': subtitles,
  236. }
  237. def _real_extract(self, url):
  238. page_id = self._match_id(url)
  239. webpage = self._download_webpage(url, page_id)
  240. art_json = self._search_json(
  241. r'window\.__preloadedData\s*=', webpage, 'media details', page_id,
  242. transform_source=lambda x: x.replace('undefined', 'null'))['initialData']['data']['article']
  243. blocks = traverse_obj(art_json, (
  244. 'sprinkledBody', 'content', ..., ('ledeMedia', None),
  245. lambda _, v: v['__typename'] in ('Video', 'Audio')))
  246. if not blocks:
  247. raise ExtractorError('Unable to extract any media blocks from webpage')
  248. common_info = {
  249. 'title': remove_end(self._html_extract_title(webpage), ' - The New York Times'),
  250. 'description': traverse_obj(art_json, (
  251. 'sprinkledBody', 'content', ..., 'summary', 'content', ..., 'text', {str}),
  252. get_all=False) or self._html_search_meta(['og:description', 'twitter:description'], webpage),
  253. 'timestamp': traverse_obj(art_json, ('firstPublished', {parse_iso8601})),
  254. 'creator': ', '.join(
  255. traverse_obj(art_json, ('bylines', ..., 'creators', ..., 'displayName'))), # TODO: change to 'creators' (list)
  256. 'thumbnails': self._extract_thumbnails(traverse_obj(
  257. art_json, ('promotionalMedia', 'assetCrops', ..., 'renditions', ...))),
  258. }
  259. entries = []
  260. for block in blocks:
  261. entries.append(merge_dicts(self._extract_content_from_block(block), common_info))
  262. if len(entries) > 1:
  263. return self.playlist_result(entries, page_id, **common_info)
  264. return {
  265. 'id': page_id,
  266. **entries[0],
  267. }
  268. class NYTimesCookingIE(NYTimesBaseIE):
  269. IE_NAME = 'NYTimesCookingGuide'
  270. _VALID_URL = r'https?://cooking\.nytimes\.com/guides/(?P<id>[\w-]+)'
  271. _TESTS = [{
  272. 'url': 'https://cooking.nytimes.com/guides/13-how-to-cook-a-turkey',
  273. 'info_dict': {
  274. 'id': '13-how-to-cook-a-turkey',
  275. 'title': 'How to Cook a Turkey',
  276. 'description': 'md5:726cfd3f9b161bdf5c279879e8050ca0',
  277. },
  278. 'playlist_count': 2,
  279. }, {
  280. # single video example
  281. 'url': 'https://cooking.nytimes.com/guides/50-how-to-make-mac-and-cheese',
  282. 'md5': '64415805fe0b8640fce6b0b9def5989a',
  283. 'info_dict': {
  284. 'id': '100000005835845',
  285. 'ext': 'mp4',
  286. 'title': 'How to Make Mac and Cheese',
  287. 'description': 'md5:b8f2f33ec1fb7523b21367147c9594f1',
  288. 'timestamp': 1522950315,
  289. 'upload_date': '20180405',
  290. 'duration': 9.51,
  291. 'creator': 'Alison Roman',
  292. 'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg',
  293. },
  294. }, {
  295. 'url': 'https://cooking.nytimes.com/guides/20-how-to-frost-a-cake',
  296. 'md5': '64415805fe0b8640fce6b0b9def5989a',
  297. 'info_dict': {
  298. 'id': '20-how-to-frost-a-cake',
  299. 'title': 'How to Frost a Cake',
  300. 'description': 'md5:a31fe3b98a8ce7b98aae097730c269cd',
  301. },
  302. 'playlist_count': 8,
  303. }]
  304. def _real_extract(self, url):
  305. page_id = self._match_id(url)
  306. webpage = self._download_webpage(url, page_id)
  307. title = self._html_search_meta(['og:title', 'twitter:title'], webpage)
  308. description = self._html_search_meta(['og:description', 'twitter:description'], webpage)
  309. lead_video_id = self._search_regex(
  310. r'data-video-player-id="(\d+)"></div>', webpage, 'lead video')
  311. media_ids = traverse_obj(
  312. get_elements_html_by_class('video-item', webpage), (..., {extract_attributes}, 'data-video-id'))
  313. if media_ids:
  314. media_ids.append(lead_video_id)
  315. return self.playlist_result(
  316. [self._extract_video(media_id) for media_id in media_ids], page_id, title, description)
  317. return {
  318. **self._extract_video(lead_video_id),
  319. 'title': title,
  320. 'description': description,
  321. 'creator': self._search_regex( # TODO: change to 'creators'
  322. r'<span itemprop="author">([^<]+)</span></p>', webpage, 'author', default=None),
  323. }
  324. class NYTimesCookingRecipeIE(InfoExtractor):
  325. _VALID_URL = r'https?://cooking\.nytimes\.com/recipes/(?P<id>\d+)'
  326. _TESTS = [{
  327. 'url': 'https://cooking.nytimes.com/recipes/1017817-cranberry-curd-tart',
  328. 'md5': '579e83bbe8e61e9de67f80edba8a78a8',
  329. 'info_dict': {
  330. 'id': '1017817',
  331. 'ext': 'mp4',
  332. 'title': 'Cranberry Curd Tart',
  333. 'description': 'md5:ad77a3fc321db636256d4343c5742152',
  334. 'timestamp': 1447804800,
  335. 'upload_date': '20151118',
  336. 'creator': 'David Tanis',
  337. 'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg',
  338. },
  339. }, {
  340. 'url': 'https://cooking.nytimes.com/recipes/1024781-neapolitan-checkerboard-cookies',
  341. 'md5': '58df35998241dcf0620e99e646331b42',
  342. 'info_dict': {
  343. 'id': '1024781',
  344. 'ext': 'mp4',
  345. 'title': 'Neapolitan Checkerboard Cookies',
  346. 'description': 'md5:ba12394c585ababea951cb6d2fcc6631',
  347. 'timestamp': 1701302400,
  348. 'upload_date': '20231130',
  349. 'creator': 'Sue Li',
  350. 'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg',
  351. },
  352. }, {
  353. 'url': 'https://cooking.nytimes.com/recipes/1019516-overnight-oats',
  354. 'md5': '2fe7965a3adc899913b8e25ada360823',
  355. 'info_dict': {
  356. 'id': '1019516',
  357. 'ext': 'mp4',
  358. 'timestamp': 1546387200,
  359. 'description': 'md5:8856ce10239161bd2596ac335b9f9bfb',
  360. 'upload_date': '20190102',
  361. 'title': 'Overnight Oats',
  362. 'creator': 'Genevieve Ko',
  363. 'thumbnail': r're:https?://\w+\.nyt.com/images/.*\.jpg',
  364. },
  365. }]
  366. def _real_extract(self, url):
  367. page_id = self._match_id(url)
  368. webpage = self._download_webpage(url, page_id)
  369. recipe_data = self._search_nextjs_data(webpage, page_id)['props']['pageProps']['recipe']
  370. formats, subtitles = self._extract_m3u8_formats_and_subtitles(
  371. recipe_data['videoSrc'], page_id, 'mp4', m3u8_id='hls')
  372. return {
  373. **traverse_obj(recipe_data, {
  374. 'id': ('id', {str_or_none}),
  375. 'title': ('title', {str}),
  376. 'description': ('topnote', {clean_html}),
  377. 'timestamp': ('publishedAt', {int_or_none}),
  378. 'creator': ('contentAttribution', 'cardByline', {str}),
  379. }),
  380. 'formats': formats,
  381. 'subtitles': subtitles,
  382. 'thumbnails': [{'url': thumb_url} for thumb_url in traverse_obj(
  383. recipe_data, ('image', 'crops', 'recipe', ..., {url_or_none}))],
  384. }