tumblr.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387
  1. from .common import InfoExtractor
  2. from ..utils import (
  3. ExtractorError,
  4. int_or_none,
  5. traverse_obj,
  6. urlencode_postdata,
  7. )
  8. class TumblrIE(InfoExtractor):
  9. _VALID_URL = r'https?://(?P<blog_name>[^/?#&]+)\.tumblr\.com/(?:post|video)/(?P<id>[0-9]+)(?:$|[/?#])'
  10. _NETRC_MACHINE = 'tumblr'
  11. _LOGIN_URL = 'https://www.tumblr.com/login'
  12. _OAUTH_URL = 'https://www.tumblr.com/api/v2/oauth2/token'
  13. _TESTS = [{
  14. 'url': 'http://tatianamaslanydaily.tumblr.com/post/54196191430/orphan-black-dvd-extra-behind-the-scenes',
  15. 'md5': '479bb068e5b16462f5176a6828829767',
  16. 'info_dict': {
  17. 'id': '54196191430',
  18. 'ext': 'mp4',
  19. 'title': 'md5:dfac39636969fe6bf1caa2d50405f069',
  20. 'description': 'md5:390ab77358960235b6937ab3b8528956',
  21. 'uploader_id': 'tatianamaslanydaily',
  22. 'uploader_url': 'https://tatianamaslanydaily.tumblr.com/',
  23. 'thumbnail': r're:^https?://.*\.jpg',
  24. 'duration': 127,
  25. 'like_count': int,
  26. 'repost_count': int,
  27. 'age_limit': 0,
  28. 'tags': ['Orphan Black', 'Tatiana Maslany', 'Interview', 'Video', 'OB S1 DVD Extras'],
  29. },
  30. }, {
  31. 'note': 'multiple formats',
  32. 'url': 'https://maskofthedragon.tumblr.com/post/626907179849564160/mona-talking-in-english',
  33. 'md5': 'f43ff8a8861712b6cf0e0c2bd84cfc68',
  34. 'info_dict': {
  35. 'id': '626907179849564160',
  36. 'ext': 'mp4',
  37. 'title': 'Mona\xa0“talking” in\xa0“english”',
  38. 'description': 'md5:082a3a621530cb786ad2b7592a6d9e2c',
  39. 'uploader_id': 'maskofthedragon',
  40. 'uploader_url': 'https://maskofthedragon.tumblr.com/',
  41. 'thumbnail': r're:^https?://.*\.jpg',
  42. 'duration': 7,
  43. 'like_count': int,
  44. 'repost_count': int,
  45. 'age_limit': 0,
  46. 'tags': 'count:19',
  47. },
  48. 'params': {
  49. 'format': 'hd',
  50. },
  51. }, {
  52. 'note': 'non-iframe video (with related posts)',
  53. 'url': 'https://shieldfoss.tumblr.com/post/675519763813908480',
  54. 'md5': '12bdb75661ef443bffe5a4dac1dbf118',
  55. 'info_dict': {
  56. 'id': '675519763813908480',
  57. 'ext': 'mp4',
  58. 'title': 'Shieldfoss',
  59. 'uploader_id': 'nerviovago',
  60. 'uploader_url': 'https://nerviovago.tumblr.com/',
  61. 'thumbnail': r're:^https?://.*\.jpg',
  62. 'like_count': int,
  63. 'repost_count': int,
  64. 'age_limit': 0,
  65. 'tags': [],
  66. },
  67. }, {
  68. 'note': 'dashboard only (original post)',
  69. 'url': 'https://jujanon.tumblr.com/post/159704441298/my-baby-eating',
  70. 'md5': '029f7c91ab386701b211e3d494d2d95e',
  71. 'info_dict': {
  72. 'id': '159704441298',
  73. 'ext': 'mp4',
  74. 'title': 'md5:ba79365861101f4911452728d2950561',
  75. 'description': 'md5:773738196cea76b6996ec71e285bdabc',
  76. 'uploader_id': 'jujanon',
  77. 'uploader_url': 'https://jujanon.tumblr.com/',
  78. 'thumbnail': r're:^https?://.*\.jpg',
  79. 'like_count': int,
  80. 'repost_count': int,
  81. 'age_limit': 0,
  82. 'tags': ['crabs', 'my video', 'my pets'],
  83. },
  84. }, {
  85. 'note': 'dashboard only (reblog)',
  86. 'url': 'https://bartlebyshop.tumblr.com/post/180294460076/duality-of-bird',
  87. 'md5': '04334e7cadb1af680d162912559f51a5',
  88. 'info_dict': {
  89. 'id': '180294460076',
  90. 'ext': 'mp4',
  91. 'title': 'duality of bird',
  92. 'description': 'duality of bird',
  93. 'uploader_id': 'todaysbird',
  94. 'uploader_url': 'https://todaysbird.tumblr.com/',
  95. 'thumbnail': r're:^https?://.*\.jpg',
  96. 'like_count': int,
  97. 'repost_count': int,
  98. 'age_limit': 0,
  99. 'tags': [],
  100. },
  101. }, {
  102. 'note': 'dashboard only (external)',
  103. 'url': 'https://afloweroutofstone.tumblr.com/post/675661759168823296/the-blues-remembers-everything-the-country-forgot',
  104. 'info_dict': {
  105. 'id': 'q67_fd7b8SU',
  106. 'ext': 'mp4',
  107. 'title': 'The Blues Remembers Everything the Country Forgot',
  108. 'alt_title': 'The Blues Remembers Everything the Country Forgot',
  109. 'description': 'md5:1a6b4097e451216835a24c1023707c79',
  110. 'release_date': '20201224',
  111. 'creator': 'md5:c2239ba15430e87c3b971ba450773272',
  112. 'uploader': 'Moor Mother - Topic',
  113. 'upload_date': '20201223',
  114. 'uploader_id': 'UCxrMtFBRkFvQJ_vVM4il08w',
  115. 'uploader_url': 'http://www.youtube.com/channel/UCxrMtFBRkFvQJ_vVM4il08w',
  116. 'thumbnail': r're:^https?://i.ytimg.com/.*',
  117. 'channel': 'Moor Mother - Topic',
  118. 'channel_id': 'UCxrMtFBRkFvQJ_vVM4il08w',
  119. 'channel_url': 'https://www.youtube.com/channel/UCxrMtFBRkFvQJ_vVM4il08w',
  120. 'channel_follower_count': int,
  121. 'duration': 181,
  122. 'view_count': int,
  123. 'like_count': int,
  124. 'age_limit': 0,
  125. 'categories': ['Music'],
  126. 'tags': 'count:7',
  127. 'live_status': 'not_live',
  128. 'playable_in_embed': True,
  129. 'availability': 'public',
  130. 'track': 'The Blues Remembers Everything the Country Forgot',
  131. 'artist': 'md5:c2239ba15430e87c3b971ba450773272',
  132. 'album': 'Brass',
  133. 'release_year': 2020,
  134. },
  135. 'add_ie': ['Youtube'],
  136. }, {
  137. 'url': 'http://naked-yogi.tumblr.com/post/118312946248/naked-smoking-stretching',
  138. 'md5': 'de07e5211d60d4f3a2c3df757ea9f6ab',
  139. 'info_dict': {
  140. 'id': 'Wmur',
  141. 'ext': 'mp4',
  142. 'title': 'naked smoking & stretching',
  143. 'upload_date': '20150506',
  144. 'timestamp': 1430931613,
  145. 'age_limit': 18,
  146. 'uploader_id': '1638622',
  147. 'uploader': 'naked-yogi',
  148. },
  149. # 'add_ie': ['Vidme'],
  150. 'skip': 'dead embedded video host',
  151. }, {
  152. 'url': 'https://prozdvoices.tumblr.com/post/673201091169681408/what-recording-voice-acting-sounds-like',
  153. 'md5': 'a0063fc8110e6c9afe44065b4ea68177',
  154. 'info_dict': {
  155. 'id': 'eomhW5MLGWA',
  156. 'ext': 'mp4',
  157. 'title': 'what recording voice acting sounds like',
  158. 'description': 'md5:1da3faa22d0e0b1d8b50216c284ee798',
  159. 'uploader': 'ProZD',
  160. 'upload_date': '20220112',
  161. 'uploader_id': 'ProZD',
  162. 'uploader_url': 'http://www.youtube.com/user/ProZD',
  163. 'thumbnail': r're:^https?://i.ytimg.com/.*',
  164. 'channel': 'ProZD',
  165. 'channel_id': 'UC6MFZAOHXlKK1FI7V0XQVeA',
  166. 'channel_url': 'https://www.youtube.com/channel/UC6MFZAOHXlKK1FI7V0XQVeA',
  167. 'channel_follower_count': int,
  168. 'duration': 20,
  169. 'view_count': int,
  170. 'like_count': int,
  171. 'age_limit': 0,
  172. 'categories': ['Film & Animation'],
  173. 'tags': [],
  174. 'live_status': 'not_live',
  175. 'playable_in_embed': True,
  176. 'availability': 'public',
  177. },
  178. 'add_ie': ['Youtube'],
  179. }, {
  180. 'url': 'https://dominustempori.tumblr.com/post/673572712813297664/youtubes-all-right-for-some-pretty-cool',
  181. 'md5': '203e9eb8077e3f45bfaeb4c86c1467b8',
  182. 'info_dict': {
  183. 'id': '87816359',
  184. 'ext': 'mov',
  185. 'title': 'Harold Ramis',
  186. 'description': 'md5:be8e68cbf56ce0785c77f0c6c6dfaf2c',
  187. 'uploader': 'Resolution Productions Group',
  188. 'uploader_id': 'resolutionproductions',
  189. 'uploader_url': 'https://vimeo.com/resolutionproductions',
  190. 'upload_date': '20140227',
  191. 'thumbnail': r're:^https?://i.vimeocdn.com/video/.*',
  192. 'timestamp': 1393523719,
  193. 'duration': 291,
  194. },
  195. 'add_ie': ['Vimeo'],
  196. }, {
  197. 'url': 'http://sutiblr.tumblr.com/post/139638707273',
  198. 'md5': '2dd184b3669e049ba40563a7d423f95c',
  199. 'info_dict': {
  200. 'id': 'ir7qBEIKqvq',
  201. 'ext': 'mp4',
  202. 'title': 'Vine by sutiblr',
  203. 'alt_title': 'Vine by sutiblr',
  204. 'uploader': 'sutiblr',
  205. 'uploader_id': '1198993975374495744',
  206. 'upload_date': '20160220',
  207. 'like_count': int,
  208. 'comment_count': int,
  209. 'repost_count': int,
  210. 'thumbnail': r're:^https?://.*\.jpg',
  211. 'timestamp': 1455940159,
  212. 'view_count': int,
  213. },
  214. 'add_ie': ['Vine'],
  215. }, {
  216. 'url': 'https://silami.tumblr.com/post/84250043974/my-bad-river-flows-in-you-impression-on-maschine',
  217. 'md5': '3c92d7c3d867f14ccbeefa2119022277',
  218. 'info_dict': {
  219. 'id': 'nYtvtTPuTl',
  220. 'ext': 'mp4',
  221. 'title': 'Video by silbulterman',
  222. 'description': '#maschine',
  223. 'uploader_id': '242859024',
  224. 'thumbnail': r're:^https?://.*\.jpg',
  225. 'timestamp': 1398801174,
  226. 'like_count': int,
  227. 'uploader': 'Sil',
  228. 'channel': 'silbulterman',
  229. 'comment_count': int,
  230. 'upload_date': '20140429',
  231. },
  232. 'add_ie': ['Instagram'],
  233. }]
  234. _providers = {
  235. 'instagram': 'Instagram',
  236. 'vimeo': 'Vimeo',
  237. 'vine': 'Vine',
  238. 'youtube': 'Youtube',
  239. }
  240. _ACCESS_TOKEN = None
  241. def _initialize_pre_login(self):
  242. login_page = self._download_webpage(
  243. self._LOGIN_URL, None, 'Downloading login page', fatal=False)
  244. if login_page:
  245. self._ACCESS_TOKEN = self._search_regex(
  246. r'"API_TOKEN":\s*"(\w+)"', login_page, 'API access token', fatal=False)
  247. if not self._ACCESS_TOKEN:
  248. self.report_warning('Failed to get access token; metadata will be missing and some videos may not work')
  249. def _perform_login(self, username, password):
  250. if not self._ACCESS_TOKEN:
  251. return
  252. self._download_json(
  253. self._OAUTH_URL, None, 'Logging in',
  254. data=urlencode_postdata({
  255. 'password': password,
  256. 'grant_type': 'password',
  257. 'username': username,
  258. }), headers={
  259. 'Content-Type': 'application/x-www-form-urlencoded',
  260. 'Authorization': f'Bearer {self._ACCESS_TOKEN}',
  261. },
  262. errnote='Login failed', fatal=False)
  263. def _real_extract(self, url):
  264. blog, video_id = self._match_valid_url(url).groups()
  265. url = f'http://{blog}.tumblr.com/post/{video_id}/'
  266. webpage, urlh = self._download_webpage_handle(url, video_id)
  267. redirect_url = urlh.url
  268. api_only = bool(self._search_regex(
  269. r'(tumblr.com|^)/(safe-mode|login_required|blog/view)',
  270. redirect_url, 'redirect', default=None))
  271. if api_only and not self._ACCESS_TOKEN:
  272. raise ExtractorError('Cannot get data for dashboard-only post without access token')
  273. post_json = {}
  274. if self._ACCESS_TOKEN:
  275. post_json = traverse_obj(
  276. self._download_json(
  277. f'https://www.tumblr.com/api/v2/blog/{blog}/posts/{video_id}/permalink',
  278. video_id, headers={'Authorization': f'Bearer {self._ACCESS_TOKEN}'}, fatal=False),
  279. ('response', 'timeline', 'elements', 0)) or {}
  280. content_json = traverse_obj(post_json, ('trail', 0, 'content'), ('content')) or []
  281. video_json = next(
  282. (item for item in content_json if item.get('type') == 'video'), {})
  283. media_json = video_json.get('media') or {}
  284. if api_only and not media_json.get('url') and not video_json.get('url'):
  285. raise ExtractorError('Failed to find video data for dashboard-only post')
  286. if not media_json.get('url') and video_json.get('url'):
  287. # external video host
  288. return self.url_result(
  289. video_json['url'],
  290. self._providers.get(video_json.get('provider'), 'Generic'))
  291. video_url = self._og_search_video_url(webpage, default=None)
  292. duration = None
  293. formats = []
  294. # iframes can supply duration and sometimes additional formats, so check for one
  295. iframe_url = self._search_regex(
  296. fr'src=\'(https?://www\.tumblr\.com/video/{blog}/{video_id}/[^\']+)\'',
  297. webpage, 'iframe url', default=None)
  298. if iframe_url:
  299. iframe = self._download_webpage(
  300. iframe_url, video_id, 'Downloading iframe page',
  301. headers={'Referer': redirect_url})
  302. options = self._parse_json(
  303. self._search_regex(
  304. r'data-crt-options=(["\'])(?P<options>.+?)\1', iframe,
  305. 'hd video url', default='', group='options'),
  306. video_id, fatal=False)
  307. if options:
  308. duration = int_or_none(options.get('duration'))
  309. hd_url = options.get('hdUrl')
  310. if hd_url:
  311. # there are multiple formats; extract them
  312. # ignore other sources of width/height data as they may be wrong
  313. sources = []
  314. sd_url = self._search_regex(
  315. r'<source[^>]+src=(["\'])(?P<url>.+?)\1', iframe,
  316. 'sd video url', default=None, group='url')
  317. if sd_url:
  318. sources.append((sd_url, 'sd'))
  319. sources.append((hd_url, 'hd'))
  320. formats = [{
  321. 'url': video_url,
  322. 'format_id': format_id,
  323. 'height': int_or_none(self._search_regex(
  324. r'_(\d+)\.\w+$', video_url, 'height', default=None)),
  325. 'quality': quality,
  326. } for quality, (video_url, format_id) in enumerate(sources)]
  327. if not media_json.get('url') and not video_url and not iframe_url:
  328. # external video host (but we weren't able to figure it out from the api)
  329. iframe_url = self._search_regex(
  330. r'src=["\'](https?://safe\.txmblr\.com/svc/embed/inline/[^"\']+)["\']',
  331. webpage, 'embed iframe url', default=None)
  332. return self.url_result(iframe_url or redirect_url, 'Generic')
  333. formats = formats or [{
  334. 'url': media_json.get('url') or video_url,
  335. 'width': int_or_none(
  336. media_json.get('width') or self._og_search_property('video:width', webpage, default=None)),
  337. 'height': int_or_none(
  338. media_json.get('height') or self._og_search_property('video:height', webpage, default=None)),
  339. }]
  340. # the url we're extracting from might be an original post or it might be a reblog.
  341. # if it's a reblog, og:description will be the reblogger's comment, not the uploader's.
  342. # content_json is always the op, so if it exists but has no text, there's no description
  343. if content_json:
  344. description = '\n\n'.join(
  345. item.get('text') for item in content_json if item.get('type') == 'text') or None
  346. else:
  347. description = self._og_search_description(webpage, default=None)
  348. uploader_id = traverse_obj(post_json, 'reblogged_root_name', 'blog_name')
  349. return {
  350. 'id': video_id,
  351. 'title': post_json.get('summary') or (blog if api_only else self._html_search_regex(
  352. r'(?s)<title>(?P<title>.*?)(?: \| Tumblr)?</title>', webpage, 'title')),
  353. 'description': description,
  354. 'thumbnail': (traverse_obj(video_json, ('poster', 0, 'url'))
  355. or self._og_search_thumbnail(webpage, default=None)),
  356. 'uploader_id': uploader_id,
  357. 'uploader_url': f'https://{uploader_id}.tumblr.com/' if uploader_id else None,
  358. 'duration': duration,
  359. 'like_count': post_json.get('like_count'),
  360. 'repost_count': post_json.get('reblog_count'),
  361. 'age_limit': {True: 18, False: 0}.get(post_json.get('is_nsfw')),
  362. 'tags': post_json.get('tags'),
  363. 'formats': formats,
  364. }