pinterest.py 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247
  1. import json
  2. from .common import InfoExtractor
  3. from ..utils import (
  4. determine_ext,
  5. float_or_none,
  6. int_or_none,
  7. str_or_none,
  8. strip_or_none,
  9. traverse_obj,
  10. unified_timestamp,
  11. url_or_none,
  12. )
  13. class PinterestBaseIE(InfoExtractor):
  14. _VALID_URL_BASE = r'''(?x)
  15. https?://(?:[^/]+\.)?pinterest\.(?:
  16. com|fr|de|ch|jp|cl|ca|it|co\.uk|nz|ru|com\.au|at|pt|co\.kr|es|com\.mx|
  17. dk|ph|th|com\.uy|co|nl|info|kr|ie|vn|com\.vn|ec|mx|in|pe|co\.at|hu|
  18. co\.in|co\.nz|id|com\.ec|com\.py|tw|be|uk|com\.bo|com\.pe)'''
  19. def _call_api(self, resource, video_id, options):
  20. return self._download_json(
  21. f'https://www.pinterest.com/resource/{resource}Resource/get/',
  22. video_id, f'Download {resource} JSON metadata', query={
  23. 'data': json.dumps({'options': options}),
  24. })['resource_response']
  25. def _extract_video(self, data, extract_formats=True):
  26. video_id = data['id']
  27. thumbnails = []
  28. images = data.get('images')
  29. if isinstance(images, dict):
  30. for thumbnail in images.values():
  31. if not isinstance(thumbnail, dict):
  32. continue
  33. thumbnail_url = url_or_none(thumbnail.get('url'))
  34. if not thumbnail_url:
  35. continue
  36. thumbnails.append({
  37. 'url': thumbnail_url,
  38. 'width': int_or_none(thumbnail.get('width')),
  39. 'height': int_or_none(thumbnail.get('height')),
  40. })
  41. info = {
  42. 'title': strip_or_none(traverse_obj(data, 'title', 'grid_title', default='')),
  43. 'description': traverse_obj(data, 'seo_description', 'description'),
  44. 'timestamp': unified_timestamp(data.get('created_at')),
  45. 'thumbnails': thumbnails,
  46. 'uploader': traverse_obj(data, ('closeup_attribution', 'full_name')),
  47. 'uploader_id': str_or_none(traverse_obj(data, ('closeup_attribution', 'id'))),
  48. 'repost_count': int_or_none(data.get('repin_count')),
  49. 'comment_count': int_or_none(data.get('comment_count')),
  50. 'categories': traverse_obj(data, ('pin_join', 'visual_annotation'), expected_type=list),
  51. 'tags': traverse_obj(data, 'hashtags', expected_type=list),
  52. }
  53. urls = []
  54. formats = []
  55. duration = None
  56. domain = data.get('domain', '')
  57. if domain.lower() != 'uploaded by user' and traverse_obj(data, ('embed', 'src')):
  58. if not info['title']:
  59. info['title'] = None
  60. return {
  61. '_type': 'url_transparent',
  62. 'url': data['embed']['src'],
  63. **info,
  64. }
  65. elif extract_formats:
  66. video_list = traverse_obj(
  67. data, ('videos', 'video_list'),
  68. ('story_pin_data', 'pages', ..., 'blocks', ..., 'video', 'video_list'),
  69. expected_type=dict, get_all=False, default={})
  70. for format_id, format_dict in video_list.items():
  71. if not isinstance(format_dict, dict):
  72. continue
  73. format_url = url_or_none(format_dict.get('url'))
  74. if not format_url or format_url in urls:
  75. continue
  76. urls.append(format_url)
  77. duration = float_or_none(format_dict.get('duration'), scale=1000)
  78. ext = determine_ext(format_url)
  79. if 'hls' in format_id.lower() or ext == 'm3u8':
  80. formats.extend(self._extract_m3u8_formats(
  81. format_url, video_id, 'mp4', entry_protocol='m3u8_native',
  82. m3u8_id=format_id, fatal=False))
  83. else:
  84. formats.append({
  85. 'url': format_url,
  86. 'format_id': format_id,
  87. 'width': int_or_none(format_dict.get('width')),
  88. 'height': int_or_none(format_dict.get('height')),
  89. 'duration': duration,
  90. })
  91. return {
  92. 'id': video_id,
  93. 'formats': formats,
  94. 'duration': duration,
  95. 'webpage_url': f'https://www.pinterest.com/pin/{video_id}/',
  96. 'extractor_key': PinterestIE.ie_key(),
  97. 'extractor': PinterestIE.IE_NAME,
  98. **info,
  99. }
  100. class PinterestIE(PinterestBaseIE):
  101. _VALID_URL = rf'{PinterestBaseIE._VALID_URL_BASE}/pin/(?P<id>\d+)'
  102. _TESTS = [{
  103. # formats found in data['videos']
  104. 'url': 'https://www.pinterest.com/pin/664281013778109217/',
  105. 'md5': '6550c2af85d6d9f3fe3b88954d1577fc',
  106. 'info_dict': {
  107. 'id': '664281013778109217',
  108. 'ext': 'mp4',
  109. 'title': 'Origami',
  110. 'description': 'md5:e29801cab7d741ea8c741bc50c8d00ab',
  111. 'duration': 57.7,
  112. 'timestamp': 1593073622,
  113. 'upload_date': '20200625',
  114. 'repost_count': int,
  115. 'comment_count': int,
  116. 'categories': list,
  117. 'tags': list,
  118. 'thumbnail': r're:^https?://.*\.(?:jpg|png)$',
  119. },
  120. }, {
  121. # formats found in data['story_pin_data']
  122. 'url': 'https://www.pinterest.com/pin/1084663891475263837/',
  123. 'md5': '069ac19919ab9e1e13fa60de46290b03',
  124. 'info_dict': {
  125. 'id': '1084663891475263837',
  126. 'ext': 'mp4',
  127. 'title': 'Gadget, Cool products, Amazon product, technology, Kitchen gadgets',
  128. 'description': 'md5:d0a4b6ae996ff0c6eed83bc869598d13',
  129. 'uploader': 'CoolCrazyGadgets',
  130. 'uploader_id': '1084664028912989237',
  131. 'upload_date': '20211003',
  132. 'timestamp': 1633246654.0,
  133. 'duration': 14.9,
  134. 'comment_count': int,
  135. 'repost_count': int,
  136. 'categories': 'count:9',
  137. 'tags': list,
  138. 'thumbnail': r're:^https?://.*\.(?:jpg|png)$',
  139. },
  140. }, {
  141. # vimeo.com embed
  142. 'url': 'https://www.pinterest.ca/pin/441282463481903715/',
  143. 'info_dict': {
  144. 'id': '111691128',
  145. 'ext': 'mp4',
  146. 'title': 'Tonite Let\'s All Make Love In London (1967)',
  147. 'description': 'md5:8190f37b3926807809ec57ec21aa77b2',
  148. 'uploader': 'Vimeo',
  149. 'uploader_id': '473792960706651251',
  150. 'upload_date': '20180120',
  151. 'timestamp': 1516409040,
  152. 'duration': 3404,
  153. 'comment_count': int,
  154. 'repost_count': int,
  155. 'categories': 'count:9',
  156. 'tags': [],
  157. 'thumbnail': r're:^https?://.*\.(?:jpg|png)$',
  158. 'uploader_url': 'https://vimeo.com/willardandrade',
  159. },
  160. 'params': {
  161. 'skip_download': 'm3u8',
  162. },
  163. }, {
  164. 'url': 'https://co.pinterest.com/pin/824721750502199491/',
  165. 'only_matching': True,
  166. }]
  167. def _real_extract(self, url):
  168. video_id = self._match_id(url)
  169. data = self._call_api(
  170. 'Pin', video_id, {
  171. 'field_set_key': 'unauth_react_main_pin',
  172. 'id': video_id,
  173. })['data']
  174. return self._extract_video(data)
  175. class PinterestCollectionIE(PinterestBaseIE):
  176. _VALID_URL = rf'{PinterestBaseIE._VALID_URL_BASE}/(?P<username>[^/]+)/(?P<id>[^/?#&]+)'
  177. _TESTS = [{
  178. 'url': 'https://www.pinterest.ca/mashal0407/cool-diys/',
  179. 'info_dict': {
  180. 'id': '585890301462791043',
  181. 'title': 'cool diys',
  182. },
  183. 'playlist_count': 8,
  184. }, {
  185. 'url': 'https://www.pinterest.ca/fudohub/videos/',
  186. 'info_dict': {
  187. 'id': '682858430939307450',
  188. 'title': 'VIDEOS',
  189. },
  190. 'playlist_mincount': 365,
  191. 'skip': 'Test with extract_formats=False',
  192. }]
  193. @classmethod
  194. def suitable(cls, url):
  195. return False if PinterestIE.suitable(url) else super().suitable(url)
  196. def _real_extract(self, url):
  197. username, slug = self._match_valid_url(url).groups()
  198. board = self._call_api(
  199. 'Board', slug, {
  200. 'slug': slug,
  201. 'username': username,
  202. })['data']
  203. board_id = board['id']
  204. options = {
  205. 'board_id': board_id,
  206. 'page_size': 250,
  207. }
  208. bookmark = None
  209. entries = []
  210. while True:
  211. if bookmark:
  212. options['bookmarks'] = [bookmark]
  213. board_feed = self._call_api('BoardFeed', board_id, options)
  214. for item in (board_feed.get('data') or []):
  215. if not isinstance(item, dict) or item.get('type') != 'pin':
  216. continue
  217. video_id = item.get('id')
  218. if video_id:
  219. # Some pins may not be available anonymously via pin URL
  220. # video = self._extract_video(item, extract_formats=False)
  221. # video.update({
  222. # '_type': 'url_transparent',
  223. # 'url': 'https://www.pinterest.com/pin/%s/' % video_id,
  224. # })
  225. # entries.append(video)
  226. entries.append(self._extract_video(item))
  227. bookmark = board_feed.get('bookmark')
  228. if not bookmark:
  229. break
  230. return self.playlist_result(
  231. entries, playlist_id=board_id, playlist_title=board.get('name'))