wykop.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268
  1. import json
  2. from .common import InfoExtractor
  3. from ..networking.exceptions import HTTPError
  4. from ..utils import (
  5. ExtractorError,
  6. format_field,
  7. parse_iso8601,
  8. traverse_obj,
  9. url_or_none,
  10. )
  11. class WykopBaseExtractor(InfoExtractor):
  12. def _get_token(self, force_refresh=False):
  13. if not force_refresh:
  14. maybe_cached = self.cache.load('wykop', 'bearer')
  15. if maybe_cached:
  16. return maybe_cached
  17. new_token = traverse_obj(
  18. self._do_call_api('auth', None, 'Downloading anonymous auth token', data={
  19. # hardcoded in frontend
  20. 'key': 'w53947240748',
  21. 'secret': 'd537d9e0a7adc1510842059ae5316419',
  22. }), ('data', 'token'))
  23. self.cache.store('wykop', 'bearer', new_token)
  24. return new_token
  25. def _do_call_api(self, path, video_id, note='Downloading JSON metadata', data=None, headers={}):
  26. if data:
  27. data = json.dumps({'data': data}).encode()
  28. headers['Content-Type'] = 'application/json'
  29. return self._download_json(
  30. f'https://wykop.pl/api/v3/{path}', video_id,
  31. note=note, data=data, headers=headers)
  32. def _call_api(self, path, video_id, note='Downloading JSON metadata'):
  33. token = self._get_token()
  34. for retrying in range(2):
  35. try:
  36. return self._do_call_api(path, video_id, note, headers={'Authorization': f'Bearer {token}'})
  37. except ExtractorError as e:
  38. if not retrying and isinstance(e.cause, HTTPError) and e.cause.status == 403:
  39. token = self._get_token(True)
  40. continue
  41. raise
  42. def _common_data_extract(self, data):
  43. author = traverse_obj(data, ('author', 'username'), expected_type=str)
  44. return {
  45. '_type': 'url_transparent',
  46. 'display_id': data.get('slug'),
  47. 'url': traverse_obj(data,
  48. ('media', 'embed', 'url'), # what gets an iframe embed
  49. ('source', 'url'), # clickable url (dig only)
  50. expected_type=url_or_none),
  51. 'thumbnail': traverse_obj(
  52. data, ('media', 'photo', 'url'), ('media', 'embed', 'thumbnail'), expected_type=url_or_none),
  53. 'uploader': author,
  54. 'uploader_id': author,
  55. 'uploader_url': format_field(author, None, 'https://wykop.pl/ludzie/%s'),
  56. 'timestamp': parse_iso8601(data.get('created_at'), delimiter=' '), # time it got submitted
  57. 'like_count': traverse_obj(data, ('votes', 'up'), expected_type=int),
  58. 'dislike_count': traverse_obj(data, ('votes', 'down'), expected_type=int),
  59. 'comment_count': traverse_obj(data, ('comments', 'count'), expected_type=int),
  60. 'age_limit': 18 if data.get('adult') else 0,
  61. 'tags': data.get('tags'),
  62. }
  63. class WykopDigIE(WykopBaseExtractor):
  64. IE_NAME = 'wykop:dig'
  65. _VALID_URL = r'https?://(?:www\.)?wykop\.pl/link/(?P<id>\d+)'
  66. _TESTS = [{
  67. 'url': 'https://wykop.pl/link/6912923/najbardziej-zrzedliwy-kot-na-swiecie-i-frozen-planet-ii-i-bbc-earth',
  68. 'info_dict': {
  69. 'id': 'rlSTBvViflc',
  70. 'ext': 'mp4',
  71. 'title': 'Najbardziej zrzędliwy kot na świecie I Frozen Planet II I BBC Earth',
  72. 'display_id': 'najbardziej-zrzedliwy-kot-na-swiecie-i-frozen-planet-ii-i-bbc-earth',
  73. 'description': 'md5:ac0f87dea1cdcb6b0c53f3612a095c87',
  74. 'tags': ['zwierzaczki', 'koty', 'smiesznykotek', 'humor', 'rozrywka', 'ciekawostki'],
  75. 'age_limit': 0,
  76. 'timestamp': 1669154480,
  77. 'release_timestamp': 1669194241,
  78. 'release_date': '20221123',
  79. 'uploader': 'starnak',
  80. 'uploader_id': 'starnak',
  81. 'uploader_url': 'https://wykop.pl/ludzie/starnak',
  82. 'like_count': int,
  83. 'dislike_count': int,
  84. 'comment_count': int,
  85. 'thumbnail': r're:https?://wykop\.pl/cdn/.+',
  86. 'view_count': int,
  87. 'channel': 'BBC Earth',
  88. 'channel_id': 'UCwmZiChSryoWQCZMIQezgTg',
  89. 'channel_url': 'https://www.youtube.com/channel/UCwmZiChSryoWQCZMIQezgTg',
  90. 'categories': ['Pets & Animals'],
  91. 'upload_date': '20220923',
  92. 'duration': 191,
  93. 'channel_follower_count': int,
  94. 'availability': 'public',
  95. 'live_status': 'not_live',
  96. 'playable_in_embed': True,
  97. },
  98. }]
  99. @classmethod
  100. def suitable(cls, url):
  101. return cls._match_valid_url(url) and not WykopDigCommentIE.suitable(url)
  102. def _real_extract(self, url):
  103. video_id = self._match_id(url)
  104. data = self._call_api(f'links/{video_id}', video_id)['data']
  105. return {
  106. **self._common_data_extract(data),
  107. 'id': video_id,
  108. 'title': data['title'],
  109. 'description': data.get('description'),
  110. # time it got "digged" to the homepage
  111. 'release_timestamp': parse_iso8601(data.get('published_at'), delimiter=' '),
  112. }
  113. class WykopDigCommentIE(WykopBaseExtractor):
  114. IE_NAME = 'wykop:dig:comment'
  115. _VALID_URL = r'https?://(?:www\.)?wykop\.pl/link/(?P<dig_id>\d+)/[^/]+/komentarz/(?P<id>\d+)'
  116. _TESTS = [{
  117. 'url': 'https://wykop.pl/link/6992589/strollowal-oszusta-przez-ponad-24-minuty-udawal-naiwniaka-i-nagral-rozmowe/komentarz/114540527/podobna-sytuacja-ponizej-ciekawa-dyskusja-z-oszustem-na-sam-koniec-sam-bylem-w-biurze-swiadkiem-podobnej-rozmowy-niemal-zakonczonej-sukcesem-bandyty-g',
  118. 'info_dict': {
  119. 'id': 'u6tEi2FmKZY',
  120. 'ext': 'mp4',
  121. 'title': 'md5:e7c741c5baa7ed6478000caf72865577',
  122. 'display_id': 'md5:45b2d12bd0e262d09cc7cf7abc8412db',
  123. 'description': 'md5:bcec7983429f9c0630f9deb9d3d1ba5e',
  124. 'timestamp': 1674476945,
  125. 'uploader': 'Bartholomew',
  126. 'uploader_id': 'Bartholomew',
  127. 'uploader_url': 'https://wykop.pl/ludzie/Bartholomew',
  128. 'thumbnail': r're:https?://wykop\.pl/cdn/.+',
  129. 'tags': [],
  130. 'availability': 'public',
  131. 'duration': 1838,
  132. 'upload_date': '20230117',
  133. 'categories': ['Entertainment'],
  134. 'view_count': int,
  135. 'like_count': int,
  136. 'dislike_count': int,
  137. 'comment_count': int,
  138. 'channel_follower_count': int,
  139. 'playable_in_embed': True,
  140. 'live_status': 'not_live',
  141. 'age_limit': 0,
  142. 'chapters': 'count:3',
  143. 'channel': 'Poszukiwacze Okazji',
  144. 'channel_id': 'UCzzvJDZThwv06dR4xmzrZBw',
  145. 'channel_url': 'https://www.youtube.com/channel/UCzzvJDZThwv06dR4xmzrZBw',
  146. },
  147. }]
  148. def _real_extract(self, url):
  149. dig_id, comment_id = self._search_regex(self._VALID_URL, url, 'dig and comment ids', group=('dig_id', 'id'))
  150. data = self._call_api(f'links/{dig_id}/comments/{comment_id}', comment_id)['data']
  151. return {
  152. **self._common_data_extract(data),
  153. 'id': comment_id,
  154. 'title': f"{traverse_obj(data, ('author', 'username'))} - {data.get('content') or ''}",
  155. 'description': data.get('content'),
  156. }
  157. class WykopPostIE(WykopBaseExtractor):
  158. IE_NAME = 'wykop:post'
  159. _VALID_URL = r'https?://(?:www\.)?wykop\.pl/wpis/(?P<id>\d+)'
  160. _TESTS = [{
  161. 'url': 'https://wykop.pl/wpis/68893343/kot-koty-smiesznykotek',
  162. 'info_dict': {
  163. 'id': 'PL8JMjiUPHUhwc9ZlKa_5IFeBwBV8Xe7jI',
  164. 'title': 'PawelW124 - #kot #koty #smiesznykotek',
  165. 'description': '#kot #koty #smiesznykotek',
  166. 'display_id': 'kot-koty-smiesznykotek',
  167. 'tags': ['kot', 'koty', 'smiesznykotek'],
  168. 'uploader': 'PawelW124',
  169. 'uploader_id': 'PawelW124',
  170. 'uploader_url': 'https://wykop.pl/ludzie/PawelW124',
  171. 'timestamp': 1668938142,
  172. 'age_limit': 0,
  173. 'like_count': int,
  174. 'dislike_count': int,
  175. 'thumbnail': r're:https?://wykop\.pl/cdn/.+',
  176. 'comment_count': int,
  177. 'channel': 'Revan',
  178. 'channel_id': 'UCW9T_-uZoiI7ROARQdTDyOw',
  179. 'channel_url': 'https://www.youtube.com/channel/UCW9T_-uZoiI7ROARQdTDyOw',
  180. 'upload_date': '20221120',
  181. 'modified_date': '20220814',
  182. 'availability': 'public',
  183. 'view_count': int,
  184. },
  185. 'playlist_mincount': 15,
  186. 'params': {
  187. 'flat_playlist': True,
  188. },
  189. }]
  190. @classmethod
  191. def suitable(cls, url):
  192. return cls._match_valid_url(url) and not WykopPostCommentIE.suitable(url)
  193. def _real_extract(self, url):
  194. video_id = self._match_id(url)
  195. data = self._call_api(f'entries/{video_id}', video_id)['data']
  196. return {
  197. **self._common_data_extract(data),
  198. 'id': video_id,
  199. 'title': f"{traverse_obj(data, ('author', 'username'))} - {data.get('content') or ''}",
  200. 'description': data.get('content'),
  201. }
  202. class WykopPostCommentIE(WykopBaseExtractor):
  203. IE_NAME = 'wykop:post:comment'
  204. _VALID_URL = r'https?://(?:www\.)?wykop\.pl/wpis/(?P<post_id>\d+)/[^/#]+#(?P<id>\d+)'
  205. _TESTS = [{
  206. 'url': 'https://wykop.pl/wpis/70084873/test-test-test#249303979',
  207. 'info_dict': {
  208. 'id': 'confusedquickarmyant',
  209. 'ext': 'mp4',
  210. 'title': 'tpap - treść komentarza',
  211. 'display_id': 'tresc-komentarza',
  212. 'description': 'treść komentarza',
  213. 'uploader': 'tpap',
  214. 'uploader_id': 'tpap',
  215. 'uploader_url': 'https://wykop.pl/ludzie/tpap',
  216. 'timestamp': 1675349470,
  217. 'upload_date': '20230202',
  218. 'tags': [],
  219. 'duration': 2.12,
  220. 'age_limit': 0,
  221. 'categories': [],
  222. 'view_count': int,
  223. 'like_count': int,
  224. 'dislike_count': int,
  225. 'thumbnail': r're:https?://wykop\.pl/cdn/.+',
  226. },
  227. }]
  228. def _real_extract(self, url):
  229. post_id, comment_id = self._search_regex(self._VALID_URL, url, 'post and comment ids', group=('post_id', 'id'))
  230. data = self._call_api(f'entries/{post_id}/comments/{comment_id}', comment_id)['data']
  231. return {
  232. **self._common_data_extract(data),
  233. 'id': comment_id,
  234. 'title': f"{traverse_obj(data, ('author', 'username'))} - {data.get('content') or ''}",
  235. 'description': data.get('content'),
  236. }