gettr.py 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206
  1. from .common import InfoExtractor
  2. from ..utils import (
  3. ExtractorError,
  4. bool_or_none,
  5. dict_get,
  6. float_or_none,
  7. int_or_none,
  8. str_or_none,
  9. traverse_obj,
  10. try_get,
  11. url_or_none,
  12. urljoin,
  13. )
  14. class GettrBaseIE(InfoExtractor):
  15. _BASE_REGEX = r'https?://(www\.)?gettr\.com/'
  16. _MEDIA_BASE_URL = 'https://media.gettr.com/'
  17. def _call_api(self, path, video_id, *args, **kwargs):
  18. return self._download_json(urljoin('https://api.gettr.com/u/', path), video_id, *args, **kwargs)['result']
  19. class GettrIE(GettrBaseIE):
  20. _VALID_URL = GettrBaseIE._BASE_REGEX + r'post/(?P<id>[a-z0-9]+)'
  21. _TESTS = [{
  22. 'url': 'https://www.gettr.com/post/pcf6uv838f',
  23. 'info_dict': {
  24. 'id': 'pcf6uv838f',
  25. 'title': 'md5:9086a646bbd06c41c4fe8e52b3c93454',
  26. 'description': 'md5:be0577f1e4caadc06de4a002da2bf287',
  27. 'ext': 'mp4',
  28. 'uploader': 'EpochTV',
  29. 'uploader_id': 'epochtv',
  30. 'upload_date': '20210927',
  31. 'thumbnail': r're:^https?://.+/out\.jpg',
  32. 'timestamp': 1632782451.058,
  33. 'duration': 58.5585,
  34. 'tags': ['hornofafrica', 'explorations'],
  35. },
  36. }, {
  37. 'url': 'https://gettr.com/post/p4iahp',
  38. 'info_dict': {
  39. 'id': 'p4iahp',
  40. 'title': 'md5:b03c07883db6fbc1aab88877a6c3b149',
  41. 'description': 'md5:741b7419d991c403196ed2ea7749a39d',
  42. 'ext': 'mp4',
  43. 'uploader': 'Neues Forum Freiheit',
  44. 'uploader_id': 'nf_freiheit',
  45. 'upload_date': '20210718',
  46. 'thumbnail': r're:^https?://.+/out\.jpg',
  47. 'timestamp': 1626594455.017,
  48. 'duration': 23,
  49. 'tags': 'count:12',
  50. },
  51. }, {
  52. # quote post
  53. 'url': 'https://gettr.com/post/pxn5b743a9',
  54. 'only_matching': True,
  55. }, {
  56. # quote with video
  57. 'url': 'https://gettr.com/post/pxtiiz5ca2',
  58. 'only_matching': True,
  59. }, {
  60. # streaming embed
  61. 'url': 'https://gettr.com/post/pxlu8p3b13',
  62. 'only_matching': True,
  63. }, {
  64. # youtube embed
  65. 'url': 'https://gettr.com/post/pv6wp9e24c',
  66. 'only_matching': True,
  67. 'add_ie': ['Youtube'],
  68. }]
  69. def _real_extract(self, url):
  70. post_id = self._match_id(url)
  71. webpage = self._download_webpage(url, post_id)
  72. api_data = self._call_api(f'post/{post_id}?incl="poststats|userinfo"', post_id)
  73. post_data = api_data.get('data')
  74. user_data = try_get(api_data, lambda x: x['aux']['uinf'][post_data['uid']], dict) or {}
  75. vid = post_data.get('vid')
  76. ovid = post_data.get('ovid')
  77. if post_data.get('p_type') == 'stream':
  78. return self.url_result(f'https://gettr.com/streaming/{post_id}', ie='GettrStreaming', video_id=post_id)
  79. if not (ovid or vid):
  80. embed_url = url_or_none(post_data.get('prevsrc'))
  81. shared_post_id = traverse_obj(api_data, ('aux', 'shrdpst', '_id'), ('data', 'rpstIds', 0), expected_type=str)
  82. if embed_url:
  83. return self.url_result(embed_url)
  84. elif shared_post_id:
  85. return self.url_result(f'https://gettr.com/post/{shared_post_id}', ie='Gettr', video_id=shared_post_id)
  86. else:
  87. raise ExtractorError('There\'s no video in this post.')
  88. title = description = str_or_none(
  89. post_data.get('txt') or self._og_search_description(webpage))
  90. uploader = str_or_none(
  91. user_data.get('nickname')
  92. or self._search_regex(r'^(.+?) on GETTR', self._og_search_title(webpage, default=''), 'uploader', fatal=False))
  93. if uploader:
  94. title = f'{uploader} - {title}'
  95. formats, subtitles = self._extract_m3u8_formats_and_subtitles(
  96. urljoin(self._MEDIA_BASE_URL, vid), post_id, 'mp4',
  97. entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) if vid else ([], {})
  98. if ovid:
  99. formats.append({
  100. 'url': urljoin(self._MEDIA_BASE_URL, ovid),
  101. 'format_id': 'ovid',
  102. 'ext': 'mp4',
  103. 'width': int_or_none(post_data.get('vid_wid')),
  104. 'height': int_or_none(post_data.get('vid_hgt')),
  105. })
  106. return {
  107. 'id': post_id,
  108. 'title': title,
  109. 'description': description,
  110. 'formats': formats,
  111. 'subtitles': subtitles,
  112. 'uploader': uploader,
  113. 'uploader_id': str_or_none(
  114. dict_get(user_data, ['_id', 'username'])
  115. or post_data.get('uid')),
  116. 'thumbnail': url_or_none(
  117. urljoin(self._MEDIA_BASE_URL, post_data.get('main'))
  118. or self._html_search_meta(['og:image', 'image'], webpage, 'thumbnail', fatal=False)),
  119. 'timestamp': float_or_none(dict_get(post_data, ['cdate', 'udate']), scale=1000),
  120. 'duration': float_or_none(post_data.get('vid_dur')),
  121. 'tags': post_data.get('htgs'),
  122. }
  123. class GettrStreamingIE(GettrBaseIE):
  124. _VALID_URL = GettrBaseIE._BASE_REGEX + r'streaming/(?P<id>[a-z0-9]+)'
  125. _TESTS = [{
  126. 'url': 'https://gettr.com/streaming/psoiulc122',
  127. 'info_dict': {
  128. 'id': 'psoiulc122',
  129. 'ext': 'mp4',
  130. 'description': 'md5:56bca4b8f48f1743d9fd03d49c723017',
  131. 'view_count': int,
  132. 'uploader': 'Corona Investigative Committee',
  133. 'uploader_id': 'coronacommittee',
  134. 'duration': 5180.184,
  135. 'thumbnail': r're:^https?://.+',
  136. 'title': 'Day 1: Opening Session of the Grand Jury Proceeding',
  137. 'timestamp': 1644080997.164,
  138. 'upload_date': '20220205',
  139. },
  140. }, {
  141. 'url': 'https://gettr.com/streaming/psfmeefcc1',
  142. 'info_dict': {
  143. 'id': 'psfmeefcc1',
  144. 'ext': 'mp4',
  145. 'title': 'Session 90: "The Virus Of Power"',
  146. 'view_count': int,
  147. 'uploader_id': 'coronacommittee',
  148. 'description': 'md5:98986acdf656aa836bf36f9c9704c65b',
  149. 'uploader': 'Corona Investigative Committee',
  150. 'thumbnail': r're:^https?://.+',
  151. 'duration': 21872.507,
  152. 'timestamp': 1643976662.858,
  153. 'upload_date': '20220204',
  154. },
  155. }]
  156. def _real_extract(self, url):
  157. video_id = self._match_id(url)
  158. video_info = self._call_api(f'live/join/{video_id}', video_id, data={})
  159. live_info = video_info['broadcast']
  160. live_url = url_or_none(live_info.get('url'))
  161. formats, subtitles = self._extract_m3u8_formats_and_subtitles(
  162. live_url, video_id, ext='mp4',
  163. entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) if live_url else ([], {})
  164. thumbnails = [{
  165. 'url': urljoin(self._MEDIA_BASE_URL, thumbnail),
  166. } for thumbnail in try_get(video_info, lambda x: x['postData']['imgs'], list) or []]
  167. return {
  168. 'id': video_id,
  169. 'title': try_get(video_info, lambda x: x['postData']['ttl'], str),
  170. 'description': try_get(video_info, lambda x: x['postData']['dsc'], str),
  171. 'formats': formats,
  172. 'subtitles': subtitles,
  173. 'thumbnails': thumbnails,
  174. 'uploader': try_get(video_info, lambda x: x['liveHostInfo']['nickname'], str),
  175. 'uploader_id': try_get(video_info, lambda x: x['liveHostInfo']['_id'], str),
  176. 'view_count': int_or_none(live_info.get('viewsCount')),
  177. 'timestamp': float_or_none(live_info.get('startAt'), scale=1000),
  178. 'duration': float_or_none(live_info.get('duration'), scale=1000),
  179. 'is_live': bool_or_none(live_info.get('isLive')),
  180. }