youtube_live_chat.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236
  1. from __future__ import division, unicode_literals
  2. import json
  3. import time
  4. from .fragment import FragmentFD
  5. from ..compat import compat_urllib_error
  6. from ..utils import (
  7. try_get,
  8. dict_get,
  9. int_or_none,
  10. RegexNotFoundError,
  11. )
  12. from ..extractor.youtube import YoutubeBaseInfoExtractor as YT_BaseIE
  13. class YoutubeLiveChatFD(FragmentFD):
  14. """ Downloads YouTube live chats fragment by fragment """
  15. FD_NAME = 'youtube_live_chat'
  16. def real_download(self, filename, info_dict):
  17. video_id = info_dict['video_id']
  18. self.to_screen('[%s] Downloading live chat' % self.FD_NAME)
  19. fragment_retries = self.params.get('fragment_retries', 0)
  20. test = self.params.get('test', False)
  21. ctx = {
  22. 'filename': filename,
  23. 'live': True,
  24. 'total_frags': None,
  25. }
  26. ie = YT_BaseIE(self.ydl)
  27. start_time = int(time.time() * 1000)
  28. def dl_fragment(url, data=None, headers=None):
  29. http_headers = info_dict.get('http_headers', {})
  30. if headers:
  31. http_headers = http_headers.copy()
  32. http_headers.update(headers)
  33. return self._download_fragment(ctx, url, info_dict, http_headers, data)
  34. def parse_actions_replay(live_chat_continuation):
  35. offset = continuation_id = click_tracking_params = None
  36. processed_fragment = bytearray()
  37. for action in live_chat_continuation.get('actions', []):
  38. if 'replayChatItemAction' in action:
  39. replay_chat_item_action = action['replayChatItemAction']
  40. offset = int(replay_chat_item_action['videoOffsetTimeMsec'])
  41. processed_fragment.extend(
  42. json.dumps(action, ensure_ascii=False).encode('utf-8') + b'\n')
  43. if offset is not None:
  44. continuation = try_get(
  45. live_chat_continuation,
  46. lambda x: x['continuations'][0]['liveChatReplayContinuationData'], dict)
  47. if continuation:
  48. continuation_id = continuation.get('continuation')
  49. click_tracking_params = continuation.get('clickTrackingParams')
  50. self._append_fragment(ctx, processed_fragment)
  51. return continuation_id, offset, click_tracking_params
  52. def try_refresh_replay_beginning(live_chat_continuation):
  53. # choose the second option that contains the unfiltered live chat replay
  54. refresh_continuation = try_get(
  55. live_chat_continuation,
  56. lambda x: x['header']['liveChatHeaderRenderer']['viewSelector']['sortFilterSubMenuRenderer']['subMenuItems'][1]['continuation']['reloadContinuationData'], dict)
  57. if refresh_continuation:
  58. # no data yet but required to call _append_fragment
  59. self._append_fragment(ctx, b'')
  60. refresh_continuation_id = refresh_continuation.get('continuation')
  61. offset = 0
  62. click_tracking_params = refresh_continuation.get('trackingParams')
  63. return refresh_continuation_id, offset, click_tracking_params
  64. return parse_actions_replay(live_chat_continuation)
  65. live_offset = 0
  66. def parse_actions_live(live_chat_continuation):
  67. nonlocal live_offset
  68. continuation_id = click_tracking_params = None
  69. processed_fragment = bytearray()
  70. for action in live_chat_continuation.get('actions', []):
  71. timestamp = self.parse_live_timestamp(action)
  72. if timestamp is not None:
  73. live_offset = timestamp - start_time
  74. # compatibility with replay format
  75. pseudo_action = {
  76. 'replayChatItemAction': {'actions': [action]},
  77. 'videoOffsetTimeMsec': str(live_offset),
  78. 'isLive': True,
  79. }
  80. processed_fragment.extend(
  81. json.dumps(pseudo_action, ensure_ascii=False).encode('utf-8') + b'\n')
  82. continuation_data_getters = [
  83. lambda x: x['continuations'][0]['invalidationContinuationData'],
  84. lambda x: x['continuations'][0]['timedContinuationData'],
  85. ]
  86. continuation_data = try_get(live_chat_continuation, continuation_data_getters, dict)
  87. if continuation_data:
  88. continuation_id = continuation_data.get('continuation')
  89. click_tracking_params = continuation_data.get('clickTrackingParams')
  90. timeout_ms = int_or_none(continuation_data.get('timeoutMs'))
  91. if timeout_ms is not None:
  92. time.sleep(timeout_ms / 1000)
  93. self._append_fragment(ctx, processed_fragment)
  94. return continuation_id, live_offset, click_tracking_params
  95. def download_and_parse_fragment(url, frag_index, request_data=None, headers=None):
  96. count = 0
  97. while count <= fragment_retries:
  98. try:
  99. success, raw_fragment = dl_fragment(url, request_data, headers)
  100. if not success:
  101. return False, None, None, None
  102. try:
  103. data = ie.extract_yt_initial_data(video_id, raw_fragment.decode('utf-8', 'replace'))
  104. except RegexNotFoundError:
  105. data = None
  106. if not data:
  107. data = json.loads(raw_fragment)
  108. live_chat_continuation = try_get(
  109. data,
  110. lambda x: x['continuationContents']['liveChatContinuation'], dict) or {}
  111. if info_dict['protocol'] == 'youtube_live_chat_replay':
  112. if frag_index == 1:
  113. continuation_id, offset, click_tracking_params = try_refresh_replay_beginning(live_chat_continuation)
  114. else:
  115. continuation_id, offset, click_tracking_params = parse_actions_replay(live_chat_continuation)
  116. elif info_dict['protocol'] == 'youtube_live_chat':
  117. continuation_id, offset, click_tracking_params = parse_actions_live(live_chat_continuation)
  118. return True, continuation_id, offset, click_tracking_params
  119. except compat_urllib_error.HTTPError as err:
  120. count += 1
  121. if count <= fragment_retries:
  122. self.report_retry_fragment(err, frag_index, count, fragment_retries)
  123. if count > fragment_retries:
  124. self.report_error('giving up after %s fragment retries' % fragment_retries)
  125. return False, None, None, None
  126. self._prepare_and_start_frag_download(ctx, info_dict)
  127. success, raw_fragment = dl_fragment(info_dict['url'])
  128. if not success:
  129. return False
  130. try:
  131. data = ie.extract_yt_initial_data(video_id, raw_fragment.decode('utf-8', 'replace'))
  132. except RegexNotFoundError:
  133. return False
  134. continuation_id = try_get(
  135. data,
  136. lambda x: x['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation'])
  137. # no data yet but required to call _append_fragment
  138. self._append_fragment(ctx, b'')
  139. ytcfg = ie.extract_ytcfg(video_id, raw_fragment.decode('utf-8', 'replace'))
  140. if not ytcfg:
  141. return False
  142. api_key = try_get(ytcfg, lambda x: x['INNERTUBE_API_KEY'])
  143. innertube_context = try_get(ytcfg, lambda x: x['INNERTUBE_CONTEXT'])
  144. if not api_key or not innertube_context:
  145. return False
  146. visitor_data = try_get(innertube_context, lambda x: x['client']['visitorData'], str)
  147. if info_dict['protocol'] == 'youtube_live_chat_replay':
  148. url = 'https://www.youtube.com/youtubei/v1/live_chat/get_live_chat_replay?key=' + api_key
  149. chat_page_url = 'https://www.youtube.com/live_chat_replay?continuation=' + continuation_id
  150. elif info_dict['protocol'] == 'youtube_live_chat':
  151. url = 'https://www.youtube.com/youtubei/v1/live_chat/get_live_chat?key=' + api_key
  152. chat_page_url = 'https://www.youtube.com/live_chat?continuation=' + continuation_id
  153. frag_index = offset = 0
  154. click_tracking_params = None
  155. while continuation_id is not None:
  156. frag_index += 1
  157. request_data = {
  158. 'context': innertube_context,
  159. 'continuation': continuation_id,
  160. }
  161. if frag_index > 1:
  162. request_data['currentPlayerState'] = {'playerOffsetMs': str(max(offset - 5000, 0))}
  163. if click_tracking_params:
  164. request_data['context']['clickTracking'] = {'clickTrackingParams': click_tracking_params}
  165. headers = ie.generate_api_headers(ytcfg=ytcfg, visitor_data=visitor_data)
  166. headers.update({'content-type': 'application/json'})
  167. fragment_request_data = json.dumps(request_data, ensure_ascii=False).encode('utf-8') + b'\n'
  168. success, continuation_id, offset, click_tracking_params = download_and_parse_fragment(
  169. url, frag_index, fragment_request_data, headers)
  170. else:
  171. success, continuation_id, offset, click_tracking_params = download_and_parse_fragment(
  172. chat_page_url, frag_index)
  173. if not success:
  174. return False
  175. if test:
  176. break
  177. self._finish_frag_download(ctx, info_dict)
  178. return True
  179. @staticmethod
  180. def parse_live_timestamp(action):
  181. action_content = dict_get(
  182. action,
  183. ['addChatItemAction', 'addLiveChatTickerItemAction', 'addBannerToLiveChatCommand'])
  184. if not isinstance(action_content, dict):
  185. return None
  186. item = dict_get(action_content, ['item', 'bannerRenderer'])
  187. if not isinstance(item, dict):
  188. return None
  189. renderer = dict_get(item, [
  190. # text
  191. 'liveChatTextMessageRenderer', 'liveChatPaidMessageRenderer',
  192. 'liveChatMembershipItemRenderer', 'liveChatPaidStickerRenderer',
  193. # ticker
  194. 'liveChatTickerPaidMessageItemRenderer',
  195. 'liveChatTickerSponsorItemRenderer',
  196. # banner
  197. 'liveChatBannerRenderer',
  198. ])
  199. if not isinstance(renderer, dict):
  200. return None
  201. parent_item_getters = [
  202. lambda x: x['showItemEndpoint']['showLiveChatItemEndpoint']['renderer'],
  203. lambda x: x['contents'],
  204. ]
  205. parent_item = try_get(renderer, parent_item_getters, dict)
  206. if parent_item:
  207. renderer = dict_get(parent_item, [
  208. 'liveChatTextMessageRenderer', 'liveChatPaidMessageRenderer',
  209. 'liveChatMembershipItemRenderer', 'liveChatPaidStickerRenderer',
  210. ])
  211. if not isinstance(renderer, dict):
  212. return None
  213. return int_or_none(renderer.get('timestampUsec'), 1000)