naver.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404
  1. import base64
  2. import hashlib
  3. import hmac
  4. import itertools
  5. import json
  6. import re
  7. import time
  8. import urllib.parse
  9. from .common import InfoExtractor
  10. from ..utils import (
  11. ExtractorError,
  12. dict_get,
  13. int_or_none,
  14. join_nonempty,
  15. merge_dicts,
  16. parse_iso8601,
  17. traverse_obj,
  18. try_get,
  19. unified_timestamp,
  20. update_url_query,
  21. url_or_none,
  22. )
  23. class NaverBaseIE(InfoExtractor):
  24. _CAPTION_EXT_RE = r'\.(?:ttml|vtt)'
  25. @staticmethod # NB: Used in WeverseIE
  26. def process_subtitles(vod_data, process_url):
  27. ret = {'subtitles': {}, 'automatic_captions': {}}
  28. for caption in traverse_obj(vod_data, ('captions', 'list', ...)):
  29. caption_url = caption.get('source')
  30. if not caption_url:
  31. continue
  32. type_ = 'automatic_captions' if caption.get('type') == 'auto' else 'subtitles'
  33. lang = caption.get('locale') or join_nonempty('language', 'country', from_dict=caption) or 'und'
  34. if caption.get('type') == 'fan':
  35. lang += '_fan{}'.format(next(i for i in itertools.count(1) if f'{lang}_fan{i}' not in ret[type_]))
  36. ret[type_].setdefault(lang, []).extend({
  37. 'url': sub_url,
  38. 'name': join_nonempty('label', 'fanName', from_dict=caption, delim=' - '),
  39. } for sub_url in process_url(caption_url))
  40. return ret
  41. def _extract_video_info(self, video_id, vid, key):
  42. video_data = self._download_json(
  43. 'http://play.rmcnmv.naver.com/vod/play/v2.0/' + vid,
  44. video_id, query={
  45. 'key': key,
  46. })
  47. meta = video_data['meta']
  48. title = meta['subject']
  49. formats = []
  50. get_list = lambda x: try_get(video_data, lambda y: y[x + 's']['list'], list) or []
  51. def extract_formats(streams, stream_type, query={}):
  52. for stream in streams:
  53. stream_url = stream.get('source')
  54. if not stream_url:
  55. continue
  56. stream_url = update_url_query(stream_url, query)
  57. encoding_option = stream.get('encodingOption', {})
  58. bitrate = stream.get('bitrate', {})
  59. formats.append({
  60. 'format_id': '{}_{}'.format(stream.get('type') or stream_type, dict_get(encoding_option, ('name', 'id'))),
  61. 'url': stream_url,
  62. 'ext': 'mp4',
  63. 'width': int_or_none(encoding_option.get('width')),
  64. 'height': int_or_none(encoding_option.get('height')),
  65. 'vbr': int_or_none(bitrate.get('video')),
  66. 'abr': int_or_none(bitrate.get('audio')),
  67. 'filesize': int_or_none(stream.get('size')),
  68. 'protocol': 'm3u8_native' if stream_type == 'HLS' else None,
  69. })
  70. extract_formats(get_list('video'), 'H264')
  71. for stream_set in video_data.get('streams', []):
  72. query = {}
  73. for param in stream_set.get('keys', []):
  74. query[param['name']] = param['value']
  75. stream_type = stream_set.get('type')
  76. videos = stream_set.get('videos')
  77. if videos:
  78. extract_formats(videos, stream_type, query)
  79. elif stream_type == 'HLS':
  80. stream_url = stream_set.get('source')
  81. if not stream_url:
  82. continue
  83. formats.extend(self._extract_m3u8_formats(
  84. update_url_query(stream_url, query), video_id,
  85. 'mp4', 'm3u8_native', m3u8_id=stream_type, fatal=False))
  86. replace_ext = lambda x, y: re.sub(self._CAPTION_EXT_RE, '.' + y, x)
  87. def get_subs(caption_url):
  88. if re.search(self._CAPTION_EXT_RE, caption_url):
  89. return [
  90. replace_ext(caption_url, 'ttml'),
  91. replace_ext(caption_url, 'vtt'),
  92. ]
  93. return [caption_url]
  94. user = meta.get('user', {})
  95. return {
  96. 'id': video_id,
  97. 'title': title,
  98. 'formats': formats,
  99. 'thumbnail': try_get(meta, lambda x: x['cover']['source']),
  100. 'view_count': int_or_none(meta.get('count')),
  101. 'uploader_id': user.get('id'),
  102. 'uploader': user.get('name'),
  103. 'uploader_url': user.get('url'),
  104. **self.process_subtitles(video_data, get_subs),
  105. }
  106. def _call_api(self, path, video_id):
  107. api_endpoint = f'https://apis.naver.com/now_web2/now_web_api/v1{path}'
  108. key = b'nbxvs5nwNG9QKEWK0ADjYA4JZoujF4gHcIwvoCxFTPAeamq5eemvt5IWAYXxrbYM'
  109. msgpad = int(time.time() * 1000)
  110. md = base64.b64encode(hmac.HMAC(
  111. key, f'{api_endpoint[:255]}{msgpad}'.encode(), digestmod=hashlib.sha1).digest()).decode()
  112. return self._download_json(api_endpoint, video_id=video_id, headers=self.geo_verification_headers(), query={
  113. 'msgpad': msgpad,
  114. 'md': md,
  115. })['result']
  116. class NaverIE(NaverBaseIE):
  117. _VALID_URL = r'https?://(?:m\.)?tv(?:cast)?\.naver\.com/(?:v|embed)/(?P<id>\d+)'
  118. _GEO_BYPASS = False
  119. _TESTS = [{
  120. 'url': 'http://tv.naver.com/v/81652',
  121. 'info_dict': {
  122. 'id': '81652',
  123. 'ext': 'mp4',
  124. 'title': '[9월 모의고사 해설강의][수학_김상희] 수학 A형 16~20번',
  125. 'description': '메가스터디 수학 김상희 선생님이 9월 모의고사 수학A형 16번에서 20번까지 해설강의를 공개합니다.',
  126. 'timestamp': 1378200754,
  127. 'upload_date': '20130903',
  128. 'uploader': '메가스터디, 합격불변의 법칙',
  129. 'uploader_id': 'megastudy',
  130. 'uploader_url': 'https://tv.naver.com/megastudy',
  131. 'view_count': int,
  132. 'like_count': int,
  133. 'comment_count': int,
  134. 'duration': 2118,
  135. 'thumbnail': r're:^https?://.*\.jpg',
  136. },
  137. }, {
  138. 'url': 'http://tv.naver.com/v/395837',
  139. 'md5': '7791205fa89dbed2f5e3eb16d287ff05',
  140. 'info_dict': {
  141. 'id': '395837',
  142. 'ext': 'mp4',
  143. 'title': '9년이 지나도 아픈 기억, 전효성의 아버지',
  144. 'description': 'md5:c76be23e21403a6473d8119678cdb5cb',
  145. 'timestamp': 1432030253,
  146. 'upload_date': '20150519',
  147. 'uploader': '4가지쇼',
  148. 'uploader_id': '4show',
  149. 'uploader_url': 'https://tv.naver.com/4show',
  150. 'view_count': int,
  151. 'like_count': int,
  152. 'comment_count': int,
  153. 'duration': 277,
  154. 'thumbnail': r're:^https?://.*\.jpg',
  155. },
  156. }, {
  157. 'url': 'http://tvcast.naver.com/v/81652',
  158. 'only_matching': True,
  159. }]
  160. def _real_extract(self, url):
  161. video_id = self._match_id(url)
  162. data = self._call_api(f'/clips/{video_id}/play-info', video_id)
  163. vid = traverse_obj(data, ('clip', 'videoId', {str}))
  164. in_key = traverse_obj(data, ('play', 'inKey', {str}))
  165. if not vid or not in_key:
  166. raise ExtractorError('Unable to extract video info')
  167. info = self._extract_video_info(video_id, vid, in_key)
  168. info.update(traverse_obj(data, ('clip', {
  169. 'title': 'title',
  170. 'description': 'description',
  171. 'timestamp': ('firstExposureDatetime', {parse_iso8601}),
  172. 'duration': ('playTime', {int_or_none}),
  173. 'like_count': ('likeItCount', {int_or_none}),
  174. 'view_count': ('playCount', {int_or_none}),
  175. 'comment_count': ('commentCount', {int_or_none}),
  176. 'thumbnail': ('thumbnailImageUrl', {url_or_none}),
  177. 'uploader': 'channelName',
  178. 'uploader_id': 'channelId',
  179. 'uploader_url': ('channelUrl', {url_or_none}),
  180. 'age_limit': ('adultVideo', {lambda x: 19 if x else None}),
  181. })))
  182. return info
  183. class NaverLiveIE(NaverBaseIE):
  184. IE_NAME = 'Naver:live'
  185. _VALID_URL = r'https?://(?:m\.)?tv(?:cast)?\.naver\.com/l/(?P<id>\d+)'
  186. _GEO_BYPASS = False
  187. _TESTS = [{
  188. 'url': 'https://tv.naver.com/l/127062',
  189. 'info_dict': {
  190. 'id': '127062',
  191. 'ext': 'mp4',
  192. 'live_status': 'is_live',
  193. 'channel': '뉴스는 YTN',
  194. 'channel_id': 'ytnnews24',
  195. 'title': 're:^대한민국 24시간 뉴스 채널 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
  196. 'description': 'md5:f938b5956711beab6f882314ffadf4d5',
  197. 'start_time': 1677752280,
  198. 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)',
  199. 'like_count': int,
  200. },
  201. }, {
  202. 'url': 'https://tv.naver.com/l/140535',
  203. 'info_dict': {
  204. 'id': '140535',
  205. 'ext': 'mp4',
  206. 'live_status': 'is_live',
  207. 'channel': 'KBS뉴스',
  208. 'channel_id': 'kbsnews',
  209. 'start_time': 1696867320,
  210. 'title': 're:^언제 어디서나! KBS 뉴스 24 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
  211. 'description': 'md5:6ad419c0bf2f332829bda3f79c295284',
  212. 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)',
  213. 'like_count': int,
  214. },
  215. }, {
  216. 'url': 'https://tv.naver.com/l/54887',
  217. 'only_matching': True,
  218. }]
  219. def _real_extract(self, url):
  220. video_id = self._match_id(url)
  221. data = self._call_api(f'/live-end/normal/{video_id}/play-info?renewLastPlayDate=true', video_id)
  222. status = traverse_obj(data, ('live', 'liveStatus'))
  223. if status == 'CLOSED':
  224. raise ExtractorError('Stream is offline.', expected=True)
  225. elif status != 'OPENED':
  226. raise ExtractorError(f'Unknown status {status!r}')
  227. return {
  228. 'id': video_id,
  229. 'formats': self._extract_m3u8_formats(
  230. traverse_obj(data, ('playbackBody', {json.loads}, 'media', 0, 'path')), video_id, live=True),
  231. **traverse_obj(data, ('live', {
  232. 'title': 'title',
  233. 'channel': 'channelName',
  234. 'channel_id': 'channelId',
  235. 'description': 'description',
  236. 'like_count': (('likeCount', 'likeItCount'), {int_or_none}),
  237. 'thumbnail': ('thumbnailImageUrl', {url_or_none}),
  238. 'start_time': (('startTime', 'startDateTime', 'startYmdt'), {parse_iso8601}),
  239. }), get_all=False),
  240. 'is_live': True,
  241. }
  242. class NaverNowIE(NaverBaseIE):
  243. IE_NAME = 'navernow'
  244. _VALID_URL = r'https?://now\.naver\.com/s/now\.(?P<id>\w+)'
  245. _API_URL = 'https://apis.naver.com/now_web/oldnow_web/v4'
  246. _TESTS = [{
  247. 'url': 'https://now.naver.com/s/now.4759?shareReplayId=26331132#replay=',
  248. 'md5': 'e05854162c21c221481de16b2944a0bc',
  249. 'info_dict': {
  250. 'id': '4759-26331132',
  251. 'title': '아이키X노제\r\n💖꽁냥꽁냥💖(1)',
  252. 'ext': 'mp4',
  253. 'thumbnail': r're:^https?://.*\.jpg',
  254. 'timestamp': 1650369600,
  255. 'upload_date': '20220419',
  256. 'uploader_id': 'now',
  257. 'view_count': int,
  258. 'uploader_url': 'https://now.naver.com/show/4759',
  259. 'uploader': '아이키의 떰즈업',
  260. },
  261. 'params': {
  262. 'noplaylist': True,
  263. },
  264. }, {
  265. 'url': 'https://now.naver.com/s/now.4759?shareHightlight=26601461#highlight=',
  266. 'md5': '9f6118e398aa0f22b2152f554ea7851b',
  267. 'info_dict': {
  268. 'id': '4759-26601461',
  269. 'title': '아이키: 나 리정한테 흔들렸어,,, 질투 폭발하는 노제 여보😾 [아이키의 떰즈업]ㅣ네이버 NOW.',
  270. 'ext': 'mp4',
  271. 'thumbnail': r're:^https?://.*\.jpg',
  272. 'upload_date': '20220504',
  273. 'timestamp': 1651648311,
  274. 'uploader_id': 'now',
  275. 'view_count': int,
  276. 'uploader_url': 'https://now.naver.com/show/4759',
  277. 'uploader': '아이키의 떰즈업',
  278. },
  279. 'params': {
  280. 'noplaylist': True,
  281. },
  282. }, {
  283. 'url': 'https://now.naver.com/s/now.4759',
  284. 'info_dict': {
  285. 'id': '4759',
  286. 'title': '아이키의 떰즈업',
  287. },
  288. 'playlist_mincount': 101,
  289. }, {
  290. 'url': 'https://now.naver.com/s/now.4759?shareReplayId=26331132#replay',
  291. 'info_dict': {
  292. 'id': '4759',
  293. 'title': '아이키의 떰즈업',
  294. },
  295. 'playlist_mincount': 101,
  296. }, {
  297. 'url': 'https://now.naver.com/s/now.4759?shareHightlight=26601461#highlight=',
  298. 'info_dict': {
  299. 'id': '4759',
  300. 'title': '아이키의 떰즈업',
  301. },
  302. 'playlist_mincount': 101,
  303. }, {
  304. 'url': 'https://now.naver.com/s/now.kihyunplay?shareReplayId=30573291#replay',
  305. 'only_matching': True,
  306. }]
  307. def _extract_replay(self, show_id, replay_id):
  308. vod_info = self._download_json(f'{self._API_URL}/shows/now.{show_id}/vod/{replay_id}', replay_id)
  309. in_key = self._download_json(f'{self._API_URL}/shows/now.{show_id}/vod/{replay_id}/inkey', replay_id)['inKey']
  310. return merge_dicts({
  311. 'id': f'{show_id}-{replay_id}',
  312. 'title': traverse_obj(vod_info, ('episode', 'title')),
  313. 'timestamp': unified_timestamp(traverse_obj(vod_info, ('episode', 'start_time'))),
  314. 'thumbnail': vod_info.get('thumbnail_image_url'),
  315. }, self._extract_video_info(replay_id, vod_info['video_id'], in_key))
  316. def _extract_show_replays(self, show_id):
  317. page_size = 15
  318. page = 1
  319. while True:
  320. show_vod_info = self._download_json(
  321. f'{self._API_URL}/vod-shows/now.{show_id}', show_id,
  322. query={'page': page, 'page_size': page_size},
  323. note=f'Downloading JSON vod list for show {show_id} - page {page}',
  324. )['response']['result']
  325. for v in show_vod_info.get('vod_list') or []:
  326. yield self._extract_replay(show_id, v['id'])
  327. if len(show_vod_info.get('vod_list') or []) < page_size:
  328. break
  329. page += 1
  330. def _extract_show_highlights(self, show_id, highlight_id=None):
  331. page_size = 10
  332. page = 1
  333. while True:
  334. highlights_videos = self._download_json(
  335. f'{self._API_URL}/shows/now.{show_id}/highlights/videos/', show_id,
  336. query={'page': page, 'page_size': page_size},
  337. note=f'Downloading JSON highlights for show {show_id} - page {page}')
  338. for highlight in highlights_videos.get('results') or []:
  339. if highlight_id and highlight.get('clip_no') != int(highlight_id):
  340. continue
  341. yield merge_dicts({
  342. 'id': f'{show_id}-{highlight["clip_no"]}',
  343. 'title': highlight.get('title'),
  344. 'timestamp': unified_timestamp(highlight.get('regdate')),
  345. 'thumbnail': highlight.get('thumbnail_url'),
  346. }, self._extract_video_info(highlight['clip_no'], highlight['video_id'], highlight['video_inkey']))
  347. if len(highlights_videos.get('results') or []) < page_size:
  348. break
  349. page += 1
  350. def _extract_highlight(self, show_id, highlight_id):
  351. try:
  352. return next(self._extract_show_highlights(show_id, highlight_id))
  353. except StopIteration:
  354. raise ExtractorError(f'Unable to find highlight {highlight_id} for show {show_id}')
  355. def _real_extract(self, url):
  356. show_id = self._match_id(url)
  357. qs = urllib.parse.parse_qs(urllib.parse.urlparse(url).query)
  358. if not self._yes_playlist(show_id, qs.get('shareHightlight')):
  359. return self._extract_highlight(show_id, qs['shareHightlight'][0])
  360. elif not self._yes_playlist(show_id, qs.get('shareReplayId')):
  361. return self._extract_replay(show_id, qs['shareReplayId'][0])
  362. show_info = self._download_json(
  363. f'{self._API_URL}/shows/now.{show_id}/', show_id,
  364. note=f'Downloading JSON vod list for show {show_id}')
  365. return self.playlist_result(
  366. itertools.chain(self._extract_show_replays(show_id), self._extract_show_highlights(show_id)),
  367. show_id, show_info.get('title'))