jstream.py 3.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273
  1. import base64
  2. import json
  3. import re
  4. from .common import InfoExtractor
  5. from ..utils import (
  6. float_or_none,
  7. js_to_json,
  8. remove_start,
  9. )
  10. class JStreamIE(InfoExtractor):
  11. # group "id" only exists for compliance, not directly used in requests
  12. # also all components are mandatory
  13. _VALID_URL = r'jstream:(?P<host>www\d+):(?P<id>(?P<publisher>[a-z0-9]+):(?P<mid>\d+))'
  14. _TESTS = [{
  15. 'url': 'jstream:www50:eqd638pvwx:752',
  16. 'info_dict': {
  17. 'id': 'eqd638pvwx:752',
  18. 'ext': 'mp4',
  19. 'title': '阪神淡路大震災 激震の記録2020年版 解説動画',
  20. 'duration': 672,
  21. 'thumbnail': r're:https?://eqd638pvwx\.eq\.webcdn\.stream\.ne\.jp/.+\.jpg',
  22. },
  23. }]
  24. def _parse_jsonp(self, callback, string, video_id):
  25. return self._search_json(rf'\s*{re.escape(callback)}\s*\(', string, callback, video_id)
  26. def _find_formats(self, video_id, movie_list_hls, host, publisher, subtitles):
  27. for value in movie_list_hls:
  28. text = value.get('text') or ''
  29. if not text.startswith('auto'):
  30. continue
  31. m3u8_id = remove_start(remove_start(text, 'auto'), '_') or None
  32. fmts, subs = self._extract_m3u8_formats_and_subtitles(
  33. f'https://{publisher}.eq.webcdn.stream.ne.jp/{host}/{publisher}/jmc_pub/{value.get("url")}', video_id, 'mp4', m3u8_id=m3u8_id)
  34. self._merge_subtitles(subs, target=subtitles)
  35. yield from fmts
  36. def _real_extract(self, url):
  37. host, publisher, mid, video_id = self._match_valid_url(url).group('host', 'publisher', 'mid', 'id')
  38. video_info_jsonp = self._download_webpage(
  39. f'https://{publisher}.eq.webcdn.stream.ne.jp/{host}/{publisher}/jmc_pub/eq_meta/v1/{mid}.jsonp',
  40. video_id, 'Requesting video info')
  41. video_info = self._parse_jsonp('metaDataResult', video_info_jsonp, video_id)['movie']
  42. subtitles = {}
  43. formats = list(self._find_formats(video_id, video_info.get('movie_list_hls'), host, publisher, subtitles))
  44. self._remove_duplicate_formats(formats)
  45. return {
  46. 'id': video_id,
  47. 'title': video_info.get('title'),
  48. 'duration': float_or_none(video_info.get('duration')),
  49. 'thumbnail': video_info.get('thumbnail_url'),
  50. 'formats': formats,
  51. 'subtitles': subtitles,
  52. }
  53. @classmethod
  54. def _extract_embed_urls(cls, url, webpage):
  55. # check for eligiblity of webpage
  56. # https://support.eq.stream.co.jp/hc/ja/articles/115008388147-%E3%83%97%E3%83%AC%E3%82%A4%E3%83%A4%E3%83%BCAPI%E3%81%AE%E3%82%B5%E3%83%B3%E3%83%97%E3%83%AB%E3%82%B3%E3%83%BC%E3%83%89
  57. script_tag = re.search(r'<script\s*[^>]+?src="https://ssl-cache\.stream\.ne\.jp/(?P<host>www\d+)/(?P<publisher>[a-z0-9]+)/[^"]+?/if\.js"', webpage)
  58. if not script_tag:
  59. return
  60. host, publisher = script_tag.groups()
  61. for m in re.finditer(r'(?s)PlayerFactoryIF\.create\(\s*({[^\}]+?})\s*\)\s*;', webpage):
  62. # TODO: using json.loads here as InfoExtractor._parse_json is not classmethod
  63. info = json.loads(js_to_json(m.group(1)))
  64. mid = base64.b64decode(info.get('m')).decode()
  65. yield f'jstream:{host}:{publisher}:{mid}'