hls.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381
  1. import binascii
  2. import io
  3. import re
  4. import urllib.parse
  5. from . import get_suitable_downloader
  6. from .external import FFmpegFD
  7. from .fragment import FragmentFD
  8. from .. import webvtt
  9. from ..dependencies import Cryptodome
  10. from ..utils import (
  11. bug_reports_message,
  12. parse_m3u8_attributes,
  13. remove_start,
  14. traverse_obj,
  15. update_url_query,
  16. urljoin,
  17. )
  18. class HlsFD(FragmentFD):
  19. """
  20. Download segments in a m3u8 manifest. External downloaders can take over
  21. the fragment downloads by supporting the 'm3u8_frag_urls' protocol and
  22. re-defining 'supports_manifest' function
  23. """
  24. FD_NAME = 'hlsnative'
  25. @staticmethod
  26. def _has_drm(manifest): # TODO: https://github.com/yt-dlp/yt-dlp/pull/5039
  27. return bool(re.search('|'.join((
  28. r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://', # Apple FairPlay
  29. r'#EXT-X-(?:SESSION-)?KEY:.*?KEYFORMAT="com\.apple\.streamingkeydelivery"', # Apple FairPlay
  30. r'#EXT-X-(?:SESSION-)?KEY:.*?KEYFORMAT="com\.microsoft\.playready"', # Microsoft PlayReady
  31. r'#EXT-X-FAXS-CM:', # Adobe Flash Access
  32. )), manifest))
  33. @classmethod
  34. def can_download(cls, manifest, info_dict, allow_unplayable_formats=False):
  35. UNSUPPORTED_FEATURES = [
  36. # r'#EXT-X-BYTERANGE', # playlists composed of byte ranges of media files [2]
  37. # Live streams heuristic does not always work (e.g. geo restricted to Germany
  38. # http://hls-geo.daserste.de/i/videoportal/Film/c_620000/622873/format,716451,716457,716450,716458,716459,.mp4.csmil/index_4_av.m3u8?null=0)
  39. # r'#EXT-X-MEDIA-SEQUENCE:(?!0$)', # live streams [3]
  40. # This heuristic also is not correct since segments may not be appended as well.
  41. # Twitch vods of finished streams have EXT-X-PLAYLIST-TYPE:EVENT despite
  42. # no segments will definitely be appended to the end of the playlist.
  43. # r'#EXT-X-PLAYLIST-TYPE:EVENT', # media segments may be appended to the end of
  44. # # event media playlists [4]
  45. # r'#EXT-X-MAP:', # media initialization [5]
  46. # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.4
  47. # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.2
  48. # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.2
  49. # 4. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.5
  50. # 5. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.5
  51. ]
  52. if not allow_unplayable_formats:
  53. UNSUPPORTED_FEATURES += [
  54. r'#EXT-X-KEY:METHOD=(?!NONE|AES-128)', # encrypted streams [1], but not necessarily DRM
  55. ]
  56. def check_results():
  57. yield not info_dict.get('is_live')
  58. for feature in UNSUPPORTED_FEATURES:
  59. yield not re.search(feature, manifest)
  60. if not allow_unplayable_formats:
  61. yield not cls._has_drm(manifest)
  62. return all(check_results())
  63. def real_download(self, filename, info_dict):
  64. man_url = info_dict['url']
  65. self.to_screen(f'[{self.FD_NAME}] Downloading m3u8 manifest')
  66. urlh = self.ydl.urlopen(self._prepare_url(info_dict, man_url))
  67. man_url = urlh.url
  68. s = urlh.read().decode('utf-8', 'ignore')
  69. can_download, message = self.can_download(s, info_dict, self.params.get('allow_unplayable_formats')), None
  70. if can_download:
  71. has_ffmpeg = FFmpegFD.available()
  72. no_crypto = not Cryptodome.AES and '#EXT-X-KEY:METHOD=AES-128' in s
  73. if no_crypto and has_ffmpeg:
  74. can_download, message = False, 'The stream has AES-128 encryption and pycryptodomex is not available'
  75. elif no_crypto:
  76. message = ('The stream has AES-128 encryption and neither ffmpeg nor pycryptodomex are available; '
  77. 'Decryption will be performed natively, but will be extremely slow')
  78. elif info_dict.get('extractor_key') == 'Generic' and re.search(r'(?m)#EXT-X-MEDIA-SEQUENCE:(?!0$)', s):
  79. install_ffmpeg = '' if has_ffmpeg else 'install ffmpeg and '
  80. message = ('Live HLS streams are not supported by the native downloader. If this is a livestream, '
  81. f'please {install_ffmpeg}add "--downloader ffmpeg --hls-use-mpegts" to your command')
  82. if not can_download:
  83. if self._has_drm(s) and not self.params.get('allow_unplayable_formats'):
  84. if info_dict.get('has_drm') and self.params.get('test'):
  85. self.to_screen(f'[{self.FD_NAME}] This format is DRM protected', skip_eol=True)
  86. else:
  87. self.report_error(
  88. 'This format is DRM protected; Try selecting another format with --format or '
  89. 'add --check-formats to automatically fallback to the next best format', tb=False)
  90. return False
  91. message = message or 'Unsupported features have been detected'
  92. fd = FFmpegFD(self.ydl, self.params)
  93. self.report_warning(f'{message}; extraction will be delegated to {fd.get_basename()}')
  94. return fd.real_download(filename, info_dict)
  95. elif message:
  96. self.report_warning(message)
  97. is_webvtt = info_dict['ext'] == 'vtt'
  98. if is_webvtt:
  99. real_downloader = None # Packing the fragments is not currently supported for external downloader
  100. else:
  101. real_downloader = get_suitable_downloader(
  102. info_dict, self.params, None, protocol='m3u8_frag_urls', to_stdout=(filename == '-'))
  103. if real_downloader and not real_downloader.supports_manifest(s):
  104. real_downloader = None
  105. if real_downloader:
  106. self.to_screen(f'[{self.FD_NAME}] Fragment downloads will be delegated to {real_downloader.get_basename()}')
  107. def is_ad_fragment_start(s):
  108. return ((s.startswith('#ANVATO-SEGMENT-INFO') and 'type=ad' in s)
  109. or (s.startswith('#UPLYNK-SEGMENT') and s.endswith(',ad')))
  110. def is_ad_fragment_end(s):
  111. return ((s.startswith('#ANVATO-SEGMENT-INFO') and 'type=master' in s)
  112. or (s.startswith('#UPLYNK-SEGMENT') and s.endswith(',segment')))
  113. fragments = []
  114. media_frags = 0
  115. ad_frags = 0
  116. ad_frag_next = False
  117. for line in s.splitlines():
  118. line = line.strip()
  119. if not line:
  120. continue
  121. if line.startswith('#'):
  122. if is_ad_fragment_start(line):
  123. ad_frag_next = True
  124. elif is_ad_fragment_end(line):
  125. ad_frag_next = False
  126. continue
  127. if ad_frag_next:
  128. ad_frags += 1
  129. continue
  130. media_frags += 1
  131. ctx = {
  132. 'filename': filename,
  133. 'total_frags': media_frags,
  134. 'ad_frags': ad_frags,
  135. }
  136. if real_downloader:
  137. self._prepare_external_frag_download(ctx)
  138. else:
  139. self._prepare_and_start_frag_download(ctx, info_dict)
  140. extra_state = ctx.setdefault('extra_state', {})
  141. format_index = info_dict.get('format_index')
  142. extra_segment_query = None
  143. if extra_param_to_segment_url := info_dict.get('extra_param_to_segment_url'):
  144. extra_segment_query = urllib.parse.parse_qs(extra_param_to_segment_url)
  145. extra_key_query = None
  146. if extra_param_to_key_url := info_dict.get('extra_param_to_key_url'):
  147. extra_key_query = urllib.parse.parse_qs(extra_param_to_key_url)
  148. i = 0
  149. media_sequence = 0
  150. decrypt_info = {'METHOD': 'NONE'}
  151. external_aes_key = traverse_obj(info_dict, ('hls_aes', 'key'))
  152. if external_aes_key:
  153. external_aes_key = binascii.unhexlify(remove_start(external_aes_key, '0x'))
  154. assert len(external_aes_key) in (16, 24, 32), 'Invalid length for HLS AES-128 key'
  155. external_aes_iv = traverse_obj(info_dict, ('hls_aes', 'iv'))
  156. if external_aes_iv:
  157. external_aes_iv = binascii.unhexlify(remove_start(external_aes_iv, '0x').zfill(32))
  158. byte_range = {}
  159. discontinuity_count = 0
  160. frag_index = 0
  161. ad_frag_next = False
  162. for line in s.splitlines():
  163. line = line.strip()
  164. if line:
  165. if not line.startswith('#'):
  166. if format_index and discontinuity_count != format_index:
  167. continue
  168. if ad_frag_next:
  169. continue
  170. frag_index += 1
  171. if frag_index <= ctx['fragment_index']:
  172. continue
  173. frag_url = urljoin(man_url, line)
  174. if extra_segment_query:
  175. frag_url = update_url_query(frag_url, extra_segment_query)
  176. fragments.append({
  177. 'frag_index': frag_index,
  178. 'url': frag_url,
  179. 'decrypt_info': decrypt_info,
  180. 'byte_range': byte_range,
  181. 'media_sequence': media_sequence,
  182. })
  183. media_sequence += 1
  184. elif line.startswith('#EXT-X-MAP'):
  185. if format_index and discontinuity_count != format_index:
  186. continue
  187. if frag_index > 0:
  188. self.report_error(
  189. 'Initialization fragment found after media fragments, unable to download')
  190. return False
  191. frag_index += 1
  192. map_info = parse_m3u8_attributes(line[11:])
  193. frag_url = urljoin(man_url, map_info.get('URI'))
  194. if extra_segment_query:
  195. frag_url = update_url_query(frag_url, extra_segment_query)
  196. if map_info.get('BYTERANGE'):
  197. splitted_byte_range = map_info.get('BYTERANGE').split('@')
  198. sub_range_start = int(splitted_byte_range[1]) if len(splitted_byte_range) == 2 else byte_range['end']
  199. byte_range = {
  200. 'start': sub_range_start,
  201. 'end': sub_range_start + int(splitted_byte_range[0]),
  202. }
  203. fragments.append({
  204. 'frag_index': frag_index,
  205. 'url': frag_url,
  206. 'decrypt_info': decrypt_info,
  207. 'byte_range': byte_range,
  208. 'media_sequence': media_sequence,
  209. })
  210. media_sequence += 1
  211. elif line.startswith('#EXT-X-KEY'):
  212. decrypt_url = decrypt_info.get('URI')
  213. decrypt_info = parse_m3u8_attributes(line[11:])
  214. if decrypt_info['METHOD'] == 'AES-128':
  215. if external_aes_iv:
  216. decrypt_info['IV'] = external_aes_iv
  217. elif 'IV' in decrypt_info:
  218. decrypt_info['IV'] = binascii.unhexlify(decrypt_info['IV'][2:].zfill(32))
  219. if external_aes_key:
  220. decrypt_info['KEY'] = external_aes_key
  221. else:
  222. decrypt_info['URI'] = urljoin(man_url, decrypt_info['URI'])
  223. if extra_key_query or extra_segment_query:
  224. # Fall back to extra_segment_query to key for backwards compat
  225. decrypt_info['URI'] = update_url_query(
  226. decrypt_info['URI'], extra_key_query or extra_segment_query)
  227. if decrypt_url != decrypt_info['URI']:
  228. decrypt_info['KEY'] = None
  229. elif line.startswith('#EXT-X-MEDIA-SEQUENCE'):
  230. media_sequence = int(line[22:])
  231. elif line.startswith('#EXT-X-BYTERANGE'):
  232. splitted_byte_range = line[17:].split('@')
  233. sub_range_start = int(splitted_byte_range[1]) if len(splitted_byte_range) == 2 else byte_range['end']
  234. byte_range = {
  235. 'start': sub_range_start,
  236. 'end': sub_range_start + int(splitted_byte_range[0]),
  237. }
  238. elif is_ad_fragment_start(line):
  239. ad_frag_next = True
  240. elif is_ad_fragment_end(line):
  241. ad_frag_next = False
  242. elif line.startswith('#EXT-X-DISCONTINUITY'):
  243. discontinuity_count += 1
  244. i += 1
  245. # We only download the first fragment during the test
  246. if self.params.get('test', False):
  247. fragments = [fragments[0] if fragments else None]
  248. if real_downloader:
  249. info_dict['fragments'] = fragments
  250. fd = real_downloader(self.ydl, self.params)
  251. # TODO: Make progress updates work without hooking twice
  252. # for ph in self._progress_hooks:
  253. # fd.add_progress_hook(ph)
  254. return fd.real_download(filename, info_dict)
  255. if is_webvtt:
  256. def pack_fragment(frag_content, frag_index):
  257. output = io.StringIO()
  258. adjust = 0
  259. overflow = False
  260. mpegts_last = None
  261. for block in webvtt.parse_fragment(frag_content):
  262. if isinstance(block, webvtt.CueBlock):
  263. extra_state['webvtt_mpegts_last'] = mpegts_last
  264. if overflow:
  265. extra_state['webvtt_mpegts_adjust'] += 1
  266. overflow = False
  267. block.start += adjust
  268. block.end += adjust
  269. dedup_window = extra_state.setdefault('webvtt_dedup_window', [])
  270. ready = []
  271. i = 0
  272. is_new = True
  273. while i < len(dedup_window):
  274. wcue = dedup_window[i]
  275. wblock = webvtt.CueBlock.from_json(wcue)
  276. i += 1
  277. if wblock.hinges(block):
  278. wcue['end'] = block.end
  279. is_new = False
  280. continue
  281. if wblock == block:
  282. is_new = False
  283. continue
  284. if wblock.end > block.start:
  285. continue
  286. ready.append(wblock)
  287. i -= 1
  288. del dedup_window[i]
  289. if is_new:
  290. dedup_window.append(block.as_json)
  291. for block in ready:
  292. block.write_into(output)
  293. # we only emit cues once they fall out of the duplicate window
  294. continue
  295. elif isinstance(block, webvtt.Magic):
  296. # take care of MPEG PES timestamp overflow
  297. if block.mpegts is None:
  298. block.mpegts = 0
  299. extra_state.setdefault('webvtt_mpegts_adjust', 0)
  300. block.mpegts += extra_state['webvtt_mpegts_adjust'] << 33
  301. if block.mpegts < extra_state.get('webvtt_mpegts_last', 0):
  302. overflow = True
  303. block.mpegts += 1 << 33
  304. mpegts_last = block.mpegts
  305. if frag_index == 1:
  306. extra_state['webvtt_mpegts'] = block.mpegts or 0
  307. extra_state['webvtt_local'] = block.local or 0
  308. # XXX: block.local = block.mpegts = None ?
  309. else:
  310. if block.mpegts is not None and block.local is not None:
  311. adjust = (
  312. (block.mpegts - extra_state.get('webvtt_mpegts', 0))
  313. - (block.local - extra_state.get('webvtt_local', 0))
  314. )
  315. continue
  316. elif isinstance(block, webvtt.HeaderBlock):
  317. if frag_index != 1:
  318. # XXX: this should probably be silent as well
  319. # or verify that all segments contain the same data
  320. self.report_warning(bug_reports_message(
  321. f'Discarding a {type(block).__name__} block found in the middle of the stream; '
  322. 'if the subtitles display incorrectly,'))
  323. continue
  324. block.write_into(output)
  325. return output.getvalue().encode()
  326. def fin_fragments():
  327. dedup_window = extra_state.get('webvtt_dedup_window')
  328. if not dedup_window:
  329. return b''
  330. output = io.StringIO()
  331. for cue in dedup_window:
  332. webvtt.CueBlock.from_json(cue).write_into(output)
  333. return output.getvalue().encode()
  334. if len(fragments) == 1:
  335. self.download_and_append_fragments(ctx, fragments, info_dict)
  336. else:
  337. self.download_and_append_fragments(
  338. ctx, fragments, info_dict, pack_func=pack_fragment, finish_func=fin_fragments)
  339. else:
  340. return self.download_and_append_fragments(ctx, fragments, info_dict)