f4m.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427
  1. import base64
  2. import io
  3. import itertools
  4. import struct
  5. import time
  6. import urllib.error
  7. import urllib.parse
  8. from .fragment import FragmentFD
  9. from ..compat import compat_etree_fromstring
  10. from ..utils import fix_xml_ampersands, xpath_text
  11. class DataTruncatedError(Exception):
  12. pass
  13. class FlvReader(io.BytesIO):
  14. """
  15. Reader for Flv files
  16. The file format is documented in https://www.adobe.com/devnet/f4v.html
  17. """
  18. def read_bytes(self, n):
  19. data = self.read(n)
  20. if len(data) < n:
  21. raise DataTruncatedError(
  22. 'FlvReader error: need %d bytes while only %d bytes got' % (
  23. n, len(data)))
  24. return data
  25. # Utility functions for reading numbers and strings
  26. def read_unsigned_long_long(self):
  27. return struct.unpack('!Q', self.read_bytes(8))[0]
  28. def read_unsigned_int(self):
  29. return struct.unpack('!I', self.read_bytes(4))[0]
  30. def read_unsigned_char(self):
  31. return struct.unpack('!B', self.read_bytes(1))[0]
  32. def read_string(self):
  33. res = b''
  34. while True:
  35. char = self.read_bytes(1)
  36. if char == b'\x00':
  37. break
  38. res += char
  39. return res
  40. def read_box_info(self):
  41. """
  42. Read a box and return the info as a tuple: (box_size, box_type, box_data)
  43. """
  44. real_size = size = self.read_unsigned_int()
  45. box_type = self.read_bytes(4)
  46. header_end = 8
  47. if size == 1:
  48. real_size = self.read_unsigned_long_long()
  49. header_end = 16
  50. return real_size, box_type, self.read_bytes(real_size - header_end)
  51. def read_asrt(self):
  52. # version
  53. self.read_unsigned_char()
  54. # flags
  55. self.read_bytes(3)
  56. quality_entry_count = self.read_unsigned_char()
  57. # QualityEntryCount
  58. for i in range(quality_entry_count):
  59. self.read_string()
  60. segment_run_count = self.read_unsigned_int()
  61. segments = []
  62. for i in range(segment_run_count):
  63. first_segment = self.read_unsigned_int()
  64. fragments_per_segment = self.read_unsigned_int()
  65. segments.append((first_segment, fragments_per_segment))
  66. return {
  67. 'segment_run': segments,
  68. }
  69. def read_afrt(self):
  70. # version
  71. self.read_unsigned_char()
  72. # flags
  73. self.read_bytes(3)
  74. # time scale
  75. self.read_unsigned_int()
  76. quality_entry_count = self.read_unsigned_char()
  77. # QualitySegmentUrlModifiers
  78. for i in range(quality_entry_count):
  79. self.read_string()
  80. fragments_count = self.read_unsigned_int()
  81. fragments = []
  82. for i in range(fragments_count):
  83. first = self.read_unsigned_int()
  84. first_ts = self.read_unsigned_long_long()
  85. duration = self.read_unsigned_int()
  86. if duration == 0:
  87. discontinuity_indicator = self.read_unsigned_char()
  88. else:
  89. discontinuity_indicator = None
  90. fragments.append({
  91. 'first': first,
  92. 'ts': first_ts,
  93. 'duration': duration,
  94. 'discontinuity_indicator': discontinuity_indicator,
  95. })
  96. return {
  97. 'fragments': fragments,
  98. }
  99. def read_abst(self):
  100. # version
  101. self.read_unsigned_char()
  102. # flags
  103. self.read_bytes(3)
  104. self.read_unsigned_int() # BootstrapinfoVersion
  105. # Profile,Live,Update,Reserved
  106. flags = self.read_unsigned_char()
  107. live = flags & 0x20 != 0
  108. # time scale
  109. self.read_unsigned_int()
  110. # CurrentMediaTime
  111. self.read_unsigned_long_long()
  112. # SmpteTimeCodeOffset
  113. self.read_unsigned_long_long()
  114. self.read_string() # MovieIdentifier
  115. server_count = self.read_unsigned_char()
  116. # ServerEntryTable
  117. for i in range(server_count):
  118. self.read_string()
  119. quality_count = self.read_unsigned_char()
  120. # QualityEntryTable
  121. for i in range(quality_count):
  122. self.read_string()
  123. # DrmData
  124. self.read_string()
  125. # MetaData
  126. self.read_string()
  127. segments_count = self.read_unsigned_char()
  128. segments = []
  129. for i in range(segments_count):
  130. box_size, box_type, box_data = self.read_box_info()
  131. assert box_type == b'asrt'
  132. segment = FlvReader(box_data).read_asrt()
  133. segments.append(segment)
  134. fragments_run_count = self.read_unsigned_char()
  135. fragments = []
  136. for i in range(fragments_run_count):
  137. box_size, box_type, box_data = self.read_box_info()
  138. assert box_type == b'afrt'
  139. fragments.append(FlvReader(box_data).read_afrt())
  140. return {
  141. 'segments': segments,
  142. 'fragments': fragments,
  143. 'live': live,
  144. }
  145. def read_bootstrap_info(self):
  146. total_size, box_type, box_data = self.read_box_info()
  147. assert box_type == b'abst'
  148. return FlvReader(box_data).read_abst()
  149. def read_bootstrap_info(bootstrap_bytes):
  150. return FlvReader(bootstrap_bytes).read_bootstrap_info()
  151. def build_fragments_list(boot_info):
  152. """ Return a list of (segment, fragment) for each fragment in the video """
  153. res = []
  154. segment_run_table = boot_info['segments'][0]
  155. fragment_run_entry_table = boot_info['fragments'][0]['fragments']
  156. first_frag_number = fragment_run_entry_table[0]['first']
  157. fragments_counter = itertools.count(first_frag_number)
  158. for segment, fragments_count in segment_run_table['segment_run']:
  159. # In some live HDS streams (e.g. Rai), `fragments_count` is
  160. # abnormal and causing out-of-memory errors. It's OK to change the
  161. # number of fragments for live streams as they are updated periodically
  162. if fragments_count == 4294967295 and boot_info['live']:
  163. fragments_count = 2
  164. for _ in range(fragments_count):
  165. res.append((segment, next(fragments_counter)))
  166. if boot_info['live']:
  167. res = res[-2:]
  168. return res
  169. def write_unsigned_int(stream, val):
  170. stream.write(struct.pack('!I', val))
  171. def write_unsigned_int_24(stream, val):
  172. stream.write(struct.pack('!I', val)[1:])
  173. def write_flv_header(stream):
  174. """Writes the FLV header to stream"""
  175. # FLV header
  176. stream.write(b'FLV\x01')
  177. stream.write(b'\x05')
  178. stream.write(b'\x00\x00\x00\x09')
  179. stream.write(b'\x00\x00\x00\x00')
  180. def write_metadata_tag(stream, metadata):
  181. """Writes optional metadata tag to stream"""
  182. SCRIPT_TAG = b'\x12'
  183. FLV_TAG_HEADER_LEN = 11
  184. if metadata:
  185. stream.write(SCRIPT_TAG)
  186. write_unsigned_int_24(stream, len(metadata))
  187. stream.write(b'\x00\x00\x00\x00\x00\x00\x00')
  188. stream.write(metadata)
  189. write_unsigned_int(stream, FLV_TAG_HEADER_LEN + len(metadata))
  190. def remove_encrypted_media(media):
  191. return list(filter(lambda e: 'drmAdditionalHeaderId' not in e.attrib
  192. and 'drmAdditionalHeaderSetId' not in e.attrib,
  193. media))
  194. def _add_ns(prop, ver=1):
  195. return '{http://ns.adobe.com/f4m/%d.0}%s' % (ver, prop)
  196. def get_base_url(manifest):
  197. base_url = xpath_text(
  198. manifest, [_add_ns('baseURL'), _add_ns('baseURL', 2)],
  199. 'base URL', default=None)
  200. if base_url:
  201. base_url = base_url.strip()
  202. return base_url
  203. class F4mFD(FragmentFD):
  204. """
  205. A downloader for f4m manifests or AdobeHDS.
  206. """
  207. def _get_unencrypted_media(self, doc):
  208. media = doc.findall(_add_ns('media'))
  209. if not media:
  210. self.report_error('No media found')
  211. if not self.params.get('allow_unplayable_formats'):
  212. for e in (doc.findall(_add_ns('drmAdditionalHeader'))
  213. + doc.findall(_add_ns('drmAdditionalHeaderSet'))):
  214. # If id attribute is missing it's valid for all media nodes
  215. # without drmAdditionalHeaderId or drmAdditionalHeaderSetId attribute
  216. if 'id' not in e.attrib:
  217. self.report_error('Missing ID in f4m DRM')
  218. media = remove_encrypted_media(media)
  219. if not media:
  220. self.report_error('Unsupported DRM')
  221. return media
  222. def _get_bootstrap_from_url(self, bootstrap_url):
  223. bootstrap = self.ydl.urlopen(bootstrap_url).read()
  224. return read_bootstrap_info(bootstrap)
  225. def _update_live_fragments(self, bootstrap_url, latest_fragment):
  226. fragments_list = []
  227. retries = 30
  228. while (not fragments_list) and (retries > 0):
  229. boot_info = self._get_bootstrap_from_url(bootstrap_url)
  230. fragments_list = build_fragments_list(boot_info)
  231. fragments_list = [f for f in fragments_list if f[1] > latest_fragment]
  232. if not fragments_list:
  233. # Retry after a while
  234. time.sleep(5.0)
  235. retries -= 1
  236. if not fragments_list:
  237. self.report_error('Failed to update fragments')
  238. return fragments_list
  239. def _parse_bootstrap_node(self, node, base_url):
  240. # Sometimes non empty inline bootstrap info can be specified along
  241. # with bootstrap url attribute (e.g. dummy inline bootstrap info
  242. # contains whitespace characters in [1]). We will prefer bootstrap
  243. # url over inline bootstrap info when present.
  244. # 1. http://live-1-1.rutube.ru/stream/1024/HDS/SD/C2NKsS85HQNckgn5HdEmOQ/1454167650/S-s604419906/move/four/dirs/upper/1024-576p.f4m
  245. bootstrap_url = node.get('url')
  246. if bootstrap_url:
  247. bootstrap_url = urllib.parse.urljoin(
  248. base_url, bootstrap_url)
  249. boot_info = self._get_bootstrap_from_url(bootstrap_url)
  250. else:
  251. bootstrap_url = None
  252. bootstrap = base64.b64decode(node.text)
  253. boot_info = read_bootstrap_info(bootstrap)
  254. return boot_info, bootstrap_url
  255. def real_download(self, filename, info_dict):
  256. man_url = info_dict['url']
  257. requested_bitrate = info_dict.get('tbr')
  258. self.to_screen('[%s] Downloading f4m manifest' % self.FD_NAME)
  259. urlh = self.ydl.urlopen(self._prepare_url(info_dict, man_url))
  260. man_url = urlh.geturl()
  261. # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
  262. # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244
  263. # and https://github.com/ytdl-org/youtube-dl/issues/7823)
  264. manifest = fix_xml_ampersands(urlh.read().decode('utf-8', 'ignore')).strip()
  265. doc = compat_etree_fromstring(manifest)
  266. formats = [(int(f.attrib.get('bitrate', -1)), f)
  267. for f in self._get_unencrypted_media(doc)]
  268. if requested_bitrate is None or len(formats) == 1:
  269. # get the best format
  270. formats = sorted(formats, key=lambda f: f[0])
  271. rate, media = formats[-1]
  272. else:
  273. rate, media = list(filter(
  274. lambda f: int(f[0]) == requested_bitrate, formats))[0]
  275. # Prefer baseURL for relative URLs as per 11.2 of F4M 3.0 spec.
  276. man_base_url = get_base_url(doc) or man_url
  277. base_url = urllib.parse.urljoin(man_base_url, media.attrib['url'])
  278. bootstrap_node = doc.find(_add_ns('bootstrapInfo'))
  279. boot_info, bootstrap_url = self._parse_bootstrap_node(
  280. bootstrap_node, man_base_url)
  281. live = boot_info['live']
  282. metadata_node = media.find(_add_ns('metadata'))
  283. if metadata_node is not None:
  284. metadata = base64.b64decode(metadata_node.text)
  285. else:
  286. metadata = None
  287. fragments_list = build_fragments_list(boot_info)
  288. test = self.params.get('test', False)
  289. if test:
  290. # We only download the first fragment
  291. fragments_list = fragments_list[:1]
  292. total_frags = len(fragments_list)
  293. # For some akamai manifests we'll need to add a query to the fragment url
  294. akamai_pv = xpath_text(doc, _add_ns('pv-2.0'))
  295. ctx = {
  296. 'filename': filename,
  297. 'total_frags': total_frags,
  298. 'live': bool(live),
  299. }
  300. self._prepare_frag_download(ctx)
  301. dest_stream = ctx['dest_stream']
  302. if ctx['complete_frags_downloaded_bytes'] == 0:
  303. write_flv_header(dest_stream)
  304. if not live:
  305. write_metadata_tag(dest_stream, metadata)
  306. base_url_parsed = urllib.parse.urlparse(base_url)
  307. self._start_frag_download(ctx, info_dict)
  308. frag_index = 0
  309. while fragments_list:
  310. seg_i, frag_i = fragments_list.pop(0)
  311. frag_index += 1
  312. if frag_index <= ctx['fragment_index']:
  313. continue
  314. name = 'Seg%d-Frag%d' % (seg_i, frag_i)
  315. query = []
  316. if base_url_parsed.query:
  317. query.append(base_url_parsed.query)
  318. if akamai_pv:
  319. query.append(akamai_pv.strip(';'))
  320. if info_dict.get('extra_param_to_segment_url'):
  321. query.append(info_dict['extra_param_to_segment_url'])
  322. url_parsed = base_url_parsed._replace(path=base_url_parsed.path + name, query='&'.join(query))
  323. try:
  324. success = self._download_fragment(ctx, url_parsed.geturl(), info_dict)
  325. if not success:
  326. return False
  327. down_data = self._read_fragment(ctx)
  328. reader = FlvReader(down_data)
  329. while True:
  330. try:
  331. _, box_type, box_data = reader.read_box_info()
  332. except DataTruncatedError:
  333. if test:
  334. # In tests, segments may be truncated, and thus
  335. # FlvReader may not be able to parse the whole
  336. # chunk. If so, write the segment as is
  337. # See https://github.com/ytdl-org/youtube-dl/issues/9214
  338. dest_stream.write(down_data)
  339. break
  340. raise
  341. if box_type == b'mdat':
  342. self._append_fragment(ctx, box_data)
  343. break
  344. except urllib.error.HTTPError as err:
  345. if live and (err.code == 404 or err.code == 410):
  346. # We didn't keep up with the live window. Continue
  347. # with the next available fragment.
  348. msg = 'Fragment %d unavailable' % frag_i
  349. self.report_warning(msg)
  350. fragments_list = []
  351. else:
  352. raise
  353. if not fragments_list and not test and live and bootstrap_url:
  354. fragments_list = self._update_live_fragments(bootstrap_url, frag_i)
  355. total_frags += len(fragments_list)
  356. if fragments_list and (fragments_list[0][1] > frag_i + 1):
  357. msg = 'Missed %d fragments' % (fragments_list[0][1] - (frag_i + 1))
  358. self.report_warning(msg)
  359. return self._finish_frag_download(ctx, info_dict)