mhtml.py 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197
  1. import io
  2. import quopri
  3. import re
  4. import uuid
  5. from .fragment import FragmentFD
  6. from ..utils import escapeHTML, formatSeconds, srt_subtitles_timecode, urljoin
  7. from ..version import __version__ as YT_DLP_VERSION
  8. class MhtmlFD(FragmentFD):
  9. _STYLESHEET = """\
  10. html, body {
  11. margin: 0;
  12. padding: 0;
  13. height: 100vh;
  14. }
  15. html {
  16. overflow-y: scroll;
  17. scroll-snap-type: y mandatory;
  18. }
  19. body {
  20. scroll-snap-type: y mandatory;
  21. display: flex;
  22. flex-flow: column;
  23. }
  24. body > figure {
  25. max-width: 100vw;
  26. max-height: 100vh;
  27. scroll-snap-align: center;
  28. }
  29. body > figure > figcaption {
  30. text-align: center;
  31. height: 2.5em;
  32. }
  33. body > figure > img {
  34. display: block;
  35. margin: auto;
  36. max-width: 100%;
  37. max-height: calc(100vh - 5em);
  38. }
  39. """
  40. _STYLESHEET = re.sub(r'\s+', ' ', _STYLESHEET)
  41. _STYLESHEET = re.sub(r'\B \B|(?<=[\w\-]) (?=[^\w\-])|(?<=[^\w\-]) (?=[\w\-])', '', _STYLESHEET)
  42. @staticmethod
  43. def _escape_mime(s):
  44. return '=?utf-8?Q?' + (b''.join(
  45. bytes((b,)) if b >= 0x20 else b'=%02X' % b
  46. for b in quopri.encodestring(s.encode(), header=True)
  47. )).decode('us-ascii') + '?='
  48. def _gen_cid(self, i, fragment, frag_boundary):
  49. return '%u.%s@yt-dlp.github.io.invalid' % (i, frag_boundary)
  50. def _gen_stub(self, *, fragments, frag_boundary, title):
  51. output = io.StringIO()
  52. output.write((
  53. '<!DOCTYPE html>'
  54. '<html>'
  55. '<head>'
  56. '' '<meta name="generator" content="yt-dlp {version}">'
  57. '' '<title>{title}</title>'
  58. '' '<style>{styles}</style>'
  59. '<body>'
  60. ).format(
  61. version=escapeHTML(YT_DLP_VERSION),
  62. styles=self._STYLESHEET,
  63. title=escapeHTML(title)
  64. ))
  65. t0 = 0
  66. for i, frag in enumerate(fragments):
  67. output.write('<figure>')
  68. try:
  69. t1 = t0 + frag['duration']
  70. output.write((
  71. '<figcaption>Slide #{num}: {t0} – {t1} (duration: {duration})</figcaption>'
  72. ).format(
  73. num=i + 1,
  74. t0=srt_subtitles_timecode(t0),
  75. t1=srt_subtitles_timecode(t1),
  76. duration=formatSeconds(frag['duration'], msec=True)
  77. ))
  78. except (KeyError, ValueError, TypeError):
  79. t1 = None
  80. output.write((
  81. '<figcaption>Slide #{num}</figcaption>'
  82. ).format(num=i + 1))
  83. output.write('<img src="cid:{cid}">'.format(
  84. cid=self._gen_cid(i, frag, frag_boundary)))
  85. output.write('</figure>')
  86. t0 = t1
  87. return output.getvalue()
  88. def real_download(self, filename, info_dict):
  89. fragment_base_url = info_dict.get('fragment_base_url')
  90. fragments = info_dict['fragments'][:1] if self.params.get(
  91. 'test', False) else info_dict['fragments']
  92. title = info_dict.get('title', info_dict['format_id'])
  93. origin = info_dict.get('webpage_url', info_dict['url'])
  94. ctx = {
  95. 'filename': filename,
  96. 'total_frags': len(fragments),
  97. }
  98. self._prepare_and_start_frag_download(ctx, info_dict)
  99. extra_state = ctx.setdefault('extra_state', {
  100. 'header_written': False,
  101. 'mime_boundary': str(uuid.uuid4()).replace('-', ''),
  102. })
  103. frag_boundary = extra_state['mime_boundary']
  104. if not extra_state['header_written']:
  105. stub = self._gen_stub(
  106. fragments=fragments,
  107. frag_boundary=frag_boundary,
  108. title=title
  109. )
  110. ctx['dest_stream'].write((
  111. 'MIME-Version: 1.0\r\n'
  112. 'From: <nowhere@yt-dlp.github.io.invalid>\r\n'
  113. 'To: <nowhere@yt-dlp.github.io.invalid>\r\n'
  114. 'Subject: {title}\r\n'
  115. 'Content-type: multipart/related; '
  116. '' 'boundary="{boundary}"; '
  117. '' 'type="text/html"\r\n'
  118. 'X.yt-dlp.Origin: {origin}\r\n'
  119. '\r\n'
  120. '--{boundary}\r\n'
  121. 'Content-Type: text/html; charset=utf-8\r\n'
  122. 'Content-Length: {length}\r\n'
  123. '\r\n'
  124. '{stub}\r\n'
  125. ).format(
  126. origin=origin,
  127. boundary=frag_boundary,
  128. length=len(stub),
  129. title=self._escape_mime(title),
  130. stub=stub
  131. ).encode())
  132. extra_state['header_written'] = True
  133. for i, fragment in enumerate(fragments):
  134. if (i + 1) <= ctx['fragment_index']:
  135. continue
  136. fragment_url = fragment.get('url')
  137. if not fragment_url:
  138. assert fragment_base_url
  139. fragment_url = urljoin(fragment_base_url, fragment['path'])
  140. success = self._download_fragment(ctx, fragment_url, info_dict)
  141. if not success:
  142. continue
  143. frag_content = self._read_fragment(ctx)
  144. mime_type = b'image/jpeg'
  145. if frag_content.startswith(b'\x89PNG\r\n\x1a\n'):
  146. mime_type = b'image/png'
  147. if frag_content.startswith((b'GIF87a', b'GIF89a')):
  148. mime_type = b'image/gif'
  149. if frag_content.startswith(b'RIFF') and frag_content[8:12] == b'WEBP':
  150. mime_type = b'image/webp'
  151. frag_header = io.BytesIO()
  152. frag_header.write(
  153. b'--%b\r\n' % frag_boundary.encode('us-ascii'))
  154. frag_header.write(
  155. b'Content-ID: <%b>\r\n' % self._gen_cid(i, fragment, frag_boundary).encode('us-ascii'))
  156. frag_header.write(
  157. b'Content-type: %b\r\n' % mime_type)
  158. frag_header.write(
  159. b'Content-length: %u\r\n' % len(frag_content))
  160. frag_header.write(
  161. b'Content-location: %b\r\n' % fragment_url.encode('us-ascii'))
  162. frag_header.write(
  163. b'X.yt-dlp.Duration: %f\r\n' % fragment['duration'])
  164. frag_header.write(b'\r\n')
  165. self._append_fragment(
  166. ctx, frag_header.getvalue() + frag_content + b'\r\n')
  167. ctx['dest_stream'].write(
  168. b'--%b--\r\n\r\n' % frag_boundary.encode('us-ascii'))
  169. self._finish_frag_download(ctx, info_dict)
  170. return True