Просмотр исходного кода

[extractor] Add dev option `--load-pages`

pukkandan 2 лет назад
Родитель
Сommit
f95b9dee45
4 измененных файлов с 54 добавлено и 36 удалено
  1. 1 0
      README.md
  2. 1 0
      yt_dlp/__init__.py
  3. 48 36
      yt_dlp/extractor/common.py
  4. 4 0
      yt_dlp/options.py

+ 1 - 0
README.md

@@ -2022,6 +2022,7 @@ While these options still work, their use is not recommended since there are oth
 These options are not intended to be used by the end-user
 
     --test                           Download only part of video for testing extractors
+    --load-pages                     Load pages dumped by --write-pages
     --youtube-print-sig-code         For testing youtube signatures
     --allow-unplayable-formats       List unplayable formats also
     --no-allow-unplayable-formats    Default

+ 1 - 0
yt_dlp/__init__.py

@@ -758,6 +758,7 @@ def parse_options(argv=None):
         'verbose': opts.verbose,
         'dump_intermediate_pages': opts.dump_intermediate_pages,
         'write_pages': opts.write_pages,
+        'load_pages': opts.load_pages,
         'test': opts.test,
         'keepvideo': opts.keepvideo,
         'min_filesize': opts.min_filesize,

+ 48 - 36
yt_dlp/extractor/common.py

@@ -75,7 +75,6 @@ from ..utils import (
     unified_strdate,
     unified_timestamp,
     update_Request,
-    update_url_query,
     url_basename,
     url_or_none,
     urljoin,
@@ -724,6 +723,11 @@ class InfoExtractor:
         else:
             return err.code in variadic(expected_status)
 
+    def _create_request(self, url_or_request, data=None, headers={}, query={}):
+        if not isinstance(url_or_request, compat_urllib_request.Request):
+            url_or_request = sanitized_Request(url_or_request)
+        return update_Request(url_or_request, data=data, headers=headers, query=query)
+
     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
         """
         Return the response handle.
@@ -755,16 +759,8 @@ class InfoExtractor:
             if 'X-Forwarded-For' not in headers:
                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 
-        if isinstance(url_or_request, compat_urllib_request.Request):
-            url_or_request = update_Request(
-                url_or_request, data=data, headers=headers, query=query)
-        else:
-            if query:
-                url_or_request = update_url_query(url_or_request, query)
-            if data is not None or headers:
-                url_or_request = sanitized_Request(url_or_request, data, headers)
         try:
-            return self._downloader.urlopen(url_or_request)
+            return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query))
         except network_exceptions as err:
             if isinstance(err, compat_urllib_error.HTTPError):
                 if self.__can_accept_status_code(err, expected_status):
@@ -876,40 +872,44 @@ class InfoExtractor:
                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
                 expected=True)
 
+    def _request_dump_filename(self, url, video_id):
+        basen = f'{video_id}_{url}'
+        trim_length = self.get_param('trim_file_name') or 240
+        if len(basen) > trim_length:
+            h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
+            basen = basen[:trim_length - len(h)] + h
+        filename = sanitize_filename(f'{basen}.dump', restricted=True)
+        # Working around MAX_PATH limitation on Windows (see
+        # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
+        if compat_os_name == 'nt':
+            absfilepath = os.path.abspath(filename)
+            if len(absfilepath) > 259:
+                filename = fR'\\?\{absfilepath}'
+        return filename
+
+    def __decode_webpage(self, webpage_bytes, encoding, headers):
+        if not encoding:
+            encoding = self._guess_encoding_from_content(headers.get('Content-Type', ''), webpage_bytes)
+        try:
+            return webpage_bytes.decode(encoding, 'replace')
+        except LookupError:
+            return webpage_bytes.decode('utf-8', 'replace')
+
     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
-        content_type = urlh.headers.get('Content-Type', '')
         webpage_bytes = urlh.read()
         if prefix is not None:
             webpage_bytes = prefix + webpage_bytes
-        if not encoding:
-            encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
         if self.get_param('dump_intermediate_pages', False):
             self.to_screen('Dumping request to ' + urlh.geturl())
             dump = base64.b64encode(webpage_bytes).decode('ascii')
             self._downloader.to_screen(dump)
-        if self.get_param('write_pages', False):
-            basen = f'{video_id}_{urlh.geturl()}'
-            trim_length = self.get_param('trim_file_name') or 240
-            if len(basen) > trim_length:
-                h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
-                basen = basen[:trim_length - len(h)] + h
-            raw_filename = basen + '.dump'
-            filename = sanitize_filename(raw_filename, restricted=True)
-            self.to_screen('Saving request to ' + filename)
-            # Working around MAX_PATH limitation on Windows (see
-            # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
-            if compat_os_name == 'nt':
-                absfilepath = os.path.abspath(filename)
-                if len(absfilepath) > 259:
-                    filename = '\\\\?\\' + absfilepath
+        if self.get_param('write_pages'):
+            filename = self._request_dump_filename(video_id, urlh.geturl())
+            self.to_screen(f'Saving request to {filename}')
             with open(filename, 'wb') as outf:
                 outf.write(webpage_bytes)
 
-        try:
-            content = webpage_bytes.decode(encoding, 'replace')
-        except LookupError:
-            content = webpage_bytes.decode('utf-8', 'replace')
-
+        content = self.__decode_webpage(webpage_bytes, encoding, urlh.headers)
         self.__check_blocked(content)
 
         return content
@@ -967,9 +967,21 @@ class InfoExtractor:
             content, urlh = res
             return parse(self, content, video_id, transform_source, fatal), urlh
 
-        def download_content(
-                self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None, *args, **kwargs):
-            args = [url_or_request, video_id, note, errnote, transform_source, *args]
+        def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None,
+                             fatal=True, encoding=None, data=None, headers={}, query={}, *args, **kwargs):
+            if self.get_param('load_pages'):
+                url_or_request = self._create_request(url_or_request, data, headers, query)
+                filename = self._request_dump_filename(url_or_request.full_url, video_id)
+                self.to_screen(f'Loading request from {filename}')
+                try:
+                    with open(filename, 'rb') as dumpf:
+                        webpage_bytes = dumpf.read()
+                except OSError as e:
+                    self.report_warning(f'Unable to load request from disk: {e}')
+                else:
+                    content = self.__decode_webpage(webpage_bytes, encoding, url_or_request.headers)
+                    return parse(self, content, video_id, transform_source, fatal)
+            args = [url_or_request, video_id, note, errnote, transform_source, fatal, encoding, data, headers, query, *args]
             if parser is None:
                 args.pop(4)  # transform_source
             # The method is fetched by name so subclasses can override _download_..._handle

+ 4 - 0
yt_dlp/options.py

@@ -1154,6 +1154,10 @@ def create_parser():
         '--write-pages',
         action='store_true', dest='write_pages', default=False,
         help='Write downloaded intermediary pages to files in the current directory to debug problems')
+    verbosity.add_option(
+        '--load-pages',
+        action='store_true', dest='load_pages', default=False,
+        help=optparse.SUPPRESS_HELP)
     verbosity.add_option(
         '--youtube-print-sig-code',
         action='store_true', dest='youtube_print_sig_code', default=False,