1 year ago · 5f009a094f
--- a/yt_dlp/extractor/_extractors.py
+++ b/yt_dlp/extractor/_extractors.py
@@ -134,8 +134,8 @@ from .arcpublishing import ArcPublishingIE
 
				 from .arkena import ArkenaIE
			
 
				 from .ard import (
			
 
				     ARDBetaMediathekIE,
			
 
				+    ARDMediathekCollectionIE,
			
 
				     ARDIE,
			
 
				-    ARDMediathekIE,
			
 
				 )
			
 
				 from .arte import (
			
 
				     ArteTVIE,
			
--- a/yt_dlp/extractor/ard.py
+++ b/yt_dlp/extractor/ard.py
@@ -1,24 +1,23 @@
 
				-import json
			
 
				 import re
			
 
				+from functools import partial
			
 
				 
			
 
				 from .common import InfoExtractor
			
 
				-from .generic import GenericIE
			
 
				 from ..utils import (
			
 
				+    OnDemandPagedList,
			
 
				     determine_ext,
			
 
				-    ExtractorError,
			
 
				     int_or_none,
			
 
				+    join_nonempty,
			
 
				+    make_archive_id,
			
 
				     parse_duration,
			
 
				-    qualities,
			
 
				+    parse_iso8601,
			
 
				+    remove_start,
			
 
				     str_or_none,
			
 
				-    try_get,
			
 
				     unified_strdate,
			
 
				-    unified_timestamp,
			
 
				-    update_url,
			
 
				     update_url_query,
			
 
				     url_or_none,
			
 
				     xpath_text,
			
 
				 )
			
 
				-from ..compat import compat_etree_fromstring
			
 
				+from ..utils.traversal import traverse_obj
			
 
				 
			
 
				 
			
 
				 class ARDMediathekBaseIE(InfoExtractor):
			
@@ -61,45 +60,6 @@ class ARDMediathekBaseIE(InfoExtractor):
 
				             'subtitles': subtitles,
			
 
				         }
			
 
				 
			
 
				-    def _ARD_extract_episode_info(self, title):
			
 
				-        """Try to extract season/episode data from the title."""
			
 
				-        res = {}
			
 
				-        if not title:
			
 
				-            return res
			
 
				-
			
 
				-        for pattern in [
			
 
				-            # Pattern for title like "Homo sapiens (S06/E07) - Originalversion"
			
 
				-            # from: https://www.ardmediathek.de/one/sendung/doctor-who/Y3JpZDovL3dkci5kZS9vbmUvZG9jdG9yIHdobw
			
 
				-            r'.*(?P<ep_info> \(S(?P<season_number>\d+)/E(?P<episode_number>\d+)\)).*',
			
 
				-            # E.g.: title="Fritjof aus Norwegen (2) (AD)"
			
 
				-            # from: https://www.ardmediathek.de/ard/sammlung/der-krieg-und-ich/68cMkqJdllm639Skj4c7sS/
			
 
				-            r'.*(?P<ep_info> \((?:Folge |Teil )?(?P<episode_number>\d+)(?:/\d+)?\)).*',
			
 
				-            r'.*(?P<ep_info>Folge (?P<episode_number>\d+)(?:\:| -|) )\"(?P<episode>.+)\".*',
			
 
				-            # E.g.: title="Folge 25/42: Symmetrie"
			
 
				-            # from: https://www.ardmediathek.de/ard/video/grips-mathe/folge-25-42-symmetrie/ard-alpha/Y3JpZDovL2JyLmRlL3ZpZGVvLzMyYzI0ZjczLWQ1N2MtNDAxNC05ZmZhLTFjYzRkZDA5NDU5OQ/
			
 
				-            # E.g.: title="Folge 1063 - Vertrauen"
			
 
				-            # from: https://www.ardmediathek.de/ard/sendung/die-fallers/Y3JpZDovL3N3ci5kZS8yMzAyMDQ4/
			
 
				-            r'.*(?P<ep_info>Folge (?P<episode_number>\d+)(?:/\d+)?(?:\:| -|) ).*',
			
 
				-        ]:
			
 
				-            m = re.match(pattern, title)
			
 
				-            if m:
			
 
				-                groupdict = m.groupdict()
			
 
				-                res['season_number'] = int_or_none(groupdict.get('season_number'))
			
 
				-                res['episode_number'] = int_or_none(groupdict.get('episode_number'))
			
 
				-                res['episode'] = str_or_none(groupdict.get('episode'))
			
 
				-                # Build the episode title by removing numeric episode information:
			
 
				-                if groupdict.get('ep_info') and not res['episode']:
			
 
				-                    res['episode'] = str_or_none(
			
 
				-                        title.replace(groupdict.get('ep_info'), ''))
			
 
				-                if res['episode']:
			
 
				-                    res['episode'] = res['episode'].strip()
			
 
				-                break
			
 
				-
			
 
				-        # As a fallback use the whole title as the episode name:
			
 
				-        if not res.get('episode'):
			
 
				-            res['episode'] = title.strip()
			
 
				-        return res
			
 
				-
			
 
				     def _extract_formats(self, media_info, video_id):
			
 
				         type_ = media_info.get('_type')
			
 
				         media_array = media_info.get('_mediaArray', [])
			
@@ -155,138 +115,6 @@ class ARDMediathekBaseIE(InfoExtractor):
 
				         return formats
			
 
				 
			
 
				 
			
 
				-class ARDMediathekIE(ARDMediathekBaseIE):
			
 
				-    IE_NAME = 'ARD:mediathek'
			
 
				-    _VALID_URL = r'^https?://(?:(?:(?:www|classic)\.)?ardmediathek\.de|mediathek\.(?:daserste|rbb-online)\.de|one\.ard\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?'
			
 
				-
			
 
				-    _TESTS = [{
			
 
				-        # available till 26.07.2022
			
 
				-        'url': 'http://www.ardmediathek.de/tv/S%C3%9CDLICHT/Was-ist-die-Kunst-der-Zukunft-liebe-Ann/BR-Fernsehen/Video?bcastId=34633636&documentId=44726822',
			
 
				-        'info_dict': {
			
 
				-            'id': '44726822',
			
 
				-            'ext': 'mp4',
			
 
				-            'title': 'Was ist die Kunst der Zukunft, liebe Anna McCarthy?',
			
 
				-            'description': 'md5:4ada28b3e3b5df01647310e41f3a62f5',
			
 
				-            'duration': 1740,
			
 
				-        },
			
 
				-        'params': {
			
 
				-            # m3u8 download
			
 
				-            'skip_download': True,
			
 
				-        }
			
 
				-    }, {
			
 
				-        'url': 'https://one.ard.de/tv/Mord-mit-Aussicht/Mord-mit-Aussicht-6-39-T%C3%B6dliche-Nach/ONE/Video?bcastId=46384294&documentId=55586872',
			
 
				-        'only_matching': True,
			
 
				-    }, {
			
 
				-        # audio
			
 
				-        'url': 'http://www.ardmediathek.de/tv/WDR-H%C3%B6rspiel-Speicher/Tod-eines-Fu%C3%9Fballers/WDR-3/Audio-Podcast?documentId=28488308&bcastId=23074086',
			
 
				-        'only_matching': True,
			
 
				-    }, {
			
 
				-        'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht',
			
 
				-        'only_matching': True,
			
 
				-    }, {
			
 
				-        # audio
			
 
				-        'url': 'http://mediathek.rbb-online.de/radio/Hörspiel/Vor-dem-Fest/kulturradio/Audio?documentId=30796318&topRessort=radio&bcastId=9839158',
			
 
				-        'only_matching': True,
			
 
				-    }, {
			
 
				-        'url': 'https://classic.ardmediathek.de/tv/Panda-Gorilla-Co/Panda-Gorilla-Co-Folge-274/Das-Erste/Video?bcastId=16355486&documentId=58234698',
			
 
				-        'only_matching': True,
			
 
				-    }]
			
 
				-
			
 
				-    @classmethod
			
 
				-    def suitable(cls, url):
			
 
				-        return False if ARDBetaMediathekIE.suitable(url) else super(ARDMediathekIE, cls).suitable(url)
			
 
				-
			
 
				-    def _real_extract(self, url):
			
 
				-        # determine video id from url
			
 
				-        m = self._match_valid_url(url)
			
 
				-
			
 
				-        document_id = None
			
 
				-
			
 
				-        numid = re.search(r'documentId=([0-9]+)', url)
			
 
				-        if numid:
			
 
				-            document_id = video_id = numid.group(1)
			
 
				-        else:
			
 
				-            video_id = m.group('video_id')
			
 
				-
			
 
				-        webpage = self._download_webpage(url, video_id)
			
 
				-
			
 
				-        ERRORS = (
			
 
				-            ('>Leider liegt eine Störung vor.', 'Video %s is unavailable'),
			
 
				-            ('>Der gewünschte Beitrag ist nicht mehr verfügbar.<',
			
 
				-             'Video %s is no longer available'),
			
 
				-        )
			
 
				-
			
 
				-        for pattern, message in ERRORS:
			
 
				-            if pattern in webpage:
			
 
				-                raise ExtractorError(message % video_id, expected=True)
			
 
				-
			
 
				-        if re.search(r'[\?&]rss($|[=&])', url):
			
 
				-            doc = compat_etree_fromstring(webpage.encode('utf-8'))
			
 
				-            if doc.tag == 'rss':
			
 
				-                return GenericIE()._extract_rss(url, video_id, doc)
			
 
				-
			
 
				-        title = self._og_search_title(webpage, default=None) or self._html_search_regex(
			
 
				-            [r'<h1(?:\s+class="boxTopHeadline")?>(.*?)</h1>',
			
 
				-             r'<meta name="dcterms\.title" content="(.*?)"/>',
			
 
				-             r'<h4 class="headline">(.*?)</h4>',
			
 
				-             r'<title[^>]*>(.*?)</title>'],
			
 
				-            webpage, 'title')
			
 
				-        description = self._og_search_description(webpage, default=None) or self._html_search_meta(
			
 
				-            'dcterms.abstract', webpage, 'description', default=None)
			
 
				-        if description is None:
			
 
				-            description = self._html_search_meta(
			
 
				-                'description', webpage, 'meta description', default=None)
			
 
				-        if description is None:
			
 
				-            description = self._html_search_regex(
			
 
				-                r'<p\s+class="teasertext">(.+?)</p>',
			
 
				-                webpage, 'teaser text', default=None)
			
 
				-
			
 
				-        # Thumbnail is sometimes not present.
			
 
				-        # It is in the mobile version, but that seems to use a different URL
			
 
				-        # structure altogether.
			
 
				-        thumbnail = self._og_search_thumbnail(webpage, default=None)
			
 
				-
			
 
				-        media_streams = re.findall(r'''(?x)
			
 
				-            mediaCollection\.addMediaStream\([0-9]+,\s*[0-9]+,\s*"[^"]*",\s*
			
 
				-            "([^"]+)"''', webpage)
			
 
				-
			
 
				-        if media_streams:
			
 
				-            QUALITIES = qualities(['lo', 'hi', 'hq'])
			
 
				-            formats = []
			
 
				-            for furl in set(media_streams):
			
 
				-                if furl.endswith('.f4m'):
			
 
				-                    fid = 'f4m'
			
 
				-                else:
			
 
				-                    fid_m = re.match(r'.*\.([^.]+)\.[^.]+$', furl)
			
 
				-                    fid = fid_m.group(1) if fid_m else None
			
 
				-                formats.append({
			
 
				-                    'quality': QUALITIES(fid),
			
 
				-                    'format_id': fid,
			
 
				-                    'url': furl,
			
 
				-                })
			
 
				-            info = {
			
 
				-                'formats': formats,
			
 
				-            }
			
 
				-        else:  # request JSON file
			
 
				-            if not document_id:
			
 
				-                video_id = self._search_regex(
			
 
				-                    (r'/play/(?:config|media|sola)/(\d+)', r'contentId["\']\s*:\s*(\d+)'),
			
 
				-                    webpage, 'media id', default=None)
			
 
				-            info = self._extract_media_info(
			
 
				-                'http://www.ardmediathek.de/play/media/%s' % video_id,
			
 
				-                webpage, video_id)
			
 
				-
			
 
				-        info.update({
			
 
				-            'id': video_id,
			
 
				-            'title': title,
			
 
				-            'description': description,
			
 
				-            'thumbnail': thumbnail,
			
 
				-        })
			
 
				-        info.update(self._ARD_extract_episode_info(info['title']))
			
 
				-
			
 
				-        return info
			
 
				-
			
 
				-
			
 
				 class ARDIE(InfoExtractor):
			
 
				     _VALID_URL = r'(?P<mainurl>https?://(?:www\.)?daserste\.de/(?:[^/?#&]+/)+(?P<id>[^/?#&]+))\.html'
			
 
				     _TESTS = [{
			
@@ -399,21 +227,23 @@ class ARDIE(InfoExtractor):
 
				         }
			
 
				 
			
 
				 
			
 
				-class ARDBetaMediathekIE(ARDMediathekBaseIE):
			
 
				+class ARDBetaMediathekIE(InfoExtractor):
			
 
				+    IE_NAME = 'ARDMediathek'
			
 
				     _VALID_URL = r'''(?x)https://
			
 
				         (?:(?:beta|www)\.)?ardmediathek\.de/
			
 
				-        (?:(?P<client>[^/]+)/)?
			
 
				-        (?:player|live|video|(?P<playlist>sendung|serie|sammlung))/
			
 
				-        (?:(?P<display_id>(?(playlist)[^?#]+?|[^?#]+))/)?
			
 
				-        (?P<id>(?(playlist)|Y3JpZDovL)[a-zA-Z0-9]+)
			
 
				-        (?(playlist)/(?P<season>\d+)?/?(?:[?#]|$))'''
			
 
				+        (?:[^/]+/)?
			
 
				+        (?:player|live|video)/
			
 
				+        (?:(?P<display_id>[^?#]+)/)?
			
 
				+        (?P<id>[a-zA-Z0-9]+)
			
 
				+        /?(?:[?#]|$)'''
			
 
				+    _GEO_COUNTRIES = ['DE']
			
 
				 
			
 
				     _TESTS = [{
			
 
				         'url': 'https://www.ardmediathek.de/video/filme-im-mdr/liebe-auf-vier-pfoten/mdr-fernsehen/Y3JpZDovL21kci5kZS9zZW5kdW5nLzI4MjA0MC80MjIwOTEtNDAyNTM0',
			
 
				         'md5': 'b6e8ab03f2bcc6e1f9e6cef25fcc03c4',
			
 
				         'info_dict': {
			
 
				             'display_id': 'filme-im-mdr/liebe-auf-vier-pfoten/mdr-fernsehen',
			
 
				-            'id': '12939099',
			
 
				+            'id': 'Y3JpZDovL21kci5kZS9zZW5kdW5nLzI4MjA0MC80MjIwOTEtNDAyNTM0',
			
 
				             'title': 'Liebe auf vier Pfoten',
			
 
				             'description': r're:^Claudia Schmitt, Anwältin in Salzburg',
			
 
				             'duration': 5222,
			
@@ -422,7 +252,10 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
 
				             'upload_date': '20231130',
			
 
				             'ext': 'mp4',
			
 
				             'episode': 'Liebe auf vier Pfoten',
			
 
				-            'series': 'Filme im MDR'
			
 
				+            'series': 'Filme im MDR',
			
 
				+            'age_limit': 0,
			
 
				+            'channel': 'MDR',
			
 
				+            '_old_archive_ids': ['ardbetamediathek 12939099'],
			
 
				         },
			
 
				     }, {
			
 
				         'url': 'https://www.ardmediathek.de/mdr/video/die-robuste-roswita/Y3JpZDovL21kci5kZS9iZWl0cmFnL2Ntcy84MWMxN2MzZC0wMjkxLTRmMzUtODk4ZS0wYzhlOWQxODE2NGI/',
			
@@ -443,7 +276,7 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
 
				         'url': 'https://www.ardmediathek.de/video/tagesschau-oder-tagesschau-20-00-uhr/das-erste/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhZ2Vzc2NoYXUvZmM4ZDUxMjgtOTE0ZC00Y2MzLTgzNzAtNDZkNGNiZWJkOTll',
			
 
				         'md5': '1e73ded21cb79bac065117e80c81dc88',
			
 
				         'info_dict': {
			
 
				-            'id': '10049223',
			
 
				+            'id': 'Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhZ2Vzc2NoYXUvZmM4ZDUxMjgtOTE0ZC00Y2MzLTgzNzAtNDZkNGNiZWJkOTll',
			
 
				             'ext': 'mp4',
			
 
				             'title': 'tagesschau, 20:00 Uhr',
			
 
				             'timestamp': 1636398000,
			
@@ -454,6 +287,26 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
 
				             'episode': 'tagesschau, 20:00 Uhr',
			
 
				             'series': 'tagesschau',
			
 
				             'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:fbb21142783b0a49?w=960&ch=ee69108ae344f678',
			
 
				+            'channel': 'ARD-Aktuell',
			
 
				+            '_old_archive_ids': ['ardbetamediathek 10049223'],
			
 
				+        },
			
 
				+    }, {
			
 
				+        'url': 'https://www.ardmediathek.de/video/7-tage/7-tage-unter-harten-jungs/hr-fernsehen/N2I2YmM5MzgtNWFlOS00ZGFlLTg2NzMtYzNjM2JlNjk4MDg3',
			
 
				+        'md5': 'c428b9effff18ff624d4f903bda26315',
			
 
				+        'info_dict': {
			
 
				+            'id': 'N2I2YmM5MzgtNWFlOS00ZGFlLTg2NzMtYzNjM2JlNjk4MDg3',
			
 
				+            'ext': 'mp4',
			
 
				+            'duration': 2700,
			
 
				+            'episode': '7 Tage ... unter harten Jungs',
			
 
				+            'description': 'md5:0f215470dcd2b02f59f4bd10c963f072',
			
 
				+            'upload_date': '20231005',
			
 
				+            'timestamp': 1696491171,
			
 
				+            'display_id': '7-tage/7-tage-unter-harten-jungs/hr-fernsehen',
			
 
				+            'series': '7 Tage ...',
			
 
				+            'channel': 'HR',
			
 
				+            'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:f6e6d5ffac41925c?w=960&ch=fa32ba69bc87989a',
			
 
				+            'title': '7 Tage ... unter harten Jungs',
			
 
				+            '_old_archive_ids': ['ardbetamediathek 94834686'],
			
 
				         },
			
 
				     }, {
			
 
				         'url': 'https://beta.ardmediathek.de/ard/video/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE',
			
@@ -470,6 +323,176 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
 
				     }, {
			
 
				         'url': 'https://www.ardmediathek.de/swr/live/Y3JpZDovL3N3ci5kZS8xMzQ4MTA0Mg',
			
 
				         'only_matching': True,
			
 
				+    }, {
			
 
				+        'url': 'https://www.ardmediathek.de/video/coronavirus-update-ndr-info/astrazeneca-kurz-lockdown-und-pims-syndrom-81/ndr/Y3JpZDovL25kci5kZS84NzE0M2FjNi0wMWEwLTQ5ODEtOTE5NS1mOGZhNzdhOTFmOTI/',
			
 
				+        'only_matching': True,
			
 
				+    }]
			
 
				+
			
 
				+    def _extract_episode_info(self, title):
			
 
				+        patterns = [
			
 
				+            # Pattern for title like "Homo sapiens (S06/E07) - Originalversion"
			
 
				+            # from: https://www.ardmediathek.de/one/sendung/doctor-who/Y3JpZDovL3dkci5kZS9vbmUvZG9jdG9yIHdobw
			
 
				+            r'.*(?P<ep_info> \(S(?P<season_number>\d+)/E(?P<episode_number>\d+)\)).*',
			
 
				+            # E.g.: title="Fritjof aus Norwegen (2) (AD)"
			
 
				+            # from: https://www.ardmediathek.de/ard/sammlung/der-krieg-und-ich/68cMkqJdllm639Skj4c7sS/
			
 
				+            r'.*(?P<ep_info> \((?:Folge |Teil )?(?P<episode_number>\d+)(?:/\d+)?\)).*',
			
 
				+            r'.*(?P<ep_info>Folge (?P<episode_number>\d+)(?:\:| -|) )\"(?P<episode>.+)\".*',
			
 
				+            # E.g.: title="Folge 25/42: Symmetrie"
			
 
				+            # from: https://www.ardmediathek.de/ard/video/grips-mathe/folge-25-42-symmetrie/ard-alpha/Y3JpZDovL2JyLmRlL3ZpZGVvLzMyYzI0ZjczLWQ1N2MtNDAxNC05ZmZhLTFjYzRkZDA5NDU5OQ/
			
 
				+            # E.g.: title="Folge 1063 - Vertrauen"
			
 
				+            # from: https://www.ardmediathek.de/ard/sendung/die-fallers/Y3JpZDovL3N3ci5kZS8yMzAyMDQ4/
			
 
				+            r'.*(?P<ep_info>Folge (?P<episode_number>\d+)(?:/\d+)?(?:\:| -|) ).*',
			
 
				+            # As a fallback use the full title
			
 
				+            r'(?P<title>.*)',
			
 
				+        ]
			
 
				+
			
 
				+        return traverse_obj(patterns, (..., {partial(re.match, string=title)}, {
			
 
				+            'season_number': ('season_number', {int_or_none}),
			
 
				+            'episode_number': ('episode_number', {int_or_none}),
			
 
				+            'episode': ((
			
 
				+                ('episode', {str_or_none}),
			
 
				+                ('ep_info', {lambda x: title.replace(x, '')}),
			
 
				+                ('title', {str}),
			
 
				+            ), {str.strip}),
			
 
				+        }), get_all=False)
			
 
				+
			
 
				+    def _real_extract(self, url):
			
 
				+        video_id, display_id = self._match_valid_url(url).group('id', 'display_id')
			
 
				+
			
 
				+        page_data = self._download_json(
			
 
				+            f'https://api.ardmediathek.de/page-gateway/pages/ard/item/{video_id}', video_id, query={
			
 
				+                'embedded': 'false',
			
 
				+                'mcV6': 'true',
			
 
				+            })
			
 
				+
			
 
				+        player_data = traverse_obj(
			
 
				+            page_data, ('widgets', lambda _, v: v['type'] in ('player_ondemand', 'player_live'), {dict}), get_all=False)
			
 
				+        is_live = player_data.get('type') == 'player_live'
			
 
				+        media_data = traverse_obj(player_data, ('mediaCollection', 'embedded', {dict}))
			
 
				+
			
 
				+        if player_data.get('blockedByFsk'):
			
 
				+            self.raise_no_formats('This video is only available after 22:00', expected=True)
			
 
				+
			
 
				+        formats = []
			
 
				+        subtitles = {}
			
 
				+        for stream in traverse_obj(media_data, ('streams', ..., {dict})):
			
 
				+            kind = stream.get('kind')
			
 
				+            # Prioritize main stream over sign language and others
			
 
				+            preference = 1 if kind == 'main' else None
			
 
				+            for media in traverse_obj(stream, ('media', lambda _, v: url_or_none(v['url']))):
			
 
				+                media_url = media['url']
			
 
				+
			
 
				+                audio_kind = traverse_obj(media, (
			
 
				+                    'audios', 0, 'kind', {str}), default='').replace('standard', '')
			
 
				+                lang_code = traverse_obj(media, ('audios', 0, 'languageCode', {str})) or 'deu'
			
 
				+                lang = join_nonempty(lang_code, audio_kind)
			
 
				+                language_preference = 10 if lang == 'deu' else -10
			
 
				+
			
 
				+                if determine_ext(media_url) == 'm3u8':
			
 
				+                    fmts, subs = self._extract_m3u8_formats_and_subtitles(
			
 
				+                        media_url, video_id, m3u8_id=f'hls-{kind}', preference=preference, fatal=False, live=is_live)
			
 
				+                    for f in fmts:
			
 
				+                        f['language'] = lang
			
 
				+                        f['language_preference'] = language_preference
			
 
				+                    formats.extend(fmts)
			
 
				+                    self._merge_subtitles(subs, target=subtitles)
			
 
				+                else:
			
 
				+                    formats.append({
			
 
				+                        'url': media_url,
			
 
				+                        'format_id': f'http-{kind}',
			
 
				+                        'preference': preference,
			
 
				+                        'language': lang,
			
 
				+                        'language_preference': language_preference,
			
 
				+                        **traverse_obj(media, {
			
 
				+                            'format_note': ('forcedLabel', {str}),
			
 
				+                            'width': ('maxHResolutionPx', {int_or_none}),
			
 
				+                            'height': ('maxVResolutionPx', {int_or_none}),
			
 
				+                            'vcodec': ('videoCodec', {str}),
			
 
				+                        }),
			
 
				+                    })
			
 
				+
			
 
				+        for sub in traverse_obj(media_data, ('subtitles', ..., {dict})):
			
 
				+            for sources in traverse_obj(sub, ('sources', lambda _, v: url_or_none(v['url']))):
			
 
				+                subtitles.setdefault(sub.get('languageCode') or 'deu', []).append({
			
 
				+                    'url': sources['url'],
			
 
				+                    'ext': {'webvtt': 'vtt', 'ebutt': 'ttml'}.get(sources.get('kind')),
			
 
				+                })
			
 
				+
			
 
				+        age_limit = traverse_obj(page_data, ('fskRating', {lambda x: remove_start(x, 'FSK')}, {int_or_none}))
			
 
				+        old_id = traverse_obj(page_data, ('tracking', 'atiCustomVars', 'contentId'))
			
 
				+
			
 
				+        return {
			
 
				+            'id': video_id,
			
 
				+            'display_id': display_id,
			
 
				+            'formats': formats,
			
 
				+            'subtitles': subtitles,
			
 
				+            'is_live': is_live,
			
 
				+            'age_limit': age_limit,
			
 
				+            **traverse_obj(media_data, ('meta', {
			
 
				+                'title': 'title',
			
 
				+                'description': 'synopsis',
			
 
				+                'timestamp': ('broadcastedOnDateTime', {parse_iso8601}),
			
 
				+                'series': 'seriesTitle',
			
 
				+                'thumbnail': ('images', 0, 'url', {url_or_none}),
			
 
				+                'duration': ('durationSeconds', {int_or_none}),
			
 
				+                'channel': 'clipSourceName',
			
 
				+            })),
			
 
				+            **self._extract_episode_info(page_data.get('title')),
			
 
				+            '_old_archive_ids': [make_archive_id(ARDBetaMediathekIE, old_id)],
			
 
				+        }
			
 
				+
			
 
				+
			
 
				+class ARDMediathekCollectionIE(InfoExtractor):
			
 
				+    _VALID_URL = r'''(?x)https://
			
 
				+        (?:(?:beta|www)\.)?ardmediathek\.de/
			
 
				+        (?:[^/?#]+/)?
			
 
				+        (?P<playlist>sendung|serie|sammlung)/
			
 
				+        (?:(?P<display_id>[^?#]+?)/)?
			
 
				+        (?P<id>[a-zA-Z0-9]+)
			
 
				+        (?:/(?P<season>\d+)(?:/(?P<version>OV|AD))?)?/?(?:[?#]|$)'''
			
 
				+    _GEO_COUNTRIES = ['DE']
			
 
				+
			
 
				+    _TESTS = [{
			
 
				+        'url': 'https://www.ardmediathek.de/serie/quiz/staffel-1-originalversion/Y3JpZDovL3dkci5kZS9vbmUvcXVpeg/1/OV',
			
 
				+        'info_dict': {
			
 
				+            'id': 'Y3JpZDovL3dkci5kZS9vbmUvcXVpeg_1_OV',
			
 
				+            'display_id': 'quiz/staffel-1-originalversion',
			
 
				+            'title': 'Staffel 1 Originalversion',
			
 
				+        },
			
 
				+        'playlist_count': 3,
			
 
				+    }, {
			
 
				+        'url': 'https://www.ardmediathek.de/serie/babylon-berlin/staffel-4-mit-audiodeskription/Y3JpZDovL2Rhc2Vyc3RlLmRlL2JhYnlsb24tYmVybGlu/4/AD',
			
 
				+        'info_dict': {
			
 
				+            'id': 'Y3JpZDovL2Rhc2Vyc3RlLmRlL2JhYnlsb24tYmVybGlu_4_AD',
			
 
				+            'display_id': 'babylon-berlin/staffel-4-mit-audiodeskription',
			
 
				+            'title': 'Staffel 4 mit Audiodeskription',
			
 
				+        },
			
 
				+        'playlist_count': 12,
			
 
				+    }, {
			
 
				+        'url': 'https://www.ardmediathek.de/serie/babylon-berlin/staffel-1/Y3JpZDovL2Rhc2Vyc3RlLmRlL2JhYnlsb24tYmVybGlu/1/',
			
 
				+        'info_dict': {
			
 
				+            'id': 'Y3JpZDovL2Rhc2Vyc3RlLmRlL2JhYnlsb24tYmVybGlu_1',
			
 
				+            'display_id': 'babylon-berlin/staffel-1',
			
 
				+            'title': 'Staffel 1',
			
 
				+        },
			
 
				+        'playlist_count': 8,
			
 
				+    }, {
			
 
				+        'url': 'https://www.ardmediathek.de/sendung/tatort/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydA',
			
 
				+        'info_dict': {
			
 
				+            'id': 'Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydA',
			
 
				+            'display_id': 'tatort',
			
 
				+            'title': 'Tatort',
			
 
				+        },
			
 
				+        'playlist_mincount': 500,
			
 
				+    }, {
			
 
				+        'url': 'https://www.ardmediathek.de/sammlung/die-kirche-bleibt-im-dorf/5eOHzt8XB2sqeFXbIoJlg2',
			
 
				+        'info_dict': {
			
 
				+            'id': '5eOHzt8XB2sqeFXbIoJlg2',
			
 
				+            'display_id': 'die-kirche-bleibt-im-dorf',
			
 
				+            'title': 'Die Kirche bleibt im Dorf',
			
 
				+            'description': 'Die Kirche bleibt im Dorf',
			
 
				+        },
			
 
				+        'playlist_count': 4,
			
 
				     }, {
			
 
				         # playlist of type 'sendung'
			
 
				         'url': 'https://www.ardmediathek.de/ard/sendung/doctor-who/Y3JpZDovL3dkci5kZS9vbmUvZG9jdG9yIHdobw/',
			
@@ -482,196 +505,48 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
 
				         # playlist of type 'sammlung'
			
 
				         'url': 'https://www.ardmediathek.de/ard/sammlung/team-muenster/5JpTzLSbWUAK8184IOvEir/',
			
 
				         'only_matching': True,
			
 
				-    }, {
			
 
				-        'url': 'https://www.ardmediathek.de/video/coronavirus-update-ndr-info/astrazeneca-kurz-lockdown-und-pims-syndrom-81/ndr/Y3JpZDovL25kci5kZS84NzE0M2FjNi0wMWEwLTQ5ODEtOTE5NS1mOGZhNzdhOTFmOTI/',
			
 
				-        'only_matching': True,
			
 
				-    }, {
			
 
				-        'url': 'https://www.ardmediathek.de/ard/player/Y3JpZDovL3dkci5kZS9CZWl0cmFnLWQ2NDJjYWEzLTMwZWYtNGI4NS1iMTI2LTU1N2UxYTcxOGIzOQ/tatort-duo-koeln-leipzig-ihr-kinderlein-kommet',
			
 
				-        'only_matching': True,
			
 
				     }]
			
 
				 
			
 
				-    def _ARD_load_playlist_snippet(self, playlist_id, display_id, client, mode, page_number):
			
 
				-        """ Query the ARD server for playlist information
			
 
				-        and returns the data in "raw" format """
			
 
				-        assert mode in ('sendung', 'serie', 'sammlung')
			
 
				-        if mode in ('sendung', 'serie'):
			
 
				-            graphQL = json.dumps({
			
 
				-                'query': '''{
			
 
				-                    showPage(
			
 
				-                        client: "%s"
			
 
				-                        showId: "%s"
			
 
				-                        pageNumber: %d
			
 
				-                    ) {
			
 
				-                        pagination {
			
 
				-                            pageSize
			
 
				-                            totalElements
			
 
				-                        }
			
 
				-                        teasers {        # Array
			
 
				-                            mediumTitle
			
 
				-                            links { target { id href title } }
			
 
				-                            type
			
 
				-                        }
			
 
				-                    }}''' % (client, playlist_id, page_number),
			
 
				-            }).encode()
			
 
				-        else:  # mode == 'sammlung'
			
 
				-            graphQL = json.dumps({
			
 
				-                'query': '''{
			
 
				-                    morePage(
			
 
				-                        client: "%s"
			
 
				-                        compilationId: "%s"
			
 
				-                        pageNumber: %d
			
 
				-                    ) {
			
 
				-                        widget {
			
 
				-                            pagination {
			
 
				-                                pageSize
			
 
				-                                totalElements
			
 
				-                            }
			
 
				-                            teasers {        # Array
			
 
				-                                mediumTitle
			
 
				-                                links { target { id href title } }
			
 
				-                                type
			
 
				-                            }
			
 
				-                        }
			
 
				-                    }}''' % (client, playlist_id, page_number),
			
 
				-            }).encode()
			
 
				-        # Ressources for ARD graphQL debugging:
			
 
				-        # https://api-test.ardmediathek.de/public-gateway
			
 
				-        show_page = self._download_json(
			
 
				-            'https://api.ardmediathek.de/public-gateway',
			
 
				-            '[Playlist] %s' % display_id,
			
 
				-            data=graphQL,
			
 
				-            headers={'Content-Type': 'application/json'})['data']
			
 
				-        # align the structure of the returned data:
			
 
				-        if mode in ('sendung', 'serie'):
			
 
				-            show_page = show_page['showPage']
			
 
				-        else:  # mode == 'sammlung'
			
 
				-            show_page = show_page['morePage']['widget']
			
 
				-        return show_page
			
 
				-
			
 
				-    def _ARD_extract_playlist(self, url, playlist_id, display_id, client, mode):
			
 
				-        """ Collects all playlist entries and returns them as info dict.
			
 
				-        Supports playlists of mode 'sendung', 'serie', and 'sammlung',
			
 
				-        as well as nested playlists. """
			
 
				-        entries = []
			
 
				-        pageNumber = 0
			
 
				-        while True:  # iterate by pageNumber
			
 
				-            show_page = self._ARD_load_playlist_snippet(
			
 
				-                playlist_id, display_id, client, mode, pageNumber)
			
 
				-            for teaser in show_page['teasers']:  # process playlist items
			
 
				-                if '/compilation/' in teaser['links']['target']['href']:
			
 
				-                    # alternativ cond.: teaser['type'] == "compilation"
			
 
				-                    # => This is an nested compilation, e.g. like:
			
 
				-                    # https://www.ardmediathek.de/ard/sammlung/die-kirche-bleibt-im-dorf/5eOHzt8XB2sqeFXbIoJlg2/
			
 
				-                    link_mode = 'sammlung'
			
 
				-                else:
			
 
				-                    link_mode = 'video'
			
 
				-
			
 
				-                item_url = 'https://www.ardmediathek.de/%s/%s/%s/%s/%s' % (
			
 
				-                    client, link_mode, display_id,
			
 
				-                    # perform HTLM quoting of episode title similar to ARD:
			
 
				-                    re.sub('^-|-$', '',  # remove '-' from begin/end
			
 
				-                           re.sub('[^a-zA-Z0-9]+', '-',  # replace special chars by -
			
 
				-                                  teaser['links']['target']['title'].lower()
			
 
				-                                  .replace('ä', 'ae').replace('ö', 'oe')
			
 
				-                                  .replace('ü', 'ue').replace('ß', 'ss'))),
			
 
				-                    teaser['links']['target']['id'])
			
 
				-                entries.append(self.url_result(
			
 
				-                    item_url,
			
 
				-                    ie=ARDBetaMediathekIE.ie_key()))
			
 
				-
			
 
				-            if (show_page['pagination']['pageSize'] * (pageNumber + 1)
			
 
				-               >= show_page['pagination']['totalElements']):
			
 
				-                # we've processed enough pages to get all playlist entries
			
 
				-                break
			
 
				-            pageNumber = pageNumber + 1
			
 
				-
			
 
				-        return self.playlist_result(entries, playlist_id, playlist_title=display_id)
			
 
				+    _PAGE_SIZE = 100
			
 
				 
			
 
				     def _real_extract(self, url):
			
 
				-        video_id, display_id, playlist_type, client, season_number = self._match_valid_url(url).group(
			
 
				-            'id', 'display_id', 'playlist', 'client', 'season')
			
 
				-        display_id, client = display_id or video_id, client or 'ard'
			
 
				-
			
 
				-        if playlist_type:
			
 
				-            # TODO: Extract only specified season
			
 
				-            return self._ARD_extract_playlist(url, video_id, display_id, client, playlist_type)
			
 
				-
			
 
				-        player_page = self._download_json(
			
 
				-            'https://api.ardmediathek.de/public-gateway',
			
 
				-            display_id, data=json.dumps({
			
 
				-                'query': '''{
			
 
				-  playerPage(client:"%s", clipId: "%s") {
			
 
				-    blockedByFsk
			
 
				-    broadcastedOn
			
 
				-    maturityContentRating
			
 
				-    mediaCollection {
			
 
				-      _duration
			
 
				-      _geoblocked
			
 
				-      _isLive
			
 
				-      _mediaArray {
			
 
				-        _mediaStreamArray {
			
 
				-          _quality
			
 
				-          _server
			
 
				-          _stream
			
 
				-        }
			
 
				-      }
			
 
				-      _previewImage
			
 
				-      _subtitleUrl
			
 
				-      _type
			
 
				-    }
			
 
				-    show {
			
 
				-      title
			
 
				-    }
			
 
				-    image {
			
 
				-      src
			
 
				-    }
			
 
				-    synopsis
			
 
				-    title
			
 
				-    tracking {
			
 
				-      atiCustomVars {
			
 
				-        contentId
			
 
				-      }
			
 
				-    }
			
 
				-  }
			
 
				-}''' % (client, video_id),
			
 
				-            }).encode(), headers={
			
 
				-                'Content-Type': 'application/json'
			
 
				-            })['data']['playerPage']
			
 
				-        title = player_page['title']
			
 
				-        content_id = str_or_none(try_get(
			
 
				-            player_page, lambda x: x['tracking']['atiCustomVars']['contentId']))
			
 
				-        media_collection = player_page.get('mediaCollection') or {}
			
 
				-        if not media_collection and content_id:
			
 
				-            media_collection = self._download_json(
			
 
				-                'https://www.ardmediathek.de/play/media/' + content_id,
			
 
				-                content_id, fatal=False) or {}
			
 
				-        info = self._parse_media_info(
			
 
				-            media_collection, content_id or video_id,
			
 
				-            player_page.get('blockedByFsk'))
			
 
				-        age_limit = None
			
 
				-        description = player_page.get('synopsis')
			
 
				-        maturity_content_rating = player_page.get('maturityContentRating')
			
 
				-        if maturity_content_rating:
			
 
				-            age_limit = int_or_none(maturity_content_rating.lstrip('FSK'))
			
 
				-        if not age_limit and description:
			
 
				-            age_limit = int_or_none(self._search_regex(
			
 
				-                r'\(FSK\s*(\d+)\)\s*$', description, 'age limit', default=None))
			
 
				-        info.update({
			
 
				-            'age_limit': age_limit,
			
 
				-            'display_id': display_id,
			
 
				-            'title': title,
			
 
				-            'description': description,
			
 
				-            'timestamp': unified_timestamp(player_page.get('broadcastedOn')),
			
 
				-            'series': try_get(player_page, lambda x: x['show']['title']),
			
 
				-            'thumbnail': (media_collection.get('_previewImage')
			
 
				-                          or try_get(player_page, lambda x: update_url(x['image']['src'], query=None, fragment=None))
			
 
				-                          or self.get_thumbnail_from_html(display_id, url)),
			
 
				-        })
			
 
				-        info.update(self._ARD_extract_episode_info(info['title']))
			
 
				-        return info
			
 
				-
			
 
				-    def get_thumbnail_from_html(self, display_id, url):
			
 
				-        webpage = self._download_webpage(url, display_id, fatal=False) or ''
			
 
				-        return (
			
 
				-            self._og_search_thumbnail(webpage, default=None)
			
 
				-            or self._html_search_meta('thumbnailUrl', webpage, default=None))
			
 
				+        playlist_id, display_id, playlist_type, season_number, version = self._match_valid_url(url).group(
			
 
				+            'id', 'display_id', 'playlist', 'season', 'version')
			
 
				+
			
 
				+        def call_api(page_num):
			
 
				+            api_path = 'compilations/ard' if playlist_type == 'sammlung' else 'widgets/ard/asset'
			
 
				+            return self._download_json(
			
 
				+                f'https://api.ardmediathek.de/page-gateway/{api_path}/{playlist_id}', playlist_id,
			
 
				+                f'Downloading playlist page {page_num}', query={
			
 
				+                    'pageNumber': page_num,
			
 
				+                    'pageSize': self._PAGE_SIZE,
			
 
				+                    **({
			
 
				+                        'seasoned': 'true',
			
 
				+                        'seasonNumber': season_number,
			
 
				+                        'withOriginalversion': 'true' if version == 'OV' else 'false',
			
 
				+                        'withAudiodescription': 'true' if version == 'AD' else 'false',
			
 
				+                    } if season_number else {}),
			
 
				+                })
			
 
				+
			
 
				+        def fetch_page(page_num):
			
 
				+            for item in traverse_obj(call_api(page_num), ('teasers', ..., {dict})):
			
 
				+                item_id = traverse_obj(item, ('links', 'target', ('urlId', 'id')), 'id', get_all=False)
			
 
				+                if not item_id or item_id == playlist_id:
			
 
				+                    continue
			
 
				+                item_mode = 'sammlung' if item.get('type') == 'compilation' else 'video'
			
 
				+                yield self.url_result(
			
 
				+                    f'https://www.ardmediathek.de/{item_mode}/{item_id}',
			
 
				+                    ie=(ARDMediathekCollectionIE if item_mode == 'sammlung' else ARDBetaMediathekIE),
			
 
				+                    **traverse_obj(item, {
			
 
				+                        'id': ('id', {str}),
			
 
				+                        'title': ('longTitle', {str}),
			
 
				+                        'duration': ('duration', {int_or_none}),
			
 
				+                        'timestamp': ('broadcastedOn', {parse_iso8601}),
			
 
				+                    }))
			
 
				+
			
 
				+        page_data = call_api(0)
			
 
				+        full_id = join_nonempty(playlist_id, season_number, version, delim='_')
			
 
				+
			
 
				+        return self.playlist_result(
			
 
				+            OnDemandPagedList(fetch_page, self._PAGE_SIZE), full_id, display_id=display_id,
			
 
				+            title=page_data.get('title'), description=page_data.get('synopsis'))