|
@@ -1,24 +1,23 @@
|
|
|
-import json
|
|
|
import re
|
|
|
+from functools import partial
|
|
|
|
|
|
from .common import InfoExtractor
|
|
|
-from .generic import GenericIE
|
|
|
from ..utils import (
|
|
|
+ OnDemandPagedList,
|
|
|
determine_ext,
|
|
|
- ExtractorError,
|
|
|
int_or_none,
|
|
|
+ join_nonempty,
|
|
|
+ make_archive_id,
|
|
|
parse_duration,
|
|
|
- qualities,
|
|
|
+ parse_iso8601,
|
|
|
+ remove_start,
|
|
|
str_or_none,
|
|
|
- try_get,
|
|
|
unified_strdate,
|
|
|
- unified_timestamp,
|
|
|
- update_url,
|
|
|
update_url_query,
|
|
|
url_or_none,
|
|
|
xpath_text,
|
|
|
)
|
|
|
-from ..compat import compat_etree_fromstring
|
|
|
+from ..utils.traversal import traverse_obj
|
|
|
|
|
|
|
|
|
class ARDMediathekBaseIE(InfoExtractor):
|
|
@@ -61,45 +60,6 @@ class ARDMediathekBaseIE(InfoExtractor):
|
|
|
'subtitles': subtitles,
|
|
|
}
|
|
|
|
|
|
- def _ARD_extract_episode_info(self, title):
|
|
|
- """Try to extract season/episode data from the title."""
|
|
|
- res = {}
|
|
|
- if not title:
|
|
|
- return res
|
|
|
-
|
|
|
- for pattern in [
|
|
|
- # Pattern for title like "Homo sapiens (S06/E07) - Originalversion"
|
|
|
- # from: https://www.ardmediathek.de/one/sendung/doctor-who/Y3JpZDovL3dkci5kZS9vbmUvZG9jdG9yIHdobw
|
|
|
- r'.*(?P<ep_info> \(S(?P<season_number>\d+)/E(?P<episode_number>\d+)\)).*',
|
|
|
- # E.g.: title="Fritjof aus Norwegen (2) (AD)"
|
|
|
- # from: https://www.ardmediathek.de/ard/sammlung/der-krieg-und-ich/68cMkqJdllm639Skj4c7sS/
|
|
|
- r'.*(?P<ep_info> \((?:Folge |Teil )?(?P<episode_number>\d+)(?:/\d+)?\)).*',
|
|
|
- r'.*(?P<ep_info>Folge (?P<episode_number>\d+)(?:\:| -|) )\"(?P<episode>.+)\".*',
|
|
|
- # E.g.: title="Folge 25/42: Symmetrie"
|
|
|
- # from: https://www.ardmediathek.de/ard/video/grips-mathe/folge-25-42-symmetrie/ard-alpha/Y3JpZDovL2JyLmRlL3ZpZGVvLzMyYzI0ZjczLWQ1N2MtNDAxNC05ZmZhLTFjYzRkZDA5NDU5OQ/
|
|
|
- # E.g.: title="Folge 1063 - Vertrauen"
|
|
|
- # from: https://www.ardmediathek.de/ard/sendung/die-fallers/Y3JpZDovL3N3ci5kZS8yMzAyMDQ4/
|
|
|
- r'.*(?P<ep_info>Folge (?P<episode_number>\d+)(?:/\d+)?(?:\:| -|) ).*',
|
|
|
- ]:
|
|
|
- m = re.match(pattern, title)
|
|
|
- if m:
|
|
|
- groupdict = m.groupdict()
|
|
|
- res['season_number'] = int_or_none(groupdict.get('season_number'))
|
|
|
- res['episode_number'] = int_or_none(groupdict.get('episode_number'))
|
|
|
- res['episode'] = str_or_none(groupdict.get('episode'))
|
|
|
- # Build the episode title by removing numeric episode information:
|
|
|
- if groupdict.get('ep_info') and not res['episode']:
|
|
|
- res['episode'] = str_or_none(
|
|
|
- title.replace(groupdict.get('ep_info'), ''))
|
|
|
- if res['episode']:
|
|
|
- res['episode'] = res['episode'].strip()
|
|
|
- break
|
|
|
-
|
|
|
- # As a fallback use the whole title as the episode name:
|
|
|
- if not res.get('episode'):
|
|
|
- res['episode'] = title.strip()
|
|
|
- return res
|
|
|
-
|
|
|
def _extract_formats(self, media_info, video_id):
|
|
|
type_ = media_info.get('_type')
|
|
|
media_array = media_info.get('_mediaArray', [])
|
|
@@ -155,138 +115,6 @@ class ARDMediathekBaseIE(InfoExtractor):
|
|
|
return formats
|
|
|
|
|
|
|
|
|
-class ARDMediathekIE(ARDMediathekBaseIE):
|
|
|
- IE_NAME = 'ARD:mediathek'
|
|
|
- _VALID_URL = r'^https?://(?:(?:(?:www|classic)\.)?ardmediathek\.de|mediathek\.(?:daserste|rbb-online)\.de|one\.ard\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?'
|
|
|
-
|
|
|
- _TESTS = [{
|
|
|
- # available till 26.07.2022
|
|
|
- 'url': 'http://www.ardmediathek.de/tv/S%C3%9CDLICHT/Was-ist-die-Kunst-der-Zukunft-liebe-Ann/BR-Fernsehen/Video?bcastId=34633636&documentId=44726822',
|
|
|
- 'info_dict': {
|
|
|
- 'id': '44726822',
|
|
|
- 'ext': 'mp4',
|
|
|
- 'title': 'Was ist die Kunst der Zukunft, liebe Anna McCarthy?',
|
|
|
- 'description': 'md5:4ada28b3e3b5df01647310e41f3a62f5',
|
|
|
- 'duration': 1740,
|
|
|
- },
|
|
|
- 'params': {
|
|
|
- # m3u8 download
|
|
|
- 'skip_download': True,
|
|
|
- }
|
|
|
- }, {
|
|
|
- 'url': 'https://one.ard.de/tv/Mord-mit-Aussicht/Mord-mit-Aussicht-6-39-T%C3%B6dliche-Nach/ONE/Video?bcastId=46384294&documentId=55586872',
|
|
|
- 'only_matching': True,
|
|
|
- }, {
|
|
|
- # audio
|
|
|
- 'url': 'http://www.ardmediathek.de/tv/WDR-H%C3%B6rspiel-Speicher/Tod-eines-Fu%C3%9Fballers/WDR-3/Audio-Podcast?documentId=28488308&bcastId=23074086',
|
|
|
- 'only_matching': True,
|
|
|
- }, {
|
|
|
- 'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht',
|
|
|
- 'only_matching': True,
|
|
|
- }, {
|
|
|
- # audio
|
|
|
- 'url': 'http://mediathek.rbb-online.de/radio/Hörspiel/Vor-dem-Fest/kulturradio/Audio?documentId=30796318&topRessort=radio&bcastId=9839158',
|
|
|
- 'only_matching': True,
|
|
|
- }, {
|
|
|
- 'url': 'https://classic.ardmediathek.de/tv/Panda-Gorilla-Co/Panda-Gorilla-Co-Folge-274/Das-Erste/Video?bcastId=16355486&documentId=58234698',
|
|
|
- 'only_matching': True,
|
|
|
- }]
|
|
|
-
|
|
|
- @classmethod
|
|
|
- def suitable(cls, url):
|
|
|
- return False if ARDBetaMediathekIE.suitable(url) else super(ARDMediathekIE, cls).suitable(url)
|
|
|
-
|
|
|
- def _real_extract(self, url):
|
|
|
- # determine video id from url
|
|
|
- m = self._match_valid_url(url)
|
|
|
-
|
|
|
- document_id = None
|
|
|
-
|
|
|
- numid = re.search(r'documentId=([0-9]+)', url)
|
|
|
- if numid:
|
|
|
- document_id = video_id = numid.group(1)
|
|
|
- else:
|
|
|
- video_id = m.group('video_id')
|
|
|
-
|
|
|
- webpage = self._download_webpage(url, video_id)
|
|
|
-
|
|
|
- ERRORS = (
|
|
|
- ('>Leider liegt eine Störung vor.', 'Video %s is unavailable'),
|
|
|
- ('>Der gewünschte Beitrag ist nicht mehr verfügbar.<',
|
|
|
- 'Video %s is no longer available'),
|
|
|
- )
|
|
|
-
|
|
|
- for pattern, message in ERRORS:
|
|
|
- if pattern in webpage:
|
|
|
- raise ExtractorError(message % video_id, expected=True)
|
|
|
-
|
|
|
- if re.search(r'[\?&]rss($|[=&])', url):
|
|
|
- doc = compat_etree_fromstring(webpage.encode('utf-8'))
|
|
|
- if doc.tag == 'rss':
|
|
|
- return GenericIE()._extract_rss(url, video_id, doc)
|
|
|
-
|
|
|
- title = self._og_search_title(webpage, default=None) or self._html_search_regex(
|
|
|
- [r'<h1(?:\s+class="boxTopHeadline")?>(.*?)</h1>',
|
|
|
- r'<meta name="dcterms\.title" content="(.*?)"/>',
|
|
|
- r'<h4 class="headline">(.*?)</h4>',
|
|
|
- r'<title[^>]*>(.*?)</title>'],
|
|
|
- webpage, 'title')
|
|
|
- description = self._og_search_description(webpage, default=None) or self._html_search_meta(
|
|
|
- 'dcterms.abstract', webpage, 'description', default=None)
|
|
|
- if description is None:
|
|
|
- description = self._html_search_meta(
|
|
|
- 'description', webpage, 'meta description', default=None)
|
|
|
- if description is None:
|
|
|
- description = self._html_search_regex(
|
|
|
- r'<p\s+class="teasertext">(.+?)</p>',
|
|
|
- webpage, 'teaser text', default=None)
|
|
|
-
|
|
|
- # Thumbnail is sometimes not present.
|
|
|
- # It is in the mobile version, but that seems to use a different URL
|
|
|
- # structure altogether.
|
|
|
- thumbnail = self._og_search_thumbnail(webpage, default=None)
|
|
|
-
|
|
|
- media_streams = re.findall(r'''(?x)
|
|
|
- mediaCollection\.addMediaStream\([0-9]+,\s*[0-9]+,\s*"[^"]*",\s*
|
|
|
- "([^"]+)"''', webpage)
|
|
|
-
|
|
|
- if media_streams:
|
|
|
- QUALITIES = qualities(['lo', 'hi', 'hq'])
|
|
|
- formats = []
|
|
|
- for furl in set(media_streams):
|
|
|
- if furl.endswith('.f4m'):
|
|
|
- fid = 'f4m'
|
|
|
- else:
|
|
|
- fid_m = re.match(r'.*\.([^.]+)\.[^.]+$', furl)
|
|
|
- fid = fid_m.group(1) if fid_m else None
|
|
|
- formats.append({
|
|
|
- 'quality': QUALITIES(fid),
|
|
|
- 'format_id': fid,
|
|
|
- 'url': furl,
|
|
|
- })
|
|
|
- info = {
|
|
|
- 'formats': formats,
|
|
|
- }
|
|
|
- else: # request JSON file
|
|
|
- if not document_id:
|
|
|
- video_id = self._search_regex(
|
|
|
- (r'/play/(?:config|media|sola)/(\d+)', r'contentId["\']\s*:\s*(\d+)'),
|
|
|
- webpage, 'media id', default=None)
|
|
|
- info = self._extract_media_info(
|
|
|
- 'http://www.ardmediathek.de/play/media/%s' % video_id,
|
|
|
- webpage, video_id)
|
|
|
-
|
|
|
- info.update({
|
|
|
- 'id': video_id,
|
|
|
- 'title': title,
|
|
|
- 'description': description,
|
|
|
- 'thumbnail': thumbnail,
|
|
|
- })
|
|
|
- info.update(self._ARD_extract_episode_info(info['title']))
|
|
|
-
|
|
|
- return info
|
|
|
-
|
|
|
-
|
|
|
class ARDIE(InfoExtractor):
|
|
|
_VALID_URL = r'(?P<mainurl>https?://(?:www\.)?daserste\.de/(?:[^/?#&]+/)+(?P<id>[^/?#&]+))\.html'
|
|
|
_TESTS = [{
|
|
@@ -399,21 +227,23 @@ class ARDIE(InfoExtractor):
|
|
|
}
|
|
|
|
|
|
|
|
|
-class ARDBetaMediathekIE(ARDMediathekBaseIE):
|
|
|
+class ARDBetaMediathekIE(InfoExtractor):
|
|
|
+ IE_NAME = 'ARDMediathek'
|
|
|
_VALID_URL = r'''(?x)https://
|
|
|
(?:(?:beta|www)\.)?ardmediathek\.de/
|
|
|
- (?:(?P<client>[^/]+)/)?
|
|
|
- (?:player|live|video|(?P<playlist>sendung|serie|sammlung))/
|
|
|
- (?:(?P<display_id>(?(playlist)[^?#]+?|[^?#]+))/)?
|
|
|
- (?P<id>(?(playlist)|Y3JpZDovL)[a-zA-Z0-9]+)
|
|
|
- (?(playlist)/(?P<season>\d+)?/?(?:[?#]|$))'''
|
|
|
+ (?:[^/]+/)?
|
|
|
+ (?:player|live|video)/
|
|
|
+ (?:(?P<display_id>[^?#]+)/)?
|
|
|
+ (?P<id>[a-zA-Z0-9]+)
|
|
|
+ /?(?:[?#]|$)'''
|
|
|
+ _GEO_COUNTRIES = ['DE']
|
|
|
|
|
|
_TESTS = [{
|
|
|
'url': 'https://www.ardmediathek.de/video/filme-im-mdr/liebe-auf-vier-pfoten/mdr-fernsehen/Y3JpZDovL21kci5kZS9zZW5kdW5nLzI4MjA0MC80MjIwOTEtNDAyNTM0',
|
|
|
'md5': 'b6e8ab03f2bcc6e1f9e6cef25fcc03c4',
|
|
|
'info_dict': {
|
|
|
'display_id': 'filme-im-mdr/liebe-auf-vier-pfoten/mdr-fernsehen',
|
|
|
- 'id': '12939099',
|
|
|
+ 'id': 'Y3JpZDovL21kci5kZS9zZW5kdW5nLzI4MjA0MC80MjIwOTEtNDAyNTM0',
|
|
|
'title': 'Liebe auf vier Pfoten',
|
|
|
'description': r're:^Claudia Schmitt, Anwältin in Salzburg',
|
|
|
'duration': 5222,
|
|
@@ -422,7 +252,10 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
|
|
|
'upload_date': '20231130',
|
|
|
'ext': 'mp4',
|
|
|
'episode': 'Liebe auf vier Pfoten',
|
|
|
- 'series': 'Filme im MDR'
|
|
|
+ 'series': 'Filme im MDR',
|
|
|
+ 'age_limit': 0,
|
|
|
+ 'channel': 'MDR',
|
|
|
+ '_old_archive_ids': ['ardbetamediathek 12939099'],
|
|
|
},
|
|
|
}, {
|
|
|
'url': 'https://www.ardmediathek.de/mdr/video/die-robuste-roswita/Y3JpZDovL21kci5kZS9iZWl0cmFnL2Ntcy84MWMxN2MzZC0wMjkxLTRmMzUtODk4ZS0wYzhlOWQxODE2NGI/',
|
|
@@ -443,7 +276,7 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
|
|
|
'url': 'https://www.ardmediathek.de/video/tagesschau-oder-tagesschau-20-00-uhr/das-erste/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhZ2Vzc2NoYXUvZmM4ZDUxMjgtOTE0ZC00Y2MzLTgzNzAtNDZkNGNiZWJkOTll',
|
|
|
'md5': '1e73ded21cb79bac065117e80c81dc88',
|
|
|
'info_dict': {
|
|
|
- 'id': '10049223',
|
|
|
+ 'id': 'Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhZ2Vzc2NoYXUvZmM4ZDUxMjgtOTE0ZC00Y2MzLTgzNzAtNDZkNGNiZWJkOTll',
|
|
|
'ext': 'mp4',
|
|
|
'title': 'tagesschau, 20:00 Uhr',
|
|
|
'timestamp': 1636398000,
|
|
@@ -454,6 +287,26 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
|
|
|
'episode': 'tagesschau, 20:00 Uhr',
|
|
|
'series': 'tagesschau',
|
|
|
'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:fbb21142783b0a49?w=960&ch=ee69108ae344f678',
|
|
|
+ 'channel': 'ARD-Aktuell',
|
|
|
+ '_old_archive_ids': ['ardbetamediathek 10049223'],
|
|
|
+ },
|
|
|
+ }, {
|
|
|
+ 'url': 'https://www.ardmediathek.de/video/7-tage/7-tage-unter-harten-jungs/hr-fernsehen/N2I2YmM5MzgtNWFlOS00ZGFlLTg2NzMtYzNjM2JlNjk4MDg3',
|
|
|
+ 'md5': 'c428b9effff18ff624d4f903bda26315',
|
|
|
+ 'info_dict': {
|
|
|
+ 'id': 'N2I2YmM5MzgtNWFlOS00ZGFlLTg2NzMtYzNjM2JlNjk4MDg3',
|
|
|
+ 'ext': 'mp4',
|
|
|
+ 'duration': 2700,
|
|
|
+ 'episode': '7 Tage ... unter harten Jungs',
|
|
|
+ 'description': 'md5:0f215470dcd2b02f59f4bd10c963f072',
|
|
|
+ 'upload_date': '20231005',
|
|
|
+ 'timestamp': 1696491171,
|
|
|
+ 'display_id': '7-tage/7-tage-unter-harten-jungs/hr-fernsehen',
|
|
|
+ 'series': '7 Tage ...',
|
|
|
+ 'channel': 'HR',
|
|
|
+ 'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:f6e6d5ffac41925c?w=960&ch=fa32ba69bc87989a',
|
|
|
+ 'title': '7 Tage ... unter harten Jungs',
|
|
|
+ '_old_archive_ids': ['ardbetamediathek 94834686'],
|
|
|
},
|
|
|
}, {
|
|
|
'url': 'https://beta.ardmediathek.de/ard/video/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE',
|
|
@@ -470,6 +323,176 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
|
|
|
}, {
|
|
|
'url': 'https://www.ardmediathek.de/swr/live/Y3JpZDovL3N3ci5kZS8xMzQ4MTA0Mg',
|
|
|
'only_matching': True,
|
|
|
+ }, {
|
|
|
+ 'url': 'https://www.ardmediathek.de/video/coronavirus-update-ndr-info/astrazeneca-kurz-lockdown-und-pims-syndrom-81/ndr/Y3JpZDovL25kci5kZS84NzE0M2FjNi0wMWEwLTQ5ODEtOTE5NS1mOGZhNzdhOTFmOTI/',
|
|
|
+ 'only_matching': True,
|
|
|
+ }]
|
|
|
+
|
|
|
+ def _extract_episode_info(self, title):
|
|
|
+ patterns = [
|
|
|
+ # Pattern for title like "Homo sapiens (S06/E07) - Originalversion"
|
|
|
+ # from: https://www.ardmediathek.de/one/sendung/doctor-who/Y3JpZDovL3dkci5kZS9vbmUvZG9jdG9yIHdobw
|
|
|
+ r'.*(?P<ep_info> \(S(?P<season_number>\d+)/E(?P<episode_number>\d+)\)).*',
|
|
|
+ # E.g.: title="Fritjof aus Norwegen (2) (AD)"
|
|
|
+ # from: https://www.ardmediathek.de/ard/sammlung/der-krieg-und-ich/68cMkqJdllm639Skj4c7sS/
|
|
|
+ r'.*(?P<ep_info> \((?:Folge |Teil )?(?P<episode_number>\d+)(?:/\d+)?\)).*',
|
|
|
+ r'.*(?P<ep_info>Folge (?P<episode_number>\d+)(?:\:| -|) )\"(?P<episode>.+)\".*',
|
|
|
+ # E.g.: title="Folge 25/42: Symmetrie"
|
|
|
+ # from: https://www.ardmediathek.de/ard/video/grips-mathe/folge-25-42-symmetrie/ard-alpha/Y3JpZDovL2JyLmRlL3ZpZGVvLzMyYzI0ZjczLWQ1N2MtNDAxNC05ZmZhLTFjYzRkZDA5NDU5OQ/
|
|
|
+ # E.g.: title="Folge 1063 - Vertrauen"
|
|
|
+ # from: https://www.ardmediathek.de/ard/sendung/die-fallers/Y3JpZDovL3N3ci5kZS8yMzAyMDQ4/
|
|
|
+ r'.*(?P<ep_info>Folge (?P<episode_number>\d+)(?:/\d+)?(?:\:| -|) ).*',
|
|
|
+ # As a fallback use the full title
|
|
|
+ r'(?P<title>.*)',
|
|
|
+ ]
|
|
|
+
|
|
|
+ return traverse_obj(patterns, (..., {partial(re.match, string=title)}, {
|
|
|
+ 'season_number': ('season_number', {int_or_none}),
|
|
|
+ 'episode_number': ('episode_number', {int_or_none}),
|
|
|
+ 'episode': ((
|
|
|
+ ('episode', {str_or_none}),
|
|
|
+ ('ep_info', {lambda x: title.replace(x, '')}),
|
|
|
+ ('title', {str}),
|
|
|
+ ), {str.strip}),
|
|
|
+ }), get_all=False)
|
|
|
+
|
|
|
+ def _real_extract(self, url):
|
|
|
+ video_id, display_id = self._match_valid_url(url).group('id', 'display_id')
|
|
|
+
|
|
|
+ page_data = self._download_json(
|
|
|
+ f'https://api.ardmediathek.de/page-gateway/pages/ard/item/{video_id}', video_id, query={
|
|
|
+ 'embedded': 'false',
|
|
|
+ 'mcV6': 'true',
|
|
|
+ })
|
|
|
+
|
|
|
+ player_data = traverse_obj(
|
|
|
+ page_data, ('widgets', lambda _, v: v['type'] in ('player_ondemand', 'player_live'), {dict}), get_all=False)
|
|
|
+ is_live = player_data.get('type') == 'player_live'
|
|
|
+ media_data = traverse_obj(player_data, ('mediaCollection', 'embedded', {dict}))
|
|
|
+
|
|
|
+ if player_data.get('blockedByFsk'):
|
|
|
+ self.raise_no_formats('This video is only available after 22:00', expected=True)
|
|
|
+
|
|
|
+ formats = []
|
|
|
+ subtitles = {}
|
|
|
+ for stream in traverse_obj(media_data, ('streams', ..., {dict})):
|
|
|
+ kind = stream.get('kind')
|
|
|
+ # Prioritize main stream over sign language and others
|
|
|
+ preference = 1 if kind == 'main' else None
|
|
|
+ for media in traverse_obj(stream, ('media', lambda _, v: url_or_none(v['url']))):
|
|
|
+ media_url = media['url']
|
|
|
+
|
|
|
+ audio_kind = traverse_obj(media, (
|
|
|
+ 'audios', 0, 'kind', {str}), default='').replace('standard', '')
|
|
|
+ lang_code = traverse_obj(media, ('audios', 0, 'languageCode', {str})) or 'deu'
|
|
|
+ lang = join_nonempty(lang_code, audio_kind)
|
|
|
+ language_preference = 10 if lang == 'deu' else -10
|
|
|
+
|
|
|
+ if determine_ext(media_url) == 'm3u8':
|
|
|
+ fmts, subs = self._extract_m3u8_formats_and_subtitles(
|
|
|
+ media_url, video_id, m3u8_id=f'hls-{kind}', preference=preference, fatal=False, live=is_live)
|
|
|
+ for f in fmts:
|
|
|
+ f['language'] = lang
|
|
|
+ f['language_preference'] = language_preference
|
|
|
+ formats.extend(fmts)
|
|
|
+ self._merge_subtitles(subs, target=subtitles)
|
|
|
+ else:
|
|
|
+ formats.append({
|
|
|
+ 'url': media_url,
|
|
|
+ 'format_id': f'http-{kind}',
|
|
|
+ 'preference': preference,
|
|
|
+ 'language': lang,
|
|
|
+ 'language_preference': language_preference,
|
|
|
+ **traverse_obj(media, {
|
|
|
+ 'format_note': ('forcedLabel', {str}),
|
|
|
+ 'width': ('maxHResolutionPx', {int_or_none}),
|
|
|
+ 'height': ('maxVResolutionPx', {int_or_none}),
|
|
|
+ 'vcodec': ('videoCodec', {str}),
|
|
|
+ }),
|
|
|
+ })
|
|
|
+
|
|
|
+ for sub in traverse_obj(media_data, ('subtitles', ..., {dict})):
|
|
|
+ for sources in traverse_obj(sub, ('sources', lambda _, v: url_or_none(v['url']))):
|
|
|
+ subtitles.setdefault(sub.get('languageCode') or 'deu', []).append({
|
|
|
+ 'url': sources['url'],
|
|
|
+ 'ext': {'webvtt': 'vtt', 'ebutt': 'ttml'}.get(sources.get('kind')),
|
|
|
+ })
|
|
|
+
|
|
|
+ age_limit = traverse_obj(page_data, ('fskRating', {lambda x: remove_start(x, 'FSK')}, {int_or_none}))
|
|
|
+ old_id = traverse_obj(page_data, ('tracking', 'atiCustomVars', 'contentId'))
|
|
|
+
|
|
|
+ return {
|
|
|
+ 'id': video_id,
|
|
|
+ 'display_id': display_id,
|
|
|
+ 'formats': formats,
|
|
|
+ 'subtitles': subtitles,
|
|
|
+ 'is_live': is_live,
|
|
|
+ 'age_limit': age_limit,
|
|
|
+ **traverse_obj(media_data, ('meta', {
|
|
|
+ 'title': 'title',
|
|
|
+ 'description': 'synopsis',
|
|
|
+ 'timestamp': ('broadcastedOnDateTime', {parse_iso8601}),
|
|
|
+ 'series': 'seriesTitle',
|
|
|
+ 'thumbnail': ('images', 0, 'url', {url_or_none}),
|
|
|
+ 'duration': ('durationSeconds', {int_or_none}),
|
|
|
+ 'channel': 'clipSourceName',
|
|
|
+ })),
|
|
|
+ **self._extract_episode_info(page_data.get('title')),
|
|
|
+ '_old_archive_ids': [make_archive_id(ARDBetaMediathekIE, old_id)],
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+class ARDMediathekCollectionIE(InfoExtractor):
|
|
|
+ _VALID_URL = r'''(?x)https://
|
|
|
+ (?:(?:beta|www)\.)?ardmediathek\.de/
|
|
|
+ (?:[^/?#]+/)?
|
|
|
+ (?P<playlist>sendung|serie|sammlung)/
|
|
|
+ (?:(?P<display_id>[^?#]+?)/)?
|
|
|
+ (?P<id>[a-zA-Z0-9]+)
|
|
|
+ (?:/(?P<season>\d+)(?:/(?P<version>OV|AD))?)?/?(?:[?#]|$)'''
|
|
|
+ _GEO_COUNTRIES = ['DE']
|
|
|
+
|
|
|
+ _TESTS = [{
|
|
|
+ 'url': 'https://www.ardmediathek.de/serie/quiz/staffel-1-originalversion/Y3JpZDovL3dkci5kZS9vbmUvcXVpeg/1/OV',
|
|
|
+ 'info_dict': {
|
|
|
+ 'id': 'Y3JpZDovL3dkci5kZS9vbmUvcXVpeg_1_OV',
|
|
|
+ 'display_id': 'quiz/staffel-1-originalversion',
|
|
|
+ 'title': 'Staffel 1 Originalversion',
|
|
|
+ },
|
|
|
+ 'playlist_count': 3,
|
|
|
+ }, {
|
|
|
+ 'url': 'https://www.ardmediathek.de/serie/babylon-berlin/staffel-4-mit-audiodeskription/Y3JpZDovL2Rhc2Vyc3RlLmRlL2JhYnlsb24tYmVybGlu/4/AD',
|
|
|
+ 'info_dict': {
|
|
|
+ 'id': 'Y3JpZDovL2Rhc2Vyc3RlLmRlL2JhYnlsb24tYmVybGlu_4_AD',
|
|
|
+ 'display_id': 'babylon-berlin/staffel-4-mit-audiodeskription',
|
|
|
+ 'title': 'Staffel 4 mit Audiodeskription',
|
|
|
+ },
|
|
|
+ 'playlist_count': 12,
|
|
|
+ }, {
|
|
|
+ 'url': 'https://www.ardmediathek.de/serie/babylon-berlin/staffel-1/Y3JpZDovL2Rhc2Vyc3RlLmRlL2JhYnlsb24tYmVybGlu/1/',
|
|
|
+ 'info_dict': {
|
|
|
+ 'id': 'Y3JpZDovL2Rhc2Vyc3RlLmRlL2JhYnlsb24tYmVybGlu_1',
|
|
|
+ 'display_id': 'babylon-berlin/staffel-1',
|
|
|
+ 'title': 'Staffel 1',
|
|
|
+ },
|
|
|
+ 'playlist_count': 8,
|
|
|
+ }, {
|
|
|
+ 'url': 'https://www.ardmediathek.de/sendung/tatort/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydA',
|
|
|
+ 'info_dict': {
|
|
|
+ 'id': 'Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydA',
|
|
|
+ 'display_id': 'tatort',
|
|
|
+ 'title': 'Tatort',
|
|
|
+ },
|
|
|
+ 'playlist_mincount': 500,
|
|
|
+ }, {
|
|
|
+ 'url': 'https://www.ardmediathek.de/sammlung/die-kirche-bleibt-im-dorf/5eOHzt8XB2sqeFXbIoJlg2',
|
|
|
+ 'info_dict': {
|
|
|
+ 'id': '5eOHzt8XB2sqeFXbIoJlg2',
|
|
|
+ 'display_id': 'die-kirche-bleibt-im-dorf',
|
|
|
+ 'title': 'Die Kirche bleibt im Dorf',
|
|
|
+ 'description': 'Die Kirche bleibt im Dorf',
|
|
|
+ },
|
|
|
+ 'playlist_count': 4,
|
|
|
}, {
|
|
|
# playlist of type 'sendung'
|
|
|
'url': 'https://www.ardmediathek.de/ard/sendung/doctor-who/Y3JpZDovL3dkci5kZS9vbmUvZG9jdG9yIHdobw/',
|
|
@@ -482,196 +505,48 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
|
|
|
# playlist of type 'sammlung'
|
|
|
'url': 'https://www.ardmediathek.de/ard/sammlung/team-muenster/5JpTzLSbWUAK8184IOvEir/',
|
|
|
'only_matching': True,
|
|
|
- }, {
|
|
|
- 'url': 'https://www.ardmediathek.de/video/coronavirus-update-ndr-info/astrazeneca-kurz-lockdown-und-pims-syndrom-81/ndr/Y3JpZDovL25kci5kZS84NzE0M2FjNi0wMWEwLTQ5ODEtOTE5NS1mOGZhNzdhOTFmOTI/',
|
|
|
- 'only_matching': True,
|
|
|
- }, {
|
|
|
- 'url': 'https://www.ardmediathek.de/ard/player/Y3JpZDovL3dkci5kZS9CZWl0cmFnLWQ2NDJjYWEzLTMwZWYtNGI4NS1iMTI2LTU1N2UxYTcxOGIzOQ/tatort-duo-koeln-leipzig-ihr-kinderlein-kommet',
|
|
|
- 'only_matching': True,
|
|
|
}]
|
|
|
|
|
|
- def _ARD_load_playlist_snippet(self, playlist_id, display_id, client, mode, page_number):
|
|
|
- """ Query the ARD server for playlist information
|
|
|
- and returns the data in "raw" format """
|
|
|
- assert mode in ('sendung', 'serie', 'sammlung')
|
|
|
- if mode in ('sendung', 'serie'):
|
|
|
- graphQL = json.dumps({
|
|
|
- 'query': '''{
|
|
|
- showPage(
|
|
|
- client: "%s"
|
|
|
- showId: "%s"
|
|
|
- pageNumber: %d
|
|
|
- ) {
|
|
|
- pagination {
|
|
|
- pageSize
|
|
|
- totalElements
|
|
|
- }
|
|
|
- teasers { # Array
|
|
|
- mediumTitle
|
|
|
- links { target { id href title } }
|
|
|
- type
|
|
|
- }
|
|
|
- }}''' % (client, playlist_id, page_number),
|
|
|
- }).encode()
|
|
|
- else: # mode == 'sammlung'
|
|
|
- graphQL = json.dumps({
|
|
|
- 'query': '''{
|
|
|
- morePage(
|
|
|
- client: "%s"
|
|
|
- compilationId: "%s"
|
|
|
- pageNumber: %d
|
|
|
- ) {
|
|
|
- widget {
|
|
|
- pagination {
|
|
|
- pageSize
|
|
|
- totalElements
|
|
|
- }
|
|
|
- teasers { # Array
|
|
|
- mediumTitle
|
|
|
- links { target { id href title } }
|
|
|
- type
|
|
|
- }
|
|
|
- }
|
|
|
- }}''' % (client, playlist_id, page_number),
|
|
|
- }).encode()
|
|
|
- # Ressources for ARD graphQL debugging:
|
|
|
- # https://api-test.ardmediathek.de/public-gateway
|
|
|
- show_page = self._download_json(
|
|
|
- 'https://api.ardmediathek.de/public-gateway',
|
|
|
- '[Playlist] %s' % display_id,
|
|
|
- data=graphQL,
|
|
|
- headers={'Content-Type': 'application/json'})['data']
|
|
|
- # align the structure of the returned data:
|
|
|
- if mode in ('sendung', 'serie'):
|
|
|
- show_page = show_page['showPage']
|
|
|
- else: # mode == 'sammlung'
|
|
|
- show_page = show_page['morePage']['widget']
|
|
|
- return show_page
|
|
|
-
|
|
|
- def _ARD_extract_playlist(self, url, playlist_id, display_id, client, mode):
|
|
|
- """ Collects all playlist entries and returns them as info dict.
|
|
|
- Supports playlists of mode 'sendung', 'serie', and 'sammlung',
|
|
|
- as well as nested playlists. """
|
|
|
- entries = []
|
|
|
- pageNumber = 0
|
|
|
- while True: # iterate by pageNumber
|
|
|
- show_page = self._ARD_load_playlist_snippet(
|
|
|
- playlist_id, display_id, client, mode, pageNumber)
|
|
|
- for teaser in show_page['teasers']: # process playlist items
|
|
|
- if '/compilation/' in teaser['links']['target']['href']:
|
|
|
- # alternativ cond.: teaser['type'] == "compilation"
|
|
|
- # => This is an nested compilation, e.g. like:
|
|
|
- # https://www.ardmediathek.de/ard/sammlung/die-kirche-bleibt-im-dorf/5eOHzt8XB2sqeFXbIoJlg2/
|
|
|
- link_mode = 'sammlung'
|
|
|
- else:
|
|
|
- link_mode = 'video'
|
|
|
-
|
|
|
- item_url = 'https://www.ardmediathek.de/%s/%s/%s/%s/%s' % (
|
|
|
- client, link_mode, display_id,
|
|
|
- # perform HTLM quoting of episode title similar to ARD:
|
|
|
- re.sub('^-|-$', '', # remove '-' from begin/end
|
|
|
- re.sub('[^a-zA-Z0-9]+', '-', # replace special chars by -
|
|
|
- teaser['links']['target']['title'].lower()
|
|
|
- .replace('ä', 'ae').replace('ö', 'oe')
|
|
|
- .replace('ü', 'ue').replace('ß', 'ss'))),
|
|
|
- teaser['links']['target']['id'])
|
|
|
- entries.append(self.url_result(
|
|
|
- item_url,
|
|
|
- ie=ARDBetaMediathekIE.ie_key()))
|
|
|
-
|
|
|
- if (show_page['pagination']['pageSize'] * (pageNumber + 1)
|
|
|
- >= show_page['pagination']['totalElements']):
|
|
|
- # we've processed enough pages to get all playlist entries
|
|
|
- break
|
|
|
- pageNumber = pageNumber + 1
|
|
|
-
|
|
|
- return self.playlist_result(entries, playlist_id, playlist_title=display_id)
|
|
|
+ _PAGE_SIZE = 100
|
|
|
|
|
|
def _real_extract(self, url):
|
|
|
- video_id, display_id, playlist_type, client, season_number = self._match_valid_url(url).group(
|
|
|
- 'id', 'display_id', 'playlist', 'client', 'season')
|
|
|
- display_id, client = display_id or video_id, client or 'ard'
|
|
|
-
|
|
|
- if playlist_type:
|
|
|
- # TODO: Extract only specified season
|
|
|
- return self._ARD_extract_playlist(url, video_id, display_id, client, playlist_type)
|
|
|
-
|
|
|
- player_page = self._download_json(
|
|
|
- 'https://api.ardmediathek.de/public-gateway',
|
|
|
- display_id, data=json.dumps({
|
|
|
- 'query': '''{
|
|
|
- playerPage(client:"%s", clipId: "%s") {
|
|
|
- blockedByFsk
|
|
|
- broadcastedOn
|
|
|
- maturityContentRating
|
|
|
- mediaCollection {
|
|
|
- _duration
|
|
|
- _geoblocked
|
|
|
- _isLive
|
|
|
- _mediaArray {
|
|
|
- _mediaStreamArray {
|
|
|
- _quality
|
|
|
- _server
|
|
|
- _stream
|
|
|
- }
|
|
|
- }
|
|
|
- _previewImage
|
|
|
- _subtitleUrl
|
|
|
- _type
|
|
|
- }
|
|
|
- show {
|
|
|
- title
|
|
|
- }
|
|
|
- image {
|
|
|
- src
|
|
|
- }
|
|
|
- synopsis
|
|
|
- title
|
|
|
- tracking {
|
|
|
- atiCustomVars {
|
|
|
- contentId
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
-}''' % (client, video_id),
|
|
|
- }).encode(), headers={
|
|
|
- 'Content-Type': 'application/json'
|
|
|
- })['data']['playerPage']
|
|
|
- title = player_page['title']
|
|
|
- content_id = str_or_none(try_get(
|
|
|
- player_page, lambda x: x['tracking']['atiCustomVars']['contentId']))
|
|
|
- media_collection = player_page.get('mediaCollection') or {}
|
|
|
- if not media_collection and content_id:
|
|
|
- media_collection = self._download_json(
|
|
|
- 'https://www.ardmediathek.de/play/media/' + content_id,
|
|
|
- content_id, fatal=False) or {}
|
|
|
- info = self._parse_media_info(
|
|
|
- media_collection, content_id or video_id,
|
|
|
- player_page.get('blockedByFsk'))
|
|
|
- age_limit = None
|
|
|
- description = player_page.get('synopsis')
|
|
|
- maturity_content_rating = player_page.get('maturityContentRating')
|
|
|
- if maturity_content_rating:
|
|
|
- age_limit = int_or_none(maturity_content_rating.lstrip('FSK'))
|
|
|
- if not age_limit and description:
|
|
|
- age_limit = int_or_none(self._search_regex(
|
|
|
- r'\(FSK\s*(\d+)\)\s*$', description, 'age limit', default=None))
|
|
|
- info.update({
|
|
|
- 'age_limit': age_limit,
|
|
|
- 'display_id': display_id,
|
|
|
- 'title': title,
|
|
|
- 'description': description,
|
|
|
- 'timestamp': unified_timestamp(player_page.get('broadcastedOn')),
|
|
|
- 'series': try_get(player_page, lambda x: x['show']['title']),
|
|
|
- 'thumbnail': (media_collection.get('_previewImage')
|
|
|
- or try_get(player_page, lambda x: update_url(x['image']['src'], query=None, fragment=None))
|
|
|
- or self.get_thumbnail_from_html(display_id, url)),
|
|
|
- })
|
|
|
- info.update(self._ARD_extract_episode_info(info['title']))
|
|
|
- return info
|
|
|
-
|
|
|
- def get_thumbnail_from_html(self, display_id, url):
|
|
|
- webpage = self._download_webpage(url, display_id, fatal=False) or ''
|
|
|
- return (
|
|
|
- self._og_search_thumbnail(webpage, default=None)
|
|
|
- or self._html_search_meta('thumbnailUrl', webpage, default=None))
|
|
|
+ playlist_id, display_id, playlist_type, season_number, version = self._match_valid_url(url).group(
|
|
|
+ 'id', 'display_id', 'playlist', 'season', 'version')
|
|
|
+
|
|
|
+ def call_api(page_num):
|
|
|
+ api_path = 'compilations/ard' if playlist_type == 'sammlung' else 'widgets/ard/asset'
|
|
|
+ return self._download_json(
|
|
|
+ f'https://api.ardmediathek.de/page-gateway/{api_path}/{playlist_id}', playlist_id,
|
|
|
+ f'Downloading playlist page {page_num}', query={
|
|
|
+ 'pageNumber': page_num,
|
|
|
+ 'pageSize': self._PAGE_SIZE,
|
|
|
+ **({
|
|
|
+ 'seasoned': 'true',
|
|
|
+ 'seasonNumber': season_number,
|
|
|
+ 'withOriginalversion': 'true' if version == 'OV' else 'false',
|
|
|
+ 'withAudiodescription': 'true' if version == 'AD' else 'false',
|
|
|
+ } if season_number else {}),
|
|
|
+ })
|
|
|
+
|
|
|
+ def fetch_page(page_num):
|
|
|
+ for item in traverse_obj(call_api(page_num), ('teasers', ..., {dict})):
|
|
|
+ item_id = traverse_obj(item, ('links', 'target', ('urlId', 'id')), 'id', get_all=False)
|
|
|
+ if not item_id or item_id == playlist_id:
|
|
|
+ continue
|
|
|
+ item_mode = 'sammlung' if item.get('type') == 'compilation' else 'video'
|
|
|
+ yield self.url_result(
|
|
|
+ f'https://www.ardmediathek.de/{item_mode}/{item_id}',
|
|
|
+ ie=(ARDMediathekCollectionIE if item_mode == 'sammlung' else ARDBetaMediathekIE),
|
|
|
+ **traverse_obj(item, {
|
|
|
+ 'id': ('id', {str}),
|
|
|
+ 'title': ('longTitle', {str}),
|
|
|
+ 'duration': ('duration', {int_or_none}),
|
|
|
+ 'timestamp': ('broadcastedOn', {parse_iso8601}),
|
|
|
+ }))
|
|
|
+
|
|
|
+ page_data = call_api(0)
|
|
|
+ full_id = join_nonempty(playlist_id, season_number, version, delim='_')
|
|
|
+
|
|
|
+ return self.playlist_result(
|
|
|
+ OnDemandPagedList(fetch_page, self._PAGE_SIZE), full_id, display_id=display_id,
|
|
|
+ title=page_data.get('title'), description=page_data.get('synopsis'))
|