Browse Source

[ie/mlbtv] Fix extractor (#10515)

Closes #10510
Authored by: bashonly
bashonly 7 months ago
parent
commit
f0993391e6
1 changed files with 181 additions and 44 deletions
  1. 181 44
      yt_dlp/extractor/mlb.py

+ 181 - 44
yt_dlp/extractor/mlb.py

@@ -1,16 +1,21 @@
+import json
 import re
-import urllib.parse
+import time
 import uuid
 
 from .common import InfoExtractor
+from ..networking.exceptions import HTTPError
 from ..utils import (
+    ExtractorError,
     determine_ext,
     int_or_none,
     join_nonempty,
+    jwt_decode_hs256,
     parse_duration,
     parse_iso8601,
     try_get,
     url_or_none,
+    urlencode_postdata,
 )
 from ..utils.traversal import traverse_obj
 
@@ -276,81 +281,213 @@ class MLBVideoIE(MLBBaseIE):
 class MLBTVIE(InfoExtractor):
     _VALID_URL = r'https?://(?:www\.)?mlb\.com/tv/g(?P<id>\d{6})'
     _NETRC_MACHINE = 'mlb'
-
     _TESTS = [{
         'url': 'https://www.mlb.com/tv/g661581/vee2eff5f-a7df-4c20-bdb4-7b926fa12638',
         'info_dict': {
             'id': '661581',
             'ext': 'mp4',
             'title': '2022-07-02 - St. Louis Cardinals @ Philadelphia Phillies',
+            'release_date': '20220702',
+            'release_timestamp': 1656792300,
         },
         'params': {
             'skip_download': True,
         },
     }]
+    _GRAPHQL_INIT_QUERY = '''\
+mutation initSession($device: InitSessionInput!, $clientType: ClientType!, $experience: ExperienceTypeInput) {
+    initSession(device: $device, clientType: $clientType, experience: $experience) {
+        deviceId
+        sessionId
+        entitlements {
+            code
+        }
+        location {
+            countryCode
+            regionName
+            zipCode
+            latitude
+            longitude
+        }
+        clientExperience
+        features
+    }
+  }'''
+    _GRAPHQL_PLAYBACK_QUERY = '''\
+mutation initPlaybackSession(
+        $adCapabilities: [AdExperienceType]
+        $mediaId: String!
+        $deviceId: String!
+        $sessionId: String!
+        $quality: PlaybackQuality
+    ) {
+        initPlaybackSession(
+            adCapabilities: $adCapabilities
+            mediaId: $mediaId
+            deviceId: $deviceId
+            sessionId: $sessionId
+            quality: $quality
+        ) {
+            playbackSessionId
+            playback {
+                url
+                token
+                expiration
+                cdn
+            }
+        }
+    }'''
+    _APP_VERSION = '7.8.2'
+    _device_id = None
+    _session_id = None
     _access_token = None
+    _token_expiry = 0
+
+    @property
+    def _api_headers(self):
+        if (self._token_expiry - 120) <= time.time():
+            self.write_debug('Access token has expired; re-logging in')
+            self._perform_login(*self._get_login_info())
+        return {'Authorization': f'Bearer {self._access_token}'}
 
     def _real_initialize(self):
         if not self._access_token:
             self.raise_login_required(
                 'All videos are only available to registered users', method='password')
 
-    def _perform_login(self, username, password):
-        data = f'grant_type=password&username={urllib.parse.quote(username)}&password={urllib.parse.quote(password)}&scope=openid offline_access&client_id=0oa3e1nutA1HLzAKG356'
-        access_token = self._download_json(
-            'https://ids.mlb.com/oauth2/aus1m088yK07noBfh356/v1/token', None,
-            headers={
-                'User-Agent': 'okhttp/3.12.1',
-                'Content-Type': 'application/x-www-form-urlencoded',
-            }, data=data.encode())['access_token']
+    def _set_device_id(self, username):
+        if not self._device_id:
+            self._device_id = self.cache.load(
+                self._NETRC_MACHINE, 'device_ids', default={}).get(username)
+        if self._device_id:
+            return
+        self._device_id = str(uuid.uuid4())
+        self.cache.store(self._NETRC_MACHINE, 'device_ids', {username: self._device_id})
 
-        entitlement = self._download_webpage(
-            f'https://media-entitlement.mlb.com/api/v3/jwt?os=Android&appname=AtBat&did={uuid.uuid4()}', None,
-            headers={
-                'User-Agent': 'okhttp/3.12.1',
-                'Authorization': f'Bearer {access_token}',
-            })
+    def _perform_login(self, username, password):
+        try:
+            self._access_token = self._download_json(
+                'https://ids.mlb.com/oauth2/aus1m088yK07noBfh356/v1/token', None,
+                'Logging in', 'Unable to log in', headers={
+                    'User-Agent': 'okhttp/3.12.1',
+                    'Content-Type': 'application/x-www-form-urlencoded',
+                }, data=urlencode_postdata({
+                    'grant_type': 'password',
+                    'username': username,
+                    'password': password,
+                    'scope': 'openid offline_access',
+                    'client_id': '0oa3e1nutA1HLzAKG356',
+                }))['access_token']
+        except ExtractorError as error:
+            if isinstance(error.cause, HTTPError) and error.cause.status == 400:
+                raise ExtractorError('Invalid username or password', expected=True)
+            raise
+
+        self._token_expiry = traverse_obj(self._access_token, ({jwt_decode_hs256}, 'exp', {int})) or 0
+        self._set_device_id(username)
+
+        self._session_id = self._call_api({
+            'operationName': 'initSession',
+            'query': self._GRAPHQL_INIT_QUERY,
+            'variables': {
+                'device': {
+                    'appVersion': self._APP_VERSION,
+                    'deviceFamily': 'desktop',
+                    'knownDeviceId': self._device_id,
+                    'languagePreference': 'ENGLISH',
+                    'manufacturer': '',
+                    'model': '',
+                    'os': '',
+                    'osVersion': '',
+                },
+                'clientType': 'WEB',
+            },
+        }, None, 'session ID')['data']['initSession']['sessionId']
 
-        data = f'grant_type=urn:ietf:params:oauth:grant-type:token-exchange&subject_token={entitlement}&subject_token_type=urn:ietf:params:oauth:token-type:jwt&platform=android-tv'
-        self._access_token = self._download_json(
-            'https://us.edge.bamgrid.com/token', None,
+    def _call_api(self, data, video_id, description='GraphQL JSON', fatal=True):
+        return self._download_json(
+            'https://media-gateway.mlb.com/graphql', video_id,
+            f'Downloading {description}', f'Unable to download {description}', fatal=fatal,
             headers={
+                **self._api_headers,
                 'Accept': 'application/json',
-                'Authorization': 'Bearer bWxidHYmYW5kcm9pZCYxLjAuMA.6LZMbH2r--rbXcgEabaDdIslpo4RyZrlVfWZhsAgXIk',
-                'Content-Type': 'application/x-www-form-urlencoded',
-            }, data=data.encode())['access_token']
+                'Content-Type': 'application/json',
+                'x-client-name': 'WEB',
+                'x-client-version': self._APP_VERSION,
+            }, data=json.dumps(data, separators=(',', ':')).encode())
+
+    def _extract_formats_and_subtitles(self, broadcast, video_id):
+        feed = traverse_obj(broadcast, ('homeAway', {str.title}))
+        medium = traverse_obj(broadcast, ('type', {str}))
+        language = traverse_obj(broadcast, ('language', {str.lower}))
+        format_id = join_nonempty(feed, medium, language)
+
+        response = self._call_api({
+            'operationName': 'initPlaybackSession',
+            'query': self._GRAPHQL_PLAYBACK_QUERY,
+            'variables': {
+                'adCapabilities': ['GOOGLE_STANDALONE_AD_PODS'],
+                'deviceId': self._device_id,
+                'mediaId': broadcast['mediaId'],
+                'quality': 'PLACEHOLDER',
+                'sessionId': self._session_id,
+            },
+        }, video_id, f'{format_id} broadcast JSON', fatal=False)
+
+        playback = traverse_obj(response, ('data', 'initPlaybackSession', 'playback', {dict}))
+        m3u8_url = traverse_obj(playback, ('url', {url_or_none}))
+        token = traverse_obj(playback, ('token', {str}))
+
+        if not (m3u8_url and token):
+            errors = '; '.join(traverse_obj(response, ('errors', ..., 'message', {str})))
+            if 'not entitled' in errors:
+                raise ExtractorError(errors, expected=True)
+            elif errors:  # Only warn when 'blacked out' since radio formats are available
+                self.report_warning(f'API returned errors for {format_id}: {errors}')
+            else:
+                self.report_warning(f'No formats available for {format_id} broadcast; skipping')
+            return [], {}
+
+        cdn_headers = {'x-cdn-token': token}
+        fmts, subs = self._extract_m3u8_formats_and_subtitles(
+            m3u8_url.replace(f'/{token}/', '/'), video_id, 'mp4',
+            m3u8_id=format_id, fatal=False, headers=cdn_headers)
+        for fmt in fmts:
+            fmt['http_headers'] = cdn_headers
+            fmt.setdefault('format_note', join_nonempty(feed, medium, delim=' '))
+            fmt.setdefault('language', language)
+            if fmt.get('vcodec') == 'none' and fmt['language'] == 'en':
+                fmt['source_preference'] = 10
+
+        return fmts, subs
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
-        airings = self._download_json(
-            f'https://search-api-mlbtv.mlb.com/svc/search/v2/graphql/persisted/query/core/Airings?variables=%7B%22partnerProgramIds%22%3A%5B%22{video_id}%22%5D%2C%22applyEsniMediaRightsLabels%22%3Atrue%7D',
-            video_id)['data']['Airings']
+        metadata = traverse_obj(self._download_json(
+            'https://statsapi.mlb.com/api/v1/schedule', video_id, query={
+                'gamePk': video_id,
+                'hydrate': 'broadcasts(all),statusFlags',
+            }), ('dates', ..., 'games', lambda _, v: str(v['gamePk']) == video_id and v['broadcasts'], any))
+
+        broadcasts = traverse_obj(metadata, (
+            'broadcasts', lambda _, v: v['mediaId'] and v['mediaState']['mediaStateCode'] != 'MEDIA_OFF'))
 
         formats, subtitles = [], {}
-        for airing in traverse_obj(airings, lambda _, v: v['playbackUrls'][0]['href']):
-            format_id = join_nonempty('feedType', 'feedLanguage', from_dict=airing)
-            m3u8_url = traverse_obj(self._download_json(
-                airing['playbackUrls'][0]['href'].format(scenario='browser~csai'), video_id,
-                note=f'Downloading {format_id} stream info JSON',
-                errnote=f'Failed to download {format_id} stream info, skipping',
-                fatal=False, headers={
-                    'Authorization': self._access_token,
-                    'Accept': 'application/vnd.media-service+json; version=2',
-                }), ('stream', 'complete', {url_or_none}))
-            if not m3u8_url:
-                continue
-            f, s = self._extract_m3u8_formats_and_subtitles(
-                m3u8_url, video_id, 'mp4', m3u8_id=format_id, fatal=False)
-            formats.extend(f)
-            self._merge_subtitles(s, target=subtitles)
+        for broadcast in broadcasts:
+            fmts, subs = self._extract_formats_and_subtitles(broadcast, video_id)
+            formats.extend(fmts)
+            self._merge_subtitles(subs, target=subtitles)
 
         return {
             'id': video_id,
-            'title': traverse_obj(airings, (..., 'titles', 0, 'episodeName'), get_all=False),
-            'is_live': traverse_obj(airings, (..., 'mediaConfig', 'productType'), get_all=False) == 'LIVE',
+            'title': join_nonempty(
+                traverse_obj(metadata, ('officialDate', {str})),
+                traverse_obj(metadata, ('teams', ('away', 'home'), 'team', 'name', {str}, all, {' @ '.join})),
+                delim=' - '),
+            'is_live': traverse_obj(broadcasts, (..., 'mediaState', 'mediaStateCode', {str}, any)) == 'MEDIA_ON',
+            'release_timestamp': traverse_obj(metadata, ('gameDate', {parse_iso8601})),
             'formats': formats,
             'subtitles': subtitles,
-            'http_headers': {'Authorization': f'Bearer {self._access_token}'},
         }