nexx.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519
  1. import hashlib
  2. import random
  3. import re
  4. import time
  5. from .common import InfoExtractor
  6. from ..utils import (
  7. ExtractorError,
  8. int_or_none,
  9. parse_duration,
  10. srt_subtitles_timecode,
  11. traverse_obj,
  12. try_get,
  13. urlencode_postdata,
  14. )
  15. class NexxIE(InfoExtractor):
  16. _VALID_URL = r'''(?x)
  17. (?:
  18. https?://api\.nexx(?:\.cloud|cdn\.com)/v3(?:\.\d)?/(?P<domain_id>\d+)/videos/byid/|
  19. nexx:(?:(?P<domain_id_s>\d+):)?|
  20. https?://arc\.nexx\.cloud/api/video/
  21. )
  22. (?P<id>\d+)
  23. '''
  24. _TESTS = [{
  25. # movie
  26. 'url': 'https://api.nexx.cloud/v3/748/videos/byid/128907',
  27. 'md5': '31899fd683de49ad46f4ee67e53e83fe',
  28. 'info_dict': {
  29. 'id': '128907',
  30. 'ext': 'mp4',
  31. 'title': 'Stiftung Warentest',
  32. 'alt_title': 'Wie ein Test abläuft',
  33. 'description': 'md5:d1ddb1ef63de721132abd38639cc2fd2',
  34. 'creator': 'SPIEGEL TV',
  35. 'thumbnail': r're:^https?://.*\.jpg$',
  36. 'duration': 2509,
  37. 'timestamp': 1384264416,
  38. 'upload_date': '20131112',
  39. },
  40. 'skip': 'Spiegel nexx CDNs are now disabled',
  41. }, {
  42. # episode with captions
  43. 'url': 'https://api.nexx.cloud/v3.1/741/videos/byid/1701834',
  44. 'info_dict': {
  45. 'id': '1701834',
  46. 'ext': 'mp4',
  47. 'title': 'Mein Leben mit \'nem TikTok E-Boy 😤',
  48. 'alt_title': 'Mein Leben mit \'nem TikTok E-Boy 😤',
  49. 'description': 'md5:f84f395a881fd143f952c892deab528d',
  50. 'thumbnail': r're:^https?://.*\.jpg$',
  51. 'duration': 770,
  52. 'timestamp': 1595600027,
  53. 'upload_date': '20200724',
  54. 'episode_number': 2,
  55. 'season_number': 2,
  56. 'episode': 'Episode 2',
  57. 'season': 'Season 2',
  58. },
  59. 'params': {
  60. 'skip_download': True,
  61. },
  62. }, {
  63. 'url': 'nexx:741:1269984',
  64. 'md5': 'd5f14e14b592501e51addd5abef95a7f',
  65. 'info_dict': {
  66. 'id': '1269984',
  67. 'ext': 'mp4',
  68. 'title': '1 TAG ohne KLO... wortwörtlich! ?',
  69. 'alt_title': '1 TAG ohne KLO... wortwörtlich! ?',
  70. 'description': 'md5:2016393a31991a900946432ccdd09a6f',
  71. 'thumbnail': r're:^https?://.*\.jpg$',
  72. 'duration': 607,
  73. 'timestamp': 1518614955,
  74. 'upload_date': '20180214',
  75. },
  76. }, {
  77. # free cdn from http://www.spiegel.de/video/eifel-zoo-aufregung-um-ausgebrochene-raubtiere-video-99018031.html
  78. 'url': 'nexx:747:1533779',
  79. 'md5': '6bf6883912b82b7069fb86c2297e9893',
  80. 'info_dict': {
  81. 'id': '1533779',
  82. 'ext': 'mp4',
  83. 'title': 'Aufregung um ausgebrochene Raubtiere',
  84. 'alt_title': 'Eifel-Zoo',
  85. 'description': 'md5:f21375c91c74ad741dcb164c427999d2',
  86. 'thumbnail': r're:^https?://.*\.jpg$',
  87. 'duration': 111,
  88. 'timestamp': 1527874460,
  89. 'upload_date': '20180601',
  90. },
  91. 'skip': 'Spiegel nexx CDNs are now disabled',
  92. }, {
  93. 'url': 'https://api.nexxcdn.com/v3/748/videos/byid/128907',
  94. 'only_matching': True,
  95. }, {
  96. 'url': 'nexx:748:128907',
  97. 'only_matching': True,
  98. }, {
  99. 'url': 'nexx:128907',
  100. 'only_matching': True,
  101. }, {
  102. 'url': 'https://arc.nexx.cloud/api/video/128907.json',
  103. 'only_matching': True,
  104. }]
  105. @staticmethod
  106. def _extract_domain_id(webpage):
  107. mobj = re.search(
  108. r'<script\b[^>]+\bsrc=["\'](?:https?:)?//(?:require|arc)\.nexx(?:\.cloud|cdn\.com)/(?:sdk/)?(?P<id>\d+)',
  109. webpage)
  110. return mobj.group('id') if mobj else None
  111. @classmethod
  112. def _extract_embed_urls(cls, url, webpage):
  113. # Reference:
  114. # 1. https://nx-s.akamaized.net/files/201510/44.pdf
  115. entries = []
  116. # JavaScript Integration
  117. domain_id = NexxIE._extract_domain_id(webpage)
  118. if domain_id:
  119. for video_id in re.findall(
  120. r'(?is)onPLAYReady.+?_play\.(?:init|(?:control\.)?addPlayer)\s*\(.+?\s*,\s*["\']?(\d+)',
  121. webpage):
  122. entries.append(
  123. f'https://api.nexx.cloud/v3/{domain_id}/videos/byid/{video_id}')
  124. # TODO: support more embed formats
  125. return entries
  126. def _handle_error(self, response):
  127. if traverse_obj(response, ('metadata', 'notice'), expected_type=str):
  128. self.report_warning('{} said: {}'.format(self.IE_NAME, response['metadata']['notice']))
  129. status = int_or_none(try_get(
  130. response, lambda x: x['metadata']['status']) or 200)
  131. if 200 <= status < 300:
  132. return
  133. raise ExtractorError(
  134. '{} said: {}'.format(self.IE_NAME, response['metadata']['errorhint']),
  135. expected=True)
  136. def _call_api(self, domain_id, path, video_id, data=None, headers={}):
  137. headers['Content-Type'] = 'application/x-www-form-urlencoded; charset=UTF-8'
  138. result = self._download_json(
  139. f'https://api.nexx.cloud/v3/{domain_id}/{path}', video_id,
  140. f'Downloading {path} JSON', data=urlencode_postdata(data),
  141. headers=headers)
  142. self._handle_error(result)
  143. return result['result']
  144. def _extract_free_formats(self, video, video_id):
  145. stream_data = video['streamdata']
  146. cdn = stream_data['cdnType']
  147. assert cdn == 'free'
  148. video_hash = video['general']['hash']
  149. ps = str(stream_data['originalDomain'])
  150. if stream_data['applyFolderHierarchy'] == 1:
  151. s = ('%04d' % int(video_id))[::-1]
  152. ps += f'/{s[0:2]}/{s[2:4]}'
  153. ps += f'/{video_id}/{video_hash}_'
  154. t = 'http://%s' + ps
  155. fd = stream_data['azureFileDistribution'].split(',')
  156. cdn_provider = stream_data['cdnProvider']
  157. def p0(p):
  158. return f'_{p}' if stream_data['applyAzureStructure'] == 1 else ''
  159. formats = []
  160. if cdn_provider == 'ak':
  161. t += ','
  162. for i in fd:
  163. p = i.split(':')
  164. t += p[1] + p0(int(p[0])) + ','
  165. t += '.mp4.csmil/master.%s'
  166. elif cdn_provider == 'ce':
  167. k = t.split('/')
  168. h = k.pop()
  169. http_base = t = '/'.join(k)
  170. http_base = http_base % stream_data['cdnPathHTTP']
  171. t += '/asset.ism/manifest.%s?dcp_ver=aos4&videostream='
  172. for i in fd:
  173. p = i.split(':')
  174. tbr = int(p[0])
  175. filename = f'{h}{p[1]}{p0(tbr)}.mp4'
  176. f = {
  177. 'url': http_base + '/' + filename,
  178. 'format_id': f'{cdn}-http-{tbr}',
  179. 'tbr': tbr,
  180. }
  181. width_height = p[1].split('x')
  182. if len(width_height) == 2:
  183. f.update({
  184. 'width': int_or_none(width_height[0]),
  185. 'height': int_or_none(width_height[1]),
  186. })
  187. formats.append(f)
  188. a = filename + f':{tbr * 1000}'
  189. t += a + ','
  190. t = t[:-1] + '&audiostream=' + a.split(':')[0]
  191. else:
  192. assert False
  193. if cdn_provider == 'ce':
  194. formats.extend(self._extract_mpd_formats(
  195. t % (stream_data['cdnPathDASH'], 'mpd'), video_id,
  196. mpd_id=f'{cdn}-dash', fatal=False))
  197. formats.extend(self._extract_m3u8_formats(
  198. t % (stream_data['cdnPathHLS'], 'm3u8'), video_id, 'mp4',
  199. entry_protocol='m3u8_native', m3u8_id=f'{cdn}-hls', fatal=False))
  200. return formats
  201. def _extract_3q_formats(self, video, video_id):
  202. stream_data = video['streamdata']
  203. cdn = stream_data['cdnType']
  204. assert cdn == '3q'
  205. q_acc, q_prefix, q_locator, q_hash = stream_data['qAccount'], stream_data['qPrefix'], stream_data['qLocator'], stream_data['qHash']
  206. protection_key = traverse_obj(
  207. video, ('protectiondata', 'key'), expected_type=str)
  208. def get_cdn_shield_base(shield_type=''):
  209. for secure in ('', 's'):
  210. cdn_shield = stream_data.get(f'cdnShield{shield_type}HTTP{secure.upper()}')
  211. if cdn_shield:
  212. return f'http{secure}://{cdn_shield}'
  213. return f'http://sdn-global-{"prog" if shield_type.lower() == "prog" else "streaming"}-cache.3qsdn.com/' + (f's/{protection_key}/' if protection_key else '')
  214. stream_base = get_cdn_shield_base()
  215. formats = []
  216. formats.extend(self._extract_m3u8_formats(
  217. f'{stream_base}{q_acc}/files/{q_prefix}/{q_locator}/{q_acc}-{stream_data.get("qHEVCHash") or q_hash}.ism/manifest.m3u8',
  218. video_id, 'mp4', m3u8_id=f'{cdn}-hls', fatal=False))
  219. formats.extend(self._extract_mpd_formats(
  220. f'{stream_base}{q_acc}/files/{q_prefix}/{q_locator}/{q_acc}-{q_hash}.ism/manifest.mpd',
  221. video_id, mpd_id=f'{cdn}-dash', fatal=False))
  222. progressive_base = get_cdn_shield_base('Prog')
  223. q_references = stream_data.get('qReferences') or ''
  224. fds = q_references.split(',')
  225. for fd in fds:
  226. ss = fd.split(':')
  227. if len(ss) != 3:
  228. continue
  229. tbr = int_or_none(ss[1], scale=1000)
  230. formats.append({
  231. 'url': f'{progressive_base}{q_acc}/uploads/{q_acc}-{ss[2]}.webm',
  232. 'format_id': f'{cdn}-{ss[0]}{f"-{tbr}" if tbr else ""}',
  233. 'tbr': tbr,
  234. })
  235. azure_file_distribution = stream_data.get('azureFileDistribution') or ''
  236. fds = azure_file_distribution.split(',')
  237. for fd in fds:
  238. ss = fd.split(':')
  239. if len(ss) != 3:
  240. continue
  241. tbr = int_or_none(ss[0])
  242. width, height = ss[1].split('x') if len(ss[1].split('x')) == 2 else (None, None)
  243. f = {
  244. 'url': f'{progressive_base}{q_acc}/files/{q_prefix}/{q_locator}/{ss[2]}.mp4',
  245. 'format_id': f'{cdn}-http-{f"-{tbr}" if tbr else ""}',
  246. 'tbr': tbr,
  247. 'width': int_or_none(width),
  248. 'height': int_or_none(height),
  249. }
  250. formats.append(f)
  251. return formats
  252. def _extract_azure_formats(self, video, video_id):
  253. stream_data = video['streamdata']
  254. cdn = stream_data['cdnType']
  255. assert cdn == 'azure'
  256. azure_locator = stream_data['azureLocator']
  257. def get_cdn_shield_base(shield_type='', static=False):
  258. for secure in ('', 's'):
  259. cdn_shield = stream_data.get(f'cdnShield{shield_type}HTTP{secure.upper()}')
  260. if cdn_shield:
  261. return f'http{secure}://{cdn_shield}'
  262. if 'fb' in stream_data['azureAccount']:
  263. prefix = 'df' if static else 'f'
  264. else:
  265. prefix = 'd' if static else 'p'
  266. account = int(stream_data['azureAccount'].replace('nexxplayplus', '').replace('nexxplayfb', ''))
  267. return 'http://nx-%s%02d.akamaized.net/' % (prefix, account)
  268. language = video['general'].get('language_raw') or ''
  269. azure_stream_base = get_cdn_shield_base()
  270. is_ml = ',' in language
  271. azure_manifest_url = '{}{}/{}_src{}.ism/Manifest'.format(
  272. azure_stream_base, azure_locator, video_id, ('_manifest' if is_ml else '')) + '%s'
  273. protection_token = try_get(
  274. video, lambda x: x['protectiondata']['token'], str)
  275. if protection_token:
  276. azure_manifest_url += f'?hdnts={protection_token}'
  277. formats = self._extract_m3u8_formats(
  278. azure_manifest_url % '(format=m3u8-aapl)',
  279. video_id, 'mp4', 'm3u8_native',
  280. m3u8_id=f'{cdn}-hls', fatal=False)
  281. formats.extend(self._extract_mpd_formats(
  282. azure_manifest_url % '(format=mpd-time-csf)',
  283. video_id, mpd_id=f'{cdn}-dash', fatal=False))
  284. formats.extend(self._extract_ism_formats(
  285. azure_manifest_url % '', video_id, ism_id=f'{cdn}-mss', fatal=False))
  286. azure_progressive_base = get_cdn_shield_base('Prog', True)
  287. azure_file_distribution = stream_data.get('azureFileDistribution')
  288. if azure_file_distribution:
  289. fds = azure_file_distribution.split(',')
  290. if fds:
  291. for fd in fds:
  292. ss = fd.split(':')
  293. if len(ss) == 2:
  294. tbr = int_or_none(ss[0])
  295. if tbr:
  296. f = {
  297. 'url': f'{azure_progressive_base}{azure_locator}/{video_id}_src_{ss[1]}_{tbr}.mp4',
  298. 'format_id': f'{cdn}-http-{tbr}',
  299. 'tbr': tbr,
  300. }
  301. width_height = ss[1].split('x')
  302. if len(width_height) == 2:
  303. f.update({
  304. 'width': int_or_none(width_height[0]),
  305. 'height': int_or_none(width_height[1]),
  306. })
  307. formats.append(f)
  308. return formats
  309. def _real_extract(self, url):
  310. mobj = self._match_valid_url(url)
  311. domain_id = mobj.group('domain_id') or mobj.group('domain_id_s')
  312. video_id = mobj.group('id')
  313. video = None
  314. def find_video(result):
  315. if isinstance(result, dict):
  316. return result
  317. elif isinstance(result, list):
  318. vid = int(video_id)
  319. for v in result:
  320. if try_get(v, lambda x: x['general']['ID'], int) == vid:
  321. return v
  322. return None
  323. response = self._download_json(
  324. f'https://arc.nexx.cloud/api/video/{video_id}.json',
  325. video_id, fatal=False)
  326. if response and isinstance(response, dict):
  327. result = response.get('result')
  328. if result:
  329. video = find_video(result)
  330. # not all videos work via arc, e.g. nexx:741:1269984
  331. if not video:
  332. # Reverse engineered from JS code (see getDeviceID function)
  333. device_id = f'{random.randint(1, 4)}:{int(time.time())}:{random.randint(1e4, 99999)}{random.randint(1, 9)}'
  334. result = self._call_api(domain_id, 'session/init', video_id, data={
  335. 'nxp_devh': device_id,
  336. 'nxp_userh': '',
  337. 'precid': '0',
  338. 'playlicense': '0',
  339. 'screenx': '1920',
  340. 'screeny': '1080',
  341. 'playerversion': '6.0.00',
  342. 'gateway': 'html5',
  343. 'adGateway': '',
  344. 'explicitlanguage': 'en-US',
  345. 'addTextTemplates': '1',
  346. 'addDomainData': '1',
  347. 'addAdModel': '1',
  348. }, headers={
  349. 'X-Request-Enable-Auth-Fallback': '1',
  350. })
  351. cid = result['general']['cid']
  352. # As described in [1] X-Request-Token generation algorithm is
  353. # as follows:
  354. # md5( operation + domain_id + domain_secret )
  355. # where domain_secret is a static value that will be given by nexx.tv
  356. # as per [1]. Here is how this "secret" is generated (reversed
  357. # from _play._factory.data.getDomainData function, search for
  358. # domaintoken or enableAPIAccess). So it's actually not static
  359. # and not that much of a secret.
  360. # 1. https://nexxtvstorage.blob.core.windows.net/files/201610/27.pdf
  361. secret = result['device']['domaintoken'][int(device_id[0]):]
  362. secret = secret[0:len(secret) - int(device_id[-1])]
  363. op = 'byid'
  364. # Reversed from JS code for _play.api.call function (search for
  365. # X-Request-Token)
  366. request_token = hashlib.md5(
  367. ''.join((op, domain_id, secret)).encode()).hexdigest()
  368. result = self._call_api(
  369. domain_id, f'videos/{op}/{video_id}', video_id, data={
  370. 'additionalfields': 'language,channel,format,licenseby,slug,fileversion,episode,season',
  371. 'addInteractionOptions': '1',
  372. 'addStatusDetails': '1',
  373. 'addStreamDetails': '1',
  374. 'addFeatures': '1',
  375. # Caption format selection doesn't seem to be enforced?
  376. 'addCaptions': 'vtt',
  377. 'addScenes': '1',
  378. 'addChapters': '1',
  379. 'addHotSpots': '1',
  380. 'addConnectedMedia': 'persons',
  381. 'addBumpers': '1',
  382. }, headers={
  383. 'X-Request-CID': cid,
  384. 'X-Request-Token': request_token,
  385. })
  386. video = find_video(result)
  387. general = video['general']
  388. title = general['title']
  389. cdn = video['streamdata']['cdnType']
  390. if cdn == 'azure':
  391. formats = self._extract_azure_formats(video, video_id)
  392. elif cdn == 'free':
  393. formats = self._extract_free_formats(video, video_id)
  394. elif cdn == '3q':
  395. formats = self._extract_3q_formats(video, video_id)
  396. else:
  397. self.raise_no_formats(f'{cdn} formats are currently not supported', video_id)
  398. subtitles = {}
  399. for sub in video.get('captiondata') or []:
  400. if sub.get('data'):
  401. subtitles.setdefault(sub.get('language', 'en'), []).append({
  402. 'ext': 'srt',
  403. 'data': '\n\n'.join(
  404. f'{i + 1}\n{srt_subtitles_timecode(line["fromms"] / 1000)} --> {srt_subtitles_timecode(line["toms"] / 1000)}\n{line["caption"]}'
  405. for i, line in enumerate(sub['data'])),
  406. 'name': sub.get('language_long') or sub.get('title'),
  407. })
  408. elif sub.get('url'):
  409. subtitles.setdefault(sub.get('language', 'en'), []).append({
  410. 'url': sub['url'],
  411. 'ext': sub.get('format'),
  412. 'name': sub.get('language_long') or sub.get('title'),
  413. })
  414. return {
  415. 'id': video_id,
  416. 'title': title,
  417. 'alt_title': general.get('subtitle'),
  418. 'description': general.get('description'),
  419. 'release_year': int_or_none(general.get('year')),
  420. 'creator': general.get('studio') or general.get('studio_adref') or None,
  421. 'thumbnail': try_get(
  422. video, lambda x: x['imagedata']['thumb'], str),
  423. 'duration': parse_duration(general.get('runtime')),
  424. 'timestamp': int_or_none(general.get('uploaded')),
  425. 'episode_number': traverse_obj(
  426. video, (('episodedata', 'general'), 'episode'), expected_type=int, get_all=False),
  427. 'season_number': traverse_obj(
  428. video, (('episodedata', 'general'), 'season'), expected_type=int, get_all=False),
  429. 'cast': traverse_obj(video, ('connectedmedia', ..., 'title'), expected_type=str),
  430. 'formats': formats,
  431. 'subtitles': subtitles,
  432. }
  433. class NexxEmbedIE(InfoExtractor):
  434. _VALID_URL = r'https?://embed\.nexx(?:\.cloud|cdn\.com)/\d+/(?:video/)?(?P<id>[^/?#&]+)'
  435. # Reference. https://nx-s.akamaized.net/files/201510/44.pdf
  436. _EMBED_REGEX = [r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//embed\.nexx(?:\.cloud|cdn\.com)/\d+/(?:(?!\1).)+)\1']
  437. _TESTS = [{
  438. 'url': 'http://embed.nexx.cloud/748/KC1614647Z27Y7T?autoplay=1',
  439. 'md5': '16746bfc28c42049492385c989b26c4a',
  440. 'info_dict': {
  441. 'id': '161464',
  442. 'ext': 'mp4',
  443. 'title': 'Nervenkitzel Achterbahn',
  444. 'alt_title': 'Karussellbauer in Deutschland',
  445. 'description': 'md5:ffe7b1cc59a01f585e0569949aef73cc',
  446. 'creator': 'SPIEGEL TV',
  447. 'thumbnail': r're:^https?://.*\.jpg$',
  448. 'duration': 2761,
  449. 'timestamp': 1394021479,
  450. 'upload_date': '20140305',
  451. },
  452. 'params': {
  453. 'skip_download': True,
  454. },
  455. }, {
  456. 'url': 'https://embed.nexx.cloud/11888/video/DSRTO7UVOX06S7',
  457. 'only_matching': True,
  458. }]
  459. def _real_extract(self, url):
  460. embed_id = self._match_id(url)
  461. webpage = self._download_webpage(url, embed_id)
  462. return self.url_result(NexxIE._extract_url(webpage), ie=NexxIE.ie_key())