bbc.py 81 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838
  1. import functools
  2. import itertools
  3. import json
  4. import re
  5. import urllib.parse
  6. import xml.etree.ElementTree
  7. from .common import InfoExtractor
  8. from ..networking.exceptions import HTTPError
  9. from ..utils import (
  10. ExtractorError,
  11. OnDemandPagedList,
  12. clean_html,
  13. dict_get,
  14. float_or_none,
  15. get_element_by_class,
  16. int_or_none,
  17. join_nonempty,
  18. js_to_json,
  19. parse_duration,
  20. parse_iso8601,
  21. parse_qs,
  22. strip_or_none,
  23. traverse_obj,
  24. try_get,
  25. unescapeHTML,
  26. unified_timestamp,
  27. url_or_none,
  28. urlencode_postdata,
  29. urljoin,
  30. )
  31. class BBCCoUkIE(InfoExtractor):
  32. IE_NAME = 'bbc.co.uk'
  33. IE_DESC = 'BBC iPlayer'
  34. _ID_REGEX = r'(?:[pbml][\da-z]{7}|w[\da-z]{7,14})'
  35. _VALID_URL = rf'''(?x)
  36. https?://
  37. (?:www\.)?bbc\.co\.uk/
  38. (?:
  39. programmes/(?!articles/)|
  40. iplayer(?:/[^/]+)?/(?:episode/|playlist/)|
  41. music/(?:clips|audiovideo/popular)[/#]|
  42. radio/player/|
  43. events/[^/]+/play/[^/]+/
  44. )
  45. (?P<id>{_ID_REGEX})(?!/(?:episodes|broadcasts|clips))
  46. '''
  47. _EMBED_REGEX = [r'setPlaylist\("(?P<url>https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)']
  48. _LOGIN_URL = 'https://account.bbc.com/signin'
  49. _NETRC_MACHINE = 'bbc'
  50. _MEDIA_SELECTOR_URL_TEMPL = 'https://open.live.bbc.co.uk/mediaselector/6/select/version/2.0/mediaset/%s/vpid/%s'
  51. _MEDIA_SETS = [
  52. # Provides HQ HLS streams with even better quality that pc mediaset but fails
  53. # with geolocation in some cases when it's even not geo restricted at all (e.g.
  54. # http://www.bbc.co.uk/programmes/b06bp7lf). Also may fail with selectionunavailable.
  55. 'iptv-all',
  56. 'pc',
  57. ]
  58. _EMP_PLAYLIST_NS = 'http://bbc.co.uk/2008/emp/playlist'
  59. _TESTS = [
  60. {
  61. 'url': 'http://www.bbc.co.uk/programmes/b039g8p7',
  62. 'info_dict': {
  63. 'id': 'b039d07m',
  64. 'ext': 'flv',
  65. 'title': 'Kaleidoscope, Leonard Cohen',
  66. 'description': 'The Canadian poet and songwriter reflects on his musical career.',
  67. },
  68. 'params': {
  69. # rtmp download
  70. 'skip_download': True,
  71. },
  72. },
  73. {
  74. 'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/',
  75. 'info_dict': {
  76. 'id': 'b00yng1d',
  77. 'ext': 'flv',
  78. 'title': 'The Man in Black: Series 3: The Printed Name',
  79. 'description': "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey.",
  80. 'duration': 1800,
  81. },
  82. 'params': {
  83. # rtmp download
  84. 'skip_download': True,
  85. },
  86. 'skip': 'Episode is no longer available on BBC iPlayer Radio',
  87. },
  88. {
  89. 'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/',
  90. 'info_dict': {
  91. 'id': 'b00yng1d',
  92. 'ext': 'flv',
  93. 'title': 'The Voice UK: Series 3: Blind Auditions 5',
  94. 'description': 'Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.',
  95. 'duration': 5100,
  96. },
  97. 'params': {
  98. # rtmp download
  99. 'skip_download': True,
  100. },
  101. 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
  102. },
  103. {
  104. 'url': 'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion',
  105. 'info_dict': {
  106. 'id': 'b03k3pb7',
  107. 'ext': 'flv',
  108. 'title': "Tomorrow's Worlds: The Unearthly History of Science Fiction",
  109. 'description': '2. Invasion',
  110. 'duration': 3600,
  111. },
  112. 'params': {
  113. # rtmp download
  114. 'skip_download': True,
  115. },
  116. 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
  117. }, {
  118. 'url': 'http://www.bbc.co.uk/programmes/b04v20dw',
  119. 'info_dict': {
  120. 'id': 'b04v209v',
  121. 'ext': 'flv',
  122. 'title': 'Pete Tong, The Essential New Tune Special',
  123. 'description': "Pete has a very special mix - all of 2014's Essential New Tunes!",
  124. 'duration': 10800,
  125. },
  126. 'params': {
  127. # rtmp download
  128. 'skip_download': True,
  129. },
  130. 'skip': 'Episode is no longer available on BBC iPlayer Radio',
  131. }, {
  132. 'url': 'http://www.bbc.co.uk/music/clips/p022h44b',
  133. 'note': 'Audio',
  134. 'info_dict': {
  135. 'id': 'p022h44j',
  136. 'ext': 'flv',
  137. 'title': 'BBC Proms Music Guides, Rachmaninov: Symphonic Dances',
  138. 'description': "In this Proms Music Guide, Andrew McGregor looks at Rachmaninov's Symphonic Dances.",
  139. 'duration': 227,
  140. },
  141. 'params': {
  142. # rtmp download
  143. 'skip_download': True,
  144. },
  145. }, {
  146. 'url': 'http://www.bbc.co.uk/music/clips/p025c0zz',
  147. 'note': 'Video',
  148. 'info_dict': {
  149. 'id': 'p025c103',
  150. 'ext': 'flv',
  151. 'title': 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)',
  152. 'description': 'Rae Morris performs Closer for BBC Three at Reading 2014',
  153. 'duration': 226,
  154. },
  155. 'params': {
  156. # rtmp download
  157. 'skip_download': True,
  158. },
  159. }, {
  160. 'url': 'http://www.bbc.co.uk/iplayer/episode/b054fn09/ad/natural-world-20152016-2-super-powered-owls',
  161. 'info_dict': {
  162. 'id': 'p02n76xf',
  163. 'ext': 'flv',
  164. 'title': 'Natural World, 2015-2016: 2. Super Powered Owls',
  165. 'description': 'md5:e4db5c937d0e95a7c6b5e654d429183d',
  166. 'duration': 3540,
  167. },
  168. 'params': {
  169. # rtmp download
  170. 'skip_download': True,
  171. },
  172. 'skip': 'geolocation',
  173. }, {
  174. 'url': 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition',
  175. 'info_dict': {
  176. 'id': 'b05zmgw1',
  177. 'ext': 'flv',
  178. 'description': 'Kirsty Wark and Morgan Quaintance visit the Royal Academy as it prepares for its annual artistic extravaganza, meeting people who have come together to make the show unique.',
  179. 'title': 'Royal Academy Summer Exhibition',
  180. 'duration': 3540,
  181. },
  182. 'params': {
  183. # rtmp download
  184. 'skip_download': True,
  185. },
  186. 'skip': 'geolocation',
  187. }, {
  188. # iptv-all mediaset fails with geolocation however there is no geo restriction
  189. # for this programme at all
  190. 'url': 'http://www.bbc.co.uk/programmes/b06rkn85',
  191. 'info_dict': {
  192. 'id': 'b06rkms3',
  193. 'ext': 'flv',
  194. 'title': "Best of the Mini-Mixes 2015: Part 3, Annie Mac's Friday Night - BBC Radio 1",
  195. 'description': "Annie has part three in the Best of the Mini-Mixes 2015, plus the year's Most Played!",
  196. },
  197. 'params': {
  198. # rtmp download
  199. 'skip_download': True,
  200. },
  201. 'skip': 'Now it\'s really geo-restricted',
  202. }, {
  203. # compact player (https://github.com/ytdl-org/youtube-dl/issues/8147)
  204. 'url': 'http://www.bbc.co.uk/programmes/p028bfkf/player',
  205. 'info_dict': {
  206. 'id': 'p028bfkj',
  207. 'ext': 'flv',
  208. 'title': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
  209. 'description': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews',
  210. },
  211. 'params': {
  212. # rtmp download
  213. 'skip_download': True,
  214. },
  215. }, {
  216. 'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4',
  217. 'only_matching': True,
  218. }, {
  219. 'url': 'http://www.bbc.co.uk/music/clips#p02frcc3',
  220. 'only_matching': True,
  221. }, {
  222. 'url': 'http://www.bbc.co.uk/iplayer/cbeebies/episode/b0480276/bing-14-atchoo',
  223. 'only_matching': True,
  224. }, {
  225. 'url': 'http://www.bbc.co.uk/radio/player/p03cchwf',
  226. 'only_matching': True,
  227. }, {
  228. 'url': 'https://www.bbc.co.uk/music/audiovideo/popular#p055bc55',
  229. 'only_matching': True,
  230. }, {
  231. 'url': 'http://www.bbc.co.uk/programmes/w3csv1y9',
  232. 'only_matching': True,
  233. }, {
  234. 'url': 'https://www.bbc.co.uk/programmes/m00005xn',
  235. 'only_matching': True,
  236. }, {
  237. 'url': 'https://www.bbc.co.uk/programmes/w172w4dww1jqt5s',
  238. 'only_matching': True,
  239. }]
  240. def _perform_login(self, username, password):
  241. login_page = self._download_webpage(
  242. self._LOGIN_URL, None, 'Downloading signin page')
  243. login_form = self._hidden_inputs(login_page)
  244. login_form.update({
  245. 'username': username,
  246. 'password': password,
  247. })
  248. post_url = urljoin(self._LOGIN_URL, self._search_regex(
  249. r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page,
  250. 'post url', default=self._LOGIN_URL, group='url'))
  251. response, urlh = self._download_webpage_handle(
  252. post_url, None, 'Logging in', data=urlencode_postdata(login_form),
  253. headers={'Referer': self._LOGIN_URL})
  254. if self._LOGIN_URL in urlh.url:
  255. error = clean_html(get_element_by_class('form-message', response))
  256. if error:
  257. raise ExtractorError(
  258. f'Unable to login: {error}', expected=True)
  259. raise ExtractorError('Unable to log in')
  260. class MediaSelectionError(Exception):
  261. def __init__(self, error_id):
  262. self.id = error_id
  263. def _extract_asx_playlist(self, connection, programme_id):
  264. asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist')
  265. return [ref.get('href') for ref in asx.findall('./Entry/ref')]
  266. def _extract_items(self, playlist):
  267. return playlist.findall(f'./{{{self._EMP_PLAYLIST_NS}}}item')
  268. def _extract_medias(self, media_selection):
  269. error = media_selection.get('result')
  270. if error:
  271. raise BBCCoUkIE.MediaSelectionError(error)
  272. return media_selection.get('media') or []
  273. def _extract_connections(self, media):
  274. return media.get('connection') or []
  275. def _get_subtitles(self, media, programme_id):
  276. subtitles = {}
  277. for connection in self._extract_connections(media):
  278. cc_url = url_or_none(connection.get('href'))
  279. if not cc_url:
  280. continue
  281. captions = self._download_xml(
  282. cc_url, programme_id, 'Downloading captions', fatal=False)
  283. if not isinstance(captions, xml.etree.ElementTree.Element):
  284. continue
  285. subtitles['en'] = [
  286. {
  287. 'url': connection.get('href'),
  288. 'ext': 'ttml',
  289. },
  290. ]
  291. break
  292. return subtitles
  293. def _raise_extractor_error(self, media_selection_error):
  294. raise ExtractorError(
  295. f'{self.IE_NAME} returned error: {media_selection_error.id}',
  296. expected=True)
  297. def _download_media_selector(self, programme_id):
  298. last_exception = None
  299. formats, subtitles = [], {}
  300. for media_set in self._MEDIA_SETS:
  301. try:
  302. fmts, subs = self._download_media_selector_url(
  303. self._MEDIA_SELECTOR_URL_TEMPL % (media_set, programme_id), programme_id)
  304. formats.extend(fmts)
  305. if subs:
  306. self._merge_subtitles(subs, target=subtitles)
  307. except BBCCoUkIE.MediaSelectionError as e:
  308. if e.id in ('notukerror', 'geolocation', 'selectionunavailable'):
  309. last_exception = e
  310. continue
  311. self._raise_extractor_error(e)
  312. if last_exception:
  313. if formats or subtitles:
  314. self.report_warning(f'{self.IE_NAME} returned error: {last_exception.id}')
  315. else:
  316. self._raise_extractor_error(last_exception)
  317. return formats, subtitles
  318. def _download_media_selector_url(self, url, programme_id=None):
  319. media_selection = self._download_json(
  320. url, programme_id, 'Downloading media selection JSON',
  321. expected_status=(403, 404))
  322. return self._process_media_selector(media_selection, programme_id)
  323. def _process_media_selector(self, media_selection, programme_id):
  324. formats = []
  325. subtitles = None
  326. urls = []
  327. for media in self._extract_medias(media_selection):
  328. kind = media.get('kind')
  329. if kind in ('video', 'audio'):
  330. bitrate = int_or_none(media.get('bitrate'))
  331. encoding = media.get('encoding')
  332. width = int_or_none(media.get('width'))
  333. height = int_or_none(media.get('height'))
  334. file_size = int_or_none(media.get('media_file_size'))
  335. for connection in self._extract_connections(media):
  336. href = connection.get('href')
  337. if href in urls:
  338. continue
  339. if href:
  340. urls.append(href)
  341. conn_kind = connection.get('kind')
  342. protocol = connection.get('protocol')
  343. supplier = connection.get('supplier')
  344. transfer_format = connection.get('transferFormat')
  345. format_id = supplier or conn_kind or protocol
  346. # ASX playlist
  347. if supplier == 'asx':
  348. for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)):
  349. formats.append({
  350. 'url': ref,
  351. 'format_id': f'ref{i}_{format_id}',
  352. })
  353. elif transfer_format == 'dash':
  354. formats.extend(self._extract_mpd_formats(
  355. href, programme_id, mpd_id=format_id, fatal=False))
  356. elif transfer_format == 'hls':
  357. # TODO: let expected_status be passed into _extract_xxx_formats() instead
  358. try:
  359. fmts = self._extract_m3u8_formats(
  360. href, programme_id, ext='mp4', entry_protocol='m3u8_native',
  361. m3u8_id=format_id, fatal=False)
  362. except ExtractorError as e:
  363. if not (isinstance(e.exc_info[1], HTTPError)
  364. and e.exc_info[1].status in (403, 404)):
  365. raise
  366. fmts = []
  367. formats.extend(fmts)
  368. elif transfer_format == 'hds':
  369. formats.extend(self._extract_f4m_formats(
  370. href, programme_id, f4m_id=format_id, fatal=False))
  371. else:
  372. if not supplier and bitrate:
  373. format_id += f'-{bitrate}'
  374. fmt = {
  375. 'format_id': format_id,
  376. 'filesize': file_size,
  377. }
  378. if kind == 'video':
  379. fmt.update({
  380. 'width': width,
  381. 'height': height,
  382. 'tbr': bitrate,
  383. 'vcodec': encoding,
  384. })
  385. else:
  386. fmt.update({
  387. 'abr': bitrate,
  388. 'acodec': encoding,
  389. 'vcodec': 'none',
  390. })
  391. if protocol in ('http', 'https'):
  392. # Direct link
  393. fmt.update({
  394. 'url': href,
  395. })
  396. elif protocol == 'rtmp':
  397. application = connection.get('application', 'ondemand')
  398. auth_string = connection.get('authString')
  399. identifier = connection.get('identifier')
  400. server = connection.get('server')
  401. fmt.update({
  402. 'url': f'{protocol}://{server}/{application}?{auth_string}',
  403. 'play_path': identifier,
  404. 'app': f'{application}?{auth_string}',
  405. 'page_url': 'http://www.bbc.co.uk',
  406. 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf',
  407. 'rtmp_live': False,
  408. 'ext': 'flv',
  409. })
  410. else:
  411. continue
  412. formats.append(fmt)
  413. elif kind == 'captions':
  414. subtitles = self.extract_subtitles(media, programme_id)
  415. return formats, subtitles
  416. def _download_playlist(self, playlist_id):
  417. try:
  418. playlist = self._download_json(
  419. f'http://www.bbc.co.uk/programmes/{playlist_id}/playlist.json',
  420. playlist_id, 'Downloading playlist JSON')
  421. formats = []
  422. subtitles = {}
  423. for version in playlist.get('allAvailableVersions', []):
  424. smp_config = version['smpConfig']
  425. title = smp_config['title']
  426. description = smp_config['summary']
  427. for item in smp_config['items']:
  428. kind = item['kind']
  429. if kind not in ('programme', 'radioProgramme'):
  430. continue
  431. programme_id = item.get('vpid')
  432. duration = int_or_none(item.get('duration'))
  433. version_formats, version_subtitles = self._download_media_selector(programme_id)
  434. types = version['types']
  435. for f in version_formats:
  436. f['format_note'] = ', '.join(types)
  437. if any('AudioDescribed' in x for x in types):
  438. f['language_preference'] = -10
  439. formats += version_formats
  440. for tag, subformats in (version_subtitles or {}).items():
  441. subtitles.setdefault(tag, []).extend(subformats)
  442. return programme_id, title, description, duration, formats, subtitles
  443. except ExtractorError as ee:
  444. if not (isinstance(ee.cause, HTTPError) and ee.cause.status == 404):
  445. raise
  446. # fallback to legacy playlist
  447. return self._process_legacy_playlist(playlist_id)
  448. def _process_legacy_playlist_url(self, url, display_id):
  449. playlist = self._download_legacy_playlist_url(url, display_id)
  450. return self._extract_from_legacy_playlist(playlist, display_id)
  451. def _process_legacy_playlist(self, playlist_id):
  452. return self._process_legacy_playlist_url(
  453. f'http://www.bbc.co.uk/iplayer/playlist/{playlist_id}', playlist_id)
  454. def _download_legacy_playlist_url(self, url, playlist_id=None):
  455. return self._download_xml(
  456. url, playlist_id, 'Downloading legacy playlist XML')
  457. def _extract_from_legacy_playlist(self, playlist, playlist_id):
  458. no_items = playlist.find(f'./{{{self._EMP_PLAYLIST_NS}}}noItems')
  459. if no_items is not None:
  460. reason = no_items.get('reason')
  461. if reason == 'preAvailability':
  462. msg = f'Episode {playlist_id} is not yet available'
  463. elif reason == 'postAvailability':
  464. msg = f'Episode {playlist_id} is no longer available'
  465. elif reason == 'noMedia':
  466. msg = f'Episode {playlist_id} is not currently available'
  467. else:
  468. msg = f'Episode {playlist_id} is not available: {reason}'
  469. raise ExtractorError(msg, expected=True)
  470. for item in self._extract_items(playlist):
  471. kind = item.get('kind')
  472. if kind not in ('programme', 'radioProgramme'):
  473. continue
  474. title = playlist.find(f'./{{{self._EMP_PLAYLIST_NS}}}title').text
  475. description_el = playlist.find(f'./{{{self._EMP_PLAYLIST_NS}}}summary')
  476. description = description_el.text if description_el is not None else None
  477. def get_programme_id(item):
  478. def get_from_attributes(item):
  479. for p in ('identifier', 'group'):
  480. value = item.get(p)
  481. if value and re.match(r'^[pb][\da-z]{7}$', value):
  482. return value
  483. get_from_attributes(item)
  484. mediator = item.find(f'./{{{self._EMP_PLAYLIST_NS}}}mediator')
  485. if mediator is not None:
  486. return get_from_attributes(mediator)
  487. programme_id = get_programme_id(item)
  488. duration = int_or_none(item.get('duration'))
  489. if programme_id:
  490. formats, subtitles = self._download_media_selector(programme_id)
  491. else:
  492. formats, subtitles = self._process_media_selector(item, playlist_id)
  493. programme_id = playlist_id
  494. return programme_id, title, description, duration, formats, subtitles
  495. def _real_extract(self, url):
  496. group_id = self._match_id(url)
  497. webpage = self._download_webpage(url, group_id, 'Downloading video page')
  498. error = self._search_regex(
  499. r'<div\b[^>]+\bclass=["\'](?:smp|playout)__message delta["\'][^>]*>\s*([^<]+?)\s*<',
  500. webpage, 'error', default=None)
  501. if error:
  502. raise ExtractorError(error, expected=True)
  503. programme_id = None
  504. duration = None
  505. tviplayer = self._search_regex(
  506. r'mediator\.bind\(({.+?})\s*,\s*document\.getElementById',
  507. webpage, 'player', default=None)
  508. if tviplayer:
  509. player = self._parse_json(tviplayer, group_id).get('player', {})
  510. duration = int_or_none(player.get('duration'))
  511. programme_id = player.get('vpid')
  512. if not programme_id:
  513. programme_id = self._search_regex(
  514. rf'"vpid"\s*:\s*"({self._ID_REGEX})"', webpage, 'vpid', fatal=False, default=None)
  515. if programme_id:
  516. formats, subtitles = self._download_media_selector(programme_id)
  517. title = self._og_search_title(webpage, default=None) or self._html_search_regex(
  518. (r'<h2[^>]+id="parent-title"[^>]*>(.+?)</h2>',
  519. r'<div[^>]+class="info"[^>]*>\s*<h1>(.+?)</h1>'), webpage, 'title')
  520. description = self._search_regex(
  521. (r'<p class="[^"]*medium-description[^"]*">([^<]+)</p>',
  522. r'<div[^>]+class="info_+synopsis"[^>]*>([^<]+)</div>'),
  523. webpage, 'description', default=None)
  524. if not description:
  525. description = self._html_search_meta('description', webpage)
  526. else:
  527. programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id)
  528. return {
  529. 'id': programme_id,
  530. 'title': title,
  531. 'description': description,
  532. 'thumbnail': self._og_search_thumbnail(webpage, default=None),
  533. 'duration': duration,
  534. 'formats': formats,
  535. 'subtitles': subtitles,
  536. }
  537. class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE
  538. IE_NAME = 'bbc'
  539. IE_DESC = 'BBC'
  540. _VALID_URL = r'''(?x)
  541. https?://(?:www\.)?(?:
  542. bbc\.(?:com|co\.uk)|
  543. bbcnewsd73hkzno2ini43t4gblxvycyac5aw4gnv7t2rccijh7745uqd\.onion|
  544. bbcweb3hytmzhn5d532owbu6oqadra5z3ar726vq5kgwwn6aucdccrad\.onion
  545. )/(?:[^/]+/)+(?P<id>[^/#?]+)'''
  546. _MEDIA_SETS = [
  547. 'pc',
  548. 'mobile-tablet-main',
  549. ]
  550. _TESTS = [{
  551. # article with multiple videos embedded with data-playable containing vpids
  552. 'url': 'http://www.bbc.com/news/world-europe-32668511',
  553. 'info_dict': {
  554. 'id': 'world-europe-32668511',
  555. 'title': 'Russia stages massive WW2 parade despite Western boycott',
  556. 'description': 'md5:00ff61976f6081841f759a08bf78cc9c',
  557. },
  558. 'playlist_count': 2,
  559. }, {
  560. # article with multiple videos embedded with data-playable (more videos)
  561. 'url': 'http://www.bbc.com/news/business-28299555',
  562. 'info_dict': {
  563. 'id': 'business-28299555',
  564. 'title': 'Farnborough Airshow: Video highlights',
  565. 'description': 'BBC reports and video highlights at the Farnborough Airshow.',
  566. },
  567. 'playlist_count': 9,
  568. 'skip': 'Save time',
  569. }, {
  570. # article with multiple videos embedded with `new SMP()`
  571. # broken
  572. 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460',
  573. 'info_dict': {
  574. 'id': '3662a707-0af9-3149-963f-47bea720b460',
  575. 'title': 'BUGGER',
  576. 'description': r're:BUGGER The recent revelations by the whistleblower Edward Snowden were fascinating. .{211}\.{3}$',
  577. },
  578. 'playlist_count': 18,
  579. }, {
  580. # single video embedded with data-playable containing vpid
  581. 'url': 'http://www.bbc.com/news/world-europe-32041533',
  582. 'info_dict': {
  583. 'id': 'p02mprgb',
  584. 'ext': 'mp4',
  585. 'title': 'Germanwings crash site aerial video',
  586. 'description': r're:(?s)Aerial video showed the site where the Germanwings flight 4U 9525, .{156} BFM TV\.$',
  587. 'duration': 47,
  588. 'timestamp': 1427219242,
  589. 'upload_date': '20150324',
  590. 'thumbnail': 'https://ichef.bbci.co.uk/news/1024/media/images/81879000/jpg/_81879090_81879089.jpg',
  591. },
  592. 'params': {
  593. 'skip_download': True,
  594. },
  595. }, {
  596. # article with single video embedded with data-playable containing XML playlist
  597. # with direct video links as progressiveDownloadUrl (for now these are extracted)
  598. # and playlist with f4m and m3u8 as streamingUrl
  599. 'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu',
  600. 'info_dict': {
  601. 'id': '150615_telabyad_kentin_cogu',
  602. 'ext': 'mp4',
  603. 'title': "YPG: Tel Abyad'ın tamamı kontrolümüzde",
  604. 'description': 'md5:33a4805a855c9baf7115fcbde57e7025',
  605. 'timestamp': 1434397334,
  606. 'upload_date': '20150615',
  607. },
  608. 'params': {
  609. 'skip_download': True,
  610. },
  611. 'skip': 'now SIMORGH_DATA with no video',
  612. }, {
  613. # single video embedded with data-playable containing XML playlists (regional section)
  614. 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw',
  615. 'info_dict': {
  616. 'id': '39275083',
  617. 'display_id': '150619_video_honduras_militares_hospitales_corrupcion_aw',
  618. 'ext': 'mp4',
  619. 'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
  620. 'description': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
  621. 'timestamp': 1434713142,
  622. 'upload_date': '20150619',
  623. 'thumbnail': 'https://a.files.bbci.co.uk/worldservice/live/assets/images/2015/06/19/150619132146_honduras_hsopitales_militares_640x360_aptn_nocredit.jpg',
  624. },
  625. 'params': {
  626. 'skip_download': True,
  627. },
  628. }, {
  629. # single video from video playlist embedded with vxp-playlist-data JSON
  630. 'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376',
  631. 'info_dict': {
  632. 'id': 'p02w6qjc',
  633. 'ext': 'mp4',
  634. 'title': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
  635. 'duration': 56,
  636. 'description': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''',
  637. },
  638. 'params': {
  639. 'skip_download': True,
  640. },
  641. 'skip': '404 Not Found',
  642. }, {
  643. # single video story with __PWA_PRELOADED_STATE__
  644. 'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret',
  645. 'info_dict': {
  646. 'id': 'p02q6gc4',
  647. 'ext': 'mp4',
  648. 'title': 'Tasting the spice of life in Jaffna',
  649. 'description': r're:(?s)BBC Travel Show’s Henry Golding explores the city of Jaffna .{151} aftertaste\.$',
  650. 'timestamp': 1646058397,
  651. 'upload_date': '20220228',
  652. 'duration': 255,
  653. 'thumbnail': 'https://ichef.bbci.co.uk/images/ic/1920xn/p02vxvkn.jpg',
  654. },
  655. }, {
  656. # single video story without digitalData
  657. 'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star',
  658. 'info_dict': {
  659. 'id': 'p018zqqg',
  660. 'ext': 'mp4',
  661. 'title': 'Hyundai Santa Fe Sport: Rock star',
  662. 'description': 'md5:b042a26142c4154a6e472933cf20793d',
  663. 'timestamp': 1415867444,
  664. 'upload_date': '20141113',
  665. },
  666. 'skip': 'redirects to TopGear home page',
  667. }, {
  668. # single video embedded with Morph
  669. # TODO: replacement test page
  670. 'url': 'http://www.bbc.co.uk/sport/live/olympics/36895975',
  671. 'info_dict': {
  672. 'id': 'p041vhd0',
  673. 'ext': 'mp4',
  674. 'title': "Nigeria v Japan - Men's First Round",
  675. 'description': 'Live coverage of the first round from Group B at the Amazonia Arena.',
  676. 'duration': 7980,
  677. 'uploader': 'BBC Sport',
  678. 'uploader_id': 'bbc_sport',
  679. },
  680. 'skip': 'Video no longer in page',
  681. }, {
  682. # single video in __INITIAL_DATA__
  683. 'url': 'http://www.bbc.com/sport/0/football/33653409',
  684. 'info_dict': {
  685. 'id': 'p02xycnp',
  686. 'ext': 'mp4',
  687. 'title': 'Ronaldo to Man Utd, Arsenal to spend?',
  688. 'description': r're:(?s)BBC Sport\'s David Ornstein rounds up the latest transfer reports, .{359} here\.$',
  689. 'timestamp': 1437750175,
  690. 'upload_date': '20150724',
  691. 'thumbnail': r're:https?://.+/.+media/images/69320000/png/_69320754_mmgossipcolumnextraaugust18.png',
  692. 'duration': 140,
  693. },
  694. }, {
  695. # article with multiple videos embedded with Morph.setPayload
  696. 'url': 'http://www.bbc.com/sport/0/football/34475836',
  697. 'info_dict': {
  698. 'id': '34475836',
  699. 'title': 'Jurgen Klopp: Furious football from a witty and winning coach',
  700. 'description': 'Fast-paced football, wit, wisdom and a ready smile - why Liverpool fans should come to love new boss Jurgen Klopp.',
  701. },
  702. 'playlist_count': 3,
  703. }, {
  704. # Testing noplaylist
  705. 'url': 'http://www.bbc.com/sport/0/football/34475836',
  706. 'info_dict': {
  707. 'id': 'p034ppnv',
  708. 'ext': 'mp4',
  709. 'title': 'All you need to know about Jurgen Klopp',
  710. 'timestamp': 1444335081,
  711. 'upload_date': '20151008',
  712. 'duration': 122.0,
  713. 'thumbnail': 'https://ichef.bbci.co.uk/onesport/cps/976/cpsprodpb/7542/production/_85981003_klopp.jpg',
  714. },
  715. 'params': {
  716. 'noplaylist': True,
  717. },
  718. }, {
  719. # school report article with single video
  720. 'url': 'http://www.bbc.co.uk/schoolreport/35744779',
  721. 'info_dict': {
  722. 'id': '35744779',
  723. 'title': 'School which breaks down barriers in Jerusalem',
  724. },
  725. 'playlist_count': 1,
  726. 'skip': 'redirects to Young Reporter home page https://www.bbc.co.uk/news/topics/cg41ylwv43pt',
  727. }, {
  728. # single video with playlist URL from weather section
  729. 'url': 'http://www.bbc.com/weather/features/33601775',
  730. 'only_matching': True,
  731. }, {
  732. # custom redirection to www.bbc.com
  733. # also, video with window.__INITIAL_DATA__
  734. 'url': 'http://www.bbc.co.uk/news/science-environment-33661876',
  735. 'info_dict': {
  736. 'id': 'p02xzws1',
  737. 'ext': 'mp4',
  738. 'title': "Pluto may have 'nitrogen glaciers'",
  739. 'description': 'md5:6a95b593f528d7a5f2605221bc56912f',
  740. 'thumbnail': r're:https?://.+/.+\.jpg',
  741. 'timestamp': 1437785037,
  742. 'upload_date': '20150725',
  743. 'duration': 105,
  744. },
  745. }, {
  746. # video with window.__INITIAL_DATA__ and value as JSON string
  747. 'url': 'https://www.bbc.com/news/av/world-europe-59468682',
  748. 'info_dict': {
  749. 'id': 'p0b779gc',
  750. 'ext': 'mp4',
  751. 'title': 'Why France is making this woman a national hero',
  752. 'description': r're:(?s)France is honouring the US-born 20th Century singer and activist Josephine .{208} Second World War.',
  753. 'thumbnail': r're:https?://.+/.+\.jpg',
  754. 'timestamp': 1638215626,
  755. 'upload_date': '20211129',
  756. 'duration': 125,
  757. },
  758. }, {
  759. # video with script id __NEXT_DATA__ and value as JSON string
  760. 'url': 'https://www.bbc.com/news/uk-68546268',
  761. 'info_dict': {
  762. 'id': 'p0hj0lq7',
  763. 'ext': 'mp4',
  764. 'title': 'Nasser Hospital doctor describes his treatment by IDF',
  765. 'description': r're:(?s)Doctor Abu Sabha said he was detained by Israeli forces after .{276} hostages\."$',
  766. 'thumbnail': r're:https?://.+/.+\.jpg',
  767. 'timestamp': 1710188248,
  768. 'upload_date': '20240311',
  769. 'duration': 104,
  770. },
  771. }, {
  772. # single video article embedded with data-media-vpid
  773. 'url': 'http://www.bbc.co.uk/sport/rowing/35908187',
  774. 'only_matching': True,
  775. }, {
  776. # bbcthreeConfig
  777. 'url': 'https://www.bbc.co.uk/bbcthree/clip/73d0bbd0-abc3-4cea-b3c0-cdae21905eb1',
  778. 'info_dict': {
  779. 'id': 'p06556y7',
  780. 'ext': 'mp4',
  781. 'title': 'Things Not To Say to people that live on council estates',
  782. 'description': "From being labelled a 'chav', to the presumption that they're 'scroungers', people who live on council estates encounter all kinds of prejudices and false assumptions about themselves, their families, and their lifestyles. Here, eight people discuss the common statements, misconceptions, and clichés that they're tired of hearing.",
  783. 'duration': 360,
  784. 'thumbnail': r're:https?://.+/.+\.jpg',
  785. },
  786. }, {
  787. # window.__PRELOADED_STATE__
  788. 'url': 'https://www.bbc.co.uk/radio/play/b0b9z4yl',
  789. 'info_dict': {
  790. 'id': 'b0b9z4vz',
  791. 'ext': 'mp4',
  792. 'title': 'Prom 6: An American in Paris and Turangalila',
  793. 'description': 'md5:51cf7d6f5c8553f197e58203bc78dff8',
  794. 'uploader': 'Radio 3',
  795. 'uploader_id': 'bbc_radio_three',
  796. },
  797. 'skip': '404 Not Found',
  798. }, {
  799. 'url': 'http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227',
  800. 'info_dict': {
  801. 'id': 'p06w9tws',
  802. 'ext': 'mp4',
  803. 'title': 'md5:2fabf12a726603193a2879a055f72514',
  804. 'description': 'Learn English words and phrases from this story',
  805. 'thumbnail': 'https://ichef.bbci.co.uk/images/ic/1200x675/p06pq9gk.jpg',
  806. },
  807. 'add_ie': [BBCCoUkIE.ie_key()],
  808. }, {
  809. # BBC Reel
  810. 'url': 'https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness',
  811. 'info_dict': {
  812. 'id': 'p07c6sb9',
  813. 'ext': 'mp4',
  814. 'title': 'The downsides of positive thinking',
  815. 'description': 'The downsides of positive thinking',
  816. 'duration': 235,
  817. 'thumbnail': r're:https?://.+/p07c9dsr\.(?:jpg|webp|png)',
  818. 'upload_date': '20220223',
  819. 'timestamp': 1645632746,
  820. },
  821. }, {
  822. # BBC Sounds
  823. 'url': 'https://www.bbc.co.uk/sounds/play/w3ct5rgx',
  824. 'info_dict': {
  825. 'id': 'p0hrw4nr',
  826. 'ext': 'mp4',
  827. 'title': 'Are our coastlines being washed away?',
  828. 'description': r're:(?s)Around the world, coastlines are constantly changing .{2000,} Images\)$',
  829. 'timestamp': 1713556800,
  830. 'upload_date': '20240419',
  831. 'duration': 1588,
  832. 'thumbnail': 'https://ichef.bbci.co.uk/images/ic/raw/p0hrnxbl.jpg',
  833. 'uploader': 'World Service',
  834. 'uploader_id': 'bbc_world_service',
  835. 'series': 'CrowdScience',
  836. 'chapters': [],
  837. },
  838. }, { # onion routes
  839. 'url': 'https://www.bbcnewsd73hkzno2ini43t4gblxvycyac5aw4gnv7t2rccijh7745uqd.onion/news/av/world-europe-63208576',
  840. 'only_matching': True,
  841. }, {
  842. 'url': 'https://www.bbcweb3hytmzhn5d532owbu6oqadra5z3ar726vq5kgwwn6aucdccrad.onion/sport/av/football/63195681',
  843. 'only_matching': True,
  844. }]
  845. @classmethod
  846. def suitable(cls, url):
  847. EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerEpisodesIE, BBCCoUkIPlayerGroupIE, BBCCoUkPlaylistIE)
  848. return (False if any(ie.suitable(url) for ie in EXCLUDE_IE)
  849. else super().suitable(url))
  850. def _extract_from_media_meta(self, media_meta, video_id):
  851. # Direct links to media in media metadata (e.g.
  852. # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
  853. # TODO: there are also f4m and m3u8 streams incorporated in playlist.sxml
  854. source_files = media_meta.get('sourceFiles')
  855. if source_files:
  856. return [{
  857. 'url': f['url'],
  858. 'format_id': format_id,
  859. 'ext': f.get('encoding'),
  860. 'tbr': float_or_none(f.get('bitrate'), 1000),
  861. 'filesize': int_or_none(f.get('filesize')),
  862. } for format_id, f in source_files.items() if f.get('url')], []
  863. programme_id = media_meta.get('externalId')
  864. if programme_id:
  865. return self._download_media_selector(programme_id)
  866. # Process playlist.sxml as legacy playlist
  867. href = media_meta.get('href')
  868. if href:
  869. playlist = self._download_legacy_playlist_url(href)
  870. _, _, _, _, formats, subtitles = self._extract_from_legacy_playlist(playlist, video_id)
  871. return formats, subtitles
  872. return [], []
  873. def _extract_from_playlist_sxml(self, url, playlist_id, timestamp):
  874. programme_id, title, description, duration, formats, subtitles = \
  875. self._process_legacy_playlist_url(url, playlist_id)
  876. return {
  877. 'id': programme_id,
  878. 'title': title,
  879. 'description': description,
  880. 'duration': duration,
  881. 'timestamp': timestamp,
  882. 'formats': formats,
  883. 'subtitles': subtitles,
  884. }
  885. def _real_extract(self, url):
  886. playlist_id = self._match_id(url)
  887. webpage = self._download_webpage(url, playlist_id)
  888. json_ld_info = self._search_json_ld(webpage, playlist_id, default={})
  889. timestamp = json_ld_info.get('timestamp')
  890. playlist_title = json_ld_info.get('title') or re.sub(
  891. r'(.+)\s*-\s*BBC.*?$', r'\1', self._generic_title('', webpage, default='')).strip() or None
  892. playlist_description = json_ld_info.get(
  893. 'description') or self._og_search_description(webpage, default=None)
  894. if not timestamp:
  895. timestamp = parse_iso8601(self._search_regex(
  896. [r'<meta[^>]+property="article:published_time"[^>]+content="([^"]+)"',
  897. r'itemprop="datePublished"[^>]+datetime="([^"]+)"',
  898. r'"datePublished":\s*"([^"]+)'],
  899. webpage, 'date', default=None))
  900. entries = []
  901. # article with multiple videos embedded with playlist.sxml (e.g.
  902. # http://www.bbc.com/sport/0/football/34475836)
  903. playlists = re.findall(r'<param[^>]+name="playlist"[^>]+value="([^"]+)"', webpage)
  904. playlists.extend(re.findall(r'data-media-id="([^"]+/playlist\.sxml)"', webpage))
  905. if playlists:
  906. entries = [
  907. self._extract_from_playlist_sxml(playlist_url, playlist_id, timestamp)
  908. for playlist_url in playlists]
  909. # news article with multiple videos embedded with data-playable
  910. data_playables = re.findall(r'data-playable=(["\'])({.+?})\1', webpage)
  911. if data_playables:
  912. for _, data_playable_json in data_playables:
  913. data_playable = self._parse_json(
  914. unescapeHTML(data_playable_json), playlist_id, fatal=False)
  915. if not data_playable:
  916. continue
  917. settings = data_playable.get('settings', {})
  918. if settings:
  919. # data-playable with video vpid in settings.playlistObject.items (e.g.
  920. # http://www.bbc.com/news/world-us-canada-34473351)
  921. playlist_object = settings.get('playlistObject', {})
  922. if playlist_object:
  923. items = playlist_object.get('items')
  924. if items and isinstance(items, list):
  925. title = playlist_object['title']
  926. description = playlist_object.get('summary')
  927. duration = int_or_none(items[0].get('duration'))
  928. programme_id = items[0].get('vpid')
  929. formats, subtitles = self._download_media_selector(programme_id)
  930. entries.append({
  931. 'id': programme_id,
  932. 'title': title,
  933. 'description': description,
  934. 'timestamp': timestamp,
  935. 'duration': duration,
  936. 'formats': formats,
  937. 'subtitles': subtitles,
  938. })
  939. else:
  940. # data-playable without vpid but with a playlist.sxml URLs
  941. # in otherSettings.playlist (e.g.
  942. # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani)
  943. playlist = data_playable.get('otherSettings', {}).get('playlist', {})
  944. if playlist:
  945. entry = None
  946. for key in ('streaming', 'progressiveDownload'):
  947. playlist_url = playlist.get(f'{key}Url')
  948. if not playlist_url:
  949. continue
  950. try:
  951. info = self._extract_from_playlist_sxml(
  952. playlist_url, playlist_id, timestamp)
  953. if not entry:
  954. entry = info
  955. else:
  956. entry['title'] = info['title']
  957. entry['formats'].extend(info['formats'])
  958. except ExtractorError as e:
  959. # Some playlist URL may fail with 500, at the same time
  960. # the other one may work fine (e.g.
  961. # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
  962. if isinstance(e.cause, HTTPError) and e.cause.status == 500:
  963. continue
  964. raise
  965. if entry:
  966. entries.append(entry)
  967. if entries:
  968. return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
  969. # http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227
  970. group_id = self._search_regex(
  971. rf'<div[^>]+\bclass=["\']video["\'][^>]+\bdata-pid=["\']({self._ID_REGEX})',
  972. webpage, 'group id', default=None)
  973. if group_id:
  974. return self.url_result(
  975. f'https://www.bbc.co.uk/programmes/{group_id}', BBCCoUkIE)
  976. # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
  977. programme_id = self._search_regex(
  978. [rf'data-(?:video-player|media)-vpid="({self._ID_REGEX})"',
  979. rf'<param[^>]+name="externalIdentifier"[^>]+value="({self._ID_REGEX})"',
  980. rf'videoId\s*:\s*["\']({self._ID_REGEX})["\']'],
  981. webpage, 'vpid', default=None)
  982. if programme_id:
  983. formats, subtitles = self._download_media_selector(programme_id)
  984. # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star)
  985. digital_data = self._parse_json(
  986. self._search_regex(
  987. r'var\s+digitalData\s*=\s*({.+?});?\n', webpage, 'digital data', default='{}'),
  988. programme_id, fatal=False)
  989. page_info = digital_data.get('page', {}).get('pageInfo', {})
  990. title = page_info.get('pageName') or self._og_search_title(webpage)
  991. description = page_info.get('description') or self._og_search_description(webpage)
  992. timestamp = parse_iso8601(page_info.get('publicationDate')) or timestamp
  993. return {
  994. 'id': programme_id,
  995. 'title': title,
  996. 'description': description,
  997. 'timestamp': timestamp,
  998. 'formats': formats,
  999. 'subtitles': subtitles,
  1000. }
  1001. # bbc reel (e.g. https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness)
  1002. initial_data = self._parse_json(self._html_search_regex(
  1003. r'<script[^>]+id=(["\'])initial-data\1[^>]+data-json=(["\'])(?P<json>(?:(?!\2).)+)',
  1004. webpage, 'initial data', default='{}', group='json'), playlist_id, fatal=False)
  1005. if initial_data:
  1006. init_data = try_get(
  1007. initial_data, lambda x: x['initData']['items'][0], dict) or {}
  1008. smp_data = init_data.get('smpData') or {}
  1009. clip_data = try_get(smp_data, lambda x: x['items'][0], dict) or {}
  1010. version_id = clip_data.get('versionID')
  1011. if version_id:
  1012. title = smp_data['title']
  1013. formats, subtitles = self._download_media_selector(version_id)
  1014. image_url = smp_data.get('holdingImageURL')
  1015. display_date = init_data.get('displayDate')
  1016. topic_title = init_data.get('topicTitle')
  1017. return {
  1018. 'id': version_id,
  1019. 'title': title,
  1020. 'formats': formats,
  1021. 'alt_title': init_data.get('shortTitle'),
  1022. 'thumbnail': image_url.replace('$recipe', 'raw') if image_url else None,
  1023. 'description': smp_data.get('summary') or init_data.get('shortSummary'),
  1024. 'upload_date': display_date.replace('-', '') if display_date else None,
  1025. 'subtitles': subtitles,
  1026. 'duration': int_or_none(clip_data.get('duration')),
  1027. 'categories': [topic_title] if topic_title else None,
  1028. }
  1029. # Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975)
  1030. # Several setPayload calls may be present but the video(s)
  1031. # should be in one that mentions leadMedia or videoData
  1032. morph_payload = self._search_json(
  1033. r'\bMorph\s*\.\s*setPayload\s*\([^,]+,', webpage, 'morph payload', playlist_id,
  1034. contains_pattern=r'{(?s:(?:(?!</script>).)+(?:"leadMedia"|\\"videoData\\")\s*:.+)}',
  1035. default={})
  1036. if morph_payload:
  1037. for lead_media in traverse_obj(morph_payload, (
  1038. 'body', 'components', ..., 'props', 'leadMedia', {dict})):
  1039. programme_id = traverse_obj(lead_media, ('identifiers', ('vpid', 'playablePid'), {str}, any))
  1040. if not programme_id:
  1041. continue
  1042. formats, subtitles = self._download_media_selector(programme_id)
  1043. return {
  1044. 'id': programme_id,
  1045. 'title': lead_media.get('title') or self._og_search_title(webpage),
  1046. **traverse_obj(lead_media, {
  1047. 'description': ('summary', {str}),
  1048. 'duration': ('duration', ('rawDuration', 'formattedDuration', 'spokenDuration'), {parse_duration}),
  1049. 'uploader': ('masterBrand', {str}),
  1050. 'uploader_id': ('mid', {str}),
  1051. }),
  1052. 'formats': formats,
  1053. 'subtitles': subtitles,
  1054. }
  1055. body = self._parse_json(traverse_obj(morph_payload, (
  1056. 'body', 'content', 'article', 'body')), playlist_id, fatal=False)
  1057. for video_data in traverse_obj(body, (lambda _, v: v['videoData']['pid'], 'videoData')):
  1058. if video_data.get('vpid'):
  1059. video_id = video_data['vpid']
  1060. formats, subtitles = self._download_media_selector(video_id)
  1061. entry = {
  1062. 'id': video_id,
  1063. 'formats': formats,
  1064. 'subtitles': subtitles,
  1065. }
  1066. else:
  1067. video_id = video_data['pid']
  1068. entry = self.url_result(
  1069. f'https://www.bbc.co.uk/programmes/{video_id}', BBCCoUkIE,
  1070. video_id, url_transparent=True)
  1071. entry.update({
  1072. 'timestamp': traverse_obj(morph_payload, (
  1073. 'body', 'content', 'article', 'dateTimeInfo', 'dateTime', {parse_iso8601}),
  1074. ),
  1075. **traverse_obj(video_data, {
  1076. 'thumbnail': (('iChefImage', 'image'), {url_or_none}, any),
  1077. 'title': (('title', 'caption'), {str}, any),
  1078. 'duration': ('duration', {parse_duration}),
  1079. }),
  1080. })
  1081. if video_data.get('isLead') and not self._yes_playlist(playlist_id, video_id):
  1082. return entry
  1083. entries.append(entry)
  1084. if entries:
  1085. playlist_title = traverse_obj(morph_payload, (
  1086. 'body', 'content', 'article', 'headline', {str})) or playlist_title
  1087. return self.playlist_result(
  1088. entries, playlist_id, playlist_title, playlist_description)
  1089. # various PRELOADED_STATE JSON
  1090. preload_state = self._search_json(
  1091. r'window\.__(?:PWA_)?PRELOADED_STATE__\s*=', webpage,
  1092. 'preload state', playlist_id, transform_source=js_to_json, default={})
  1093. # PRELOADED_STATE with current programmme
  1094. current_programme = traverse_obj(preload_state, ('programmes', 'current', {dict}))
  1095. programme_id = traverse_obj(current_programme, ('id', {str}))
  1096. if programme_id and current_programme.get('type') == 'playable_item':
  1097. title = traverse_obj(current_programme, ('titles', ('tertiary', 'secondary'), {str}, any)) or playlist_title
  1098. formats, subtitles = self._download_media_selector(programme_id)
  1099. return {
  1100. 'id': programme_id,
  1101. 'title': title,
  1102. 'formats': formats,
  1103. **traverse_obj(current_programme, {
  1104. 'description': ('synopses', ('long', 'medium', 'short'), {str}, any),
  1105. 'thumbnail': ('image_url', {lambda u: url_or_none(u.replace('{recipe}', 'raw'))}),
  1106. 'duration': ('duration', 'value', {int_or_none}),
  1107. 'uploader': ('network', 'short_title', {str}),
  1108. 'uploader_id': ('network', 'id', {str}),
  1109. 'timestamp': ((('availability', 'from'), ('release', 'date')), {parse_iso8601}, any),
  1110. 'series': ('titles', 'primary', {str}),
  1111. }),
  1112. 'subtitles': subtitles,
  1113. 'chapters': traverse_obj(preload_state, (
  1114. 'tracklist', 'tracks', lambda _, v: float(v['offset']['start']), {
  1115. 'title': ('titles', {lambda x: join_nonempty(
  1116. 'primary', 'secondary', 'tertiary', delim=' - ', from_dict=x)}),
  1117. 'start_time': ('offset', 'start', {float_or_none}),
  1118. 'end_time': ('offset', 'end', {float_or_none}),
  1119. }),
  1120. ),
  1121. }
  1122. # PWA_PRELOADED_STATE with article video asset
  1123. asset_id = traverse_obj(preload_state, (
  1124. 'entities', 'articles', lambda k, _: k.rsplit('/', 1)[-1] == playlist_id,
  1125. 'assetVideo', 0, {str}, any))
  1126. if asset_id:
  1127. video_id = traverse_obj(preload_state, ('entities', 'videos', asset_id, 'vpid', {str}))
  1128. if video_id:
  1129. article = traverse_obj(preload_state, (
  1130. 'entities', 'articles', lambda _, v: v['assetVideo'][0] == asset_id, any))
  1131. def image_url(image_id):
  1132. return traverse_obj(preload_state, (
  1133. 'entities', 'images', image_id, 'url',
  1134. {lambda u: url_or_none(u.replace('$recipe', 'raw'))}))
  1135. formats, subtitles = self._download_media_selector(video_id)
  1136. return {
  1137. 'id': video_id,
  1138. **traverse_obj(preload_state, ('entities', 'videos', asset_id, {
  1139. 'title': ('title', {str}),
  1140. 'description': (('synopsisLong', 'synopsisMedium', 'synopsisShort'), {str}, any),
  1141. 'thumbnail': (0, {image_url}),
  1142. 'duration': ('duration', {int_or_none}),
  1143. })),
  1144. 'formats': formats,
  1145. 'subtitles': subtitles,
  1146. 'timestamp': traverse_obj(article, ('displayDate', {parse_iso8601})),
  1147. }
  1148. else:
  1149. return self.url_result(
  1150. f'https://www.bbc.co.uk/programmes/{asset_id}', BBCCoUkIE,
  1151. asset_id, playlist_title, display_id=playlist_id,
  1152. description=playlist_description)
  1153. bbc3_config = self._parse_json(
  1154. self._search_regex(
  1155. r'(?s)bbcthreeConfig\s*=\s*({.+?})\s*;\s*<', webpage,
  1156. 'bbcthree config', default='{}'),
  1157. playlist_id, transform_source=js_to_json, fatal=False) or {}
  1158. payload = bbc3_config.get('payload') or {}
  1159. if payload:
  1160. clip = payload.get('currentClip') or {}
  1161. clip_vpid = clip.get('vpid')
  1162. clip_title = clip.get('title')
  1163. if clip_vpid and clip_title:
  1164. formats, subtitles = self._download_media_selector(clip_vpid)
  1165. return {
  1166. 'id': clip_vpid,
  1167. 'title': clip_title,
  1168. 'thumbnail': dict_get(clip, ('poster', 'imageUrl')),
  1169. 'description': clip.get('description'),
  1170. 'duration': parse_duration(clip.get('duration')),
  1171. 'formats': formats,
  1172. 'subtitles': subtitles,
  1173. }
  1174. bbc3_playlist = try_get(
  1175. payload, lambda x: x['content']['bbcMedia']['playlist'],
  1176. dict)
  1177. if bbc3_playlist:
  1178. playlist_title = bbc3_playlist.get('title') or playlist_title
  1179. thumbnail = bbc3_playlist.get('holdingImageURL')
  1180. entries = []
  1181. for bbc3_item in bbc3_playlist['items']:
  1182. programme_id = bbc3_item.get('versionID')
  1183. if not programme_id:
  1184. continue
  1185. formats, subtitles = self._download_media_selector(programme_id)
  1186. entries.append({
  1187. 'id': programme_id,
  1188. 'title': playlist_title,
  1189. 'thumbnail': thumbnail,
  1190. 'timestamp': timestamp,
  1191. 'formats': formats,
  1192. 'subtitles': subtitles,
  1193. })
  1194. return self.playlist_result(
  1195. entries, playlist_id, playlist_title, playlist_description)
  1196. def parse_model(model):
  1197. """Extract single video from model structure"""
  1198. item_id = traverse_obj(model, ('versions', 0, 'versionId', {str}))
  1199. if not item_id:
  1200. return
  1201. formats, subtitles = self._download_media_selector(item_id)
  1202. return {
  1203. 'id': item_id,
  1204. 'formats': formats,
  1205. 'subtitles': subtitles,
  1206. **traverse_obj(model, {
  1207. 'title': ('title', {str}),
  1208. 'thumbnail': ('imageUrl', {lambda u: urljoin(url, u.replace('$recipe', 'raw'))}),
  1209. 'description': ('synopses', ('long', 'medium', 'short'), {str}, {lambda x: x or None}, any),
  1210. 'duration': ('versions', 0, 'duration', {int}),
  1211. 'timestamp': ('versions', 0, 'availableFrom', {functools.partial(int_or_none, scale=1000)}),
  1212. }),
  1213. }
  1214. def is_type(*types):
  1215. return lambda _, v: v['type'] in types
  1216. initial_data = self._search_regex(
  1217. r'window\.__INITIAL_DATA__\s*=\s*("{.+?}")\s*;', webpage,
  1218. 'quoted preload state', default=None)
  1219. if initial_data is None:
  1220. initial_data = self._search_regex(
  1221. r'window\.__INITIAL_DATA__\s*=\s*({.+?})\s*;', webpage,
  1222. 'preload state', default='{}')
  1223. else:
  1224. initial_data = self._parse_json(initial_data or '"{}"', playlist_id, fatal=False)
  1225. initial_data = self._parse_json(initial_data, playlist_id, fatal=False)
  1226. if initial_data:
  1227. for video_data in traverse_obj(initial_data, (
  1228. 'stores', 'article', 'articleBodyContent', is_type('video'))):
  1229. model = traverse_obj(video_data, (
  1230. 'model', 'blocks', is_type('aresMedia'),
  1231. 'model', 'blocks', is_type('aresMediaMetadata'),
  1232. 'model', {dict}, any))
  1233. entry = parse_model(model)
  1234. if entry:
  1235. entries.append(entry)
  1236. if entries:
  1237. return self.playlist_result(
  1238. entries, playlist_id, playlist_title, playlist_description)
  1239. def parse_media(media):
  1240. if not media:
  1241. return
  1242. for item in (try_get(media, lambda x: x['media']['items'], list) or []):
  1243. item_id = item.get('id')
  1244. item_title = item.get('title')
  1245. if not (item_id and item_title):
  1246. continue
  1247. formats, subtitles = self._download_media_selector(item_id)
  1248. item_desc = None
  1249. blocks = try_get(media, lambda x: x['summary']['blocks'], list)
  1250. if blocks:
  1251. summary = []
  1252. for block in blocks:
  1253. text = try_get(block, lambda x: x['model']['text'], str)
  1254. if text:
  1255. summary.append(text)
  1256. if summary:
  1257. item_desc = '\n\n'.join(summary)
  1258. item_time = None
  1259. for meta in try_get(media, lambda x: x['metadata']['items'], list) or []:
  1260. if try_get(meta, lambda x: x['label']) == 'Published':
  1261. item_time = unified_timestamp(meta.get('timestamp'))
  1262. break
  1263. entries.append({
  1264. 'id': item_id,
  1265. 'title': item_title,
  1266. 'thumbnail': item.get('holdingImageUrl'),
  1267. 'formats': formats,
  1268. 'subtitles': subtitles,
  1269. 'timestamp': item_time,
  1270. 'description': strip_or_none(item_desc),
  1271. 'duration': int_or_none(item.get('duration')),
  1272. })
  1273. for resp in traverse_obj(initial_data, ('data', lambda _, v: v['name'])):
  1274. name = resp['name']
  1275. if name == 'media-experience':
  1276. parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict))
  1277. elif name == 'article':
  1278. for block in traverse_obj(resp, (
  1279. 'data', (None, ('content', 'model')), 'blocks',
  1280. is_type('media', 'video'), 'model', {dict})):
  1281. parse_media(block)
  1282. return self.playlist_result(
  1283. entries, playlist_id, playlist_title, playlist_description)
  1284. # extract from SIMORGH_DATA hydration JSON
  1285. simorgh_data = self._search_json(
  1286. r'window\s*\.\s*SIMORGH_DATA\s*=', webpage,
  1287. 'simorgh data', playlist_id, default={})
  1288. if simorgh_data:
  1289. done = False
  1290. for video_data in traverse_obj(simorgh_data, (
  1291. 'pageData', 'content', 'model', 'blocks', is_type('video', 'legacyMedia'))):
  1292. model = traverse_obj(video_data, (
  1293. 'model', 'blocks', is_type('aresMedia'),
  1294. 'model', 'blocks', is_type('aresMediaMetadata'),
  1295. 'model', {dict}, any))
  1296. if video_data['type'] == 'video':
  1297. entry = parse_model(model)
  1298. else: # legacyMedia: no duration, subtitles
  1299. block_id, entry = traverse_obj(model, ('blockId', {str})), None
  1300. media_data = traverse_obj(simorgh_data, (
  1301. 'pageData', 'promo', 'media',
  1302. {lambda x: x if x['id'] == block_id else None}))
  1303. formats = traverse_obj(media_data, ('playlist', lambda _, v: url_or_none(v['url']), {
  1304. 'url': ('url', {url_or_none}),
  1305. 'ext': ('format', {str}),
  1306. 'tbr': ('bitrate', {functools.partial(int_or_none, scale=1000)}),
  1307. }))
  1308. if formats:
  1309. entry = {
  1310. 'id': block_id,
  1311. 'display_id': playlist_id,
  1312. 'formats': formats,
  1313. 'description': traverse_obj(simorgh_data, ('pageData', 'promo', 'summary', {str})),
  1314. **traverse_obj(model, {
  1315. 'title': ('title', {str}),
  1316. 'thumbnail': ('imageUrl', {lambda u: urljoin(url, u.replace('$recipe', 'raw'))}),
  1317. 'description': ('synopses', ('long', 'medium', 'short'), {str}, any),
  1318. 'timestamp': ('firstPublished', {functools.partial(int_or_none, scale=1000)}),
  1319. }),
  1320. }
  1321. done = True
  1322. if entry:
  1323. entries.append(entry)
  1324. if done:
  1325. break
  1326. if entries:
  1327. return self.playlist_result(
  1328. entries, playlist_id, playlist_title, playlist_description)
  1329. def extract_all(pattern):
  1330. return list(filter(None, (
  1331. self._parse_json(s, playlist_id, fatal=False)
  1332. for s in re.findall(pattern, webpage))))
  1333. # US accessed article with single embedded video (e.g.
  1334. # https://www.bbc.com/news/uk-68546268)
  1335. next_data = traverse_obj(self._search_nextjs_data(webpage, playlist_id, default={}),
  1336. ('props', 'pageProps', 'page'))
  1337. model = traverse_obj(next_data, (
  1338. ..., 'contents', is_type('video'),
  1339. 'model', 'blocks', is_type('media'),
  1340. 'model', 'blocks', is_type('mediaMetadata'),
  1341. 'model', {dict}, any))
  1342. if model and (entry := parse_model(model)):
  1343. if not entry.get('timestamp'):
  1344. entry['timestamp'] = traverse_obj(next_data, (
  1345. ..., 'contents', is_type('timestamp'), 'model',
  1346. 'timestamp', {functools.partial(int_or_none, scale=1000)}, any))
  1347. entries.append(entry)
  1348. return self.playlist_result(
  1349. entries, playlist_id, playlist_title, playlist_description)
  1350. # Multiple video article (e.g.
  1351. # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)
  1352. EMBED_URL = rf'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+{self._ID_REGEX}(?:\b[^"]+)?'
  1353. entries = []
  1354. for match in extract_all(r'new\s+SMP\(({.+?})\)'):
  1355. embed_url = match.get('playerSettings', {}).get('externalEmbedUrl')
  1356. if embed_url and re.match(EMBED_URL, embed_url):
  1357. entries.append(embed_url)
  1358. entries.extend(re.findall(
  1359. rf'setPlaylist\("({EMBED_URL})"\)', webpage))
  1360. if entries:
  1361. return self.playlist_result(
  1362. [self.url_result(entry_, 'BBCCoUk') for entry_ in entries],
  1363. playlist_id, playlist_title, playlist_description)
  1364. # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511)
  1365. medias = extract_all(r"data-media-meta='({[^']+})'")
  1366. if not medias:
  1367. # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international)
  1368. media_asset = self._search_regex(
  1369. r'mediaAssetPage\.init\(\s*({.+?}), "/',
  1370. webpage, 'media asset', default=None)
  1371. if media_asset:
  1372. media_asset_page = self._parse_json(media_asset, playlist_id, fatal=False)
  1373. medias = []
  1374. for video in media_asset_page.get('videos', {}).values():
  1375. medias.extend(video.values())
  1376. if not medias:
  1377. # Multiple video playlist with single `now playing` entry (e.g.
  1378. # http://www.bbc.com/news/video_and_audio/must_see/33767813)
  1379. vxp_playlist = self._parse_json(
  1380. self._search_regex(
  1381. r'<script[^>]+class="vxp-playlist-data"[^>]+type="application/json"[^>]*>([^<]+)</script>',
  1382. webpage, 'playlist data'),
  1383. playlist_id)
  1384. playlist_medias = []
  1385. for item in vxp_playlist:
  1386. media = item.get('media')
  1387. if not media:
  1388. continue
  1389. playlist_medias.append(media)
  1390. # Download single video if found media with asset id matching the video id from URL
  1391. if item.get('advert', {}).get('assetId') == playlist_id:
  1392. medias = [media]
  1393. break
  1394. # Fallback to the whole playlist
  1395. if not medias:
  1396. medias = playlist_medias
  1397. entries = []
  1398. for num, media_meta in enumerate(medias, start=1):
  1399. formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id)
  1400. if not formats and not self.get_param('ignore_no_formats'):
  1401. continue
  1402. video_id = media_meta.get('externalId')
  1403. if not video_id:
  1404. video_id = playlist_id if len(medias) == 1 else f'{playlist_id}-{num}'
  1405. title = media_meta.get('caption')
  1406. if not title:
  1407. title = playlist_title if len(medias) == 1 else f'{playlist_title} - Video {num}'
  1408. duration = int_or_none(media_meta.get('durationInSeconds')) or parse_duration(media_meta.get('duration'))
  1409. images = []
  1410. for image in media_meta.get('images', {}).values():
  1411. images.extend(image.values())
  1412. if 'image' in media_meta:
  1413. images.append(media_meta['image'])
  1414. thumbnails = [{
  1415. 'url': image.get('href'),
  1416. 'width': int_or_none(image.get('width')),
  1417. 'height': int_or_none(image.get('height')),
  1418. } for image in images]
  1419. entries.append({
  1420. 'id': video_id,
  1421. 'title': title,
  1422. 'thumbnails': thumbnails,
  1423. 'duration': duration,
  1424. 'timestamp': timestamp,
  1425. 'formats': formats,
  1426. 'subtitles': subtitles,
  1427. })
  1428. return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
  1429. class BBCCoUkArticleIE(InfoExtractor):
  1430. _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/articles/(?P<id>[a-zA-Z0-9]+)'
  1431. IE_NAME = 'bbc.co.uk:article'
  1432. IE_DESC = 'BBC articles'
  1433. _TEST = {
  1434. 'url': 'http://www.bbc.co.uk/programmes/articles/3jNQLTMrPlYGTBn0WV6M2MS/not-your-typical-role-model-ada-lovelace-the-19th-century-programmer',
  1435. 'info_dict': {
  1436. 'id': '3jNQLTMrPlYGTBn0WV6M2MS',
  1437. 'title': 'Calculating Ada: The Countess of Computing - Not your typical role model: Ada Lovelace the 19th century programmer - BBC Four',
  1438. 'description': 'Hannah Fry reveals some of her surprising discoveries about Ada Lovelace during filming.',
  1439. },
  1440. 'playlist_count': 4,
  1441. 'add_ie': ['BBCCoUk'],
  1442. }
  1443. def _real_extract(self, url):
  1444. playlist_id = self._match_id(url)
  1445. webpage = self._download_webpage(url, playlist_id)
  1446. title = self._og_search_title(webpage)
  1447. description = self._og_search_description(webpage).strip()
  1448. entries = [self.url_result(programme_url) for programme_url in re.findall(
  1449. r'<div[^>]+typeof="Clip"[^>]+resource="([^"]+)"', webpage)]
  1450. return self.playlist_result(entries, playlist_id, title, description)
  1451. class BBCCoUkPlaylistBaseIE(InfoExtractor):
  1452. def _entries(self, webpage, url, playlist_id):
  1453. single_page = 'page' in urllib.parse.parse_qs(
  1454. urllib.parse.urlparse(url).query)
  1455. for page_num in itertools.count(2):
  1456. for video_id in re.findall(
  1457. self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage):
  1458. yield self.url_result(
  1459. self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key())
  1460. if single_page:
  1461. return
  1462. next_page = self._search_regex(
  1463. r'<li[^>]+class=(["\'])pagination_+next\1[^>]*><a[^>]+href=(["\'])(?P<url>(?:(?!\2).)+)\2',
  1464. webpage, 'next page url', default=None, group='url')
  1465. if not next_page:
  1466. break
  1467. webpage = self._download_webpage(
  1468. urllib.parse.urljoin(url, next_page), playlist_id,
  1469. f'Downloading page {page_num}', page_num)
  1470. def _real_extract(self, url):
  1471. playlist_id = self._match_id(url)
  1472. webpage = self._download_webpage(url, playlist_id)
  1473. title, description = self._extract_title_and_description(webpage)
  1474. return self.playlist_result(
  1475. self._entries(webpage, url, playlist_id),
  1476. playlist_id, title, description)
  1477. class BBCCoUkIPlayerPlaylistBaseIE(InfoExtractor):
  1478. _VALID_URL_TMPL = rf'https?://(?:www\.)?bbc\.co\.uk/iplayer/%s/(?P<id>{BBCCoUkIE._ID_REGEX})'
  1479. @staticmethod
  1480. def _get_default(episode, key, default_key='default'):
  1481. return try_get(episode, lambda x: x[key][default_key])
  1482. def _get_description(self, data):
  1483. synopsis = data.get(self._DESCRIPTION_KEY) or {}
  1484. return dict_get(synopsis, ('large', 'medium', 'small'))
  1485. def _fetch_page(self, programme_id, per_page, series_id, page):
  1486. elements = self._get_elements(self._call_api(
  1487. programme_id, per_page, page + 1, series_id))
  1488. for element in elements:
  1489. episode = self._get_episode(element)
  1490. episode_id = episode.get('id')
  1491. if not episode_id:
  1492. continue
  1493. thumbnail = None
  1494. image = self._get_episode_image(episode)
  1495. if image:
  1496. thumbnail = image.replace('{recipe}', 'raw')
  1497. category = self._get_default(episode, 'labels', 'category')
  1498. yield {
  1499. '_type': 'url',
  1500. 'id': episode_id,
  1501. 'title': self._get_episode_field(episode, 'subtitle'),
  1502. 'url': 'https://www.bbc.co.uk/iplayer/episode/' + episode_id,
  1503. 'thumbnail': thumbnail,
  1504. 'description': self._get_description(episode),
  1505. 'categories': [category] if category else None,
  1506. 'series': self._get_episode_field(episode, 'title'),
  1507. 'ie_key': BBCCoUkIE.ie_key(),
  1508. }
  1509. def _real_extract(self, url):
  1510. pid = self._match_id(url)
  1511. qs = parse_qs(url)
  1512. series_id = qs.get('seriesId', [None])[0]
  1513. page = qs.get('page', [None])[0]
  1514. per_page = 36 if page else self._PAGE_SIZE
  1515. fetch_page = functools.partial(self._fetch_page, pid, per_page, series_id)
  1516. entries = fetch_page(int(page) - 1) if page else OnDemandPagedList(fetch_page, self._PAGE_SIZE)
  1517. playlist_data = self._get_playlist_data(self._call_api(pid, 1))
  1518. return self.playlist_result(
  1519. entries, pid, self._get_playlist_title(playlist_data),
  1520. self._get_description(playlist_data))
  1521. class BBCCoUkIPlayerEpisodesIE(BBCCoUkIPlayerPlaylistBaseIE):
  1522. IE_NAME = 'bbc.co.uk:iplayer:episodes'
  1523. _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'episodes'
  1524. _TESTS = [{
  1525. 'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v',
  1526. 'info_dict': {
  1527. 'id': 'b05rcz9v',
  1528. 'title': 'The Disappearance',
  1529. 'description': 'md5:58eb101aee3116bad4da05f91179c0cb',
  1530. },
  1531. 'playlist_mincount': 8,
  1532. }, {
  1533. # all seasons
  1534. 'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster',
  1535. 'info_dict': {
  1536. 'id': 'b094m5t9',
  1537. 'title': 'Doctor Foster',
  1538. 'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6',
  1539. },
  1540. 'playlist_mincount': 10,
  1541. }, {
  1542. # explicit season
  1543. 'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster?seriesId=b094m6nv',
  1544. 'info_dict': {
  1545. 'id': 'b094m5t9',
  1546. 'title': 'Doctor Foster',
  1547. 'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6',
  1548. },
  1549. 'playlist_mincount': 5,
  1550. }, {
  1551. # all pages
  1552. 'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove',
  1553. 'info_dict': {
  1554. 'id': 'm0004c4v',
  1555. 'title': 'Beechgrove',
  1556. 'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.',
  1557. },
  1558. 'playlist_mincount': 37,
  1559. }, {
  1560. # explicit page
  1561. 'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove?page=2',
  1562. 'info_dict': {
  1563. 'id': 'm0004c4v',
  1564. 'title': 'Beechgrove',
  1565. 'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.',
  1566. },
  1567. 'playlist_mincount': 1,
  1568. }]
  1569. _PAGE_SIZE = 100
  1570. _DESCRIPTION_KEY = 'synopsis'
  1571. def _get_episode_image(self, episode):
  1572. return self._get_default(episode, 'image')
  1573. def _get_episode_field(self, episode, field):
  1574. return self._get_default(episode, field)
  1575. @staticmethod
  1576. def _get_elements(data):
  1577. return data['entities']['results']
  1578. @staticmethod
  1579. def _get_episode(element):
  1580. return element.get('episode') or {}
  1581. def _call_api(self, pid, per_page, page=1, series_id=None):
  1582. variables = {
  1583. 'id': pid,
  1584. 'page': page,
  1585. 'perPage': per_page,
  1586. }
  1587. if series_id:
  1588. variables['sliceId'] = series_id
  1589. return self._download_json(
  1590. 'https://graph.ibl.api.bbc.co.uk/', pid, headers={
  1591. 'Content-Type': 'application/json',
  1592. }, data=json.dumps({
  1593. 'id': '5692d93d5aac8d796a0305e895e61551',
  1594. 'variables': variables,
  1595. }).encode())['data']['programme']
  1596. @staticmethod
  1597. def _get_playlist_data(data):
  1598. return data
  1599. def _get_playlist_title(self, data):
  1600. return self._get_default(data, 'title')
  1601. class BBCCoUkIPlayerGroupIE(BBCCoUkIPlayerPlaylistBaseIE):
  1602. IE_NAME = 'bbc.co.uk:iplayer:group'
  1603. _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'group'
  1604. _TESTS = [{
  1605. # Available for over a year unlike 30 days for most other programmes
  1606. 'url': 'http://www.bbc.co.uk/iplayer/group/p02tcc32',
  1607. 'info_dict': {
  1608. 'id': 'p02tcc32',
  1609. 'title': 'Bohemian Icons',
  1610. 'description': 'md5:683e901041b2fe9ba596f2ab04c4dbe7',
  1611. },
  1612. 'playlist_mincount': 10,
  1613. }, {
  1614. # all pages
  1615. 'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7',
  1616. 'info_dict': {
  1617. 'id': 'p081d7j7',
  1618. 'title': 'Music in Scotland',
  1619. 'description': 'Perfomances in Scotland and programmes featuring Scottish acts.',
  1620. },
  1621. 'playlist_mincount': 47,
  1622. }, {
  1623. # explicit page
  1624. 'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7?page=2',
  1625. 'info_dict': {
  1626. 'id': 'p081d7j7',
  1627. 'title': 'Music in Scotland',
  1628. 'description': 'Perfomances in Scotland and programmes featuring Scottish acts.',
  1629. },
  1630. 'playlist_mincount': 11,
  1631. }]
  1632. _PAGE_SIZE = 200
  1633. _DESCRIPTION_KEY = 'synopses'
  1634. def _get_episode_image(self, episode):
  1635. return self._get_default(episode, 'images', 'standard')
  1636. def _get_episode_field(self, episode, field):
  1637. return episode.get(field)
  1638. @staticmethod
  1639. def _get_elements(data):
  1640. return data['elements']
  1641. @staticmethod
  1642. def _get_episode(element):
  1643. return element
  1644. def _call_api(self, pid, per_page, page=1, series_id=None):
  1645. return self._download_json(
  1646. f'http://ibl.api.bbc.co.uk/ibl/v1/groups/{pid}/episodes',
  1647. pid, query={
  1648. 'page': page,
  1649. 'per_page': per_page,
  1650. })['group_episodes']
  1651. @staticmethod
  1652. def _get_playlist_data(data):
  1653. return data['group']
  1654. def _get_playlist_title(self, data):
  1655. return data.get('title')
  1656. class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE):
  1657. IE_NAME = 'bbc.co.uk:playlist'
  1658. _VALID_URL = rf'https?://(?:www\.)?bbc\.co\.uk/programmes/(?P<id>{BBCCoUkIE._ID_REGEX})/(?:episodes|broadcasts|clips)'
  1659. _URL_TEMPLATE = 'http://www.bbc.co.uk/programmes/%s'
  1660. _VIDEO_ID_TEMPLATE = r'data-pid=["\'](%s)'
  1661. _TESTS = [{
  1662. 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
  1663. 'info_dict': {
  1664. 'id': 'b05rcz9v',
  1665. 'title': 'The Disappearance - Clips - BBC Four',
  1666. 'description': 'French thriller serial about a missing teenager.',
  1667. },
  1668. 'playlist_mincount': 7,
  1669. }, {
  1670. # multipage playlist, explicit page
  1671. 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips?page=1',
  1672. 'info_dict': {
  1673. 'id': 'b00mfl7n',
  1674. 'title': 'Frozen Planet - Clips - BBC One',
  1675. 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
  1676. },
  1677. 'playlist_mincount': 24,
  1678. }, {
  1679. # multipage playlist, all pages
  1680. 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips',
  1681. 'info_dict': {
  1682. 'id': 'b00mfl7n',
  1683. 'title': 'Frozen Planet - Clips - BBC One',
  1684. 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
  1685. },
  1686. 'playlist_mincount': 142,
  1687. }, {
  1688. 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/broadcasts/2016/06',
  1689. 'only_matching': True,
  1690. }, {
  1691. 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
  1692. 'only_matching': True,
  1693. }, {
  1694. 'url': 'http://www.bbc.co.uk/programmes/b055jkys/episodes/player',
  1695. 'only_matching': True,
  1696. }]
  1697. def _extract_title_and_description(self, webpage):
  1698. title = self._og_search_title(webpage, fatal=False)
  1699. description = self._og_search_description(webpage)
  1700. return title, description