nfb.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293
  1. from .common import InfoExtractor
  2. from ..utils import (
  3. int_or_none,
  4. join_nonempty,
  5. merge_dicts,
  6. parse_count,
  7. url_or_none,
  8. )
  9. from ..utils.traversal import traverse_obj
  10. class NFBBaseIE(InfoExtractor):
  11. _VALID_URL_BASE = r'https?://(?:www\.)?(?P<site>nfb|onf)\.ca'
  12. _GEO_COUNTRIES = ['CA']
  13. def _extract_ep_data(self, webpage, video_id, fatal=False):
  14. return self._search_json(
  15. r'episodesData\s*:', webpage, 'episode data', video_id, fatal=fatal) or {}
  16. def _extract_ep_info(self, data, video_id, slug=None):
  17. info = traverse_obj(data, (lambda _, v: video_id in v['embed_url'], {
  18. 'description': ('description', {str}),
  19. 'thumbnail': ('thumbnail_url', {url_or_none}),
  20. 'uploader': ('data_layer', 'episodeMaker', {str}),
  21. 'release_year': ('data_layer', 'episodeYear', {int_or_none}),
  22. 'episode': ('data_layer', 'episodeTitle', {str}),
  23. 'season': ('data_layer', 'seasonTitle', {str}),
  24. 'season_number': ('data_layer', 'seasonTitle', {parse_count}),
  25. 'series': ('data_layer', 'seriesTitle', {str}),
  26. }), get_all=False)
  27. return {
  28. **info,
  29. 'id': video_id,
  30. 'title': join_nonempty('series', 'episode', from_dict=info, delim=' - '),
  31. 'episode_number': int_or_none(self._search_regex(
  32. r'[/-]e(?:pisode)?-?(\d+)(?:[/-]|$)', slug or video_id, 'episode number', default=None)),
  33. }
  34. class NFBIE(NFBBaseIE):
  35. IE_NAME = 'nfb'
  36. IE_DESC = 'nfb.ca and onf.ca films and episodes'
  37. _VALID_URL = [
  38. rf'{NFBBaseIE._VALID_URL_BASE}/(?P<type>film)/(?P<id>[^/?#&]+)',
  39. rf'{NFBBaseIE._VALID_URL_BASE}/(?P<type>series?)/(?P<id>[^/?#&]+/s(?:ea|ai)son\d+/episode\d+)',
  40. ]
  41. _TESTS = [{
  42. 'note': 'NFB film',
  43. 'url': 'https://www.nfb.ca/film/trafficopter/',
  44. 'info_dict': {
  45. 'id': 'trafficopter',
  46. 'ext': 'mp4',
  47. 'title': 'Trafficopter',
  48. 'description': 'md5:060228455eb85cf88785c41656776bc0',
  49. 'thumbnail': r're:^https?://.*\.jpg$',
  50. 'uploader': 'Barrie Howells',
  51. 'release_year': 1972,
  52. 'duration': 600.0,
  53. },
  54. 'params': {'skip_download': 'm3u8'},
  55. }, {
  56. 'note': 'ONF film',
  57. 'url': 'https://www.onf.ca/film/mal-du-siecle/',
  58. 'info_dict': {
  59. 'id': 'mal-du-siecle',
  60. 'ext': 'mp4',
  61. 'title': 'Le mal du siècle',
  62. 'description': 'md5:1abf774d77569ebe603419f2d344102b',
  63. 'thumbnail': r're:^https?://.*\.jpg$',
  64. 'uploader': 'Catherine Lepage',
  65. 'release_year': 2019,
  66. 'duration': 300.0,
  67. },
  68. 'params': {'skip_download': 'm3u8'},
  69. }, {
  70. 'note': 'NFB episode with English title',
  71. 'url': 'https://www.nfb.ca/series/true-north-inside-the-rise-of-toronto-basketball/season1/episode9/',
  72. 'info_dict': {
  73. 'id': 'true-north-episode9-true-north-finale-making-it',
  74. 'ext': 'mp4',
  75. 'title': 'True North: Inside the Rise of Toronto Basketball - Finale: Making It',
  76. 'description': 'We catch up with each player in the midst of their journey as they reflect on their road ahead.',
  77. 'series': 'True North: Inside the Rise of Toronto Basketball',
  78. 'release_year': 2018,
  79. 'season': 'Season 1',
  80. 'season_number': 1,
  81. 'episode': 'Finale: Making It',
  82. 'episode_number': 9,
  83. 'uploader': 'Ryan Sidhoo',
  84. 'thumbnail': r're:^https?://.*\.jpg$',
  85. },
  86. 'params': {'skip_download': 'm3u8'},
  87. }, {
  88. 'note': 'ONF episode with French title',
  89. 'url': 'https://www.onf.ca/serie/direction-nord-la-montee-du-basketball-a-toronto/saison1/episode9/',
  90. 'info_dict': {
  91. 'id': 'direction-nord-episode-9',
  92. 'ext': 'mp4',
  93. 'title': 'Direction nord – La montée du basketball à Toronto - Finale : Réussir',
  94. 'description': 'md5:349a57419b71432b97bf6083d92b029d',
  95. 'series': 'Direction nord – La montée du basketball à Toronto',
  96. 'release_year': 2018,
  97. 'season': 'Saison 1',
  98. 'season_number': 1,
  99. 'episode': 'Finale : Réussir',
  100. 'episode_number': 9,
  101. 'uploader': 'Ryan Sidhoo',
  102. 'thumbnail': r're:^https?://.*\.jpg$',
  103. },
  104. 'params': {'skip_download': 'm3u8'},
  105. }, {
  106. 'note': 'NFB episode with French title (needs geo-bypass)',
  107. 'url': 'https://www.nfb.ca/series/etoile-du-nord/saison1/episode1/',
  108. 'info_dict': {
  109. 'id': 'etoile-du-nord-episode-1-lobservation',
  110. 'ext': 'mp4',
  111. 'title': 'Étoile du Nord - L\'observation',
  112. 'description': 'md5:161a4617260dee3de70f509b2c9dd21b',
  113. 'series': 'Étoile du Nord',
  114. 'release_year': 2023,
  115. 'season': 'Saison 1',
  116. 'season_number': 1,
  117. 'episode': 'L\'observation',
  118. 'episode_number': 1,
  119. 'uploader': 'Patrick Bossé',
  120. 'thumbnail': r're:^https?://.*\.jpg$',
  121. },
  122. 'params': {'skip_download': 'm3u8'},
  123. }, {
  124. 'note': 'ONF episode with English title (needs geo-bypass)',
  125. 'url': 'https://www.onf.ca/serie/north-star/season1/episode1/',
  126. 'info_dict': {
  127. 'id': 'north-star-episode-1-observation',
  128. 'ext': 'mp4',
  129. 'title': 'North Star - Observation',
  130. 'description': 'md5:c727f370839d8a817392b9e3f23655c7',
  131. 'series': 'North Star',
  132. 'release_year': 2023,
  133. 'season': 'Season 1',
  134. 'season_number': 1,
  135. 'episode': 'Observation',
  136. 'episode_number': 1,
  137. 'uploader': 'Patrick Bossé',
  138. 'thumbnail': r're:^https?://.*\.jpg$',
  139. },
  140. 'params': {'skip_download': 'm3u8'},
  141. }, {
  142. 'note': 'NFB episode with /film/ URL and English title (needs geo-bypass)',
  143. 'url': 'https://www.nfb.ca/film/north-star-episode-1-observation/',
  144. 'info_dict': {
  145. 'id': 'north-star-episode-1-observation',
  146. 'ext': 'mp4',
  147. 'title': 'North Star - Observation',
  148. 'description': 'md5:c727f370839d8a817392b9e3f23655c7',
  149. 'series': 'North Star',
  150. 'release_year': 2023,
  151. 'season': 'Season 1',
  152. 'season_number': 1,
  153. 'episode': 'Observation',
  154. 'episode_number': 1,
  155. 'uploader': 'Patrick Bossé',
  156. 'thumbnail': r're:^https?://.*\.jpg$',
  157. },
  158. 'params': {'skip_download': 'm3u8'},
  159. }, {
  160. 'note': 'ONF episode with /film/ URL and French title (needs geo-bypass)',
  161. 'url': 'https://www.onf.ca/film/etoile-du-nord-episode-1-lobservation/',
  162. 'info_dict': {
  163. 'id': 'etoile-du-nord-episode-1-lobservation',
  164. 'ext': 'mp4',
  165. 'title': 'Étoile du Nord - L\'observation',
  166. 'description': 'md5:161a4617260dee3de70f509b2c9dd21b',
  167. 'series': 'Étoile du Nord',
  168. 'release_year': 2023,
  169. 'season': 'Saison 1',
  170. 'season_number': 1,
  171. 'episode': 'L\'observation',
  172. 'episode_number': 1,
  173. 'uploader': 'Patrick Bossé',
  174. 'thumbnail': r're:^https?://.*\.jpg$',
  175. },
  176. 'params': {'skip_download': 'm3u8'},
  177. }, {
  178. 'note': 'Season 2 episode w/o episode num in id, extract from json ld',
  179. 'url': 'https://www.onf.ca/film/liste-des-choses-qui-existent-saison-2-ours',
  180. 'info_dict': {
  181. 'id': 'liste-des-choses-qui-existent-saison-2-ours',
  182. 'ext': 'mp4',
  183. 'title': 'La liste des choses qui existent - L\'ours en peluche',
  184. 'description': 'md5:d5e8d8fc5f3a7385a9cf0f509b37e28a',
  185. 'series': 'La liste des choses qui existent',
  186. 'release_year': 2022,
  187. 'season': 'Saison 2',
  188. 'season_number': 2,
  189. 'episode': 'L\'ours en peluche',
  190. 'episode_number': 12,
  191. 'uploader': 'Francis Papillon',
  192. 'thumbnail': r're:^https?://.*\.jpg$',
  193. },
  194. 'params': {'skip_download': 'm3u8'},
  195. }, {
  196. 'note': 'NFB film /embed/player/ page',
  197. 'url': 'https://www.nfb.ca/film/afterlife/embed/player/',
  198. 'info_dict': {
  199. 'id': 'afterlife',
  200. 'ext': 'mp4',
  201. 'title': 'Afterlife',
  202. 'description': 'md5:84951394f594f1fb1e62d9c43242fdf5',
  203. 'release_year': 1978,
  204. 'duration': 420.0,
  205. 'uploader': 'Ishu Patel',
  206. 'thumbnail': r're:^https?://.*\.jpg$',
  207. },
  208. 'params': {'skip_download': 'm3u8'},
  209. }]
  210. def _real_extract(self, url):
  211. site, type_, slug = self._match_valid_url(url).group('site', 'type', 'id')
  212. # Need to construct the URL since we match /embed/player/ URLs as well
  213. webpage, urlh = self._download_webpage_handle(f'https://www.{site}.ca/{type_}/{slug}/', slug)
  214. # type_ can change from film to serie(s) after redirect; new slug may have episode number
  215. type_, slug = self._match_valid_url(urlh.url).group('type', 'id')
  216. player_data = self._search_json(
  217. r'window\.PLAYER_OPTIONS\[[^\]]+\]\s*=', webpage, 'player data', slug)
  218. video_id = self._match_id(player_data['overlay']['url']) # overlay url always has unique slug
  219. formats, subtitles = self._extract_m3u8_formats_and_subtitles(
  220. player_data['source'], video_id, 'mp4', m3u8_id='hls')
  221. if dv_source := url_or_none(player_data.get('dvSource')):
  222. fmts, subs = self._extract_m3u8_formats_and_subtitles(
  223. dv_source, video_id, 'mp4', m3u8_id='dv', preference=-2, fatal=False)
  224. for fmt in fmts:
  225. fmt['format_note'] = 'described video'
  226. formats.extend(fmts)
  227. self._merge_subtitles(subs, target=subtitles)
  228. info = {
  229. 'id': video_id,
  230. 'title': self._html_search_regex(
  231. r'["\']nfb_version_title["\']\s*:\s*["\']([^"\']+)',
  232. webpage, 'title', default=None),
  233. 'description': self._html_search_regex(
  234. r'<[^>]+\bid=["\']tabSynopsis["\'][^>]*>\s*<p[^>]*>\s*([^<]+)',
  235. webpage, 'description', default=None),
  236. 'thumbnail': url_or_none(player_data.get('poster')),
  237. 'uploader': self._html_search_regex(
  238. r'<[^>]+\bitemprop=["\']director["\'][^>]*>([^<]+)', webpage, 'uploader', default=None),
  239. 'release_year': int_or_none(self._html_search_regex(
  240. r'["\']nfb_version_year["\']\s*:\s*["\']([^"\']+)',
  241. webpage, 'release_year', default=None)),
  242. } if type_ == 'film' else self._extract_ep_info(self._extract_ep_data(webpage, video_id, slug), video_id)
  243. return merge_dicts({
  244. 'formats': formats,
  245. 'subtitles': subtitles,
  246. }, info, self._search_json_ld(webpage, video_id, default={}))
  247. class NFBSeriesIE(NFBBaseIE):
  248. IE_NAME = 'nfb:series'
  249. IE_DESC = 'nfb.ca and onf.ca series'
  250. _VALID_URL = rf'{NFBBaseIE._VALID_URL_BASE}/(?P<type>series?)/(?P<id>[^/?#&]+)/?(?:[?#]|$)'
  251. _TESTS = [{
  252. 'url': 'https://www.nfb.ca/series/true-north-inside-the-rise-of-toronto-basketball/',
  253. 'playlist_mincount': 9,
  254. 'info_dict': {
  255. 'id': 'true-north-inside-the-rise-of-toronto-basketball',
  256. },
  257. }, {
  258. 'url': 'https://www.onf.ca/serie/la-liste-des-choses-qui-existent-serie/',
  259. 'playlist_mincount': 26,
  260. 'info_dict': {
  261. 'id': 'la-liste-des-choses-qui-existent-serie',
  262. },
  263. }]
  264. def _entries(self, episodes):
  265. for episode in traverse_obj(episodes, lambda _, v: NFBIE.suitable(v['embed_url'])):
  266. mobj = NFBIE._match_valid_url(episode['embed_url'])
  267. yield self.url_result(
  268. mobj[0], NFBIE, **self._extract_ep_info([episode], mobj.group('id')))
  269. def _real_extract(self, url):
  270. site, type_, series_id = self._match_valid_url(url).group('site', 'type', 'id')
  271. season_path = 'saison' if type_ == 'serie' else 'season'
  272. webpage = self._download_webpage(
  273. f'https://www.{site}.ca/{type_}/{series_id}/{season_path}1/episode1', series_id)
  274. episodes = self._extract_ep_data(webpage, series_id, fatal=True)
  275. return self.playlist_result(self._entries(episodes), series_id)