bpb.py 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170
  1. import functools
  2. import re
  3. from .common import InfoExtractor
  4. from ..utils import (
  5. clean_html,
  6. extract_attributes,
  7. get_element_text_and_html_by_tag,
  8. get_elements_by_class,
  9. join_nonempty,
  10. js_to_json,
  11. mimetype2ext,
  12. unified_strdate,
  13. url_or_none,
  14. urljoin,
  15. variadic,
  16. )
  17. from ..utils.traversal import traverse_obj
  18. def html_get_element(tag=None, cls=None):
  19. assert tag or cls, 'One of tag or class is required'
  20. if cls:
  21. func = functools.partial(get_elements_by_class, cls, tag=tag)
  22. else:
  23. func = functools.partial(get_element_text_and_html_by_tag, tag)
  24. def html_get_element_wrapper(html):
  25. return variadic(func(html))[0]
  26. return html_get_element_wrapper
  27. class BpbIE(InfoExtractor):
  28. IE_DESC = 'Bundeszentrale für politische Bildung'
  29. _VALID_URL = r'https?://(?:www\.|m\.)?bpb\.de/(?:[^/?#]+/)*(?P<id>\d+)(?:[/?#]|$)'
  30. _TESTS = [{
  31. 'url': 'http://www.bpb.de/mediathek/297/joachim-gauck-zu-1989-und-die-erinnerung-an-die-ddr',
  32. 'info_dict': {
  33. 'id': '297',
  34. 'ext': 'mp4',
  35. 'creator': 'Kooperative Berlin',
  36. 'description': 'md5:f4f75885ba009d3e2b156247a8941ce6',
  37. 'release_date': '20160115',
  38. 'series': 'Interview auf dem Geschichtsforum 1989 | 2009',
  39. 'tags': ['Friedliche Revolution', 'Erinnerungskultur', 'Vergangenheitspolitik', 'DDR 1949 - 1990', 'Freiheitsrecht', 'BStU', 'Deutschland'],
  40. 'thumbnail': 'https://www.bpb.de/cache/images/7/297_teaser_16x9_1240.jpg?8839D',
  41. 'title': 'Joachim Gauck zu 1989 und die Erinnerung an die DDR',
  42. 'uploader': 'Bundeszentrale für politische Bildung',
  43. },
  44. }, {
  45. 'url': 'https://www.bpb.de/mediathek/video/522184/krieg-flucht-und-falschmeldungen-wirstattdesinformation-2/',
  46. 'info_dict': {
  47. 'id': '522184',
  48. 'ext': 'mp4',
  49. 'creator': 'Institute for Strategic Dialogue Germany gGmbH (ISD)',
  50. 'description': 'md5:f83c795ff8f825a69456a9e51fc15903',
  51. 'release_date': '20230621',
  52. 'tags': ['Desinformation', 'Ukraine', 'Russland', 'Geflüchtete'],
  53. 'thumbnail': 'https://www.bpb.de/cache/images/4/522184_teaser_16x9_1240.png?EABFB',
  54. 'title': 'md5:9b01ccdbf58dbf9e5c9f6e771a803b1c',
  55. 'uploader': 'Bundeszentrale für politische Bildung',
  56. },
  57. }, {
  58. 'url': 'https://www.bpb.de/lernen/bewegtbild-und-politische-bildung/webvideo/518789/krieg-flucht-und-falschmeldungen-wirstattdesinformation-1/',
  59. 'info_dict': {
  60. 'id': '518789',
  61. 'ext': 'mp4',
  62. 'creator': 'Institute for Strategic Dialogue Germany gGmbH (ISD)',
  63. 'description': 'md5:85228aed433e84ff0ff9bc582abd4ea8',
  64. 'release_date': '20230302',
  65. 'tags': ['Desinformation', 'Ukraine', 'Russland', 'Geflüchtete'],
  66. 'thumbnail': 'https://www.bpb.de/cache/images/9/518789_teaser_16x9_1240.jpeg?56D0D',
  67. 'title': 'md5:3e956f264bb501f6383f10495a401da4',
  68. 'uploader': 'Bundeszentrale für politische Bildung',
  69. },
  70. }, {
  71. 'url': 'https://www.bpb.de/mediathek/podcasts/apuz-podcast/539727/apuz-20-china/',
  72. 'only_matching': True,
  73. }, {
  74. 'url': 'https://www.bpb.de/mediathek/audio/315813/folge-1-eine-einfuehrung/',
  75. 'info_dict': {
  76. 'id': '315813',
  77. 'ext': 'mp3',
  78. 'creator': 'Axel Schröder',
  79. 'description': 'md5:eda9d1af34e5912efef5baf54fba4427',
  80. 'release_date': '20200921',
  81. 'series': 'Auf Endlagersuche. Der deutsche Weg zu einem sicheren Atommülllager',
  82. 'tags': ['Atomenergie', 'Endlager', 'hoch-radioaktiver Abfall', 'Endlagersuche', 'Atommüll', 'Atomendlager', 'Gorleben', 'Deutschland'],
  83. 'thumbnail': 'https://www.bpb.de/cache/images/3/315813_teaser_16x9_1240.png?92A94',
  84. 'title': 'Folge 1: Eine Einführung',
  85. 'uploader': 'Bundeszentrale für politische Bildung',
  86. },
  87. }, {
  88. 'url': 'https://www.bpb.de/517806/die-weltanschauung-der-neuen-rechten/',
  89. 'info_dict': {
  90. 'id': '517806',
  91. 'ext': 'mp3',
  92. 'creator': 'Bundeszentrale für politische Bildung',
  93. 'description': 'md5:594689600e919912aade0b2871cc3fed',
  94. 'release_date': '20230127',
  95. 'series': 'Vorträge des Fachtags "Modernisierer. Grenzgänger. Anstifter. Sechs Jahrzehnte \'Neue Rechte\'"',
  96. 'tags': ['Rechtsextremismus', 'Konservatismus', 'Konservativismus', 'neue Rechte', 'Rechtspopulismus', 'Schnellroda', 'Deutschland'],
  97. 'thumbnail': 'https://www.bpb.de/cache/images/6/517806_teaser_16x9_1240.png?7A7A0',
  98. 'title': 'Die Weltanschauung der "Neuen Rechten"',
  99. 'uploader': 'Bundeszentrale für politische Bildung',
  100. },
  101. }, {
  102. 'url': 'https://www.bpb.de/mediathek/reihen/zahlen-und-fakten-soziale-situation-filme/520153/zahlen-und-fakten-die-soziale-situation-in-deutschland-migration/',
  103. 'only_matching': True,
  104. }]
  105. _TITLE_RE = re.compile('(?P<title>[^<]*)<[^>]+>(?P<series>[^<]*)')
  106. def _parse_vue_attributes(self, name, string, video_id):
  107. attributes = extract_attributes(self._search_regex(rf'(<{name}(?:"[^"]*?"|[^>])*>)', string, name))
  108. for key, value in attributes.items():
  109. if key.startswith(':'):
  110. attributes[key] = self._parse_json(value, video_id, transform_source=js_to_json, fatal=False)
  111. return attributes
  112. @staticmethod
  113. def _process_source(source):
  114. url = url_or_none(source['src'])
  115. if not url:
  116. return None
  117. source_type = source.get('type', '')
  118. extension = mimetype2ext(source_type)
  119. is_video = source_type.startswith('video')
  120. note = url.rpartition('.')[0].rpartition('_')[2] if is_video else None
  121. return {
  122. 'url': url,
  123. 'ext': extension,
  124. 'vcodec': None if is_video else 'none',
  125. 'quality': 10 if note == 'high' else 0,
  126. 'format_note': note,
  127. 'format_id': join_nonempty(extension, note),
  128. }
  129. def _real_extract(self, url):
  130. video_id = self._match_id(url)
  131. webpage = self._download_webpage(url, video_id)
  132. title_result = traverse_obj(webpage, ({html_get_element(cls='opening-header__title')}, {self._TITLE_RE.match}))
  133. json_lds = list(self._yield_json_ld(webpage, video_id, fatal=False))
  134. return {
  135. 'id': video_id,
  136. 'title': traverse_obj(title_result, ('title', {str.strip})) or None,
  137. # This metadata could be interpreted otherwise, but it fits "series" the most
  138. 'series': traverse_obj(title_result, ('series', {str.strip})) or None,
  139. 'description': join_nonempty(*traverse_obj(webpage, [(
  140. {html_get_element(cls='opening-intro')},
  141. [{html_get_element(tag='bpb-accordion-item')}, {html_get_element(cls='text-content')}],
  142. ), {clean_html}]), delim='\n\n') or None,
  143. 'creator': self._html_search_meta('author', webpage),
  144. 'uploader': self._html_search_meta('publisher', webpage),
  145. 'release_date': unified_strdate(self._html_search_meta('date', webpage)),
  146. 'tags': traverse_obj(json_lds, (..., 'keywords', {lambda x: x.split(',')}, ...)),
  147. **traverse_obj(self._parse_vue_attributes('bpb-player', webpage, video_id), {
  148. 'formats': (':sources', ..., {self._process_source}),
  149. 'thumbnail': ('poster', {lambda x: urljoin(url, x)}),
  150. }),
  151. }