bundestag.py 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123
  1. import functools
  2. import re
  3. from .common import InfoExtractor
  4. from ..networking.exceptions import HTTPError
  5. from ..utils import (
  6. ExtractorError,
  7. bug_reports_message,
  8. clean_html,
  9. format_field,
  10. get_element_text_and_html_by_tag,
  11. int_or_none,
  12. url_or_none,
  13. )
  14. from ..utils.traversal import traverse_obj
  15. class BundestagIE(InfoExtractor):
  16. _VALID_URL = [
  17. r'https?://dbtg\.tv/[cf]vid/(?P<id>\d+)',
  18. r'https?://www\.bundestag\.de/mediathek/?\?(?:[^#]+&)?videoid=(?P<id>\d+)',
  19. ]
  20. _TESTS = [{
  21. 'url': 'https://dbtg.tv/cvid/7605304',
  22. 'info_dict': {
  23. 'id': '7605304',
  24. 'ext': 'mp4',
  25. 'title': '145. Sitzung vom 15.12.2023, TOP 24 Barrierefreiheit',
  26. 'description': 'md5:321a9dc6bdad201264c0045efc371561',
  27. },
  28. }, {
  29. 'url': 'https://www.bundestag.de/mediathek?videoid=7602120&url=L21lZGlhdGhla292ZXJsYXk=&mod=mediathek',
  30. 'info_dict': {
  31. 'id': '7602120',
  32. 'ext': 'mp4',
  33. 'title': '130. Sitzung vom 18.10.2023, TOP 1 Befragung der Bundesregierung',
  34. 'description': 'Befragung der Bundesregierung',
  35. },
  36. }, {
  37. 'url': 'https://www.bundestag.de/mediathek?videoid=7604941#url=L21lZGlhdGhla292ZXJsYXk/dmlkZW9pZD03NjA0OTQx&mod=mediathek',
  38. 'only_matching': True,
  39. }, {
  40. 'url': 'http://dbtg.tv/fvid/3594346',
  41. 'only_matching': True,
  42. }]
  43. _OVERLAY_URL = 'https://www.bundestag.de/mediathekoverlay'
  44. _INSTANCE_FORMAT = 'https://cldf-wzw-od.r53.cdn.tv1.eu/13014bundestagod/_definst_/13014bundestag/ondemand/3777parlamentsfernsehen/archiv/app144277506/145293313/{0}/{0}_playlist.smil/playlist.m3u8'
  45. _SHARE_URL = 'https://webtv.bundestag.de/player/macros/_x_s-144277506/shareData.json?contentId='
  46. _SHARE_AUDIO_REGEX = r'/\d+_(?P<codec>\w+)_(?P<bitrate>\d+)kb_(?P<channels>\w+)_\w+_\d+\.(?P<ext>\w+)'
  47. _SHARE_VIDEO_REGEX = r'/\d+_(?P<codec>\w+)_(?P<width>\w+)_(?P<height>\w+)_(?P<bitrate>\d+)kb_\w+_\w+_\d+\.(?P<ext>\w+)'
  48. def _bt_extract_share_formats(self, video_id):
  49. share_data = self._download_json(
  50. f'{self._SHARE_URL}{video_id}', video_id, note='Downloading share format JSON')
  51. if traverse_obj(share_data, ('status', 'code', {int})) != 1:
  52. self.report_warning(format_field(
  53. share_data, [('status', 'message', {str})],
  54. 'Share API response: %s', default='Unknown Share API Error')
  55. + bug_reports_message())
  56. return
  57. for name, url in share_data.items():
  58. if not isinstance(name, str) or not url_or_none(url):
  59. continue
  60. elif name.startswith('audio'):
  61. match = re.search(self._SHARE_AUDIO_REGEX, url)
  62. yield {
  63. 'format_id': name,
  64. 'url': url,
  65. 'vcodec': 'none',
  66. **traverse_obj(match, {
  67. 'acodec': 'codec',
  68. 'audio_channels': ('channels', {{'mono': 1, 'stereo': 2}.get}),
  69. 'abr': ('bitrate', {int_or_none}),
  70. 'ext': 'ext',
  71. }),
  72. }
  73. elif name.startswith('download'):
  74. match = re.search(self._SHARE_VIDEO_REGEX, url)
  75. yield {
  76. 'format_id': name,
  77. 'url': url,
  78. **traverse_obj(match, {
  79. 'vcodec': 'codec',
  80. 'tbr': ('bitrate', {int_or_none}),
  81. 'width': ('width', {int_or_none}),
  82. 'height': ('height', {int_or_none}),
  83. 'ext': 'ext',
  84. }),
  85. }
  86. def _real_extract(self, url):
  87. video_id = self._match_id(url)
  88. formats = []
  89. result = {'id': video_id, 'formats': formats}
  90. try:
  91. formats.extend(self._extract_m3u8_formats(
  92. self._INSTANCE_FORMAT.format(video_id), video_id, m3u8_id='instance'))
  93. except ExtractorError as error:
  94. if isinstance(error.cause, HTTPError) and error.cause.status == 404:
  95. raise ExtractorError('Could not find video id', expected=True)
  96. self.report_warning(f'Error extracting hls formats: {error}', video_id)
  97. formats.extend(self._bt_extract_share_formats(video_id))
  98. if not formats:
  99. self.raise_no_formats('Could not find suitable formats', video_id=video_id)
  100. result.update(traverse_obj(self._download_webpage(
  101. self._OVERLAY_URL, video_id,
  102. query={'videoid': video_id, 'view': 'main'},
  103. note='Downloading metadata overlay', fatal=False,
  104. ), {
  105. 'title': (
  106. {functools.partial(get_element_text_and_html_by_tag, 'h3')}, 0,
  107. {functools.partial(re.sub, r'<span[^>]*>[^<]+</span>', '')}, {clean_html}),
  108. 'description': ({functools.partial(get_element_text_and_html_by_tag, 'p')}, 0, {clean_html}),
  109. }))
  110. return result