tagesschau.py 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164
  1. import re
  2. from .common import InfoExtractor
  3. from ..utils import (
  4. UnsupportedError,
  5. extract_attributes,
  6. int_or_none,
  7. js_to_json,
  8. parse_iso8601,
  9. try_get,
  10. )
  11. class TagesschauIE(InfoExtractor):
  12. _WORKING = False
  13. _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/(?P<path>[^/]+/(?:[^/]+/)*?(?P<id>[^/#?]+?(?:-?[0-9]+)?))(?:~_?[^/#?]+?)?\.html'
  14. _TESTS = [{
  15. 'url': 'http://www.tagesschau.de/multimedia/video/video-102143.html',
  16. 'md5': 'ccb9359bf8c4795836e43759f3408a93',
  17. 'info_dict': {
  18. 'id': 'video-102143-1',
  19. 'ext': 'mp4',
  20. 'title': 'Regierungsumbildung in Athen: Neue Minister in Griechenland vereidigt',
  21. 'duration': 138,
  22. },
  23. }, {
  24. 'url': 'http://www.tagesschau.de/multimedia/sendung/ts-5727.html',
  25. 'md5': '5c15e8f3da049e48829ec9786d835536',
  26. 'info_dict': {
  27. 'id': 'ts-5727-1',
  28. 'ext': 'mp4',
  29. 'title': 'Ganze Sendung',
  30. 'duration': 932,
  31. },
  32. }, {
  33. # exclusive audio
  34. 'url': 'http://www.tagesschau.de/multimedia/audio/audio-29417.html',
  35. 'md5': '4bff8f23504df56a0d86ed312d654182',
  36. 'info_dict': {
  37. 'id': 'audio-29417-1',
  38. 'ext': 'mp3',
  39. 'title': 'EU-Gipfel: Im Verbrennerstreit hat Deutschland maximalen Schaden angerichtet',
  40. },
  41. }, {
  42. 'url': 'http://www.tagesschau.de/inland/bnd-303.html',
  43. 'md5': 'f049fa1698d7564e9ca4c3325108f034',
  44. 'info_dict': {
  45. 'id': 'bnd-303-1',
  46. 'ext': 'mp3',
  47. 'title': 'Das Siegel des Bundesnachrichtendienstes | dpa',
  48. },
  49. }, {
  50. 'url': 'http://www.tagesschau.de/inland/afd-parteitag-135.html',
  51. 'info_dict': {
  52. 'id': 'afd-parteitag-135',
  53. 'title': 'AfD',
  54. },
  55. 'playlist_mincount': 15,
  56. }, {
  57. 'url': 'https://www.tagesschau.de/multimedia/audio/audio-29417~player.html',
  58. 'info_dict': {
  59. 'id': 'audio-29417-1',
  60. 'ext': 'mp3',
  61. 'title': 'EU-Gipfel: Im Verbrennerstreit hat Deutschland maximalen Schaden angerichtet',
  62. },
  63. }, {
  64. 'url': 'https://www.tagesschau.de/multimedia/audio/podcast-11km-327.html',
  65. 'info_dict': {
  66. 'id': 'podcast-11km-327',
  67. 'ext': 'mp3',
  68. 'title': 'Gewalt in der Kita – Wenn Erzieher:innen schweigen',
  69. 'upload_date': '20230322',
  70. 'timestamp': 1679482808,
  71. 'thumbnail': 'https://www.tagesschau.de/multimedia/audio/podcast-11km-329~_v-original.jpg',
  72. 'description': 'md5:dad059931fe4b3693e3656e93a249848',
  73. },
  74. }, {
  75. 'url': 'http://www.tagesschau.de/multimedia/sendung/tsg-3771.html',
  76. 'only_matching': True,
  77. }, {
  78. 'url': 'http://www.tagesschau.de/multimedia/sendung/tt-3827.html',
  79. 'only_matching': True,
  80. }, {
  81. 'url': 'http://www.tagesschau.de/multimedia/sendung/nm-3475.html',
  82. 'only_matching': True,
  83. }, {
  84. 'url': 'http://www.tagesschau.de/multimedia/sendung/weltspiegel-3167.html',
  85. 'only_matching': True,
  86. }, {
  87. 'url': 'http://www.tagesschau.de/multimedia/tsvorzwanzig-959.html',
  88. 'only_matching': True,
  89. }, {
  90. 'url': 'http://www.tagesschau.de/multimedia/sendung/bab/bab-3299~_bab-sendung-209.html',
  91. 'only_matching': True,
  92. }, {
  93. 'url': 'http://www.tagesschau.de/multimedia/video/video-102303~_bab-sendung-211.html',
  94. 'only_matching': True,
  95. }, {
  96. 'url': 'http://www.tagesschau.de/100sekunden/index.html',
  97. 'only_matching': True,
  98. }, {
  99. # playlist article with collapsing sections
  100. 'url': 'http://www.tagesschau.de/wirtschaft/faq-freihandelszone-eu-usa-101.html',
  101. 'only_matching': True,
  102. }]
  103. def _real_extract(self, url):
  104. mobj = self._match_valid_url(url)
  105. video_id = mobj.group('id') or mobj.group('path')
  106. display_id = video_id.lstrip('-')
  107. webpage = self._download_webpage(url, display_id)
  108. title = self._html_search_regex(
  109. r'<span[^>]*class="headline"[^>]*>(.+?)</span>',
  110. webpage, 'title', default=None) or self._og_search_title(webpage, fatal=False)
  111. entries = []
  112. videos = re.findall(r'<div[^>]+>', webpage)
  113. num = 0
  114. for video in videos:
  115. video = extract_attributes(video).get('data-config')
  116. if not video:
  117. continue
  118. video = self._parse_json(video, video_id, transform_source=js_to_json, fatal=False)
  119. video_formats = try_get(video, lambda x: x['mc']['_mediaArray'][0]['_mediaStreamArray'])
  120. if not video_formats:
  121. continue
  122. num += 1
  123. for video_format in video_formats:
  124. media_url = video_format.get('_stream') or ''
  125. formats = []
  126. if media_url.endswith('master.m3u8'):
  127. formats = self._extract_m3u8_formats(media_url, video_id, 'mp4', m3u8_id='hls')
  128. elif media_url.endswith('.mp3'):
  129. formats = [{
  130. 'url': media_url,
  131. 'vcodec': 'none',
  132. }]
  133. if not formats:
  134. continue
  135. entries.append({
  136. 'id': f'{display_id}-{num}',
  137. 'title': try_get(video, lambda x: x['mc']['_title']),
  138. 'duration': int_or_none(try_get(video, lambda x: x['mc']['_duration'])),
  139. 'formats': formats,
  140. })
  141. if not entries:
  142. raise UnsupportedError(url)
  143. if len(entries) > 1:
  144. return self.playlist_result(entries, display_id, title)
  145. return {
  146. 'id': display_id,
  147. 'title': title,
  148. 'thumbnail': self._og_search_thumbnail(webpage),
  149. 'formats': entries[0]['formats'],
  150. 'timestamp': parse_iso8601(self._html_search_meta('date', webpage)),
  151. 'description': self._og_search_description(webpage),
  152. 'duration': entries[0]['duration'],
  153. }