rheinmaintv.py 4.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394
  1. from .common import InfoExtractor
  2. from ..utils import extract_attributes, merge_dicts, remove_end
  3. class RheinMainTVIE(InfoExtractor):
  4. _VALID_URL = r'https?://(?:www\.)?rheinmaintv\.de/sendungen/(?:[\w-]+/)*(?P<video_id>(?P<display_id>[\w-]+)/vom-\d{2}\.\d{2}\.\d{4}(?:/\d+)?)'
  5. _TESTS = [{
  6. 'url': 'https://www.rheinmaintv.de/sendungen/beitrag-video/auf-dem-weg-zur-deutschen-meisterschaft/vom-07.11.2022/',
  7. 'info_dict': {
  8. 'id': 'auf-dem-weg-zur-deutschen-meisterschaft-vom-07.11.2022',
  9. 'ext': 'ismv', # ismv+isma will be merged into mp4
  10. 'alt_title': 'Auf dem Weg zur Deutschen Meisterschaft',
  11. 'title': 'Auf dem Weg zur Deutschen Meisterschaft',
  12. 'upload_date': '20221108',
  13. 'view_count': int,
  14. 'display_id': 'auf-dem-weg-zur-deutschen-meisterschaft',
  15. 'thumbnail': r're:^https://.+\.jpg',
  16. 'description': 'md5:48c59b74192bc819a9b34af1d5ed1eb9',
  17. 'timestamp': 1667933057,
  18. 'duration': 243.0,
  19. },
  20. 'params': {'skip_download': 'ism'},
  21. }, {
  22. 'url': 'https://www.rheinmaintv.de/sendungen/beitrag-video/formationsgemeinschaft-rhein-main-bei-den-deutschen-meisterschaften/vom-14.11.2022/',
  23. 'info_dict': {
  24. 'id': 'formationsgemeinschaft-rhein-main-bei-den-deutschen-meisterschaften-vom-14.11.2022',
  25. 'ext': 'ismv',
  26. 'title': 'Formationsgemeinschaft Rhein-Main bei den Deutschen Meisterschaften',
  27. 'timestamp': 1668526214,
  28. 'display_id': 'formationsgemeinschaft-rhein-main-bei-den-deutschen-meisterschaften',
  29. 'alt_title': 'Formationsgemeinschaft Rhein-Main bei den Deutschen Meisterschaften',
  30. 'view_count': int,
  31. 'thumbnail': r're:^https://.+\.jpg',
  32. 'duration': 345.0,
  33. 'description': 'md5:9370ba29526984006c2cba1372e5c5a0',
  34. 'upload_date': '20221115',
  35. },
  36. 'params': {'skip_download': 'ism'},
  37. }, {
  38. 'url': 'https://www.rheinmaintv.de/sendungen/beitrag-video/casino-mainz-bei-den-deutschen-meisterschaften/vom-14.11.2022/',
  39. 'info_dict': {
  40. 'id': 'casino-mainz-bei-den-deutschen-meisterschaften-vom-14.11.2022',
  41. 'ext': 'ismv',
  42. 'title': 'Casino Mainz bei den Deutschen Meisterschaften',
  43. 'view_count': int,
  44. 'timestamp': 1668527402,
  45. 'alt_title': 'Casino Mainz bei den Deutschen Meisterschaften',
  46. 'upload_date': '20221115',
  47. 'display_id': 'casino-mainz-bei-den-deutschen-meisterschaften',
  48. 'duration': 348.0,
  49. 'thumbnail': r're:^https://.+\.jpg',
  50. 'description': 'md5:70fc1660eeba96da17199e5bdff4c0aa',
  51. },
  52. 'params': {'skip_download': 'ism'},
  53. }, {
  54. 'url': 'https://www.rheinmaintv.de/sendungen/beitrag-video/bricks4kids/vom-22.06.2022/',
  55. 'only_matching': True,
  56. }]
  57. def _real_extract(self, url):
  58. mobj = self._match_valid_url(url)
  59. display_id = mobj.group('display_id')
  60. video_id = mobj.group('video_id').replace('/', '-')
  61. webpage = self._download_webpage(url, video_id)
  62. source, img = self._search_regex(r'(?s)(?P<source><source[^>]*>)(?P<img><img[^>]*>)',
  63. webpage, 'video', group=('source', 'img'))
  64. source = extract_attributes(source)
  65. img = extract_attributes(img)
  66. raw_json_ld = list(self._yield_json_ld(webpage, video_id))
  67. json_ld = self._json_ld(raw_json_ld, video_id)
  68. json_ld.pop('url', None)
  69. ism_manifest_url = (
  70. source.get('src')
  71. or next(json_ld.get('embedUrl') for json_ld in raw_json_ld if json_ld.get('@type') == 'VideoObject')
  72. )
  73. formats, subtitles = self._extract_ism_formats_and_subtitles(ism_manifest_url, video_id)
  74. return merge_dicts({
  75. 'id': video_id,
  76. 'display_id': display_id,
  77. 'title':
  78. self._html_search_regex(r'<h1><span class="title">([^<]*)</span>',
  79. webpage, 'headline', default=None)
  80. or img.get('title') or json_ld.get('title') or self._og_search_title(webpage)
  81. or remove_end(self._html_extract_title(webpage), ' -'),
  82. 'alt_title': img.get('alt'),
  83. 'description': json_ld.get('description') or self._og_search_description(webpage),
  84. 'formats': formats,
  85. 'subtitles': subtitles,
  86. 'thumbnails': [{'url': img['src']}] if 'src' in img else json_ld.get('thumbnails'),
  87. }, json_ld)