megatvcom.py 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164
  1. import re
  2. from .common import InfoExtractor
  3. from ..networking import HEADRequest
  4. from ..utils import (
  5. ExtractorError,
  6. clean_html,
  7. determine_ext,
  8. extract_attributes,
  9. get_element_by_class,
  10. get_element_html_by_id,
  11. parse_qs,
  12. unescapeHTML,
  13. unified_timestamp,
  14. )
  15. class MegaTVComBaseIE(InfoExtractor):
  16. _PLAYER_DIV_ID = 'player_div_id'
  17. def _extract_player_attrs(self, webpage):
  18. player_el = get_element_html_by_id(self._PLAYER_DIV_ID, webpage)
  19. return {
  20. re.sub(r'^data-(?:kwik_)?', '', k): v
  21. for k, v in extract_attributes(player_el).items()
  22. if k not in ('id',)
  23. }
  24. class MegaTVComIE(MegaTVComBaseIE):
  25. IE_NAME = 'megatvcom'
  26. IE_DESC = 'megatv.com videos'
  27. _VALID_URL = r'https?://(?:www\.)?megatv\.com/(?:\d{4}/\d{2}/\d{2}|[^/]+/(?P<id>\d+))/(?P<slug>[^/]+)'
  28. _TESTS = [{
  29. 'url': 'https://www.megatv.com/2021/10/23/egkainia-gia-ti-nea-skini-omega-tou-dimotikou-theatrou-peiraia/',
  30. 'md5': '6546a1a37fff0dd51c9dce5f490b7d7d',
  31. 'info_dict': {
  32. 'id': '520979',
  33. 'ext': 'mp4',
  34. 'title': 'md5:70eef71a9cd2c1ecff7ee428354dded2',
  35. 'description': 'md5:0209fa8d318128569c0d256a5c404db1',
  36. 'timestamp': 1634975747,
  37. 'upload_date': '20211023',
  38. 'display_id': 'egkainia-gia-ti-nea-skini-omega-tou-dimotikou-theatrou-peiraia',
  39. 'thumbnail': 'https://www.megatv.com/wp-content/uploads/2021/10/ΠΕΙΡΑΙΑΣ-1024x450.jpg',
  40. },
  41. }, {
  42. 'url': 'https://www.megatv.com/tvshows/527800/epeisodio-65-12/',
  43. 'md5': 'cba2085d45c1abeb8e7e9b7e1d6c0072',
  44. 'info_dict': {
  45. 'id': '527800',
  46. 'ext': 'mp4',
  47. 'title': 'md5:fc322cb51f682eecfe2f54cd5ab3a157',
  48. 'description': 'md5:b2b7ed3690a78f2a0156eb790fdc00df',
  49. 'timestamp': 1636048859,
  50. 'upload_date': '20211104',
  51. 'display_id': 'epeisodio-65-12',
  52. 'thumbnail': 'https://www.megatv.com/wp-content/uploads/2021/11/16-1-1.jpg',
  53. },
  54. }]
  55. def _real_extract(self, url):
  56. video_id, display_id = self._match_valid_url(url).group('id', 'slug')
  57. _is_article = video_id is None
  58. webpage = self._download_webpage(url, video_id or display_id)
  59. if _is_article:
  60. video_id = self._search_regex(
  61. r'<article[^>]*\sid=["\']Article_(\d+)["\']', webpage, 'article id')
  62. player_attrs = self._extract_player_attrs(webpage)
  63. title = player_attrs.get('label') or self._og_search_title(webpage)
  64. description = get_element_by_class(
  65. 'article-wrapper' if _is_article else 'story_content',
  66. webpage)
  67. description = clean_html(re.sub(r'<script[^>]*>[^<]+</script>', '', description))
  68. if not description:
  69. description = self._og_search_description(webpage)
  70. thumbnail = player_attrs.get('image') or self._og_search_thumbnail(webpage)
  71. timestamp = unified_timestamp(self._html_search_meta(
  72. 'article:published_time', webpage))
  73. source = player_attrs.get('source')
  74. if not source:
  75. raise ExtractorError('No source found', video_id=video_id)
  76. if determine_ext(source) == 'm3u8':
  77. formats, subs = self._extract_m3u8_formats_and_subtitles(source, video_id, 'mp4')
  78. else:
  79. formats, subs = [{'url': source}], {}
  80. if player_attrs.get('subs'):
  81. self._merge_subtitles({'und': [{'url': player_attrs['subs']}]}, target=subs)
  82. return {
  83. 'id': video_id,
  84. 'display_id': display_id,
  85. 'title': title,
  86. 'description': description,
  87. 'thumbnail': thumbnail,
  88. 'timestamp': timestamp,
  89. 'formats': formats,
  90. 'subtitles': subs,
  91. }
  92. class MegaTVComEmbedIE(MegaTVComBaseIE):
  93. IE_NAME = 'megatvcom:embed'
  94. IE_DESC = 'megatv.com embedded videos'
  95. _VALID_URL = r'(?:https?:)?//(?:www\.)?megatv\.com/embed/?\?p=(?P<id>\d+)'
  96. _EMBED_REGEX = [rf'''<iframe[^>]+?src=(?P<_q1>["'])(?P<url>{_VALID_URL})(?P=_q1)''']
  97. _TESTS = [{
  98. 'url': 'https://www.megatv.com/embed/?p=2020520979',
  99. 'md5': '6546a1a37fff0dd51c9dce5f490b7d7d',
  100. 'info_dict': {
  101. 'id': '520979',
  102. 'ext': 'mp4',
  103. 'title': 'md5:70eef71a9cd2c1ecff7ee428354dded2',
  104. 'description': 'md5:0209fa8d318128569c0d256a5c404db1',
  105. 'timestamp': 1634975747,
  106. 'upload_date': '20211023',
  107. 'display_id': 'egkainia-gia-ti-nea-skini-omega-tou-dimotikou-theatrou-peiraia',
  108. 'thumbnail': 'https://www.megatv.com/wp-content/uploads/2021/10/ΠΕΙΡΑΙΑΣ-1024x450.jpg',
  109. },
  110. }, {
  111. 'url': 'https://www.megatv.com/embed/?p=2020534081',
  112. 'md5': '6ac8b3ce4dc6120c802f780a1e6b3812',
  113. 'info_dict': {
  114. 'id': '534081',
  115. 'ext': 'mp4',
  116. 'title': 'md5:062e9d5976ef854d8bdc1f5724d9b2d0',
  117. 'description': 'md5:36dbe4c3762d2ede9513eea8d07f6d52',
  118. 'timestamp': 1636376351,
  119. 'upload_date': '20211108',
  120. 'display_id': 'neo-rekor-stin-timi-tou-ilektrikou-reymatos-pano-apo-ta-200e-i-xondriki-timi-tou-ilektrikou',
  121. 'thumbnail': 'https://www.megatv.com/wp-content/uploads/2021/11/Capture-266.jpg',
  122. },
  123. }]
  124. def _match_canonical_url(self, webpage):
  125. LINK_RE = r'''(?x)
  126. <link(?:
  127. rel=(?P<_q1>["'])(?P<canonical>canonical)(?P=_q1)|
  128. href=(?P<_q2>["'])(?P<href>(?:(?!(?P=_q2)).)+)(?P=_q2)|
  129. [^>]*?
  130. )+>
  131. '''
  132. for mobj in re.finditer(LINK_RE, webpage):
  133. canonical, href = mobj.group('canonical', 'href')
  134. if canonical and href:
  135. return unescapeHTML(href)
  136. def _real_extract(self, url):
  137. video_id = self._match_id(url)
  138. webpage = self._download_webpage(url, video_id)
  139. player_attrs = self._extract_player_attrs(webpage)
  140. canonical_url = player_attrs.get('share_url') or self._match_canonical_url(webpage)
  141. if not canonical_url:
  142. raise ExtractorError('canonical URL not found')
  143. video_id = parse_qs(canonical_url)['p'][0]
  144. # Defer to megatvcom as the metadata extracted from the embeddable page some
  145. # times are slightly different, for the same video
  146. canonical_url = self._request_webpage(
  147. HEADRequest(canonical_url), video_id,
  148. note='Resolve canonical URL',
  149. errnote='Could not resolve canonical URL').url
  150. return self.url_result(canonical_url, MegaTVComIE.ie_key(), video_id)