la7.py 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234
  1. import re
  2. from .common import InfoExtractor
  3. from ..networking import HEADRequest
  4. from ..utils import float_or_none, int_or_none, parse_duration, unified_strdate
  5. class LA7IE(InfoExtractor):
  6. IE_NAME = 'la7.it'
  7. _VALID_URL = r'''(?x)https?://(?:
  8. (?:www\.)?la7\.it/([^/]+)/(?:rivedila7|video|news)/|
  9. tg\.la7\.it/repliche-tgla7\?id=
  10. )(?P<id>.+)'''
  11. _TESTS = [{
  12. # single quality video
  13. 'url': 'http://www.la7.it/crozza/video/inccool8-02-10-2015-163722',
  14. 'md5': '8b613ffc0c4bf9b9e377169fc19c214c',
  15. 'info_dict': {
  16. 'id': 'inccool8-02-10-2015-163722',
  17. 'ext': 'mp4',
  18. 'title': 'Inc.Cool8',
  19. 'description': 'Benvenuti nell\'incredibile mondo della INC. COOL. 8. dove “INC.” sta per “Incorporated” “COOL” sta per “fashion” ed Eight sta per il gesto atletico',
  20. 'thumbnail': 're:^https?://.*',
  21. 'upload_date': '20151002',
  22. 'formats': 'count:4',
  23. },
  24. }, {
  25. # multiple quality video
  26. 'url': 'https://www.la7.it/calcio-femminile/news/il-gol-di-lindsey-thomas-fiorentina-vs-milan-serie-a-calcio-femminile-26-11-2022-461736',
  27. 'md5': 'd2370e78f75e8d1238cb3a0db9a2eda3',
  28. 'info_dict': {
  29. 'id': 'il-gol-di-lindsey-thomas-fiorentina-vs-milan-serie-a-calcio-femminile-26-11-2022-461736',
  30. 'ext': 'mp4',
  31. 'title': 'Il gol di Lindsey Thomas | Fiorentina vs Milan | Serie A Calcio Femminile',
  32. 'description': 'Il gol di Lindsey Thomas | Fiorentina vs Milan | Serie A Calcio Femminile',
  33. 'thumbnail': 're:^https?://.*',
  34. 'upload_date': '20221126',
  35. 'formats': 'count:8',
  36. },
  37. }, {
  38. 'url': 'http://www.la7.it/omnibus/rivedila7/omnibus-news-02-07-2016-189077',
  39. 'only_matching': True,
  40. }]
  41. _HOST = 'https://awsvodpkg.iltrovatore.it'
  42. def _generate_mp4_url(self, quality, m3u8_formats):
  43. for f in m3u8_formats:
  44. if f['vcodec'] != 'none' and quality in f['url']:
  45. http_url = f'{self._HOST}{quality}.mp4'
  46. urlh = self._request_webpage(
  47. HEADRequest(http_url), quality,
  48. note='Check filesize', fatal=False)
  49. if urlh:
  50. http_f = f.copy()
  51. del http_f['manifest_url']
  52. http_f.update({
  53. 'format_id': http_f['format_id'].replace('hls-', 'https-'),
  54. 'url': http_url,
  55. 'protocol': 'https',
  56. 'filesize_approx': int_or_none(urlh.headers.get('Content-Length', None)),
  57. })
  58. return http_f
  59. return None
  60. def _real_extract(self, url):
  61. video_id = self._match_id(url)
  62. webpage = self._download_webpage(url, video_id)
  63. if re.search(r'(?i)(drmsupport\s*:\s*true)\s*', webpage):
  64. self.report_drm(video_id)
  65. video_path = self._search_regex(
  66. r'(/content/[\w/,]+?)\.mp4(?:\.csmil)?/master\.m3u8', webpage, 'video_path')
  67. formats = self._extract_mpd_formats(
  68. f'{self._HOST}/local/dash/,{video_path}.mp4.urlset/manifest.mpd',
  69. video_id, mpd_id='dash', fatal=False)
  70. m3u8_formats = self._extract_m3u8_formats(
  71. f'{self._HOST}/local/hls/,{video_path}.mp4.urlset/master.m3u8',
  72. video_id, 'mp4', m3u8_id='hls', fatal=False)
  73. formats.extend(m3u8_formats)
  74. for q in filter(None, video_path.split(',')):
  75. http_f = self._generate_mp4_url(q, m3u8_formats)
  76. if http_f:
  77. formats.append(http_f)
  78. return {
  79. 'id': video_id,
  80. 'title': self._og_search_title(webpage, default=None),
  81. 'description': self._og_search_description(webpage, default=None),
  82. 'thumbnail': self._og_search_thumbnail(webpage, default=None),
  83. 'formats': formats,
  84. 'upload_date': unified_strdate(self._search_regex(r'datetime="(.+?)"', webpage, 'upload_date', fatal=False)),
  85. }
  86. class LA7PodcastEpisodeIE(InfoExtractor):
  87. IE_NAME = 'la7.it:pod:episode'
  88. _VALID_URL = r'https?://(?:www\.)?la7\.it/[^/]+/podcast/([^/]+-)?(?P<id>\d+)'
  89. _TESTS = [{
  90. 'url': 'https://www.la7.it/voicetown/podcast/la-carezza-delle-memoria-di-carlo-verdone-23-03-2021-371497',
  91. 'md5': '7737d4d79b3c1a34b3de3e16297119ed',
  92. 'info_dict': {
  93. 'id': '371497',
  94. 'ext': 'mp3',
  95. 'title': '"La carezza delle memoria" di Carlo Verdone',
  96. 'description': 'md5:5abf07c3c551a687db80af3f9ceb7d52',
  97. 'thumbnail': 'https://www.la7.it/sites/default/files/podcast/371497.jpg',
  98. 'upload_date': '20210323',
  99. },
  100. }, {
  101. # embed url
  102. 'url': 'https://www.la7.it/embed/podcast/371497',
  103. 'only_matching': True,
  104. }, {
  105. # date already in the title
  106. 'url': 'https://www.la7.it/propagandalive/podcast/lintervista-di-diego-bianchi-ad-annalisa-cuzzocrea-puntata-del-1932021-20-03-2021-371130',
  107. 'only_matching': True,
  108. }, {
  109. # title same as show_title
  110. 'url': 'https://www.la7.it/otto-e-mezzo/podcast/otto-e-mezzo-26-03-2021-372340',
  111. 'only_matching': True,
  112. }]
  113. def _extract_info(self, webpage, video_id=None, ppn=None):
  114. if not video_id:
  115. video_id = self._search_regex(
  116. r'data-nid=([\'"])(?P<vid>\d+)\1',
  117. webpage, 'video_id', group='vid')
  118. media_url = self._search_regex(
  119. (r'src\s*:\s*([\'"])(?P<url>\S+?mp3.+?)\1',
  120. r'data-podcast\s*=\s*([\'"])(?P<url>\S+?mp3.+?)\1'),
  121. webpage, 'media_url', group='url')
  122. formats = [{
  123. 'url': media_url,
  124. 'format_id': 'http-mp3',
  125. 'ext': 'mp3',
  126. 'acodec': 'mp3',
  127. 'vcodec': 'none',
  128. }]
  129. title = self._html_search_regex(
  130. (r'<div class="title">(?P<title>.+?)</',
  131. r'<title>(?P<title>[^<]+)</title>',
  132. r'title:\s*([\'"])(?P<title>.+?)\1'),
  133. webpage, 'title', group='title')
  134. description = (
  135. self._html_search_regex(
  136. (r'<div class="description">(.+?)</div>',
  137. r'<div class="description-mobile">(.+?)</div>',
  138. r'<div class="box-txt">([^<]+?)</div>',
  139. r'<div class="field-content"><p>(.+?)</p></div>'),
  140. webpage, 'description', default=None)
  141. or self._html_search_meta('description', webpage))
  142. thumb = self._html_search_regex(
  143. (r'<div class="podcast-image"><img src="(.+?)"></div>',
  144. r'<div class="container-embed"[^<]+url\((.+?)\);">',
  145. r'<div class="field-content"><img src="(.+?)"'),
  146. webpage, 'thumbnail', fatal=False, default=None)
  147. duration = parse_duration(self._html_search_regex(
  148. r'<span class="(?:durata|duration)">([\d:]+)</span>',
  149. webpage, 'duration', fatal=False, default=None))
  150. date = self._html_search_regex(
  151. r'class="data">\s*(?:<span>)?([\d\.]+)\s*</',
  152. webpage, 'date', default=None)
  153. date_alt = self._search_regex(
  154. r'(\d+[\./]\d+[\./]\d+)', title, 'date_alt', default=None)
  155. ppn = ppn or self._search_regex(
  156. r'ppN:\s*([\'"])(?P<ppn>.+?)\1',
  157. webpage, 'ppn', group='ppn', default=None)
  158. # if the date is not in the title
  159. # and title is the same as the show_title
  160. # add the date to the title
  161. if date and not date_alt and ppn and ppn.lower() == title.lower():
  162. title = f'{title} del {date}'
  163. return {
  164. 'id': video_id,
  165. 'title': title,
  166. 'description': description,
  167. 'duration': float_or_none(duration),
  168. 'formats': formats,
  169. 'thumbnail': thumb,
  170. 'upload_date': unified_strdate(date),
  171. }
  172. def _real_extract(self, url):
  173. video_id = self._match_id(url)
  174. webpage = self._download_webpage(url, video_id)
  175. return self._extract_info(webpage, video_id)
  176. class LA7PodcastIE(LA7PodcastEpisodeIE): # XXX: Do not subclass from concrete IE
  177. IE_NAME = 'la7.it:podcast'
  178. _VALID_URL = r'https?://(?:www\.)?la7\.it/(?P<id>[^/]+)/podcast/?(?:$|[#?])'
  179. _TESTS = [{
  180. 'url': 'https://www.la7.it/propagandalive/podcast',
  181. 'info_dict': {
  182. 'id': 'propagandalive',
  183. 'title': 'Propaganda Live',
  184. },
  185. 'playlist_mincount': 10,
  186. }]
  187. def _real_extract(self, url):
  188. playlist_id = self._match_id(url)
  189. webpage = self._download_webpage(url, playlist_id)
  190. title = (
  191. self._html_search_regex(
  192. r'<h1.*?>(.+?)</h1>', webpage, 'title', fatal=False, default=None)
  193. or self._og_search_title(webpage))
  194. ppn = self._search_regex(
  195. r'window\.ppN\s*=\s*([\'"])(?P<ppn>.+?)\1',
  196. webpage, 'ppn', group='ppn', default=None)
  197. entries = []
  198. for episode in re.finditer(
  199. r'<div class="container-podcast-property">([\s\S]+?)(?:</div>\s*){3}',
  200. webpage):
  201. entries.append(self._extract_info(episode.group(1), ppn=ppn))
  202. return self.playlist_result(entries, playlist_id, title)