telegram.py 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136
  1. import re
  2. from .common import InfoExtractor
  3. from ..utils import (
  4. clean_html,
  5. format_field,
  6. get_element_by_class,
  7. parse_duration,
  8. parse_qs,
  9. traverse_obj,
  10. unified_timestamp,
  11. update_url_query,
  12. url_basename,
  13. )
  14. class TelegramEmbedIE(InfoExtractor):
  15. IE_NAME = 'telegram:embed'
  16. _VALID_URL = r'https?://t\.me/(?P<channel_id>[^/]+)/(?P<id>\d+)'
  17. _TESTS = [{
  18. 'url': 'https://t.me/europa_press/613',
  19. 'md5': 'dd707708aea958c11a590e8068825f22',
  20. 'info_dict': {
  21. 'id': '613',
  22. 'ext': 'mp4',
  23. 'title': 'md5:6ce2d7e8d56eda16d80607b23db7b252',
  24. 'description': 'md5:6ce2d7e8d56eda16d80607b23db7b252',
  25. 'channel_id': 'europa_press',
  26. 'channel': 'Europa Press ✔',
  27. 'thumbnail': r're:^https?://.+',
  28. 'timestamp': 1635631203,
  29. 'upload_date': '20211030',
  30. 'duration': 61,
  31. },
  32. }, {
  33. # 2-video post
  34. 'url': 'https://t.me/vorposte/29342',
  35. 'info_dict': {
  36. 'id': 'vorposte-29342',
  37. 'title': 'Форпост 29342',
  38. 'description': 'md5:9d92e22169a3e136d5d69df25f82c3dc',
  39. },
  40. 'playlist_count': 2,
  41. 'params': {
  42. 'skip_download': True,
  43. },
  44. }, {
  45. # 2-video post with --no-playlist
  46. 'url': 'https://t.me/vorposte/29343',
  47. 'md5': '1724e96053c18e788c8464038876e245',
  48. 'info_dict': {
  49. 'id': '29343',
  50. 'ext': 'mp4',
  51. 'title': 'md5:9d92e22169a3e136d5d69df25f82c3dc',
  52. 'description': 'md5:9d92e22169a3e136d5d69df25f82c3dc',
  53. 'channel_id': 'vorposte',
  54. 'channel': 'Форпост',
  55. 'thumbnail': r're:^https?://.+',
  56. 'timestamp': 1666384480,
  57. 'upload_date': '20221021',
  58. 'duration': 35,
  59. },
  60. 'params': {
  61. 'noplaylist': True,
  62. },
  63. }, {
  64. # 2-video post with 'single' query param
  65. 'url': 'https://t.me/vorposte/29342?single',
  66. 'md5': 'd20b202f1e41400a9f43201428add18f',
  67. 'info_dict': {
  68. 'id': '29342',
  69. 'ext': 'mp4',
  70. 'title': 'md5:9d92e22169a3e136d5d69df25f82c3dc',
  71. 'description': 'md5:9d92e22169a3e136d5d69df25f82c3dc',
  72. 'channel_id': 'vorposte',
  73. 'channel': 'Форпост',
  74. 'thumbnail': r're:^https?://.+',
  75. 'timestamp': 1666384480,
  76. 'upload_date': '20221021',
  77. 'duration': 33,
  78. },
  79. }]
  80. def _real_extract(self, url):
  81. channel_id, msg_id = self._match_valid_url(url).group('channel_id', 'id')
  82. embed = self._download_webpage(
  83. url, msg_id, query={'embed': '1', 'single': []}, note='Downloading embed frame')
  84. def clean_text(html_class, html):
  85. text = clean_html(get_element_by_class(html_class, html))
  86. return text.replace('\n', ' ') if text else None
  87. description = clean_text('tgme_widget_message_text', embed)
  88. message = {
  89. 'title': description or '',
  90. 'description': description,
  91. 'channel': clean_text('tgme_widget_message_author', embed),
  92. 'channel_id': channel_id,
  93. 'timestamp': unified_timestamp(self._search_regex(
  94. r'<time[^>]*datetime="([^"]*)"', embed, 'timestamp', fatal=False)),
  95. }
  96. videos = []
  97. for video in re.findall(r'<a class="tgme_widget_message_video_player(?s:.+?)</time>', embed):
  98. video_url = self._search_regex(
  99. r'<video[^>]+src="([^"]+)"', video, 'video URL', fatal=False)
  100. webpage_url = self._search_regex(
  101. r'<a class="tgme_widget_message_video_player[^>]+href="([^"]+)"',
  102. video, 'webpage URL', fatal=False)
  103. if not video_url or not webpage_url:
  104. continue
  105. formats = [{
  106. 'url': video_url,
  107. 'ext': 'mp4',
  108. }]
  109. videos.append({
  110. 'id': url_basename(webpage_url),
  111. 'webpage_url': update_url_query(webpage_url, {'single': True}),
  112. 'duration': parse_duration(self._search_regex(
  113. r'<time[^>]+duration[^>]*>([\d:]+)</time>', video, 'duration', fatal=False)),
  114. 'thumbnail': self._search_regex(
  115. r'tgme_widget_message_video_thumb"[^>]+background-image:url\(\'([^\']+)\'\)',
  116. video, 'thumbnail', fatal=False),
  117. 'formats': formats,
  118. **message,
  119. })
  120. playlist_id = None
  121. if len(videos) > 1 and 'single' not in parse_qs(url, keep_blank_values=True):
  122. playlist_id = f'{channel_id}-{msg_id}'
  123. if self._yes_playlist(playlist_id, msg_id):
  124. return self.playlist_result(
  125. videos, playlist_id, format_field(message, 'channel', f'%s {msg_id}'), description)
  126. else:
  127. return traverse_obj(videos, lambda _, x: x['id'] == msg_id, get_all=False)