sejmpl.py 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218
  1. import datetime as dt
  2. from .common import InfoExtractor
  3. from .redge import RedCDNLivxIE
  4. from ..utils import (
  5. clean_html,
  6. join_nonempty,
  7. js_to_json,
  8. strip_or_none,
  9. update_url_query,
  10. )
  11. from ..utils.traversal import traverse_obj
  12. def is_dst(date):
  13. last_march = dt.datetime(date.year, 3, 31)
  14. last_october = dt.datetime(date.year, 10, 31)
  15. last_sunday_march = last_march - dt.timedelta(days=last_march.isoweekday() % 7)
  16. last_sunday_october = last_october - dt.timedelta(days=last_october.isoweekday() % 7)
  17. return last_sunday_march.replace(hour=2) <= date <= last_sunday_october.replace(hour=3)
  18. def rfc3339_to_atende(date):
  19. date = dt.datetime.fromisoformat(date)
  20. date = date + dt.timedelta(hours=1 if is_dst(date) else 0)
  21. return int((date.timestamp() - 978307200) * 1000)
  22. class SejmIE(InfoExtractor):
  23. _VALID_URL = (
  24. r'https?://(?:www\.)?sejm\.gov\.pl/[Ss]ejm(?P<term>\d+)\.nsf/transmisje(?:_arch)?\.xsp(?:\?[^#]*)?#(?P<id>[\dA-F]+)',
  25. r'https?://(?:www\.)?sejm\.gov\.pl/[Ss]ejm(?P<term>\d+)\.nsf/transmisje(?:_arch)?\.xsp\?(?:[^#]+&)?unid=(?P<id>[\dA-F]+)',
  26. r'https?://sejm-embed\.redcdn\.pl/[Ss]ejm(?P<term>\d+)\.nsf/VideoFrame\.xsp/(?P<id>[\dA-F]+)',
  27. )
  28. IE_NAME = 'sejm'
  29. _TESTS = [{
  30. # multiple cameras, polish SL iterpreter
  31. 'url': 'https://www.sejm.gov.pl/Sejm10.nsf/transmisje_arch.xsp#6181EF1AD9CEEBB5C1258A6D006452B5',
  32. 'info_dict': {
  33. 'id': '6181EF1AD9CEEBB5C1258A6D006452B5',
  34. 'title': '1. posiedzenie Sejmu X kadencji',
  35. 'duration': 20145,
  36. 'live_status': 'was_live',
  37. 'location': 'Sala Posiedzeń',
  38. },
  39. 'playlist': [{
  40. 'info_dict': {
  41. 'id': 'ENC01-722340000000-722360145000',
  42. 'ext': 'mp4',
  43. 'duration': 20145,
  44. 'title': '1. posiedzenie Sejmu X kadencji - ENC01',
  45. 'live_status': 'was_live',
  46. },
  47. }, {
  48. 'info_dict': {
  49. 'id': 'ENC30-722340000000-722360145000',
  50. 'ext': 'mp4',
  51. 'duration': 20145,
  52. 'title': '1. posiedzenie Sejmu X kadencji - ENC30',
  53. 'live_status': 'was_live',
  54. },
  55. }, {
  56. 'info_dict': {
  57. 'id': 'ENC31-722340000000-722360145000',
  58. 'ext': 'mp4',
  59. 'duration': 20145,
  60. 'title': '1. posiedzenie Sejmu X kadencji - ENC31',
  61. 'live_status': 'was_live',
  62. },
  63. }, {
  64. 'info_dict': {
  65. 'id': 'ENC32-722340000000-722360145000',
  66. 'ext': 'mp4',
  67. 'duration': 20145,
  68. 'title': '1. posiedzenie Sejmu X kadencji - ENC32',
  69. 'live_status': 'was_live',
  70. },
  71. }, {
  72. # sign lang interpreter
  73. 'info_dict': {
  74. 'id': 'Migacz-ENC01-1-722340000000-722360145000',
  75. 'ext': 'mp4',
  76. 'duration': 20145,
  77. 'title': '1. posiedzenie Sejmu X kadencji - Migacz-ENC01',
  78. 'live_status': 'was_live',
  79. },
  80. }],
  81. }, {
  82. 'url': 'https://www.sejm.gov.pl/Sejm8.nsf/transmisje.xsp?unid=9377A9D65518E9A5C125808E002E9FF2',
  83. 'info_dict': {
  84. 'id': '9377A9D65518E9A5C125808E002E9FF2',
  85. 'title': 'Debata "Lepsza Polska: obywatelska"',
  86. 'description': 'KP .Nowoczesna',
  87. 'duration': 8770,
  88. 'live_status': 'was_live',
  89. 'location': 'sala kolumnowa im. Kazimierza Pużaka (bud. C-D)',
  90. },
  91. 'playlist': [{
  92. 'info_dict': {
  93. 'id': 'ENC08-1-503831270000-503840040000',
  94. 'ext': 'mp4',
  95. 'duration': 8770,
  96. 'title': 'Debata "Lepsza Polska: obywatelska" - ENC08',
  97. 'live_status': 'was_live',
  98. },
  99. }],
  100. }, {
  101. # 7th term is very special, since it does not use redcdn livx
  102. 'url': 'https://www.sejm.gov.pl/sejm7.nsf/transmisje_arch.xsp?rok=2015&month=11#A6E6D475ECCC6FE5C1257EF90034817F',
  103. 'info_dict': {
  104. 'id': 'A6E6D475ECCC6FE5C1257EF90034817F',
  105. 'title': 'Konferencja prasowa - Stanowisko SLD ws. składu nowego rządu',
  106. 'description': 'SLD - Biuro Prasowe Klubu',
  107. 'duration': 514,
  108. 'location': 'sala 101/bud. C',
  109. 'live_status': 'was_live',
  110. },
  111. 'playlist': [{
  112. 'info_dict': {
  113. 'id': 'A6E6D475ECCC6FE5C1257EF90034817F',
  114. 'ext': 'mp4',
  115. 'title': 'Konferencja prasowa - Stanowisko SLD ws. składu nowego rządu',
  116. 'duration': 514,
  117. },
  118. }],
  119. }, {
  120. 'url': 'https://sejm-embed.redcdn.pl/Sejm10.nsf/VideoFrame.xsp/FED58EABB97FBD53C1258A7400386492',
  121. 'only_matching': True,
  122. }]
  123. def _real_extract(self, url):
  124. term, video_id = self._match_valid_url(url).group('term', 'id')
  125. frame = self._download_webpage(
  126. f'https://sejm-embed.redcdn.pl/Sejm{term}.nsf/VideoFrame.xsp/{video_id}',
  127. video_id)
  128. # despite it says "transmisje_arch", it works for live streams too!
  129. data = self._download_json(
  130. f'https://www.sejm.gov.pl/Sejm{term}.nsf/transmisje_arch.xsp/json/{video_id}',
  131. video_id)
  132. params = data['params']
  133. title = strip_or_none(data.get('title'))
  134. if data.get('status') == 'VIDEO_ENDED':
  135. live_status = 'was_live'
  136. elif data.get('status') == 'VIDEO_PLAYING':
  137. live_status = 'is_live'
  138. else:
  139. live_status = None
  140. self.report_warning(f'unknown status: {data.get("status")}')
  141. start_time = rfc3339_to_atende(params['start'])
  142. # current streams have a stop time of *expected* end of session, but actual times
  143. # can change during the transmission. setting a stop_time would artificially
  144. # end the stream at that time, while the session actually keeps going.
  145. if live_status == 'was_live':
  146. stop_time = rfc3339_to_atende(params['stop'])
  147. duration = (stop_time - start_time) // 1000
  148. else:
  149. stop_time, duration = None, None
  150. entries = []
  151. def add_entry(file, legacy_file=False):
  152. if not file:
  153. return
  154. file = self._proto_relative_url(file)
  155. if not legacy_file:
  156. file = update_url_query(file, {'startTime': start_time})
  157. if stop_time is not None:
  158. file = update_url_query(file, {'stopTime': stop_time})
  159. stream_id = self._search_regex(r'/o2/sejm/([^/]+)/[^./]+\.livx', file, 'stream id')
  160. common_info = {
  161. 'url': file,
  162. 'duration': duration,
  163. }
  164. if legacy_file:
  165. entries.append({
  166. **common_info,
  167. 'id': video_id,
  168. 'title': title,
  169. })
  170. else:
  171. entries.append({
  172. **common_info,
  173. '_type': 'url_transparent',
  174. 'ie_key': RedCDNLivxIE.ie_key(),
  175. 'id': stream_id,
  176. 'title': join_nonempty(title, stream_id, delim=' - '),
  177. })
  178. cameras = self._search_json(
  179. r'var\s+cameras\s*=', frame, 'camera list', video_id,
  180. contains_pattern=r'\[(?s:.+)\]', transform_source=js_to_json,
  181. fatal=False) or []
  182. for camera_file in traverse_obj(cameras, (..., 'file', {dict})):
  183. if camera_file.get('flv'):
  184. add_entry(camera_file['flv'])
  185. elif camera_file.get('mp4'):
  186. # this is only a thing in 7th term. no streams before, and starting 8th it's redcdn livx
  187. add_entry(camera_file['mp4'], legacy_file=True)
  188. else:
  189. self.report_warning('Unknown camera stream type found')
  190. if params.get('mig'):
  191. add_entry(self._search_regex(r"var sliUrl\s*=\s*'([^']+)'", frame, 'sign language interpreter url', fatal=False))
  192. return {
  193. '_type': 'playlist',
  194. 'entries': entries,
  195. 'id': video_id,
  196. 'title': title,
  197. 'description': clean_html(data.get('desc')) or None,
  198. 'duration': duration,
  199. 'live_status': live_status,
  200. 'location': strip_or_none(data.get('location')),
  201. }