arcpublishing.py 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165
  1. import re
  2. from .common import InfoExtractor
  3. from ..utils import (
  4. extract_attributes,
  5. int_or_none,
  6. join_nonempty,
  7. parse_iso8601,
  8. try_get,
  9. )
  10. class ArcPublishingIE(InfoExtractor):
  11. _UUID_REGEX = r'[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12}'
  12. _VALID_URL = rf'arcpublishing:(?P<org>[a-z]+):(?P<id>{_UUID_REGEX})'
  13. _TESTS = [{
  14. # https://www.adn.com/politics/2020/11/02/video-senate-candidates-campaign-in-anchorage-on-eve-of-election-day/
  15. 'url': 'arcpublishing:adn:8c99cb6e-b29c-4bc9-9173-7bf9979225ab',
  16. 'only_matching': True,
  17. }, {
  18. # https://www.bostonglobe.com/video/2020/12/30/metro/footage-released-showing-officer-talking-about-striking-protesters-with-car/
  19. 'url': 'arcpublishing:bostonglobe:232b7ae6-7d73-432d-bc0a-85dbf0119ab1',
  20. 'only_matching': True,
  21. }, {
  22. # https://www.actionnewsjax.com/video/live-stream/
  23. 'url': 'arcpublishing:cmg:cfb1cf1b-3ab5-4d1b-86c5-a5515d311f2a',
  24. 'only_matching': True,
  25. }, {
  26. # https://elcomercio.pe/videos/deportes/deporte-total-futbol-peruano-seleccion-peruana-la-valorizacion-de-los-peruanos-en-el-exterior-tras-un-2020-atipico-nnav-vr-video-noticia/
  27. 'url': 'arcpublishing:elcomercio:27a7e1f8-2ec7-4177-874f-a4feed2885b3',
  28. 'only_matching': True,
  29. }, {
  30. # https://www.clickondetroit.com/video/community/2020/05/15/events-surrounding-woodward-dream-cruise-being-canceled/
  31. 'url': 'arcpublishing:gmg:c8793fb2-8d44-4242-881e-2db31da2d9fe',
  32. 'only_matching': True,
  33. }, {
  34. # https://www.wabi.tv/video/2020/12/30/trenton-company-making-equipment-pfizer-covid-vaccine/
  35. 'url': 'arcpublishing:gray:0b0ba30e-032a-4598-8810-901d70e6033e',
  36. 'only_matching': True,
  37. }, {
  38. # https://www.lateja.cr/el-mundo/video-china-aprueba-con-condiciones-su-primera/dfcbfa57-527f-45ff-a69b-35fe71054143/video/
  39. 'url': 'arcpublishing:gruponacion:dfcbfa57-527f-45ff-a69b-35fe71054143',
  40. 'only_matching': True,
  41. }, {
  42. # https://www.fifthdomain.com/video/2018/03/09/is-america-vulnerable-to-a-cyber-attack/
  43. 'url': 'arcpublishing:mco:aa0ca6fe-1127-46d4-b32c-be0d6fdb8055',
  44. 'only_matching': True,
  45. }, {
  46. # https://www.vl.no/kultur/2020/12/09/en-melding-fra-en-lytter-endret-julelista-til-lewi-bergrud/
  47. 'url': 'arcpublishing:mentormedier:47a12084-650b-4011-bfd0-3699b6947b2d',
  48. 'only_matching': True,
  49. }, {
  50. # https://www.14news.com/2020/12/30/whiskey-theft-caught-camera-henderson-liquor-store/
  51. 'url': 'arcpublishing:raycom:b89f61f8-79fa-4c09-8255-e64237119bf7',
  52. 'only_matching': True,
  53. }, {
  54. # https://www.theglobeandmail.com/world/video-ethiopian-woman-who-became-symbol-of-integration-in-italy-killed-on/
  55. 'url': 'arcpublishing:tgam:411b34c1-8701-4036-9831-26964711664b',
  56. 'only_matching': True,
  57. }, {
  58. # https://www.pilotonline.com/460f2931-8130-4719-8ea1-ffcb2d7cb685-132.html
  59. 'url': 'arcpublishing:tronc:460f2931-8130-4719-8ea1-ffcb2d7cb685',
  60. 'only_matching': True,
  61. }]
  62. _POWA_DEFAULTS = [
  63. (['cmg', 'prisa'], '%s-config-prod.api.cdn.arcpublishing.com/video'),
  64. ([
  65. 'adn', 'advancelocal', 'answers', 'bonnier', 'bostonglobe', 'demo',
  66. 'gmg', 'gruponacion', 'infobae', 'mco', 'nzme', 'pmn', 'raycom',
  67. 'spectator', 'tbt', 'tgam', 'tronc', 'wapo', 'wweek',
  68. ], 'video-api-cdn.%s.arcpublishing.com/api'),
  69. ]
  70. @classmethod
  71. def _extract_embed_urls(cls, url, webpage):
  72. entries = []
  73. # https://arcpublishing.atlassian.net/wiki/spaces/POWA/overview
  74. for powa_el in re.findall(rf'(<div[^>]+class="[^"]*\bpowa\b[^"]*"[^>]+data-uuid="{ArcPublishingIE._UUID_REGEX}"[^>]*>)', webpage):
  75. powa = extract_attributes(powa_el) or {}
  76. org = powa.get('data-org')
  77. uuid = powa.get('data-uuid')
  78. if org and uuid:
  79. entries.append(f'arcpublishing:{org}:{uuid}')
  80. return entries
  81. def _real_extract(self, url):
  82. org, uuid = self._match_valid_url(url).groups()
  83. for orgs, tmpl in self._POWA_DEFAULTS:
  84. if org in orgs:
  85. base_api_tmpl = tmpl
  86. break
  87. else:
  88. base_api_tmpl = '%s-prod-cdn.video-api.arcpublishing.com/api'
  89. if org == 'wapo':
  90. org = 'washpost'
  91. video = self._download_json(
  92. 'https://%s/v1/ansvideos/findByUuid' % (base_api_tmpl % org),
  93. uuid, query={'uuid': uuid})[0]
  94. title = video['headlines']['basic']
  95. is_live = video.get('status') == 'live'
  96. urls = []
  97. formats = []
  98. for s in video.get('streams', []):
  99. s_url = s.get('url')
  100. if not s_url or s_url in urls:
  101. continue
  102. urls.append(s_url)
  103. stream_type = s.get('stream_type')
  104. if stream_type == 'smil':
  105. smil_formats = self._extract_smil_formats(
  106. s_url, uuid, fatal=False)
  107. for f in smil_formats:
  108. if f['url'].endswith('/cfx/st'):
  109. f['app'] = 'cfx/st'
  110. if not f['play_path'].startswith('mp4:'):
  111. f['play_path'] = 'mp4:' + f['play_path']
  112. if isinstance(f['tbr'], float):
  113. f['vbr'] = f['tbr'] * 1000
  114. del f['tbr']
  115. f['format_id'] = 'rtmp-%d' % f['vbr']
  116. formats.extend(smil_formats)
  117. elif stream_type in ('ts', 'hls'):
  118. m3u8_formats = self._extract_m3u8_formats(
  119. s_url, uuid, 'mp4', live=is_live, m3u8_id='hls', fatal=False)
  120. if all(f.get('acodec') == 'none' for f in m3u8_formats):
  121. continue
  122. for f in m3u8_formats:
  123. height = f.get('height')
  124. if not height:
  125. continue
  126. vbr = self._search_regex(
  127. r'[_x]%d[_-](\d+)' % height, f['url'], 'vbr', default=None)
  128. if vbr:
  129. f['vbr'] = int(vbr)
  130. formats.extend(m3u8_formats)
  131. else:
  132. vbr = int_or_none(s.get('bitrate'))
  133. formats.append({
  134. 'format_id': join_nonempty(stream_type, vbr),
  135. 'vbr': vbr,
  136. 'width': int_or_none(s.get('width')),
  137. 'height': int_or_none(s.get('height')),
  138. 'filesize': int_or_none(s.get('filesize')),
  139. 'url': s_url,
  140. 'quality': -10,
  141. })
  142. subtitles = {}
  143. for subtitle in (try_get(video, lambda x: x['subtitles']['urls'], list) or []):
  144. subtitle_url = subtitle.get('url')
  145. if subtitle_url:
  146. subtitles.setdefault('en', []).append({'url': subtitle_url})
  147. return {
  148. 'id': uuid,
  149. 'title': title,
  150. 'thumbnail': try_get(video, lambda x: x['promo_image']['url']),
  151. 'description': try_get(video, lambda x: x['subheadlines']['basic']),
  152. 'formats': formats,
  153. 'duration': int_or_none(video.get('duration'), 100),
  154. 'timestamp': parse_iso8601(video.get('created_date')),
  155. 'subtitles': subtitles,
  156. 'is_live': is_live,
  157. }