democracynow.py 2.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091
  1. import os.path
  2. import re
  3. import urllib.parse
  4. from .common import InfoExtractor
  5. from ..utils import (
  6. remove_start,
  7. url_basename,
  8. )
  9. class DemocracynowIE(InfoExtractor):
  10. _VALID_URL = r'https?://(?:www\.)?democracynow\.org/(?P<id>[^\?]*)'
  11. IE_NAME = 'democracynow'
  12. _TESTS = [{
  13. 'url': 'http://www.democracynow.org/shows/2015/7/3',
  14. 'md5': '3757c182d3d84da68f5c8f506c18c196',
  15. 'info_dict': {
  16. 'id': '2015-0703-001',
  17. 'ext': 'mp4',
  18. 'title': 'Daily Show for July 03, 2015',
  19. 'description': 'md5:80eb927244d6749900de6072c7cc2c86',
  20. },
  21. }, {
  22. 'url': 'http://www.democracynow.org/2015/7/3/this_flag_comes_down_today_bree',
  23. 'info_dict': {
  24. 'id': '2015-0703-001',
  25. 'ext': 'mp4',
  26. 'title': '"This Flag Comes Down Today": Bree Newsome Scales SC Capitol Flagpole, Takes Down Confederate Flag',
  27. 'description': 'md5:4d2bc4f0d29f5553c2210a4bc7761a21',
  28. },
  29. 'params': {
  30. 'skip_download': True,
  31. },
  32. }]
  33. def _real_extract(self, url):
  34. display_id = self._match_id(url)
  35. webpage = self._download_webpage(url, display_id)
  36. json_data = self._parse_json(self._search_regex(
  37. r'<script[^>]+type="text/json"[^>]*>\s*({[^>]+})', webpage, 'json'),
  38. display_id)
  39. title = json_data['title']
  40. formats = []
  41. video_id = None
  42. for key in ('file', 'audio', 'video', 'high_res_video'):
  43. media_url = json_data.get(key, '')
  44. if not media_url:
  45. continue
  46. media_url = re.sub(r'\?.*', '', urllib.parse.urljoin(url, media_url))
  47. video_id = video_id or remove_start(os.path.splitext(url_basename(media_url))[0], 'dn')
  48. formats.append({
  49. 'url': media_url,
  50. 'vcodec': 'none' if key == 'audio' else None,
  51. })
  52. default_lang = 'en'
  53. subtitles = {}
  54. def add_subtitle_item(lang, info_dict):
  55. if lang not in subtitles:
  56. subtitles[lang] = []
  57. subtitles[lang].append(info_dict)
  58. # chapter_file are not subtitles
  59. if 'caption_file' in json_data:
  60. add_subtitle_item(default_lang, {
  61. 'url': urllib.parse.urljoin(url, json_data['caption_file']),
  62. })
  63. for subtitle_item in json_data.get('captions', []):
  64. lang = subtitle_item.get('language', '').lower() or default_lang
  65. add_subtitle_item(lang, {
  66. 'url': urllib.parse.urljoin(url, subtitle_item['url']),
  67. })
  68. description = self._og_search_description(webpage, default=None)
  69. return {
  70. 'id': video_id or display_id,
  71. 'title': title,
  72. 'description': description,
  73. 'thumbnail': json_data.get('image'),
  74. 'subtitles': subtitles,
  75. 'formats': formats,
  76. }