harpodeon.py 2.8 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970
  1. from .common import InfoExtractor
  2. from ..utils import int_or_none
  3. class HarpodeonIE(InfoExtractor):
  4. _VALID_URL = r'https?://(?:www\.)?harpodeon\.com/(?:video|preview)/\w+/(?P<id>\d+)'
  5. _TESTS = [{
  6. 'url': 'https://www.harpodeon.com/video/The_Smoking_Out_of_Bella_Butts/268068288',
  7. 'md5': '727371564a6a9ebccef2073535b5b6bd',
  8. 'skip': 'Free video could become unavailable',
  9. 'info_dict': {
  10. 'id': '268068288',
  11. 'ext': 'mp4',
  12. 'title': 'The Smoking Out of Bella Butts',
  13. 'description': 'md5:47e16bdb41fc8a79c83ab83af11c8b77',
  14. 'creator': 'Vitagraph Company of America',
  15. 'release_year': 1915,
  16. },
  17. }, {
  18. 'url': 'https://www.harpodeon.com/preview/The_Smoking_Out_of_Bella_Butts/268068288',
  19. 'md5': '6dfea5412845f690c7331be703f884db',
  20. 'info_dict': {
  21. 'id': '268068288',
  22. 'ext': 'mp4',
  23. 'title': 'The Smoking Out of Bella Butts',
  24. 'description': 'md5:47e16bdb41fc8a79c83ab83af11c8b77',
  25. 'creator': 'Vitagraph Company of America',
  26. 'release_year': 1915,
  27. },
  28. }, {
  29. 'url': 'https://www.harpodeon.com/preview/Behind_the_Screen/421838710',
  30. 'md5': '7979df9ca04637282cb7d172ab3a9c3b',
  31. 'info_dict': {
  32. 'id': '421838710',
  33. 'ext': 'mp4',
  34. 'title': 'Behind the Screen',
  35. 'description': 'md5:008972a3dc51fba3965ee517d2ba9155',
  36. 'creator': 'Lone Star Corporation',
  37. 'release_year': 1916,
  38. },
  39. }]
  40. def _real_extract(self, url):
  41. video_id = self._match_id(url)
  42. webpage = self._download_webpage(url, video_id)
  43. title, creator, release_year = self._search_regex(
  44. r'''(?x)
  45. <div[^>]+videoInfo[^<]*<h2[^>]*>(?P<title>[^>]+)</h2>
  46. (?:\s*<p[^>]*>\((?P<creator>.+),\s*)?(?P<release_year>\d{4})?''',
  47. webpage, 'title', group=('title', 'creator', 'release_year'),
  48. fatal=False) or (None, None, None)
  49. hp_base = self._html_search_regex(r'hpBase\(\s*["\']([^"\']+)', webpage, 'hp_base')
  50. hp_inject_video, hp_resolution = self._search_regex(
  51. r'''(?x)
  52. hpInjectVideo\([\'\"](?P<hp_inject_video>\w+)[\'\"],
  53. [\'\"](?P<hp_resolution>\d+)[\'\"]''',
  54. webpage, 'hp_inject_video', group=['hp_inject_video', 'hp_resolution'])
  55. return {
  56. 'id': video_id,
  57. 'title': title,
  58. 'url': f'{hp_base}{hp_inject_video}_{hp_resolution}.mp4',
  59. 'http_headers': {'Referer': url},
  60. 'description': self._html_search_meta('description', webpage, fatal=False),
  61. 'creator': creator,
  62. 'release_year': int_or_none(release_year),
  63. }