buzzfeed.py 3.5 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495
  1. import json
  2. import re
  3. from .common import InfoExtractor
  4. from .facebook import FacebookIE
  5. class BuzzFeedIE(InfoExtractor):
  6. _VALID_URL = r'https?://(?:www\.)?buzzfeed\.com/[^?#]*?/(?P<id>[^?#]+)'
  7. _TESTS = [{
  8. 'url': 'http://www.buzzfeed.com/abagg/this-angry-ram-destroys-a-punching-bag-like-a-boss?utm_term=4ldqpia',
  9. 'info_dict': {
  10. 'id': 'this-angry-ram-destroys-a-punching-bag-like-a-boss',
  11. 'title': 'This Angry Ram Destroys A Punching Bag Like A Boss',
  12. 'description': 'Rambro!',
  13. },
  14. 'playlist': [{
  15. 'info_dict': {
  16. 'id': 'aVCR29aE_OQ',
  17. 'ext': 'mp4',
  18. 'title': 'Angry Ram destroys a punching bag..',
  19. 'description': 'md5:c59533190ef23fd4458a5e8c8c872345',
  20. 'upload_date': '20141024',
  21. 'uploader_id': 'Buddhanz1',
  22. 'uploader': 'Angry Ram',
  23. },
  24. }],
  25. }, {
  26. 'url': 'http://www.buzzfeed.com/sheridanwatson/look-at-this-cute-dog-omg?utm_term=4ldqpia',
  27. 'params': {
  28. 'skip_download': True, # Got enough YouTube download tests
  29. },
  30. 'info_dict': {
  31. 'id': 'look-at-this-cute-dog-omg',
  32. 'description': 're:Munchkin the Teddy Bear is back ?!',
  33. 'title': 'You Need To Stop What You\'re Doing And Watching This Dog Walk On A Treadmill',
  34. },
  35. 'playlist': [{
  36. 'info_dict': {
  37. 'id': 'mVmBL8B-In0',
  38. 'ext': 'mp4',
  39. 'title': 're:Munchkin the Teddy Bear gets her exercise',
  40. 'description': 'md5:28faab95cda6e361bcff06ec12fc21d8',
  41. 'upload_date': '20141124',
  42. 'uploader_id': 'CindysMunchkin',
  43. 'uploader': 're:^Munchkin the',
  44. },
  45. }],
  46. }, {
  47. 'url': 'http://www.buzzfeed.com/craigsilverman/the-most-adorable-crash-landing-ever#.eq7pX0BAmK',
  48. 'info_dict': {
  49. 'id': 'the-most-adorable-crash-landing-ever',
  50. 'title': 'Watch This Baby Goose Make The Most Adorable Crash Landing',
  51. 'description': 'This gosling knows how to stick a landing.',
  52. },
  53. 'playlist': [{
  54. 'md5': '763ca415512f91ca62e4621086900a23',
  55. 'info_dict': {
  56. 'id': '971793786185728',
  57. 'ext': 'mp4',
  58. 'title': 'We set up crash pads so that the goslings on our roof would have a safe landi...',
  59. 'uploader': 'Calgary Outdoor Centre-University of Calgary',
  60. },
  61. }],
  62. 'add_ie': ['Facebook'],
  63. }]
  64. def _real_extract(self, url):
  65. playlist_id = self._match_id(url)
  66. webpage = self._download_webpage(url, playlist_id)
  67. all_buckets = re.findall(
  68. r'(?s)<div class="video-embed[^"]*"..*?rel:bf_bucket_data=\'([^\']+)\'',
  69. webpage)
  70. entries = []
  71. for bd_json in all_buckets:
  72. bd = json.loads(bd_json)
  73. video = bd.get('video') or bd.get('progload_video')
  74. if not video:
  75. continue
  76. entries.append(self.url_result(video['url']))
  77. facebook_urls = FacebookIE._extract_embed_urls(url, webpage)
  78. entries.extend([
  79. self.url_result(facebook_url)
  80. for facebook_url in facebook_urls])
  81. return {
  82. '_type': 'playlist',
  83. 'id': playlist_id,
  84. 'title': self._og_search_title(webpage),
  85. 'description': self._og_search_description(webpage),
  86. 'entries': entries,
  87. }