amazon.py 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170
  1. import re
  2. from .common import InfoExtractor
  3. from ..utils import (
  4. ExtractorError,
  5. clean_html,
  6. float_or_none,
  7. get_element_by_attribute,
  8. get_element_by_class,
  9. int_or_none,
  10. js_to_json,
  11. traverse_obj,
  12. url_or_none,
  13. )
  14. class AmazonStoreIE(InfoExtractor):
  15. _VALID_URL = r'https?://(?:www\.)?amazon\.(?:[a-z]{2,3})(?:\.[a-z]{2})?/(?:[^/]+/)?(?:dp|gp/product)/(?P<id>[^/&#$?]+)'
  16. _TESTS = [{
  17. 'url': 'https://www.amazon.co.uk/dp/B098XNCHLD/',
  18. 'info_dict': {
  19. 'id': 'B098XNCHLD',
  20. 'title': str,
  21. },
  22. 'playlist_mincount': 1,
  23. 'playlist': [{
  24. 'info_dict': {
  25. 'id': 'A1F83G8C2ARO7P',
  26. 'ext': 'mp4',
  27. 'title': 'mcdodo usb c cable 100W 5a',
  28. 'thumbnail': r're:^https?://.*\.jpg$',
  29. 'duration': 34,
  30. },
  31. }],
  32. 'expected_warnings': ['Unable to extract data'],
  33. }, {
  34. 'url': 'https://www.amazon.in/Sony-WH-1000XM4-Cancelling-Headphones-Bluetooth/dp/B0863TXGM3',
  35. 'info_dict': {
  36. 'id': 'B0863TXGM3',
  37. 'title': str,
  38. },
  39. 'playlist_mincount': 4,
  40. 'expected_warnings': ['Unable to extract data'],
  41. }, {
  42. 'url': 'https://www.amazon.com/dp/B0845NXCXF/',
  43. 'info_dict': {
  44. 'id': 'B0845NXCXF',
  45. 'title': str,
  46. },
  47. 'playlist-mincount': 1,
  48. 'expected_warnings': ['Unable to extract data'],
  49. }, {
  50. 'url': 'https://www.amazon.es/Samsung-Smartphone-s-AMOLED-Quad-c%C3%A1mara-espa%C3%B1ola/dp/B08WX337PQ',
  51. 'info_dict': {
  52. 'id': 'B08WX337PQ',
  53. 'title': str,
  54. },
  55. 'playlist_mincount': 1,
  56. 'expected_warnings': ['Unable to extract data'],
  57. }]
  58. def _real_extract(self, url):
  59. playlist_id = self._match_id(url)
  60. for retry in self.RetryManager():
  61. webpage = self._download_webpage(url, playlist_id)
  62. try:
  63. data_json = self._search_json(
  64. r'var\s?obj\s?=\s?jQuery\.parseJSON\(\'', webpage, 'data', playlist_id,
  65. transform_source=js_to_json)
  66. except ExtractorError as e:
  67. retry.error = e
  68. entries = [{
  69. 'id': video['marketPlaceID'],
  70. 'url': video['url'],
  71. 'title': video.get('title'),
  72. 'thumbnail': video.get('thumbUrl') or video.get('thumb'),
  73. 'duration': video.get('durationSeconds'),
  74. 'height': int_or_none(video.get('videoHeight')),
  75. 'width': int_or_none(video.get('videoWidth')),
  76. } for video in (data_json.get('videos') or []) if video.get('isVideo') and video.get('url')]
  77. return self.playlist_result(entries, playlist_id=playlist_id, playlist_title=data_json.get('title'))
  78. class AmazonReviewsIE(InfoExtractor):
  79. _VALID_URL = r'https?://(?:www\.)?amazon\.(?:[a-z]{2,3})(?:\.[a-z]{2})?/gp/customer-reviews/(?P<id>[^/&#$?]+)'
  80. _TESTS = [{
  81. 'url': 'https://www.amazon.com/gp/customer-reviews/R10VE9VUSY19L3/ref=cm_cr_arp_d_rvw_ttl',
  82. 'info_dict': {
  83. 'id': 'R10VE9VUSY19L3',
  84. 'ext': 'mp4',
  85. 'title': 'Get squad #Suspicious',
  86. 'description': 'md5:7012695052f440a1e064e402d87e0afb',
  87. 'uploader': 'Kimberly Cronkright',
  88. 'average_rating': 1.0,
  89. 'thumbnail': r're:^https?://.*\.jpg$',
  90. },
  91. 'expected_warnings': ['Review body was not found in webpage'],
  92. }, {
  93. 'url': 'https://www.amazon.com/gp/customer-reviews/R10VE9VUSY19L3/ref=cm_cr_arp_d_rvw_ttl?language=es_US',
  94. 'info_dict': {
  95. 'id': 'R10VE9VUSY19L3',
  96. 'ext': 'mp4',
  97. 'title': 'Get squad #Suspicious',
  98. 'description': 'md5:7012695052f440a1e064e402d87e0afb',
  99. 'uploader': 'Kimberly Cronkright',
  100. 'average_rating': 1.0,
  101. 'thumbnail': r're:^https?://.*\.jpg$',
  102. },
  103. 'expected_warnings': ['Review body was not found in webpage'],
  104. }, {
  105. 'url': 'https://www.amazon.in/gp/customer-reviews/RV1CO8JN5VGXV/',
  106. 'info_dict': {
  107. 'id': 'RV1CO8JN5VGXV',
  108. 'ext': 'mp4',
  109. 'title': 'Not sure about its durability',
  110. 'description': 'md5:1a252c106357f0a3109ebf37d2e87494',
  111. 'uploader': 'Shoaib Gulzar',
  112. 'average_rating': 2.0,
  113. 'thumbnail': r're:^https?://.*\.jpg$',
  114. },
  115. 'expected_warnings': ['Review body was not found in webpage'],
  116. }]
  117. def _real_extract(self, url):
  118. video_id = self._match_id(url)
  119. for retry in self.RetryManager():
  120. webpage = self._download_webpage(url, video_id)
  121. review_body = get_element_by_attribute('data-hook', 'review-body', webpage)
  122. if not review_body:
  123. retry.error = ExtractorError('Review body was not found in webpage', expected=True)
  124. formats, subtitles = [], {}
  125. manifest_url = self._search_regex(
  126. r'data-video-url="([^"]+)"', review_body, 'm3u8 url', default=None)
  127. if url_or_none(manifest_url):
  128. fmts, subtitles = self._extract_m3u8_formats_and_subtitles(
  129. manifest_url, video_id, 'mp4', fatal=False)
  130. formats.extend(fmts)
  131. video_url = self._search_regex(
  132. r'<input[^>]+\bvalue="([^"]+)"[^>]+\bclass="video-url"', review_body, 'mp4 url', default=None)
  133. if url_or_none(video_url):
  134. formats.append({
  135. 'url': video_url,
  136. 'ext': 'mp4',
  137. 'format_id': 'http-mp4',
  138. })
  139. if not formats:
  140. self.raise_no_formats('No video found for this customer review', expected=True)
  141. return {
  142. 'id': video_id,
  143. 'title': (clean_html(get_element_by_attribute('data-hook', 'review-title', webpage))
  144. or self._html_extract_title(webpage)),
  145. 'description': clean_html(traverse_obj(re.findall(
  146. r'<span(?:\s+class="cr-original-review-content")?>(.+?)</span>', review_body), -1)),
  147. 'uploader': clean_html(get_element_by_class('a-profile-name', webpage)),
  148. 'average_rating': float_or_none(clean_html(get_element_by_attribute(
  149. 'data-hook', 'review-star-rating', webpage) or '').partition(' ')[0]),
  150. 'thumbnail': self._search_regex(
  151. r'data-thumbnail-url="([^"]+)"', review_body, 'thumbnail', default=None),
  152. 'formats': formats,
  153. 'subtitles': subtitles,
  154. }