thisvid.py 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226
  1. import itertools
  2. import re
  3. import urllib.parse
  4. from .common import InfoExtractor
  5. from ..utils import (
  6. clean_html,
  7. get_element_by_class,
  8. int_or_none,
  9. url_or_none,
  10. urljoin,
  11. )
  12. class ThisVidIE(InfoExtractor):
  13. _VALID_URL = r'https?://(?:www\.)?thisvid\.com/(?P<type>videos|embed)/(?P<id>[A-Za-z0-9-]+)'
  14. _TESTS = [{
  15. 'url': 'https://thisvid.com/videos/sitting-on-ball-tight-jeans/',
  16. 'md5': '839becb572995687e11a69dc4358a386',
  17. 'info_dict': {
  18. 'id': '3533241',
  19. 'ext': 'mp4',
  20. 'title': 'Sitting on ball tight jeans',
  21. 'description': 'md5:372353bb995883d1b65fddf507489acd',
  22. 'thumbnail': r're:https?://\w+\.thisvid\.com/(?:[^/]+/)+3533241/preview\.jpg',
  23. 'uploader_id': '150629',
  24. 'uploader': 'jeanslevisjeans',
  25. 'display_id': 'sitting-on-ball-tight-jeans',
  26. 'age_limit': 18,
  27. },
  28. }, {
  29. 'url': 'https://thisvid.com/embed/3533241/',
  30. 'md5': '839becb572995687e11a69dc4358a386',
  31. 'info_dict': {
  32. 'id': '3533241',
  33. 'ext': 'mp4',
  34. 'title': 'Sitting on ball tight jeans',
  35. 'thumbnail': r're:https?://\w+\.thisvid\.com/(?:[^/]+/)+3533241/preview\.jpg',
  36. 'uploader_id': '150629',
  37. 'uploader': 'jeanslevisjeans',
  38. 'display_id': 'sitting-on-ball-tight-jeans',
  39. 'age_limit': 18,
  40. },
  41. }]
  42. def _real_extract(self, url):
  43. main_id, type_ = re.match(self._VALID_URL, url).group('id', 'type')
  44. webpage = self._download_webpage(url, main_id)
  45. title = self._html_search_regex(
  46. r'<title\b[^>]*?>(?:Video:\s+)?(.+?)(?:\s+-\s+ThisVid(?:\.com| tube))?</title>',
  47. webpage, 'title')
  48. if type_ == 'embed':
  49. # look for more metadata
  50. video_alt_url = url_or_none(self._search_regex(
  51. rf'''video_alt_url\s*:\s+'({self._VALID_URL}/)',''',
  52. webpage, 'video_alt_url', default=None))
  53. if video_alt_url and video_alt_url != url:
  54. webpage = self._download_webpage(
  55. video_alt_url, main_id,
  56. note='Redirecting embed to main page', fatal=False) or webpage
  57. video_holder = get_element_by_class('video-holder', webpage) or ''
  58. if '>This video is a private video' in video_holder:
  59. self.raise_login_required(
  60. (clean_html(video_holder) or 'Private video').partition('\n')[0])
  61. uploader = self._html_search_regex(
  62. r'''(?s)<span\b[^>]*>Added by:\s*</span><a\b[^>]+\bclass\s*=\s*["']author\b[^>]+\bhref\s*=\s*["']https://thisvid\.com/members/([0-9]+/.{3,}?)\s*</a>''',
  63. webpage, 'uploader', default='')
  64. uploader = re.split(r'''/["'][^>]*>\s*''', uploader)
  65. if len(uploader) == 2:
  66. # id must be non-empty, uploader could be ''
  67. uploader_id, uploader = uploader
  68. uploader = uploader or None
  69. else:
  70. uploader_id = uploader = None
  71. return self.url_result(
  72. url, ie='Generic', url_transparent=True,
  73. title=title,
  74. age_limit=18,
  75. uploader=uploader,
  76. uploader_id=uploader_id)
  77. class ThisVidPlaylistBaseIE(InfoExtractor):
  78. _PLAYLIST_URL_RE = None
  79. @classmethod
  80. def _find_urls(cls, html):
  81. for m in re.finditer(rf'''<a\b[^>]+\bhref\s*=\s*["'](?P<url>{cls._PLAYLIST_URL_RE}\b)[^>]+>''', html):
  82. yield m.group('url')
  83. def _generate_playlist_entries(self, url, playlist_id, html=None):
  84. page_url = url
  85. for page in itertools.count(1):
  86. if not html:
  87. html = self._download_webpage(
  88. page_url, playlist_id, note=f'Downloading page {page}',
  89. fatal=False) or ''
  90. yield from self._find_urls(html)
  91. next_page = get_element_by_class('pagination-next', html) or ''
  92. if next_page:
  93. # member list page
  94. next_page = urljoin(url, self._search_regex(
  95. r'''<a\b[^>]+\bhref\s*=\s*("|')(?P<url>(?!#)(?:(?!\1).)+)''',
  96. next_page, 'next page link', group='url', default=None))
  97. # in case a member page should have pagination-next with empty link, not just `else:`
  98. if next_page is None:
  99. # playlist page
  100. parsed_url = urllib.parse.urlparse(page_url)
  101. base_path, _, num = parsed_url.path.rpartition('/')
  102. num = int_or_none(num)
  103. if num is None:
  104. base_path, num = parsed_url.path.rstrip('/'), 1
  105. parsed_url = parsed_url._replace(path=f'{base_path}/{num + 1}')
  106. next_page = urllib.parse.urlunparse(parsed_url)
  107. if page_url == next_page:
  108. next_page = None
  109. if not next_page:
  110. return
  111. page_url, html = next_page, None
  112. def _make_playlist_result(self, url):
  113. playlist_id = self._match_id(url)
  114. webpage = self._download_webpage(url, playlist_id)
  115. title = re.split(
  116. r'(?i)\s*\|\s*ThisVid\.com\s*$',
  117. self._og_search_title(webpage, default=None)
  118. or self._html_search_regex(r'(?s)<title\b[^>]*>(.+?)</title', webpage, 'title', fatal=False) or '', maxsplit=1)[0] or None
  119. return self.playlist_from_matches(
  120. self._generate_playlist_entries(url, playlist_id, webpage),
  121. playlist_id=playlist_id, playlist_title=title, ie=ThisVidIE)
  122. class ThisVidMemberIE(ThisVidPlaylistBaseIE):
  123. _VALID_URL = r'https?://thisvid\.com/members/(?P<id>\d+)'
  124. _TESTS = [{
  125. 'url': 'https://thisvid.com/members/2140501/',
  126. 'info_dict': {
  127. 'id': '2140501',
  128. 'title': 'Rafflesia\'s Profile',
  129. },
  130. 'playlist_mincount': 16,
  131. }, {
  132. 'url': 'https://thisvid.com/members/2140501/favourite_videos/',
  133. 'info_dict': {
  134. 'id': '2140501',
  135. 'title': 'Rafflesia\'s Favourite Videos',
  136. },
  137. 'playlist_mincount': 15,
  138. }, {
  139. 'url': 'https://thisvid.com/members/636468/public_videos/',
  140. 'info_dict': {
  141. 'id': '636468',
  142. 'title': 'Happymouth\'s Public Videos',
  143. },
  144. 'playlist_mincount': 196,
  145. }]
  146. _PLAYLIST_URL_RE = ThisVidIE._VALID_URL
  147. def _real_extract(self, url):
  148. return self._make_playlist_result(url)
  149. class ThisVidPlaylistIE(ThisVidPlaylistBaseIE):
  150. _VALID_URL = r'https?://thisvid\.com/playlist/(?P<id>\d+)/video/(?P<video_id>[A-Za-z0-9-]+)'
  151. _TESTS = [{
  152. 'url': 'https://thisvid.com/playlist/6615/video/big-italian-booty-28/',
  153. 'info_dict': {
  154. 'id': '6615',
  155. 'title': 'Underwear Stuff',
  156. },
  157. 'playlist_mincount': 200,
  158. }, {
  159. 'url': 'https://thisvid.com/playlist/6615/video/big-italian-booty-28/',
  160. 'info_dict': {
  161. 'id': '1072387',
  162. 'ext': 'mp4',
  163. 'title': 'Big Italian Booty 28',
  164. 'description': 'md5:1bccf7b13765e18fb27bf764dba7ede2',
  165. 'uploader_id': '367912',
  166. 'uploader': 'Jcmusclefun',
  167. 'age_limit': 18,
  168. 'display_id': 'big-italian-booty-28',
  169. 'thumbnail': r're:https?://\w+\.thisvid\.com/(?:[^/]+/)+1072387/preview\.jpg',
  170. },
  171. 'params': {
  172. 'noplaylist': True,
  173. },
  174. }]
  175. _PLAYLIST_URL_RE = _VALID_URL
  176. def _generate_playlist_entries(self, url, playlist_id, html=None):
  177. for wrapped_url in super()._generate_playlist_entries(url, playlist_id, html):
  178. video_id = re.match(self._VALID_URL, wrapped_url).group('video_id')
  179. yield urljoin(url, f'/videos/{video_id}/')
  180. def _real_extract(self, url):
  181. playlist_id, video_id = self._match_valid_url(url).group('id', 'video_id')
  182. if not self._yes_playlist(playlist_id, video_id):
  183. redirect_url = urljoin(url, f'/videos/{video_id}/')
  184. return self.url_result(redirect_url, ThisVidIE)
  185. result = self._make_playlist_result(url)
  186. # Fix duplicated title (`the title - the title` => `the title`)
  187. title = result['title']
  188. t_len = len(title)
  189. if t_len > 5 and t_len % 2 != 0:
  190. t_len = t_len // 2
  191. if title[t_len] == '-':
  192. first, second = map(str.strip, (title[:t_len], title[t_len + 1:]))
  193. if first and first == second:
  194. result['title'] = first
  195. return result