tube8.py 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170
  1. import re
  2. import urllib.parse
  3. from .common import InfoExtractor
  4. from ..aes import aes_decrypt_text
  5. from ..utils import (
  6. determine_ext,
  7. format_field,
  8. int_or_none,
  9. str_to_int,
  10. strip_or_none,
  11. url_or_none,
  12. )
  13. class Tube8IE(InfoExtractor):
  14. _WORKING = False
  15. _VALID_URL = r'https?://(?:www\.)?tube8\.com/(?:[^/]+/)+(?P<display_id>[^/]+)/(?P<id>\d+)'
  16. _EMBED_REGEX = [r'<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//(?:www\.)?tube8\.com/embed/(?:[^/]+/)+\d+)']
  17. _TESTS = [{
  18. 'url': 'http://www.tube8.com/teen/kasia-music-video/229795/',
  19. 'md5': '65e20c48e6abff62ed0c3965fff13a39',
  20. 'info_dict': {
  21. 'id': '229795',
  22. 'display_id': 'kasia-music-video',
  23. 'ext': 'mp4',
  24. 'description': 'hot teen Kasia grinding',
  25. 'uploader': 'unknown',
  26. 'title': 'Kasia music video',
  27. 'age_limit': 18,
  28. 'duration': 230,
  29. 'categories': ['Teen'],
  30. 'tags': ['dancing'],
  31. },
  32. }, {
  33. 'url': 'http://www.tube8.com/shemale/teen/blonde-cd-gets-kidnapped-by-two-blacks-and-punished-for-being-a-slutty-girl/19569151/',
  34. 'only_matching': True,
  35. }]
  36. def _extract_info(self, url, fatal=True):
  37. mobj = self._match_valid_url(url)
  38. video_id = mobj.group('id')
  39. display_id = (mobj.group('display_id')
  40. if 'display_id' in mobj.groupdict()
  41. else None) or mobj.group('id')
  42. webpage = self._download_webpage(
  43. url, display_id, headers={'Cookie': 'age_verified=1'})
  44. formats = []
  45. format_urls = set()
  46. title = None
  47. thumbnail = None
  48. duration = None
  49. encrypted = False
  50. def extract_format(format_url, height=None):
  51. format_url = url_or_none(format_url)
  52. if not format_url or not format_url.startswith(('http', '//')):
  53. return
  54. if format_url in format_urls:
  55. return
  56. format_urls.add(format_url)
  57. tbr = int_or_none(self._search_regex(
  58. r'[/_](\d+)[kK][/_]', format_url, 'tbr', default=None))
  59. if not height:
  60. height = int_or_none(self._search_regex(
  61. r'[/_](\d+)[pP][/_]', format_url, 'height', default=None))
  62. if encrypted:
  63. format_url = aes_decrypt_text(
  64. video_url, title, 32).decode('utf-8')
  65. formats.append({
  66. 'url': format_url,
  67. 'format_id': format_field(height, None, '%dp'),
  68. 'height': height,
  69. 'tbr': tbr,
  70. })
  71. flashvars = self._parse_json(
  72. self._search_regex(
  73. r'flashvars\s*=\s*({.+?});', webpage,
  74. 'flashvars', default='{}'),
  75. display_id, fatal=False)
  76. if flashvars:
  77. title = flashvars.get('video_title')
  78. thumbnail = flashvars.get('image_url')
  79. duration = int_or_none(flashvars.get('video_duration'))
  80. encrypted = flashvars.get('encrypted') is True
  81. for key, value in flashvars.items():
  82. mobj = re.search(r'quality_(\d+)[pP]', key)
  83. if mobj:
  84. extract_format(value, int(mobj.group(1)))
  85. video_url = flashvars.get('video_url')
  86. if video_url and determine_ext(video_url, None):
  87. extract_format(video_url)
  88. video_url = self._html_search_regex(
  89. r'flashvars\.video_url\s*=\s*(["\'])(?P<url>http.+?)\1',
  90. webpage, 'video url', default=None, group='url')
  91. if video_url:
  92. extract_format(urllib.parse.unquote(video_url))
  93. if not formats:
  94. if 'title="This video is no longer available"' in webpage:
  95. self.raise_no_formats(
  96. f'Video {video_id} is no longer available', expected=True)
  97. if not title:
  98. title = self._html_search_regex(
  99. r'<h1[^>]*>([^<]+)', webpage, 'title')
  100. return webpage, {
  101. 'id': video_id,
  102. 'display_id': display_id,
  103. 'title': strip_or_none(title),
  104. 'thumbnail': thumbnail,
  105. 'duration': duration,
  106. 'age_limit': 18,
  107. 'formats': formats,
  108. }
  109. def _real_extract(self, url):
  110. webpage, info = self._extract_info(url)
  111. if not info['title']:
  112. info['title'] = self._html_search_regex(
  113. r'videoTitle\s*=\s*"([^"]+)', webpage, 'title')
  114. description = self._html_search_regex(
  115. r'(?s)Description:</dt>\s*<dd>(.+?)</dd>', webpage, 'description', fatal=False)
  116. uploader = self._html_search_regex(
  117. r'<span class="username">\s*(.+?)\s*<',
  118. webpage, 'uploader', fatal=False)
  119. like_count = int_or_none(self._search_regex(
  120. r'rupVar\s*=\s*"(\d+)"', webpage, 'like count', fatal=False))
  121. dislike_count = int_or_none(self._search_regex(
  122. r'rdownVar\s*=\s*"(\d+)"', webpage, 'dislike count', fatal=False))
  123. view_count = str_to_int(self._search_regex(
  124. r'Views:\s*</dt>\s*<dd>([\d,\.]+)',
  125. webpage, 'view count', fatal=False))
  126. comment_count = str_to_int(self._search_regex(
  127. r'<span id="allCommentsCount">(\d+)</span>',
  128. webpage, 'comment count', fatal=False))
  129. category = self._search_regex(
  130. r'Category:\s*</dt>\s*<dd>\s*<a[^>]+href=[^>]+>([^<]+)',
  131. webpage, 'category', fatal=False)
  132. categories = [category] if category else None
  133. tags_str = self._search_regex(
  134. r'(?s)Tags:\s*</dt>\s*<dd>(.+?)</(?!a)',
  135. webpage, 'tags', fatal=False)
  136. tags = list(re.findall(
  137. r'<a[^>]+href=[^>]+>([^<]+)', tags_str)) if tags_str else None
  138. info.update({
  139. 'description': description,
  140. 'uploader': uploader,
  141. 'view_count': view_count,
  142. 'like_count': like_count,
  143. 'dislike_count': dislike_count,
  144. 'comment_count': comment_count,
  145. 'categories': categories,
  146. 'tags': tags,
  147. })
  148. return info