pornhub.py 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823
  1. import functools
  2. import itertools
  3. import math
  4. import operator
  5. import re
  6. from .common import InfoExtractor
  7. from .openload import PhantomJSwrapper
  8. from ..networking import Request
  9. from ..networking.exceptions import HTTPError
  10. from ..utils import (
  11. NO_DEFAULT,
  12. ExtractorError,
  13. clean_html,
  14. determine_ext,
  15. format_field,
  16. int_or_none,
  17. merge_dicts,
  18. orderedSet,
  19. remove_quotes,
  20. remove_start,
  21. str_to_int,
  22. update_url_query,
  23. url_or_none,
  24. urlencode_postdata,
  25. )
  26. class PornHubBaseIE(InfoExtractor):
  27. _NETRC_MACHINE = 'pornhub'
  28. _PORNHUB_HOST_RE = r'(?:(?P<host>pornhub(?:premium)?\.(?:com|net|org))|pornhubvybmsymdol4iibwgwtkpwmeyd6luq2gxajgjzfjvotyt5zhyd\.onion)'
  29. def _download_webpage_handle(self, *args, **kwargs):
  30. def dl(*args, **kwargs):
  31. return super(PornHubBaseIE, self)._download_webpage_handle(*args, **kwargs)
  32. ret = dl(*args, **kwargs)
  33. if not ret:
  34. return ret
  35. webpage, urlh = ret
  36. if any(re.search(p, webpage) for p in (
  37. r'<body\b[^>]+\bonload=["\']go\(\)',
  38. r'document\.cookie\s*=\s*["\']RNKEY=',
  39. r'document\.location\.reload\(true\)')):
  40. url_or_request = args[0]
  41. url = (url_or_request.url
  42. if isinstance(url_or_request, Request)
  43. else url_or_request)
  44. phantom = PhantomJSwrapper(self, required_version='2.0')
  45. phantom.get(url, html=webpage)
  46. webpage, urlh = dl(*args, **kwargs)
  47. return webpage, urlh
  48. def _real_initialize(self):
  49. self._logged_in = False
  50. def _set_age_cookies(self, host):
  51. self._set_cookie(host, 'age_verified', '1')
  52. self._set_cookie(host, 'accessAgeDisclaimerPH', '1')
  53. self._set_cookie(host, 'accessAgeDisclaimerUK', '1')
  54. self._set_cookie(host, 'accessPH', '1')
  55. def _login(self, host):
  56. if self._logged_in:
  57. return
  58. site = host.split('.')[0]
  59. # Both sites pornhub and pornhubpremium have separate accounts
  60. # so there should be an option to provide credentials for both.
  61. # At the same time some videos are available under the same video id
  62. # on both sites so that we have to identify them as the same video.
  63. # For that purpose we have to keep both in the same extractor
  64. # but under different netrc machines.
  65. username, password = self._get_login_info(netrc_machine=site)
  66. if username is None:
  67. return
  68. login_url = 'https://www.{}/{}login'.format(host, 'premium/' if 'premium' in host else '')
  69. login_page = self._download_webpage(
  70. login_url, None, f'Downloading {site} login page')
  71. def is_logged(webpage):
  72. return any(re.search(p, webpage) for p in (
  73. r'id="profileMenuDropdown"',
  74. r'class="ph-icon-logout"'))
  75. if is_logged(login_page):
  76. self._logged_in = True
  77. return
  78. login_form = self._hidden_inputs(login_page)
  79. login_form.update({
  80. 'email': username,
  81. 'password': password,
  82. })
  83. response = self._download_json(
  84. f'https://www.{host}/front/authenticate', None,
  85. f'Logging in to {site}',
  86. data=urlencode_postdata(login_form),
  87. headers={
  88. 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
  89. 'Referer': login_url,
  90. 'X-Requested-With': 'XMLHttpRequest',
  91. })
  92. if response.get('success') == '1':
  93. self._logged_in = True
  94. return
  95. message = response.get('message')
  96. if message is not None:
  97. raise ExtractorError(
  98. f'Unable to login: {message}', expected=True)
  99. raise ExtractorError('Unable to log in')
  100. class PornHubIE(PornHubBaseIE):
  101. IE_DESC = 'PornHub and Thumbzilla'
  102. _VALID_URL = rf'''(?x)
  103. https?://
  104. (?:
  105. (?:[^/]+\.)?
  106. {PornHubBaseIE._PORNHUB_HOST_RE}
  107. /(?:(?:view_video\.php|video/show)\?viewkey=|embed/)|
  108. (?:www\.)?thumbzilla\.com/video/
  109. )
  110. (?P<id>[\da-z]+)
  111. '''
  112. _EMBED_REGEX = [r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?pornhub(?:premium)?\.(?:com|net|org)/embed/[\da-z]+)']
  113. _TESTS = [{
  114. 'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015',
  115. 'md5': 'a6391306d050e4547f62b3f485dd9ba9',
  116. 'info_dict': {
  117. 'id': '648719015',
  118. 'ext': 'mp4',
  119. 'title': 'Seductive Indian beauty strips down and fingers her pink pussy',
  120. 'uploader': 'Babes',
  121. 'upload_date': '20130628',
  122. 'timestamp': 1372447216,
  123. 'duration': 361,
  124. 'view_count': int,
  125. 'like_count': int,
  126. 'dislike_count': int,
  127. 'comment_count': int,
  128. 'age_limit': 18,
  129. 'tags': list,
  130. 'categories': list,
  131. 'cast': list,
  132. },
  133. }, {
  134. # non-ASCII title
  135. 'url': 'http://www.pornhub.com/view_video.php?viewkey=1331683002',
  136. 'info_dict': {
  137. 'id': '1331683002',
  138. 'ext': 'mp4',
  139. 'title': '重庆婷婷女王足交',
  140. 'upload_date': '20150213',
  141. 'timestamp': 1423804862,
  142. 'duration': 1753,
  143. 'view_count': int,
  144. 'like_count': int,
  145. 'dislike_count': int,
  146. 'comment_count': int,
  147. 'age_limit': 18,
  148. 'tags': list,
  149. 'categories': list,
  150. },
  151. 'params': {
  152. 'skip_download': True,
  153. },
  154. 'skip': 'Video has been flagged for verification in accordance with our trust and safety policy',
  155. }, {
  156. # subtitles
  157. 'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5af5fef7c2aa7',
  158. 'info_dict': {
  159. 'id': 'ph5af5fef7c2aa7',
  160. 'ext': 'mp4',
  161. 'title': 'BFFS - Cute Teen Girls Share Cock On the Floor',
  162. 'uploader': 'BFFs',
  163. 'duration': 622,
  164. 'view_count': int,
  165. 'like_count': int,
  166. 'dislike_count': int,
  167. 'comment_count': int,
  168. 'age_limit': 18,
  169. 'tags': list,
  170. 'categories': list,
  171. 'subtitles': {
  172. 'en': [{
  173. 'ext': 'srt',
  174. }],
  175. },
  176. },
  177. 'params': {
  178. 'skip_download': True,
  179. },
  180. 'skip': 'This video has been disabled',
  181. }, {
  182. 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph601dc30bae19a',
  183. 'info_dict': {
  184. 'id': 'ph601dc30bae19a',
  185. 'uploader': 'Projekt Melody',
  186. 'uploader_id': 'projekt-melody',
  187. 'upload_date': '20210205',
  188. 'title': '"Welcome to My Pussy Mansion" - CB Stream (02/03/21)',
  189. 'thumbnail': r're:https?://.+',
  190. },
  191. }, {
  192. 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d',
  193. 'only_matching': True,
  194. }, {
  195. # removed at the request of cam4.com
  196. 'url': 'http://fr.pornhub.com/view_video.php?viewkey=ph55ca2f9760862',
  197. 'only_matching': True,
  198. }, {
  199. # removed at the request of the copyright owner
  200. 'url': 'http://www.pornhub.com/view_video.php?viewkey=788152859',
  201. 'only_matching': True,
  202. }, {
  203. # removed by uploader
  204. 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph572716d15a111',
  205. 'only_matching': True,
  206. }, {
  207. # private video
  208. 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph56fd731fce6b7',
  209. 'only_matching': True,
  210. }, {
  211. 'url': 'https://www.thumbzilla.com/video/ph56c6114abd99a/horny-girlfriend-sex',
  212. 'only_matching': True,
  213. }, {
  214. 'url': 'http://www.pornhub.com/video/show?viewkey=648719015',
  215. 'only_matching': True,
  216. }, {
  217. 'url': 'https://www.pornhub.net/view_video.php?viewkey=203640933',
  218. 'only_matching': True,
  219. }, {
  220. 'url': 'https://www.pornhub.org/view_video.php?viewkey=203640933',
  221. 'only_matching': True,
  222. }, {
  223. 'url': 'https://www.pornhubpremium.com/view_video.php?viewkey=ph5e4acdae54a82',
  224. 'only_matching': True,
  225. }, {
  226. # Some videos are available with the same id on both premium
  227. # and non-premium sites (e.g. this and the following test)
  228. 'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5f75b0f4b18e3',
  229. 'only_matching': True,
  230. }, {
  231. 'url': 'https://www.pornhubpremium.com/view_video.php?viewkey=ph5f75b0f4b18e3',
  232. 'only_matching': True,
  233. }, {
  234. # geo restricted
  235. 'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5a9813bfa7156',
  236. 'only_matching': True,
  237. }, {
  238. 'url': 'http://pornhubvybmsymdol4iibwgwtkpwmeyd6luq2gxajgjzfjvotyt5zhyd.onion/view_video.php?viewkey=ph5a9813bfa7156',
  239. 'only_matching': True,
  240. }]
  241. def _extract_count(self, pattern, webpage, name):
  242. return str_to_int(self._search_regex(pattern, webpage, f'{name} count', default=None))
  243. def _real_extract(self, url):
  244. mobj = self._match_valid_url(url)
  245. host = mobj.group('host') or 'pornhub.com'
  246. video_id = mobj.group('id')
  247. self._login(host)
  248. self._set_age_cookies(host)
  249. def dl_webpage(platform):
  250. self._set_cookie(host, 'platform', platform)
  251. return self._download_webpage(
  252. f'https://www.{host}/view_video.php?viewkey={video_id}',
  253. video_id, f'Downloading {platform} webpage')
  254. webpage = dl_webpage('pc')
  255. error_msg = self._html_search_regex(
  256. (r'(?s)<div[^>]+class=(["\'])(?:(?!\1).)*\b(?:removed|userMessageSection)\b(?:(?!\1).)*\1[^>]*>(?P<error>.+?)</div>',
  257. r'(?s)<section[^>]+class=["\']noVideo["\'][^>]*>(?P<error>.+?)</section>'),
  258. webpage, 'error message', default=None, group='error')
  259. if error_msg:
  260. error_msg = re.sub(r'\s+', ' ', error_msg)
  261. raise ExtractorError(
  262. f'PornHub said: {error_msg}',
  263. expected=True, video_id=video_id)
  264. if any(re.search(p, webpage) for p in (
  265. r'class=["\']geoBlocked["\']',
  266. r'>\s*This content is unavailable in your country')):
  267. self.raise_geo_restricted()
  268. # video_title from flashvars contains whitespace instead of non-ASCII (see
  269. # http://www.pornhub.com/view_video.php?viewkey=1331683002), not relying
  270. # on that anymore.
  271. title = self._html_search_meta(
  272. 'twitter:title', webpage, default=None) or self._html_search_regex(
  273. (r'(?s)<h1[^>]+class=["\']title["\'][^>]*>(?P<title>.+?)</h1>',
  274. r'<div[^>]+data-video-title=(["\'])(?P<title>(?:(?!\1).)+)\1',
  275. r'shareTitle["\']\s*[=:]\s*(["\'])(?P<title>(?:(?!\1).)+)\1'),
  276. webpage, 'title', group='title')
  277. video_urls = []
  278. video_urls_set = set()
  279. subtitles = {}
  280. flashvars = self._parse_json(
  281. self._search_regex(
  282. r'var\s+flashvars_\d+\s*=\s*({.+?});', webpage, 'flashvars', default='{}'),
  283. video_id)
  284. if flashvars:
  285. subtitle_url = url_or_none(flashvars.get('closedCaptionsFile'))
  286. if subtitle_url:
  287. subtitles.setdefault('en', []).append({
  288. 'url': subtitle_url,
  289. 'ext': 'srt',
  290. })
  291. thumbnail = flashvars.get('image_url')
  292. duration = int_or_none(flashvars.get('video_duration'))
  293. media_definitions = flashvars.get('mediaDefinitions')
  294. if isinstance(media_definitions, list):
  295. for definition in media_definitions:
  296. if not isinstance(definition, dict):
  297. continue
  298. video_url = definition.get('videoUrl')
  299. if not video_url or not isinstance(video_url, str):
  300. continue
  301. if video_url in video_urls_set:
  302. continue
  303. video_urls_set.add(video_url)
  304. video_urls.append(
  305. (video_url, int_or_none(definition.get('quality'))))
  306. else:
  307. thumbnail, duration = [None] * 2
  308. def extract_js_vars(webpage, pattern, default=NO_DEFAULT):
  309. assignments = self._search_regex(
  310. pattern, webpage, 'encoded url', default=default)
  311. if not assignments:
  312. return {}
  313. assignments = assignments.split(';')
  314. js_vars = {}
  315. def parse_js_value(inp):
  316. inp = re.sub(r'/\*(?:(?!\*/).)*?\*/', '', inp)
  317. if '+' in inp:
  318. inps = inp.split('+')
  319. return functools.reduce(
  320. operator.concat, map(parse_js_value, inps))
  321. inp = inp.strip()
  322. if inp in js_vars:
  323. return js_vars[inp]
  324. return remove_quotes(inp)
  325. for assn in assignments:
  326. assn = assn.strip()
  327. if not assn:
  328. continue
  329. assn = re.sub(r'var\s+', '', assn)
  330. vname, value = assn.split('=', 1)
  331. js_vars[vname] = parse_js_value(value)
  332. return js_vars
  333. def add_video_url(video_url):
  334. v_url = url_or_none(video_url)
  335. if not v_url:
  336. return
  337. if v_url in video_urls_set:
  338. return
  339. video_urls.append((v_url, None))
  340. video_urls_set.add(v_url)
  341. def parse_quality_items(quality_items):
  342. q_items = self._parse_json(quality_items, video_id, fatal=False)
  343. if not isinstance(q_items, list):
  344. return
  345. for item in q_items:
  346. if isinstance(item, dict):
  347. add_video_url(item.get('url'))
  348. if not video_urls:
  349. FORMAT_PREFIXES = ('media', 'quality', 'qualityItems')
  350. js_vars = extract_js_vars(
  351. webpage, r'(var\s+(?:{})_.+)'.format('|'.join(FORMAT_PREFIXES)),
  352. default=None)
  353. if js_vars:
  354. for key, format_url in js_vars.items():
  355. if key.startswith(FORMAT_PREFIXES[-1]):
  356. parse_quality_items(format_url)
  357. elif any(key.startswith(p) for p in FORMAT_PREFIXES[:2]):
  358. add_video_url(format_url)
  359. if not video_urls and re.search(
  360. r'<[^>]+\bid=["\']lockedPlayer', webpage):
  361. raise ExtractorError(
  362. f'Video {video_id} is locked', expected=True)
  363. if not video_urls:
  364. js_vars = extract_js_vars(
  365. dl_webpage('tv'), r'(var.+?mediastring.+?)</script>')
  366. add_video_url(js_vars['mediastring'])
  367. for mobj in re.finditer(
  368. r'<a[^>]+\bclass=["\']downloadBtn\b[^>]+\bhref=(["\'])(?P<url>(?:(?!\1).)+)\1',
  369. webpage):
  370. video_url = mobj.group('url')
  371. if video_url not in video_urls_set:
  372. video_urls.append((video_url, None))
  373. video_urls_set.add(video_url)
  374. upload_date = None
  375. formats = []
  376. def add_format(format_url, height=None):
  377. ext = determine_ext(format_url)
  378. if ext == 'mpd':
  379. formats.extend(self._extract_mpd_formats(
  380. format_url, video_id, mpd_id='dash', fatal=False))
  381. return
  382. if ext == 'm3u8':
  383. formats.extend(self._extract_m3u8_formats(
  384. format_url, video_id, 'mp4', entry_protocol='m3u8_native',
  385. m3u8_id='hls', fatal=False))
  386. return
  387. if not height:
  388. height = int_or_none(self._search_regex(
  389. r'(?P<height>\d+)[pP]?_\d+[kK]', format_url, 'height',
  390. default=None))
  391. formats.append({
  392. 'url': format_url,
  393. 'format_id': format_field(height, None, '%dp'),
  394. 'height': height,
  395. })
  396. for video_url, height in video_urls:
  397. if not upload_date:
  398. upload_date = self._search_regex(
  399. r'/(\d{6}/\d{2})/', video_url, 'upload data', default=None)
  400. if upload_date:
  401. upload_date = upload_date.replace('/', '')
  402. if '/video/get_media' in video_url:
  403. medias = self._download_json(video_url, video_id, fatal=False)
  404. if isinstance(medias, list):
  405. for media in medias:
  406. if not isinstance(media, dict):
  407. continue
  408. video_url = url_or_none(media.get('videoUrl'))
  409. if not video_url:
  410. continue
  411. height = int_or_none(media.get('quality'))
  412. add_format(video_url, height)
  413. continue
  414. add_format(video_url)
  415. model_profile = self._search_json(
  416. r'var\s+MODEL_PROFILE\s*=', webpage, 'model profile', video_id, fatal=False)
  417. video_uploader = self._html_search_regex(
  418. r'(?s)From:&nbsp;.+?<(?:a\b[^>]+\bhref=["\']/(?:(?:user|channel)s|model|pornstar)/|span\b[^>]+\bclass=["\']username)[^>]+>(.+?)<',
  419. webpage, 'uploader', default=None) or model_profile.get('username')
  420. def extract_vote_count(kind, name):
  421. return self._extract_count(
  422. (rf'<span[^>]+\bclass="votes{kind}"[^>]*>([\d,\.]+)</span>',
  423. rf'<span[^>]+\bclass=["\']votes{kind}["\'][^>]*\bdata-rating=["\'](\d+)'),
  424. webpage, name)
  425. view_count = self._extract_count(
  426. r'<span class="count">([\d,\.]+)</span> [Vv]iews', webpage, 'view')
  427. like_count = extract_vote_count('Up', 'like')
  428. dislike_count = extract_vote_count('Down', 'dislike')
  429. comment_count = self._extract_count(
  430. r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment')
  431. def extract_list(meta_key):
  432. div = self._search_regex(
  433. rf'(?s)<div[^>]+\bclass=["\'].*?\b{meta_key}Wrapper[^>]*>(.+?)</div>',
  434. webpage, meta_key, default=None)
  435. if div:
  436. return [clean_html(x).strip() for x in re.findall(r'(?s)<a[^>]+\bhref=[^>]+>.+?</a>', div)]
  437. info = self._search_json_ld(webpage, video_id, default={})
  438. # description provided in JSON-LD is irrelevant
  439. info['description'] = None
  440. return merge_dicts({
  441. 'id': video_id,
  442. 'uploader': video_uploader,
  443. 'uploader_id': remove_start(model_profile.get('modelProfileLink'), '/model/'),
  444. 'upload_date': upload_date,
  445. 'title': title,
  446. 'thumbnail': thumbnail,
  447. 'duration': duration,
  448. 'view_count': view_count,
  449. 'like_count': like_count,
  450. 'dislike_count': dislike_count,
  451. 'comment_count': comment_count,
  452. 'formats': formats,
  453. 'age_limit': 18,
  454. 'tags': extract_list('tags'),
  455. 'categories': extract_list('categories'),
  456. 'cast': extract_list('pornstars'),
  457. 'subtitles': subtitles,
  458. }, info)
  459. class PornHubPlaylistBaseIE(PornHubBaseIE):
  460. def _extract_page(self, url):
  461. return int_or_none(self._search_regex(
  462. r'\bpage=(\d+)', url, 'page', default=None))
  463. def _extract_entries(self, webpage, host):
  464. # Only process container div with main playlist content skipping
  465. # drop-down menu that uses similar pattern for videos (see
  466. # https://github.com/ytdl-org/youtube-dl/issues/11594).
  467. container = self._search_regex(
  468. r'(?s)(<div[^>]+class=["\']container.+)', webpage,
  469. 'container', default=webpage)
  470. return [
  471. self.url_result(
  472. f'http://www.{host}/{video_url}',
  473. PornHubIE.ie_key(), video_title=title)
  474. for video_url, title in orderedSet(re.findall(
  475. r'href="/?(view_video\.php\?.*\bviewkey=[\da-z]+[^"]*)"[^>]*\s+title="([^"]+)"',
  476. container))
  477. ]
  478. class PornHubUserIE(PornHubPlaylistBaseIE):
  479. _VALID_URL = rf'(?P<url>https?://(?:[^/]+\.)?{PornHubBaseIE._PORNHUB_HOST_RE}/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/?#&]+))(?:[?#&]|/(?!videos)|$)'
  480. _TESTS = [{
  481. 'url': 'https://www.pornhub.com/model/zoe_ph',
  482. 'playlist_mincount': 118,
  483. }, {
  484. 'url': 'https://www.pornhub.com/pornstar/liz-vicious',
  485. 'info_dict': {
  486. 'id': 'liz-vicious',
  487. },
  488. 'playlist_mincount': 118,
  489. }, {
  490. 'url': 'https://www.pornhub.com/users/russianveet69',
  491. 'only_matching': True,
  492. }, {
  493. 'url': 'https://www.pornhub.com/channels/povd',
  494. 'only_matching': True,
  495. }, {
  496. 'url': 'https://www.pornhub.com/model/zoe_ph?abc=1',
  497. 'only_matching': True,
  498. }, {
  499. # Unavailable via /videos page, but available with direct pagination
  500. # on pornstar page (see [1]), requires premium
  501. # 1. https://github.com/ytdl-org/youtube-dl/issues/27853
  502. 'url': 'https://www.pornhubpremium.com/pornstar/sienna-west',
  503. 'only_matching': True,
  504. }, {
  505. # Same as before, multi page
  506. 'url': 'https://www.pornhubpremium.com/pornstar/lily-labeau',
  507. 'only_matching': True,
  508. }, {
  509. 'url': 'https://pornhubvybmsymdol4iibwgwtkpwmeyd6luq2gxajgjzfjvotyt5zhyd.onion/model/zoe_ph',
  510. 'only_matching': True,
  511. }]
  512. def _real_extract(self, url):
  513. mobj = self._match_valid_url(url)
  514. user_id = mobj.group('id')
  515. videos_url = '{}/videos'.format(mobj.group('url'))
  516. self._set_age_cookies(mobj.group('host'))
  517. page = self._extract_page(url)
  518. if page:
  519. videos_url = update_url_query(videos_url, {'page': page})
  520. return self.url_result(
  521. videos_url, ie=PornHubPagedVideoListIE.ie_key(), video_id=user_id)
  522. class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE):
  523. @staticmethod
  524. def _has_more(webpage):
  525. return re.search(
  526. r'''(?x)
  527. <li[^>]+\bclass=["\']page_next|
  528. <link[^>]+\brel=["\']next|
  529. <button[^>]+\bid=["\']moreDataBtn
  530. ''', webpage) is not None
  531. def _entries(self, url, host, item_id):
  532. page = self._extract_page(url)
  533. VIDEOS = '/videos'
  534. def download_page(base_url, num, fallback=False):
  535. note = 'Downloading page {}{}'.format(num, ' (switch to fallback)' if fallback else '')
  536. return self._download_webpage(
  537. base_url, item_id, note, query={'page': num})
  538. def is_404(e):
  539. return isinstance(e.cause, HTTPError) and e.cause.status == 404
  540. base_url = url
  541. has_page = page is not None
  542. first_page = page if has_page else 1
  543. for page_num in (first_page, ) if has_page else itertools.count(first_page):
  544. try:
  545. try:
  546. webpage = download_page(base_url, page_num)
  547. except ExtractorError as e:
  548. # Some sources may not be available via /videos page,
  549. # trying to fallback to main page pagination (see [1])
  550. # 1. https://github.com/ytdl-org/youtube-dl/issues/27853
  551. if is_404(e) and page_num == first_page and VIDEOS in base_url:
  552. base_url = base_url.replace(VIDEOS, '')
  553. webpage = download_page(base_url, page_num, fallback=True)
  554. else:
  555. raise
  556. except ExtractorError as e:
  557. if is_404(e) and page_num != first_page:
  558. break
  559. raise
  560. page_entries = self._extract_entries(webpage, host)
  561. if not page_entries:
  562. break
  563. for e in page_entries:
  564. yield e
  565. if not self._has_more(webpage):
  566. break
  567. def _real_extract(self, url):
  568. mobj = self._match_valid_url(url)
  569. host = mobj.group('host')
  570. item_id = mobj.group('id')
  571. self._login(host)
  572. self._set_age_cookies(host)
  573. return self.playlist_result(self._entries(url, host, item_id), item_id)
  574. class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE):
  575. _VALID_URL = rf'https?://(?:[^/]+\.)?{PornHubBaseIE._PORNHUB_HOST_RE}/(?!playlist/)(?P<id>(?:[^/]+/)*[^/?#&]+)'
  576. _TESTS = [{
  577. 'url': 'https://www.pornhub.com/model/zoe_ph/videos',
  578. 'only_matching': True,
  579. }, {
  580. 'url': 'http://www.pornhub.com/users/rushandlia/videos',
  581. 'only_matching': True,
  582. }, {
  583. 'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos',
  584. 'info_dict': {
  585. 'id': 'pornstar/jenny-blighe/videos',
  586. },
  587. 'playlist_mincount': 149,
  588. }, {
  589. 'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos?page=3',
  590. 'info_dict': {
  591. 'id': 'pornstar/jenny-blighe/videos',
  592. },
  593. 'playlist_mincount': 40,
  594. }, {
  595. # default sorting as Top Rated Videos
  596. 'url': 'https://www.pornhub.com/channels/povd/videos',
  597. 'info_dict': {
  598. 'id': 'channels/povd/videos',
  599. },
  600. 'playlist_mincount': 293,
  601. }, {
  602. # Top Rated Videos
  603. 'url': 'https://www.pornhub.com/channels/povd/videos?o=ra',
  604. 'only_matching': True,
  605. }, {
  606. # Most Recent Videos
  607. 'url': 'https://www.pornhub.com/channels/povd/videos?o=da',
  608. 'only_matching': True,
  609. }, {
  610. # Most Viewed Videos
  611. 'url': 'https://www.pornhub.com/channels/povd/videos?o=vi',
  612. 'only_matching': True,
  613. }, {
  614. 'url': 'http://www.pornhub.com/users/zoe_ph/videos/public',
  615. 'only_matching': True,
  616. }, {
  617. # Most Viewed Videos
  618. 'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos?o=mv',
  619. 'only_matching': True,
  620. }, {
  621. # Top Rated Videos
  622. 'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos?o=tr',
  623. 'only_matching': True,
  624. }, {
  625. # Longest Videos
  626. 'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos?o=lg',
  627. 'only_matching': True,
  628. }, {
  629. # Newest Videos
  630. 'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos?o=cm',
  631. 'only_matching': True,
  632. }, {
  633. 'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos/paid',
  634. 'only_matching': True,
  635. }, {
  636. 'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos/fanonly',
  637. 'only_matching': True,
  638. }, {
  639. 'url': 'https://www.pornhub.com/video',
  640. 'only_matching': True,
  641. }, {
  642. 'url': 'https://www.pornhub.com/video?page=3',
  643. 'only_matching': True,
  644. }, {
  645. 'url': 'https://www.pornhub.com/video/search?search=123',
  646. 'only_matching': True,
  647. }, {
  648. 'url': 'https://www.pornhub.com/categories/teen',
  649. 'only_matching': True,
  650. }, {
  651. 'url': 'https://www.pornhub.com/categories/teen?page=3',
  652. 'only_matching': True,
  653. }, {
  654. 'url': 'https://www.pornhub.com/hd',
  655. 'only_matching': True,
  656. }, {
  657. 'url': 'https://www.pornhub.com/hd?page=3',
  658. 'only_matching': True,
  659. }, {
  660. 'url': 'https://www.pornhub.com/described-video',
  661. 'only_matching': True,
  662. }, {
  663. 'url': 'https://www.pornhub.com/described-video?page=2',
  664. 'only_matching': True,
  665. }, {
  666. 'url': 'https://www.pornhub.com/video/incategories/60fps-1/hd-porn',
  667. 'only_matching': True,
  668. }, {
  669. 'url': 'https://pornhubvybmsymdol4iibwgwtkpwmeyd6luq2gxajgjzfjvotyt5zhyd.onion/model/zoe_ph/videos',
  670. 'only_matching': True,
  671. }]
  672. @classmethod
  673. def suitable(cls, url):
  674. return (False
  675. if PornHubIE.suitable(url) or PornHubUserIE.suitable(url) or PornHubUserVideosUploadIE.suitable(url)
  676. else super().suitable(url))
  677. class PornHubUserVideosUploadIE(PornHubPagedPlaylistBaseIE):
  678. _VALID_URL = rf'(?P<url>https?://(?:[^/]+\.)?{PornHubBaseIE._PORNHUB_HOST_RE}/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/]+)/videos/upload)'
  679. _TESTS = [{
  680. 'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos/upload',
  681. 'info_dict': {
  682. 'id': 'jenny-blighe',
  683. },
  684. 'playlist_mincount': 129,
  685. }, {
  686. 'url': 'https://www.pornhub.com/model/zoe_ph/videos/upload',
  687. 'only_matching': True,
  688. }, {
  689. 'url': 'http://pornhubvybmsymdol4iibwgwtkpwmeyd6luq2gxajgjzfjvotyt5zhyd.onion/pornstar/jenny-blighe/videos/upload',
  690. 'only_matching': True,
  691. }]
  692. class PornHubPlaylistIE(PornHubPlaylistBaseIE):
  693. _VALID_URL = rf'(?P<url>https?://(?:[^/]+\.)?{PornHubBaseIE._PORNHUB_HOST_RE}/playlist/(?P<id>[^/?#&]+))'
  694. _TESTS = [{
  695. 'url': 'https://www.pornhub.com/playlist/44121572',
  696. 'info_dict': {
  697. 'id': '44121572',
  698. },
  699. 'playlist_count': 77,
  700. }, {
  701. 'url': 'https://www.pornhub.com/playlist/4667351',
  702. 'only_matching': True,
  703. }, {
  704. 'url': 'https://de.pornhub.com/playlist/4667351',
  705. 'only_matching': True,
  706. }, {
  707. 'url': 'https://de.pornhub.com/playlist/4667351?page=2',
  708. 'only_matching': True,
  709. }]
  710. def _entries(self, url, host, item_id):
  711. webpage = self._download_webpage(url, item_id, 'Downloading page 1')
  712. playlist_id = self._search_regex(r'var\s+playlistId\s*=\s*"([^"]+)"', webpage, 'playlist_id')
  713. video_count = int_or_none(
  714. self._search_regex(r'var\s+itemsCount\s*=\s*([0-9]+)\s*\|\|', webpage, 'video_count'))
  715. token = self._search_regex(r'var\s+token\s*=\s*"([^"]+)"', webpage, 'token')
  716. page_count = math.ceil((video_count - 36) / 40.) + 1
  717. page_entries = self._extract_entries(webpage, host)
  718. def download_page(page_num):
  719. note = f'Downloading page {page_num}'
  720. page_url = f'https://www.{host}/playlist/viewChunked'
  721. return self._download_webpage(page_url, item_id, note, query={
  722. 'id': playlist_id,
  723. 'page': page_num,
  724. 'token': token,
  725. })
  726. for page_num in range(1, page_count + 1):
  727. if page_num > 1:
  728. webpage = download_page(page_num)
  729. page_entries = self._extract_entries(webpage, host)
  730. if not page_entries:
  731. break
  732. yield from page_entries
  733. def _real_extract(self, url):
  734. mobj = self._match_valid_url(url)
  735. host = mobj.group('host')
  736. item_id = mobj.group('id')
  737. self._login(host)
  738. self._set_age_cookies(host)
  739. return self.playlist_result(self._entries(mobj.group('url'), host, item_id), item_id)