duboku.py 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246
  1. import base64
  2. import re
  3. import urllib.parse
  4. from .common import InfoExtractor
  5. from ..utils import (
  6. ExtractorError,
  7. clean_html,
  8. extract_attributes,
  9. get_elements_by_class,
  10. int_or_none,
  11. js_to_json,
  12. smuggle_url,
  13. unescapeHTML,
  14. )
  15. def _get_elements_by_tag_and_attrib(html, tag=None, attribute=None, value=None, escape_value=True):
  16. """Return the content of the tag with the specified attribute in the passed HTML document"""
  17. if tag is None:
  18. tag = '[a-zA-Z0-9:._-]+'
  19. if attribute is None:
  20. attribute = ''
  21. else:
  22. attribute = rf'\s+(?P<attribute>{re.escape(attribute)})'
  23. if value is None:
  24. value = ''
  25. else:
  26. value = re.escape(value) if escape_value else value
  27. value = f'=[\'"]?(?P<value>{value})[\'"]?'
  28. retlist = []
  29. for m in re.finditer(rf'''(?xs)
  30. <(?P<tag>{tag})
  31. (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
  32. {attribute}{value}
  33. (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
  34. \s*>
  35. (?P<content>.*?)
  36. </\1>
  37. ''', html):
  38. retlist.append(m)
  39. return retlist
  40. def _get_element_by_tag_and_attrib(html, tag=None, attribute=None, value=None, escape_value=True):
  41. retval = _get_elements_by_tag_and_attrib(html, tag, attribute, value, escape_value)
  42. return retval[0] if retval else None
  43. class DubokuIE(InfoExtractor):
  44. IE_NAME = 'duboku'
  45. IE_DESC = 'www.duboku.io'
  46. _VALID_URL = r'(?:https?://[^/]+\.duboku\.io/vodplay/)(?P<id>[0-9]+-[0-9-]+)\.html.*'
  47. _TESTS = [{
  48. 'url': 'https://w.duboku.io/vodplay/1575-1-1.html',
  49. 'info_dict': {
  50. 'id': '1575-1-1',
  51. 'ext': 'mp4',
  52. 'series': '白色月光',
  53. 'title': 'contains:白色月光',
  54. 'season_number': 1,
  55. 'episode_number': 1,
  56. 'season': 'Season 1',
  57. 'episode_id': '1',
  58. 'season_id': '1',
  59. 'episode': 'Episode 1',
  60. },
  61. 'params': {
  62. 'skip_download': 'm3u8 download',
  63. },
  64. }, {
  65. 'url': 'https://w.duboku.io/vodplay/1588-1-1.html',
  66. 'info_dict': {
  67. 'id': '1588-1-1',
  68. 'ext': 'mp4',
  69. 'series': '亲爱的自己',
  70. 'title': 'contains:第1集',
  71. 'season_number': 1,
  72. 'episode_number': 1,
  73. 'episode': 'Episode 1',
  74. 'season': 'Season 1',
  75. 'episode_id': '1',
  76. 'season_id': '1',
  77. },
  78. 'params': {
  79. 'skip_download': 'm3u8 download',
  80. },
  81. }]
  82. _PLAYER_DATA_PATTERN = r'player_data\s*=\s*(\{\s*(.*)})\s*;?\s*</script'
  83. def _real_extract(self, url):
  84. video_id = self._match_id(url)
  85. temp = video_id.split('-')
  86. series_id = temp[0]
  87. season_id = temp[1]
  88. episode_id = temp[2]
  89. webpage_url = f'https://w.duboku.io/vodplay/{video_id}.html'
  90. webpage_html = self._download_webpage(webpage_url, video_id)
  91. # extract video url
  92. player_data = self._search_regex(
  93. self._PLAYER_DATA_PATTERN, webpage_html, 'player_data')
  94. player_data = self._parse_json(player_data, video_id, js_to_json)
  95. # extract title
  96. temp = get_elements_by_class('title', webpage_html)
  97. series_title = None
  98. title = None
  99. for html in temp:
  100. mobj = re.search(r'<a\s+.*>(.*)</a>', html)
  101. if mobj:
  102. href = extract_attributes(mobj.group(0)).get('href')
  103. if href:
  104. mobj1 = re.search(r'/(\d+)\.html', href)
  105. if mobj1 and mobj1.group(1) == series_id:
  106. series_title = clean_html(mobj.group(0))
  107. series_title = re.sub(r'[\s\r\n\t]+', ' ', series_title)
  108. title = clean_html(html)
  109. title = re.sub(r'[\s\r\n\t]+', ' ', title)
  110. break
  111. data_url = player_data.get('url')
  112. if not data_url:
  113. raise ExtractorError('Cannot find url in player_data')
  114. player_encrypt = player_data.get('encrypt')
  115. if player_encrypt == 1:
  116. data_url = urllib.parse.unquote(data_url)
  117. elif player_encrypt == 2:
  118. data_url = urllib.parse.unquote(base64.b64decode(data_url).decode('ascii'))
  119. # if it is an embedded iframe, maybe it's an external source
  120. headers = {'Referer': webpage_url}
  121. if player_data.get('from') == 'iframe':
  122. # use _type url_transparent to retain the meaningful details
  123. # of the video.
  124. return {
  125. '_type': 'url_transparent',
  126. 'url': smuggle_url(data_url, {'referer': webpage_url}),
  127. 'id': video_id,
  128. 'title': title,
  129. 'series': series_title,
  130. 'season_number': int_or_none(season_id),
  131. 'season_id': season_id,
  132. 'episode_number': int_or_none(episode_id),
  133. 'episode_id': episode_id,
  134. }
  135. formats = self._extract_m3u8_formats(data_url, video_id, 'mp4', headers=headers)
  136. return {
  137. 'id': video_id,
  138. 'title': title,
  139. 'series': series_title,
  140. 'season_number': int_or_none(season_id),
  141. 'season_id': season_id,
  142. 'episode_number': int_or_none(episode_id),
  143. 'episode_id': episode_id,
  144. 'formats': formats,
  145. 'http_headers': headers,
  146. }
  147. class DubokuPlaylistIE(InfoExtractor):
  148. IE_NAME = 'duboku:list'
  149. IE_DESC = 'www.duboku.io entire series'
  150. _VALID_URL = r'(?:https?://[^/]+\.duboku\.io/voddetail/)(?P<id>[0-9]+)\.html.*'
  151. _TESTS = [{
  152. 'url': 'https://w.duboku.io/voddetail/1575.html',
  153. 'info_dict': {
  154. 'id': 'startswith:1575',
  155. 'title': '白色月光',
  156. },
  157. 'playlist_count': 12,
  158. }, {
  159. 'url': 'https://w.duboku.io/voddetail/1554.html',
  160. 'info_dict': {
  161. 'id': 'startswith:1554',
  162. 'title': '以家人之名',
  163. },
  164. 'playlist_mincount': 30,
  165. }]
  166. def _real_extract(self, url):
  167. mobj = self._match_valid_url(url)
  168. if mobj is None:
  169. raise ExtractorError(f'Invalid URL: {url}')
  170. series_id = mobj.group('id')
  171. fragment = urllib.parse.urlparse(url).fragment
  172. webpage_url = f'https://w.duboku.io/voddetail/{series_id}.html'
  173. webpage_html = self._download_webpage(webpage_url, series_id)
  174. # extract title
  175. title = _get_element_by_tag_and_attrib(webpage_html, 'h1', 'class', 'title')
  176. title = unescapeHTML(title.group('content')) if title else None
  177. if not title:
  178. title = self._html_search_meta('keywords', webpage_html)
  179. if not title:
  180. title = _get_element_by_tag_and_attrib(webpage_html, 'title')
  181. title = unescapeHTML(title.group('content')) if title else None
  182. # extract playlists
  183. playlists = {}
  184. for div in _get_elements_by_tag_and_attrib(
  185. webpage_html, attribute='id', value='playlist\\d+', escape_value=False):
  186. playlist_id = div.group('value')
  187. playlist = []
  188. for a in _get_elements_by_tag_and_attrib(
  189. div.group('content'), 'a', 'href', value='[^\'"]+?', escape_value=False):
  190. playlist.append({
  191. 'href': unescapeHTML(a.group('value')),
  192. 'title': unescapeHTML(a.group('content')),
  193. })
  194. playlists[playlist_id] = playlist
  195. # select the specified playlist if url fragment exists
  196. playlist = None
  197. playlist_id = None
  198. if fragment:
  199. playlist = playlists.get(fragment)
  200. playlist_id = fragment
  201. else:
  202. first = next(iter(playlists.items()), None)
  203. if first:
  204. (playlist_id, playlist) = first
  205. if not playlist:
  206. raise ExtractorError(
  207. f'Cannot find {fragment}' if fragment else 'Cannot extract playlist')
  208. # return url results
  209. return self.playlist_result([
  210. self.url_result(
  211. urllib.parse.urljoin('https://w.duboku.io', x['href']),
  212. ie=DubokuIE.ie_key(), video_title=x.get('title'))
  213. for x in playlist], series_id + '#' + playlist_id, title)