googledrive.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343
  1. import re
  2. import urllib.parse
  3. from .common import InfoExtractor
  4. from .youtube import YoutubeIE
  5. from ..utils import (
  6. ExtractorError,
  7. bug_reports_message,
  8. determine_ext,
  9. extract_attributes,
  10. get_element_by_class,
  11. get_element_html_by_id,
  12. int_or_none,
  13. lowercase_escape,
  14. try_get,
  15. update_url_query,
  16. )
  17. class GoogleDriveIE(InfoExtractor):
  18. _VALID_URL = r'''(?x)
  19. https?://
  20. (?:
  21. (?:docs|drive|drive\.usercontent)\.google\.com/
  22. (?:
  23. (?:uc|open|download)\?.*?id=|
  24. file/d/
  25. )|
  26. video\.google\.com/get_player\?.*?docid=
  27. )
  28. (?P<id>[a-zA-Z0-9_-]{28,})
  29. '''
  30. _TESTS = [{
  31. 'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1',
  32. 'md5': '5c602afbbf2c1db91831f5d82f678554',
  33. 'info_dict': {
  34. 'id': '0ByeS4oOUV-49Zzh4R1J6R09zazQ',
  35. 'ext': 'mp4',
  36. 'title': 'Big Buck Bunny.mp4',
  37. 'duration': 45,
  38. 'thumbnail': 'https://drive.google.com/thumbnail?id=0ByeS4oOUV-49Zzh4R1J6R09zazQ',
  39. },
  40. }, {
  41. # has itag 50 which is not in YoutubeIE._formats (royalty Free music from 1922)
  42. 'url': 'https://drive.google.com/uc?id=1IP0o8dHcQrIHGgVyp0Ofvx2cGfLzyO1x',
  43. 'md5': '322db8d63dd19788c04050a4bba67073',
  44. 'info_dict': {
  45. 'id': '1IP0o8dHcQrIHGgVyp0Ofvx2cGfLzyO1x',
  46. 'ext': 'mp3',
  47. 'title': 'My Buddy - Henry Burr - Gus Kahn - Walter Donaldson.mp3',
  48. 'duration': 184,
  49. 'thumbnail': 'https://drive.google.com/thumbnail?id=1IP0o8dHcQrIHGgVyp0Ofvx2cGfLzyO1x',
  50. },
  51. }, {
  52. # video can't be watched anonymously due to view count limit reached,
  53. # but can be downloaded (see https://github.com/ytdl-org/youtube-dl/issues/14046)
  54. 'url': 'https://drive.google.com/file/d/0B-vUyvmDLdWDcEt4WjBqcmI2XzQ/view',
  55. 'only_matching': True,
  56. }, {
  57. # video id is longer than 28 characters
  58. 'url': 'https://drive.google.com/file/d/1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ/edit',
  59. 'only_matching': True,
  60. }, {
  61. 'url': 'https://drive.google.com/open?id=0B2fjwgkl1A_CX083Tkowdmt6d28',
  62. 'only_matching': True,
  63. }, {
  64. 'url': 'https://drive.google.com/uc?id=0B2fjwgkl1A_CX083Tkowdmt6d28',
  65. 'only_matching': True,
  66. }, {
  67. 'url': 'https://drive.usercontent.google.com/download?id=0ByeS4oOUV-49Zzh4R1J6R09zazQ',
  68. 'only_matching': True,
  69. }]
  70. _FORMATS_EXT = {
  71. **{k: v['ext'] for k, v in YoutubeIE._formats.items() if v.get('ext')},
  72. '50': 'm4a',
  73. }
  74. _BASE_URL_CAPTIONS = 'https://drive.google.com/timedtext'
  75. _CAPTIONS_ENTRY_TAG = {
  76. 'subtitles': 'track',
  77. 'automatic_captions': 'target',
  78. }
  79. _caption_formats_ext = []
  80. _captions_xml = None
  81. @classmethod
  82. def _extract_embed_urls(cls, url, webpage):
  83. mobj = re.search(
  84. r'<iframe[^>]+src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28,})',
  85. webpage)
  86. if mobj:
  87. yield 'https://drive.google.com/file/d/{}'.format(mobj.group('id'))
  88. def _download_subtitles_xml(self, video_id, subtitles_id, hl):
  89. if self._captions_xml:
  90. return
  91. self._captions_xml = self._download_xml(
  92. self._BASE_URL_CAPTIONS, video_id, query={
  93. 'id': video_id,
  94. 'vid': subtitles_id,
  95. 'hl': hl,
  96. 'v': video_id,
  97. 'type': 'list',
  98. 'tlangs': '1',
  99. 'fmts': '1',
  100. 'vssids': '1',
  101. }, note='Downloading subtitles XML',
  102. errnote='Unable to download subtitles XML', fatal=False)
  103. if self._captions_xml:
  104. for f in self._captions_xml.findall('format'):
  105. if f.attrib.get('fmt_code') and not f.attrib.get('default'):
  106. self._caption_formats_ext.append(f.attrib['fmt_code'])
  107. def _get_captions_by_type(self, video_id, subtitles_id, caption_type,
  108. origin_lang_code=None):
  109. if not subtitles_id or not caption_type:
  110. return
  111. captions = {}
  112. for caption_entry in self._captions_xml.findall(
  113. self._CAPTIONS_ENTRY_TAG[caption_type]):
  114. caption_lang_code = caption_entry.attrib.get('lang_code')
  115. if not caption_lang_code:
  116. continue
  117. caption_format_data = []
  118. for caption_format in self._caption_formats_ext:
  119. query = {
  120. 'vid': subtitles_id,
  121. 'v': video_id,
  122. 'fmt': caption_format,
  123. 'lang': (caption_lang_code if origin_lang_code is None
  124. else origin_lang_code),
  125. 'type': 'track',
  126. 'name': '',
  127. 'kind': '',
  128. }
  129. if origin_lang_code is not None:
  130. query.update({'tlang': caption_lang_code})
  131. caption_format_data.append({
  132. 'url': update_url_query(self._BASE_URL_CAPTIONS, query),
  133. 'ext': caption_format,
  134. })
  135. captions[caption_lang_code] = caption_format_data
  136. return captions
  137. def _get_subtitles(self, video_id, subtitles_id, hl):
  138. if not subtitles_id or not hl:
  139. return
  140. self._download_subtitles_xml(video_id, subtitles_id, hl)
  141. if not self._captions_xml:
  142. return
  143. return self._get_captions_by_type(video_id, subtitles_id, 'subtitles')
  144. def _get_automatic_captions(self, video_id, subtitles_id, hl):
  145. if not subtitles_id or not hl:
  146. return
  147. self._download_subtitles_xml(video_id, subtitles_id, hl)
  148. if not self._captions_xml:
  149. return
  150. track = self._captions_xml.find('track')
  151. if track is None:
  152. return
  153. origin_lang_code = track.attrib.get('lang_code')
  154. if not origin_lang_code:
  155. return
  156. return self._get_captions_by_type(
  157. video_id, subtitles_id, 'automatic_captions', origin_lang_code)
  158. def _real_extract(self, url):
  159. video_id = self._match_id(url)
  160. video_info = urllib.parse.parse_qs(self._download_webpage(
  161. 'https://drive.google.com/get_video_info',
  162. video_id, 'Downloading video webpage', query={'docid': video_id}))
  163. def get_value(key):
  164. return try_get(video_info, lambda x: x[key][0])
  165. reason = get_value('reason')
  166. title = get_value('title')
  167. formats = []
  168. fmt_stream_map = (get_value('fmt_stream_map') or '').split(',')
  169. fmt_list = (get_value('fmt_list') or '').split(',')
  170. if fmt_stream_map and fmt_list:
  171. resolutions = {}
  172. for fmt in fmt_list:
  173. mobj = re.search(
  174. r'^(?P<format_id>\d+)/(?P<width>\d+)[xX](?P<height>\d+)', fmt)
  175. if mobj:
  176. resolutions[mobj.group('format_id')] = (
  177. int(mobj.group('width')), int(mobj.group('height')))
  178. for fmt_stream in fmt_stream_map:
  179. fmt_stream_split = fmt_stream.split('|')
  180. if len(fmt_stream_split) < 2:
  181. continue
  182. format_id, format_url = fmt_stream_split[:2]
  183. ext = self._FORMATS_EXT.get(format_id)
  184. if not ext:
  185. self.report_warning(f'Unknown format {format_id}{bug_reports_message()}')
  186. f = {
  187. 'url': lowercase_escape(format_url),
  188. 'format_id': format_id,
  189. 'ext': ext,
  190. }
  191. resolution = resolutions.get(format_id)
  192. if resolution:
  193. f.update({
  194. 'width': resolution[0],
  195. 'height': resolution[1],
  196. })
  197. formats.append(f)
  198. source_url = update_url_query(
  199. 'https://drive.usercontent.google.com/download', {
  200. 'id': video_id,
  201. 'export': 'download',
  202. 'confirm': 't',
  203. })
  204. def request_source_file(source_url, kind, data=None):
  205. return self._request_webpage(
  206. source_url, video_id, note=f'Requesting {kind} file',
  207. errnote=f'Unable to request {kind} file', fatal=False, data=data)
  208. urlh = request_source_file(source_url, 'source')
  209. if urlh:
  210. def add_source_format(urlh):
  211. nonlocal title
  212. if not title:
  213. title = self._search_regex(
  214. r'\bfilename="([^"]+)"', urlh.headers.get('Content-Disposition'),
  215. 'title', default=None)
  216. formats.append({
  217. # Use redirect URLs as download URLs in order to calculate
  218. # correct cookies in _calc_cookies.
  219. # Using original URLs may result in redirect loop due to
  220. # google.com's cookies mistakenly used for googleusercontent.com
  221. # redirect URLs (see #23919).
  222. 'url': urlh.url,
  223. 'ext': determine_ext(title, 'mp4').lower(),
  224. 'format_id': 'source',
  225. 'quality': 1,
  226. })
  227. if urlh.headers.get('Content-Disposition'):
  228. add_source_format(urlh)
  229. else:
  230. confirmation_webpage = self._webpage_read_content(
  231. urlh, url, video_id, note='Downloading confirmation page',
  232. errnote='Unable to confirm download', fatal=False)
  233. if confirmation_webpage:
  234. confirmed_source_url = extract_attributes(
  235. get_element_html_by_id('download-form', confirmation_webpage) or '').get('action')
  236. if confirmed_source_url:
  237. urlh = request_source_file(confirmed_source_url, 'confirmed source', data=b'')
  238. if urlh and urlh.headers.get('Content-Disposition'):
  239. add_source_format(urlh)
  240. else:
  241. self.report_warning(
  242. get_element_by_class('uc-error-subcaption', confirmation_webpage)
  243. or get_element_by_class('uc-error-caption', confirmation_webpage)
  244. or 'unable to extract confirmation code')
  245. if not formats and reason:
  246. if title:
  247. self.raise_no_formats(reason, expected=True)
  248. else:
  249. raise ExtractorError(reason, expected=True)
  250. hl = get_value('hl')
  251. subtitles_id = None
  252. ttsurl = get_value('ttsurl')
  253. if ttsurl:
  254. # the video Id for subtitles will be the last value in the ttsurl
  255. # query string
  256. subtitles_id = ttsurl.encode().decode(
  257. 'unicode_escape').split('=')[-1]
  258. self.cookiejar.clear(domain='.google.com', path='/', name='NID')
  259. return {
  260. 'id': video_id,
  261. 'title': title,
  262. 'thumbnail': 'https://drive.google.com/thumbnail?id=' + video_id,
  263. 'duration': int_or_none(get_value('length_seconds')),
  264. 'formats': formats,
  265. 'subtitles': self.extract_subtitles(video_id, subtitles_id, hl),
  266. 'automatic_captions': self.extract_automatic_captions(
  267. video_id, subtitles_id, hl),
  268. }
  269. class GoogleDriveFolderIE(InfoExtractor):
  270. IE_NAME = 'GoogleDrive:Folder'
  271. _VALID_URL = r'https?://(?:docs|drive)\.google\.com/drive/folders/(?P<id>[\w-]{28,})'
  272. _TESTS = [{
  273. 'url': 'https://drive.google.com/drive/folders/1dQ4sx0-__Nvg65rxTSgQrl7VyW_FZ9QI',
  274. 'info_dict': {
  275. 'id': '1dQ4sx0-__Nvg65rxTSgQrl7VyW_FZ9QI',
  276. 'title': 'Forrest',
  277. },
  278. 'playlist_count': 3,
  279. }]
  280. _BOUNDARY = '=====vc17a3rwnndj====='
  281. _REQUEST = "/drive/v2beta/files?openDrive=true&reason=102&syncType=0&errorRecovery=false&q=trashed%20%3D%20false%20and%20'{folder_id}'%20in%20parents&fields=kind%2CnextPageToken%2Citems(kind%2CmodifiedDate%2CmodifiedByMeDate%2ClastViewedByMeDate%2CfileSize%2Cowners(kind%2CpermissionId%2Cid)%2ClastModifyingUser(kind%2CpermissionId%2Cid)%2ChasThumbnail%2CthumbnailVersion%2Ctitle%2Cid%2CresourceKey%2Cshared%2CsharedWithMeDate%2CuserPermission(role)%2CexplicitlyTrashed%2CmimeType%2CquotaBytesUsed%2Ccopyable%2CfileExtension%2CsharingUser(kind%2CpermissionId%2Cid)%2Cspaces%2Cversion%2CteamDriveId%2ChasAugmentedPermissions%2CcreatedDate%2CtrashingUser(kind%2CpermissionId%2Cid)%2CtrashedDate%2Cparents(id)%2CshortcutDetails(targetId%2CtargetMimeType%2CtargetLookupStatus)%2Ccapabilities(canCopy%2CcanDownload%2CcanEdit%2CcanAddChildren%2CcanDelete%2CcanRemoveChildren%2CcanShare%2CcanTrash%2CcanRename%2CcanReadTeamDrive%2CcanMoveTeamDriveItem)%2Clabels(starred%2Ctrashed%2Crestricted%2Cviewed))%2CincompleteSearch&appDataFilter=NO_APP_DATA&spaces=drive&pageToken={page_token}&maxResults=50&supportsTeamDrives=true&includeItemsFromAllDrives=true&corpora=default&orderBy=folder%2Ctitle_natural%20asc&retryCount=0&key={key} HTTP/1.1"
  282. _DATA = f'''--{_BOUNDARY}
  283. content-type: application/http
  284. content-transfer-encoding: binary
  285. GET %s
  286. --{_BOUNDARY}
  287. '''
  288. def _call_api(self, folder_id, key, data, **kwargs):
  289. response = self._download_webpage(
  290. 'https://clients6.google.com/batch/drive/v2beta',
  291. folder_id, data=data.encode(),
  292. headers={
  293. 'Content-Type': 'text/plain;charset=UTF-8;',
  294. 'Origin': 'https://drive.google.com',
  295. }, query={
  296. '$ct': f'multipart/mixed; boundary="{self._BOUNDARY}"',
  297. 'key': key,
  298. }, **kwargs)
  299. return self._search_json('', response, 'api response', folder_id, **kwargs) or {}
  300. def _get_folder_items(self, folder_id, key):
  301. page_token = ''
  302. while page_token is not None:
  303. request = self._REQUEST.format(folder_id=folder_id, page_token=page_token, key=key)
  304. page = self._call_api(folder_id, key, self._DATA % request)
  305. yield from page['items']
  306. page_token = page.get('nextPageToken')
  307. def _real_extract(self, url):
  308. folder_id = self._match_id(url)
  309. webpage = self._download_webpage(url, folder_id)
  310. key = self._search_regex(r'"(\w{39})"', webpage, 'key')
  311. folder_info = self._call_api(folder_id, key, self._DATA % f'/drive/v2beta/files/{folder_id} HTTP/1.1', fatal=False)
  312. return self.playlist_from_matches(
  313. self._get_folder_items(folder_id, key), folder_id, folder_info.get('title'),
  314. ie=GoogleDriveIE, getter=lambda item: f'https://drive.google.com/file/d/{item["id"]}')