sharepoint.py 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112
  1. import json
  2. import urllib.parse
  3. from .common import InfoExtractor
  4. from ..utils import determine_ext, int_or_none, url_or_none
  5. from ..utils.traversal import traverse_obj
  6. class SharePointIE(InfoExtractor):
  7. _BASE_URL_RE = r'https?://[\w-]+\.sharepoint\.com/'
  8. _VALID_URL = [
  9. rf'{_BASE_URL_RE}:v:/[a-z]/(?:[^/?#]+/)*(?P<id>[^/?#]{{46}})/?(?:$|[?#])',
  10. rf'{_BASE_URL_RE}(?!:v:)(?:[^/?#]+/)*stream\.aspx\?(?:[^#]+&)?id=(?P<id>[^&#]+)',
  11. ]
  12. _TESTS = [{
  13. 'url': 'https://lut-my.sharepoint.com/:v:/g/personal/juha_eerola_student_lab_fi/EUrAmrktb4ZMhUcY9J2PqMEBD_9x_l0DyYWVgAvp-TTOMw?e=ZpQOOw',
  14. 'md5': '2950821d0d4937a0a76373782093b435',
  15. 'info_dict': {
  16. 'id': '01EQRS7EKKYCNLSLLPQZGIKRYY6SOY7KGB',
  17. 'display_id': 'EUrAmrktb4ZMhUcY9J2PqMEBD_9x_l0DyYWVgAvp-TTOMw',
  18. 'ext': 'mp4',
  19. 'title': 'CmvpJST',
  20. 'duration': 54.567,
  21. 'thumbnail': r're:https://.+/thumbnail',
  22. 'uploader_id': '8dcec565-a956-4b91-95e5-bacfb8bc015f',
  23. },
  24. }, {
  25. 'url': 'https://greaternyace.sharepoint.com/:v:/s/acementornydrive/ETski5eAfNVEoPRZUAyy1wEBpLgVFYWso5bjbZjfBLlPUg?e=PQUfVb',
  26. 'md5': 'c496a01644223273bff12e93e501afd1',
  27. 'info_dict': {
  28. 'id': '01QI4AVTZ3ESFZPAD42VCKB5CZKAGLFVYB',
  29. 'display_id': 'ETski5eAfNVEoPRZUAyy1wEBpLgVFYWso5bjbZjfBLlPUg',
  30. 'ext': 'mp4',
  31. 'title': '930103681233985536',
  32. 'duration': 3797.326,
  33. 'thumbnail': r're:https://.+/thumbnail',
  34. },
  35. }, {
  36. 'url': 'https://lut-my.sharepoint.com/personal/juha_eerola_student_lab_fi/_layouts/15/stream.aspx?id=%2Fpersonal%2Fjuha_eerola_student_lab_fi%2FDocuments%2FM-DL%2FCmvpJST.mp4&ga=1&referrer=StreamWebApp.Web&referrerScenario=AddressBarCopied.view',
  37. 'info_dict': {
  38. 'id': '01EQRS7EKKYCNLSLLPQZGIKRYY6SOY7KGB',
  39. 'display_id': '/personal/juha_eerola_student_lab_fi/Documents/M-DL/CmvpJST.mp4',
  40. 'ext': 'mp4',
  41. 'title': 'CmvpJST',
  42. 'duration': 54.567,
  43. 'thumbnail': r're:https://.+/thumbnail',
  44. 'uploader_id': '8dcec565-a956-4b91-95e5-bacfb8bc015f',
  45. },
  46. 'skip': 'Session cookies needed',
  47. }, {
  48. 'url': 'https://izoobasisschool.sharepoint.com/:v:/g/Eaqleq8COVBIvIPvod0U27oBypC6aWOkk8ptuDpmJ6arHw',
  49. 'only_matching': True,
  50. }, {
  51. 'url': 'https://uskudaredutr-my.sharepoint.com/:v:/g/personal/songul_turkaydin_uskudar_edu_tr/EbTf-VRUIbtGuIN73tx1MuwBCHBOmNcWNqSLw61Fd2_o0g?e=n5Vkof',
  52. 'only_matching': True,
  53. }, {
  54. 'url': 'https://epam-my.sharepoint.com/:v:/p/dzmitry_tamashevich/Ec4ZOs-rATZHjFYZWVxjczEB649FCoYFKDV_x3RxZiWAGA?e=4hswgA',
  55. 'only_matching': True,
  56. }, {
  57. 'url': 'https://microsoft.sharepoint.com/:v:/t/MicrosoftSPARKRecordings-MSFTInternal/EWCyeqByVWBAt8wDvNZdV-UB0BvU5YVbKm0UHgdrUlI6dg?e=QbPck6',
  58. 'only_matching': True,
  59. }]
  60. def _real_extract(self, url):
  61. display_id = urllib.parse.unquote(self._match_id(url))
  62. webpage, urlh = self._download_webpage_handle(url, display_id)
  63. if urllib.parse.urlparse(urlh.url).hostname == 'login.microsoftonline.com':
  64. self.raise_login_required(
  65. 'Session cookies are required for this URL and can be passed '
  66. 'with the --cookies option. The --cookies-from-browser option will not work', method=None)
  67. video_data = self._search_json(r'g_fileInfo\s*=', webpage, 'player config', display_id)
  68. video_id = video_data['VroomItemId']
  69. parsed_url = urllib.parse.urlparse(video_data['.transformUrl'])
  70. base_media_url = urllib.parse.urlunparse(parsed_url._replace(
  71. path=urllib.parse.urljoin(f'{parsed_url.path}/', '../videomanifest'),
  72. query=urllib.parse.urlencode({
  73. **urllib.parse.parse_qs(parsed_url.query),
  74. 'cTag': video_data['.ctag'],
  75. 'action': 'Access',
  76. 'part': 'index',
  77. }, doseq=True)))
  78. # Web player adds more params to the format URLs but we still get all formats without them
  79. formats = self._extract_mpd_formats(
  80. base_media_url, video_id, mpd_id='dash', query={'format': 'dash'}, fatal=False)
  81. for hls_type in ('hls', 'hls-vnext'):
  82. formats.extend(self._extract_m3u8_formats(
  83. base_media_url, video_id, 'mp4', m3u8_id=hls_type,
  84. query={'format': hls_type}, fatal=False, quality=-2))
  85. if video_url := traverse_obj(video_data, ('downloadUrl', {url_or_none})):
  86. formats.append({
  87. 'url': video_url,
  88. 'ext': determine_ext(video_data.get('extension') or video_data.get('name')),
  89. 'quality': 1,
  90. 'format_id': 'source',
  91. 'filesize': int_or_none(video_data.get('size')),
  92. 'vcodec': 'none' if video_data.get('isAudio') is True else None,
  93. })
  94. return {
  95. 'id': video_id,
  96. 'formats': formats,
  97. 'title': video_data.get('title') or video_data.get('displayName'),
  98. 'display_id': display_id,
  99. 'uploader_id': video_data.get('authorId'),
  100. 'duration': traverse_obj(video_data, (
  101. 'MediaServiceFastMetadata', {json.loads}, 'media', 'duration', {lambda x: x / 10000000})),
  102. 'thumbnail': url_or_none(video_data.get('thumbnailUrl')),
  103. }