mixcloud.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376
  1. import base64
  2. import itertools
  3. import urllib.parse
  4. from .common import InfoExtractor
  5. from ..compat import compat_ord
  6. from ..utils import (
  7. ExtractorError,
  8. int_or_none,
  9. parse_iso8601,
  10. strip_or_none,
  11. try_get,
  12. )
  13. class MixcloudBaseIE(InfoExtractor):
  14. def _call_api(self, object_type, object_fields, display_id, username, slug=None):
  15. lookup_key = object_type + 'Lookup'
  16. return self._download_json(
  17. 'https://app.mixcloud.com/graphql', display_id, query={
  18. 'query': '''{
  19. %s(lookup: {username: "%s"%s}) {
  20. %s
  21. }
  22. }''' % (lookup_key, username, f', slug: "{slug}"' if slug else '', object_fields), # noqa: UP031
  23. })['data'][lookup_key]
  24. class MixcloudIE(MixcloudBaseIE):
  25. _VALID_URL = r'https?://(?:(?:www|beta|m)\.)?mixcloud\.com/([^/]+)/(?!stream|uploads|favorites|listens|playlists)([^/]+)'
  26. IE_NAME = 'mixcloud'
  27. _TESTS = [{
  28. 'url': 'http://www.mixcloud.com/dholbach/cryptkeeper/',
  29. 'info_dict': {
  30. 'id': 'dholbach_cryptkeeper',
  31. 'ext': 'm4a',
  32. 'title': 'Cryptkeeper',
  33. 'description': 'After quite a long silence from myself, finally another Drum\'n\'Bass mix with my favourite current dance floor bangers.',
  34. 'uploader': 'Daniel Holbach',
  35. 'uploader_id': 'dholbach',
  36. 'thumbnail': r're:https?://.*\.jpg',
  37. 'view_count': int,
  38. 'timestamp': 1321359578,
  39. 'upload_date': '20111115',
  40. 'uploader_url': 'https://www.mixcloud.com/dholbach/',
  41. 'artist': 'Submorphics & Chino , Telekinesis, Porter Robinson, Enei, Breakage ft Jess Mills',
  42. 'duration': 3723,
  43. 'tags': [],
  44. 'comment_count': int,
  45. 'repost_count': int,
  46. 'like_count': int,
  47. },
  48. 'params': {'skip_download': 'm3u8'},
  49. }, {
  50. 'url': 'http://www.mixcloud.com/gillespeterson/caribou-7-inch-vinyl-mix-chat/',
  51. 'info_dict': {
  52. 'id': 'gillespeterson_caribou-7-inch-vinyl-mix-chat',
  53. 'ext': 'mp3',
  54. 'title': 'Caribou 7 inch Vinyl Mix & Chat',
  55. 'description': 'md5:2b8aec6adce69f9d41724647c65875e8',
  56. 'uploader': 'Gilles Peterson Worldwide',
  57. 'uploader_id': 'gillespeterson',
  58. 'thumbnail': 're:https?://.*',
  59. 'view_count': int,
  60. 'timestamp': 1422987057,
  61. 'upload_date': '20150203',
  62. 'uploader_url': 'https://www.mixcloud.com/gillespeterson/',
  63. 'duration': 2992,
  64. 'tags': [],
  65. 'comment_count': int,
  66. 'repost_count': int,
  67. 'like_count': int,
  68. },
  69. 'params': {'skip_download': '404 playback error on site'},
  70. }, {
  71. 'url': 'https://beta.mixcloud.com/RedLightRadio/nosedrip-15-red-light-radio-01-18-2016/',
  72. 'only_matching': True,
  73. }]
  74. _DECRYPTION_KEY = 'IFYOUWANTTHEARTISTSTOGETPAIDDONOTDOWNLOADFROMMIXCLOUD'
  75. @staticmethod
  76. def _decrypt_xor_cipher(key, ciphertext):
  77. """Encrypt/Decrypt XOR cipher. Both ways are possible because it's XOR."""
  78. return ''.join([
  79. chr(compat_ord(ch) ^ compat_ord(k))
  80. for ch, k in zip(ciphertext, itertools.cycle(key))])
  81. def _real_extract(self, url):
  82. username, slug = self._match_valid_url(url).groups()
  83. username, slug = urllib.parse.unquote(username), urllib.parse.unquote(slug)
  84. track_id = f'{username}_{slug}'
  85. cloudcast = self._call_api('cloudcast', '''audioLength
  86. comments(first: 100) {
  87. edges {
  88. node {
  89. comment
  90. created
  91. user {
  92. displayName
  93. username
  94. }
  95. }
  96. }
  97. totalCount
  98. }
  99. description
  100. favorites {
  101. totalCount
  102. }
  103. featuringArtistList
  104. isExclusive
  105. name
  106. owner {
  107. displayName
  108. url
  109. username
  110. }
  111. picture(width: 1024, height: 1024) {
  112. url
  113. }
  114. plays
  115. publishDate
  116. reposts {
  117. totalCount
  118. }
  119. streamInfo {
  120. dashUrl
  121. hlsUrl
  122. url
  123. }
  124. tags {
  125. tag {
  126. name
  127. }
  128. }
  129. restrictedReason
  130. id''', track_id, username, slug)
  131. if not cloudcast:
  132. raise ExtractorError('Track not found', expected=True)
  133. reason = cloudcast.get('restrictedReason')
  134. if reason == 'tracklist':
  135. raise ExtractorError('Track unavailable in your country due to licensing restrictions', expected=True)
  136. elif reason == 'repeat_play':
  137. raise ExtractorError('You have reached your play limit for this track', expected=True)
  138. elif reason:
  139. raise ExtractorError('Track is restricted', expected=True)
  140. title = cloudcast['name']
  141. stream_info = cloudcast['streamInfo']
  142. formats = []
  143. for url_key in ('url', 'hlsUrl', 'dashUrl'):
  144. format_url = stream_info.get(url_key)
  145. if not format_url:
  146. continue
  147. decrypted = self._decrypt_xor_cipher(
  148. self._DECRYPTION_KEY, base64.b64decode(format_url))
  149. if url_key == 'hlsUrl':
  150. formats.extend(self._extract_m3u8_formats(
  151. decrypted, track_id, 'mp4', entry_protocol='m3u8_native',
  152. m3u8_id='hls', fatal=False))
  153. elif url_key == 'dashUrl':
  154. formats.extend(self._extract_mpd_formats(
  155. decrypted, track_id, mpd_id='dash', fatal=False))
  156. else:
  157. formats.append({
  158. 'format_id': 'http',
  159. 'url': decrypted,
  160. 'vcodec': 'none',
  161. 'downloader_options': {
  162. # Mixcloud starts throttling at >~5M
  163. 'http_chunk_size': 5242880,
  164. },
  165. })
  166. if not formats and cloudcast.get('isExclusive'):
  167. self.raise_login_required(metadata_available=True)
  168. comments = []
  169. for edge in (try_get(cloudcast, lambda x: x['comments']['edges']) or []):
  170. node = edge.get('node') or {}
  171. text = strip_or_none(node.get('comment'))
  172. if not text:
  173. continue
  174. user = node.get('user') or {}
  175. comments.append({
  176. 'author': user.get('displayName'),
  177. 'author_id': user.get('username'),
  178. 'text': text,
  179. 'timestamp': parse_iso8601(node.get('created')),
  180. })
  181. tags = []
  182. for t in cloudcast.get('tags'):
  183. tag = try_get(t, lambda x: x['tag']['name'], str)
  184. if not tag:
  185. tags.append(tag)
  186. get_count = lambda x: int_or_none(try_get(cloudcast, lambda y: y[x]['totalCount']))
  187. owner = cloudcast.get('owner') or {}
  188. return {
  189. 'id': track_id,
  190. 'title': title,
  191. 'formats': formats,
  192. 'description': cloudcast.get('description'),
  193. 'thumbnail': try_get(cloudcast, lambda x: x['picture']['url'], str),
  194. 'uploader': owner.get('displayName'),
  195. 'timestamp': parse_iso8601(cloudcast.get('publishDate')),
  196. 'uploader_id': owner.get('username'),
  197. 'uploader_url': owner.get('url'),
  198. 'duration': int_or_none(cloudcast.get('audioLength')),
  199. 'view_count': int_or_none(cloudcast.get('plays')),
  200. 'like_count': get_count('favorites'),
  201. 'repost_count': get_count('reposts'),
  202. 'comment_count': get_count('comments'),
  203. 'comments': comments,
  204. 'tags': tags,
  205. 'artist': ', '.join(cloudcast.get('featuringArtistList') or []) or None,
  206. }
  207. class MixcloudPlaylistBaseIE(MixcloudBaseIE):
  208. def _get_cloudcast(self, node):
  209. return node
  210. def _get_playlist_title(self, title, slug):
  211. return title
  212. def _real_extract(self, url):
  213. username, slug = self._match_valid_url(url).groups()
  214. username = urllib.parse.unquote(username)
  215. if not slug:
  216. slug = 'uploads'
  217. else:
  218. slug = urllib.parse.unquote(slug)
  219. playlist_id = f'{username}_{slug}'
  220. is_playlist_type = self._ROOT_TYPE == 'playlist'
  221. playlist_type = 'items' if is_playlist_type else slug
  222. list_filter = ''
  223. has_next_page = True
  224. entries = []
  225. while has_next_page:
  226. playlist = self._call_api(
  227. self._ROOT_TYPE, '''%s
  228. %s
  229. %s(first: 100%s) {
  230. edges {
  231. node {
  232. %s
  233. }
  234. }
  235. pageInfo {
  236. endCursor
  237. hasNextPage
  238. }
  239. }''' % (self._TITLE_KEY, self._DESCRIPTION_KEY, playlist_type, list_filter, self._NODE_TEMPLATE), # noqa: UP031
  240. playlist_id, username, slug if is_playlist_type else None)
  241. items = playlist.get(playlist_type) or {}
  242. for edge in items.get('edges', []):
  243. cloudcast = self._get_cloudcast(edge.get('node') or {})
  244. cloudcast_url = cloudcast.get('url')
  245. if not cloudcast_url:
  246. continue
  247. item_slug = try_get(cloudcast, lambda x: x['slug'], str)
  248. owner_username = try_get(cloudcast, lambda x: x['owner']['username'], str)
  249. video_id = f'{owner_username}_{item_slug}' if item_slug and owner_username else None
  250. entries.append(self.url_result(
  251. cloudcast_url, MixcloudIE.ie_key(), video_id))
  252. page_info = items['pageInfo']
  253. has_next_page = page_info['hasNextPage']
  254. list_filter = ', after: "{}"'.format(page_info['endCursor'])
  255. return self.playlist_result(
  256. entries, playlist_id,
  257. self._get_playlist_title(playlist[self._TITLE_KEY], slug),
  258. playlist.get(self._DESCRIPTION_KEY))
  259. class MixcloudUserIE(MixcloudPlaylistBaseIE):
  260. _VALID_URL = r'https?://(?:www\.)?mixcloud\.com/(?P<id>[^/]+)/(?P<type>uploads|favorites|listens|stream)?/?$'
  261. IE_NAME = 'mixcloud:user'
  262. _TESTS = [{
  263. 'url': 'http://www.mixcloud.com/dholbach/',
  264. 'info_dict': {
  265. 'id': 'dholbach_uploads',
  266. 'title': 'Daniel Holbach (uploads)',
  267. 'description': 'md5:a3f468a60ac8c3e1f8616380fc469b2b',
  268. },
  269. 'playlist_mincount': 36,
  270. }, {
  271. 'url': 'http://www.mixcloud.com/dholbach/uploads/',
  272. 'info_dict': {
  273. 'id': 'dholbach_uploads',
  274. 'title': 'Daniel Holbach (uploads)',
  275. 'description': 'md5:a3f468a60ac8c3e1f8616380fc469b2b',
  276. },
  277. 'playlist_mincount': 36,
  278. }, {
  279. 'url': 'http://www.mixcloud.com/dholbach/favorites/',
  280. 'info_dict': {
  281. 'id': 'dholbach_favorites',
  282. 'title': 'Daniel Holbach (favorites)',
  283. 'description': 'md5:a3f468a60ac8c3e1f8616380fc469b2b',
  284. },
  285. # 'params': {
  286. # 'playlist_items': '1-100',
  287. # },
  288. 'playlist_mincount': 396,
  289. }, {
  290. 'url': 'http://www.mixcloud.com/dholbach/listens/',
  291. 'info_dict': {
  292. 'id': 'dholbach_listens',
  293. 'title': 'Daniel Holbach (listens)',
  294. 'description': 'md5:b60d776f0bab534c5dabe0a34e47a789',
  295. },
  296. # 'params': {
  297. # 'playlist_items': '1-100',
  298. # },
  299. 'playlist_mincount': 1623,
  300. 'skip': 'Large list',
  301. }, {
  302. 'url': 'https://www.mixcloud.com/FirstEar/stream/',
  303. 'info_dict': {
  304. 'id': 'FirstEar_stream',
  305. 'title': 'First Ear (stream)',
  306. 'description': 'we maraud for ears',
  307. },
  308. 'playlist_mincount': 269,
  309. }]
  310. _TITLE_KEY = 'displayName'
  311. _DESCRIPTION_KEY = 'biog'
  312. _ROOT_TYPE = 'user'
  313. _NODE_TEMPLATE = '''slug
  314. url
  315. owner { username }'''
  316. def _get_playlist_title(self, title, slug):
  317. return f'{title} ({slug})'
  318. class MixcloudPlaylistIE(MixcloudPlaylistBaseIE):
  319. _VALID_URL = r'https?://(?:www\.)?mixcloud\.com/(?P<user>[^/]+)/playlists/(?P<playlist>[^/]+)/?$'
  320. IE_NAME = 'mixcloud:playlist'
  321. _TESTS = [{
  322. 'url': 'https://www.mixcloud.com/maxvibes/playlists/jazzcat-on-ness-radio/',
  323. 'info_dict': {
  324. 'id': 'maxvibes_jazzcat-on-ness-radio',
  325. 'title': 'Ness Radio sessions',
  326. },
  327. 'playlist_mincount': 59,
  328. }]
  329. _TITLE_KEY = 'name'
  330. _DESCRIPTION_KEY = 'description'
  331. _ROOT_TYPE = 'playlist'
  332. _NODE_TEMPLATE = '''cloudcast {
  333. slug
  334. url
  335. owner { username }
  336. }'''
  337. def _get_cloudcast(self, node):
  338. return node.get('cloudcast') or {}