kuwo.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352
  1. import re
  2. import urllib.parse
  3. from .common import InfoExtractor
  4. from ..utils import (
  5. ExtractorError,
  6. InAdvancePagedList,
  7. clean_html,
  8. get_element_by_id,
  9. remove_start,
  10. )
  11. class KuwoBaseIE(InfoExtractor):
  12. _FORMATS = [
  13. {'format': 'ape', 'ext': 'ape', 'preference': 100},
  14. {'format': 'mp3-320', 'ext': 'mp3', 'br': '320kmp3', 'abr': 320, 'preference': 80},
  15. {'format': 'mp3-192', 'ext': 'mp3', 'br': '192kmp3', 'abr': 192, 'preference': 70},
  16. {'format': 'mp3-128', 'ext': 'mp3', 'br': '128kmp3', 'abr': 128, 'preference': 60},
  17. {'format': 'wma', 'ext': 'wma', 'preference': 20},
  18. {'format': 'aac', 'ext': 'aac', 'abr': 48, 'preference': 10},
  19. ]
  20. def _get_formats(self, song_id, tolerate_ip_deny=False):
  21. formats = []
  22. for file_format in self._FORMATS:
  23. query = {
  24. 'format': file_format['ext'],
  25. 'br': file_format.get('br', ''),
  26. 'rid': f'MUSIC_{song_id}',
  27. 'type': 'convert_url',
  28. 'response': 'url',
  29. }
  30. song_url = self._download_webpage(
  31. 'http://antiserver.kuwo.cn/anti.s',
  32. song_id, note='Download {} url info'.format(file_format['format']),
  33. query=query, headers=self.geo_verification_headers(),
  34. )
  35. if song_url == 'IPDeny' and not tolerate_ip_deny:
  36. raise ExtractorError('This song is blocked in this region', expected=True)
  37. if song_url.startswith(('http://', 'https://')):
  38. formats.append({
  39. 'url': song_url,
  40. 'format_id': file_format['format'],
  41. 'format': file_format['format'],
  42. 'quality': file_format['preference'],
  43. 'abr': file_format.get('abr'),
  44. })
  45. return formats
  46. class KuwoIE(KuwoBaseIE):
  47. _WORKING = False
  48. IE_NAME = 'kuwo:song'
  49. IE_DESC = '酷我音乐'
  50. _VALID_URL = r'https?://(?:www\.)?kuwo\.cn/yinyue/(?P<id>\d+)'
  51. _TESTS = [{
  52. 'url': 'http://www.kuwo.cn/yinyue/635632/',
  53. 'info_dict': {
  54. 'id': '635632',
  55. 'ext': 'ape',
  56. 'title': '爱我别走',
  57. 'creator': '张震岳',
  58. 'upload_date': '20080122',
  59. 'description': 'md5:ed13f58e3c3bf3f7fd9fbc4e5a7aa75c',
  60. },
  61. 'skip': 'this song has been offline because of copyright issues',
  62. }, {
  63. 'url': 'http://www.kuwo.cn/yinyue/6446136/',
  64. 'info_dict': {
  65. 'id': '6446136',
  66. 'ext': 'mp3',
  67. 'title': '心',
  68. 'description': 'md5:5d0e947b242c35dc0eb1d2fce9fbf02c',
  69. 'creator': 'IU',
  70. 'upload_date': '20150518',
  71. },
  72. 'params': {
  73. 'format': 'mp3-320',
  74. },
  75. }, {
  76. 'url': 'http://www.kuwo.cn/yinyue/3197154?catalog=yueku2016',
  77. 'only_matching': True,
  78. }]
  79. def _real_extract(self, url):
  80. song_id = self._match_id(url)
  81. webpage, urlh = self._download_webpage_handle(
  82. url, song_id, note='Download song detail info',
  83. errnote='Unable to get song detail info')
  84. if song_id not in urlh.url or '对不起,该歌曲由于版权问题已被下线,将返回网站首页' in webpage:
  85. raise ExtractorError('this song has been offline because of copyright issues', expected=True)
  86. song_name = self._html_search_regex(
  87. r'<p[^>]+id="lrcName">([^<]+)</p>', webpage, 'song name')
  88. singer_name = remove_start(self._html_search_regex(
  89. r'<a[^>]+href="http://www\.kuwo\.cn/artist/content\?name=([^"]+)">',
  90. webpage, 'singer name', fatal=False), '歌手')
  91. lrc_content = clean_html(get_element_by_id('lrcContent', webpage))
  92. if lrc_content == '暂无': # indicates no lyrics
  93. lrc_content = None
  94. formats = self._get_formats(song_id)
  95. album_id = self._html_search_regex(
  96. r'<a[^>]+href="http://www\.kuwo\.cn/album/(\d+)/"',
  97. webpage, 'album id', fatal=False)
  98. publish_time = None
  99. if album_id is not None:
  100. album_info_page = self._download_webpage(
  101. f'http://www.kuwo.cn/album/{album_id}/', song_id,
  102. note='Download album detail info',
  103. errnote='Unable to get album detail info')
  104. publish_time = self._html_search_regex(
  105. r'发行时间:(\d{4}-\d{2}-\d{2})', album_info_page,
  106. 'publish time', fatal=False)
  107. if publish_time:
  108. publish_time = publish_time.replace('-', '')
  109. return {
  110. 'id': song_id,
  111. 'title': song_name,
  112. 'creator': singer_name,
  113. 'upload_date': publish_time,
  114. 'description': lrc_content,
  115. 'formats': formats,
  116. }
  117. class KuwoAlbumIE(InfoExtractor):
  118. _WORKING = False
  119. IE_NAME = 'kuwo:album'
  120. IE_DESC = '酷我音乐 - 专辑'
  121. _VALID_URL = r'https?://(?:www\.)?kuwo\.cn/album/(?P<id>\d+?)/'
  122. _TEST = {
  123. 'url': 'http://www.kuwo.cn/album/502294/',
  124. 'info_dict': {
  125. 'id': '502294',
  126. 'title': 'Made\xa0Series\xa0《M》',
  127. 'description': 'md5:d463f0d8a0ff3c3ea3d6ed7452a9483f',
  128. },
  129. 'playlist_count': 2,
  130. }
  131. def _real_extract(self, url):
  132. album_id = self._match_id(url)
  133. webpage = self._download_webpage(
  134. url, album_id, note='Download album info',
  135. errnote='Unable to get album info')
  136. album_name = self._html_search_regex(
  137. r'<div[^>]+class="comm"[^<]+<h1[^>]+title="([^"]+)"', webpage,
  138. 'album name')
  139. album_intro = remove_start(
  140. clean_html(get_element_by_id('intro', webpage)),
  141. f'{album_name}简介:')
  142. entries = [
  143. self.url_result(song_url, 'Kuwo') for song_url in re.findall(
  144. r'<p[^>]+class="listen"><a[^>]+href="(http://www\.kuwo\.cn/yinyue/\d+/)"',
  145. webpage)
  146. ]
  147. return self.playlist_result(entries, album_id, album_name, album_intro)
  148. class KuwoChartIE(InfoExtractor):
  149. _WORKING = False
  150. IE_NAME = 'kuwo:chart'
  151. IE_DESC = '酷我音乐 - 排行榜'
  152. _VALID_URL = r'https?://yinyue\.kuwo\.cn/billboard_(?P<id>[^.]+).htm'
  153. _TEST = {
  154. 'url': 'http://yinyue.kuwo.cn/billboard_香港中文龙虎榜.htm',
  155. 'info_dict': {
  156. 'id': '香港中文龙虎榜',
  157. },
  158. 'playlist_mincount': 7,
  159. }
  160. def _real_extract(self, url):
  161. chart_id = self._match_id(url)
  162. webpage = self._download_webpage(
  163. url, chart_id, note='Download chart info',
  164. errnote='Unable to get chart info')
  165. entries = [
  166. self.url_result(song_url, 'Kuwo') for song_url in re.findall(
  167. r'<a[^>]+href="(http://www\.kuwo\.cn/yinyue/\d+)', webpage)
  168. ]
  169. return self.playlist_result(entries, chart_id)
  170. class KuwoSingerIE(InfoExtractor):
  171. _WORKING = False
  172. IE_NAME = 'kuwo:singer'
  173. IE_DESC = '酷我音乐 - 歌手'
  174. _VALID_URL = r'https?://(?:www\.)?kuwo\.cn/mingxing/(?P<id>[^/]+)'
  175. _TESTS = [{
  176. 'url': 'http://www.kuwo.cn/mingxing/bruno+mars/',
  177. 'info_dict': {
  178. 'id': 'bruno+mars',
  179. 'title': 'Bruno\xa0Mars',
  180. },
  181. 'playlist_mincount': 329,
  182. }, {
  183. 'url': 'http://www.kuwo.cn/mingxing/Ali/music.htm',
  184. 'info_dict': {
  185. 'id': 'Ali',
  186. 'title': 'Ali',
  187. },
  188. 'playlist_mincount': 95,
  189. 'skip': 'Regularly stalls travis build', # See https://travis-ci.org/ytdl-org/youtube-dl/jobs/78878540
  190. }]
  191. PAGE_SIZE = 15
  192. def _real_extract(self, url):
  193. singer_id = self._match_id(url)
  194. webpage = self._download_webpage(
  195. url, singer_id, note='Download singer info',
  196. errnote='Unable to get singer info')
  197. singer_name = self._html_search_regex(
  198. r'<h1>([^<]+)</h1>', webpage, 'singer name')
  199. artist_id = self._html_search_regex(
  200. r'data-artistid="(\d+)"', webpage, 'artist id')
  201. page_count = int(self._html_search_regex(
  202. r'data-page="(\d+)"', webpage, 'page count'))
  203. def page_func(page_num):
  204. webpage = self._download_webpage(
  205. 'http://www.kuwo.cn/artist/contentMusicsAjax',
  206. singer_id, note=f'Download song list page #{page_num + 1}',
  207. errnote=f'Unable to get song list page #{page_num + 1}',
  208. query={'artistId': artist_id, 'pn': page_num, 'rn': self.PAGE_SIZE})
  209. return [
  210. self.url_result(urllib.parse.urljoin(url, song_url), 'Kuwo')
  211. for song_url in re.findall(
  212. r'<div[^>]+class="name"><a[^>]+href="(/yinyue/\d+)',
  213. webpage)
  214. ]
  215. entries = InAdvancePagedList(page_func, page_count, self.PAGE_SIZE)
  216. return self.playlist_result(entries, singer_id, singer_name)
  217. class KuwoCategoryIE(InfoExtractor):
  218. _WORKING = False
  219. IE_NAME = 'kuwo:category'
  220. IE_DESC = '酷我音乐 - 分类'
  221. _VALID_URL = r'https?://yinyue\.kuwo\.cn/yy/cinfo_(?P<id>\d+?).htm'
  222. _TEST = {
  223. 'url': 'http://yinyue.kuwo.cn/yy/cinfo_86375.htm',
  224. 'info_dict': {
  225. 'id': '86375',
  226. 'title': '八十年代精选',
  227. 'description': '这些都是属于八十年代的回忆!',
  228. },
  229. 'playlist_mincount': 24,
  230. }
  231. def _real_extract(self, url):
  232. category_id = self._match_id(url)
  233. webpage = self._download_webpage(
  234. url, category_id, note='Download category info',
  235. errnote='Unable to get category info')
  236. category_name = self._html_search_regex(
  237. r'<h1[^>]+title="([^<>]+?)">[^<>]+?</h1>', webpage, 'category name')
  238. category_desc = remove_start(
  239. get_element_by_id('intro', webpage).strip(),
  240. f'{category_name}简介:')
  241. if category_desc == '暂无':
  242. category_desc = None
  243. jsonm = self._parse_json(self._html_search_regex(
  244. r'var\s+jsonm\s*=\s*([^;]+);', webpage, 'category songs'), category_id)
  245. entries = [
  246. self.url_result('http://www.kuwo.cn/yinyue/{}/'.format(song['musicrid']), 'Kuwo')
  247. for song in jsonm['musiclist']
  248. ]
  249. return self.playlist_result(entries, category_id, category_name, category_desc)
  250. class KuwoMvIE(KuwoBaseIE):
  251. _WORKING = False
  252. IE_NAME = 'kuwo:mv'
  253. IE_DESC = '酷我音乐 - MV'
  254. _VALID_URL = r'https?://(?:www\.)?kuwo\.cn/mv/(?P<id>\d+?)/'
  255. _TEST = {
  256. 'url': 'http://www.kuwo.cn/mv/6480076/',
  257. 'info_dict': {
  258. 'id': '6480076',
  259. 'ext': 'mp4',
  260. 'title': 'My HouseMV',
  261. 'creator': '2PM',
  262. },
  263. # In this video, music URLs (anti.s) are blocked outside China and
  264. # USA, while the MV URL (mvurl) is available globally, so force the MV
  265. # URL for consistent results in different countries
  266. 'params': {
  267. 'format': 'mv',
  268. },
  269. }
  270. _FORMATS = [
  271. *KuwoBaseIE._FORMATS,
  272. {'format': 'mkv', 'ext': 'mkv', 'preference': 250},
  273. {'format': 'mp4', 'ext': 'mp4', 'preference': 200}]
  274. def _real_extract(self, url):
  275. song_id = self._match_id(url)
  276. webpage = self._download_webpage(
  277. url, song_id, note=f'Download mv detail info: {song_id}',
  278. errnote=f'Unable to get mv detail info: {song_id}')
  279. mobj = re.search(
  280. r'<h1[^>]+title="(?P<song>[^"]+)">[^<]+<span[^>]+title="(?P<singer>[^"]+)"',
  281. webpage)
  282. if mobj:
  283. song_name = mobj.group('song')
  284. singer_name = mobj.group('singer')
  285. else:
  286. raise ExtractorError('Unable to find song or singer names')
  287. formats = self._get_formats(song_id, tolerate_ip_deny=True)
  288. mv_url = self._download_webpage(
  289. f'http://www.kuwo.cn/yy/st/mvurl?rid=MUSIC_{song_id}',
  290. song_id, note=f'Download {song_id} MV URL')
  291. formats.append({
  292. 'url': mv_url,
  293. 'format_id': 'mv',
  294. })
  295. return {
  296. 'id': song_id,
  297. 'title': song_name,
  298. 'creator': singer_name,
  299. 'formats': formats,
  300. }