tiktok.py 68 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565
  1. import functools
  2. import itertools
  3. import json
  4. import random
  5. import re
  6. import string
  7. import time
  8. import urllib.parse
  9. import uuid
  10. from .common import InfoExtractor
  11. from ..networking import HEADRequest
  12. from ..utils import (
  13. ExtractorError,
  14. UnsupportedError,
  15. UserNotLive,
  16. determine_ext,
  17. filter_dict,
  18. format_field,
  19. int_or_none,
  20. join_nonempty,
  21. merge_dicts,
  22. mimetype2ext,
  23. parse_qs,
  24. qualities,
  25. remove_start,
  26. srt_subtitles_timecode,
  27. str_or_none,
  28. traverse_obj,
  29. try_call,
  30. try_get,
  31. url_or_none,
  32. urlencode_postdata,
  33. )
  34. class TikTokBaseIE(InfoExtractor):
  35. _UPLOADER_URL_FORMAT = 'https://www.tiktok.com/@%s'
  36. _WEBPAGE_HOST = 'https://www.tiktok.com/'
  37. QUALITIES = ('360p', '540p', '720p', '1080p')
  38. _APP_INFO_DEFAULTS = {
  39. # unique "install id"
  40. 'iid': None,
  41. # TikTok (KR/PH/TW/TH/VN) = trill, TikTok (rest of world) = musical_ly, Douyin = aweme
  42. 'app_name': 'musical_ly',
  43. 'app_version': '35.1.3',
  44. 'manifest_app_version': '2023501030',
  45. # "app id": aweme = 1128, trill = 1180, musical_ly = 1233, universal = 0
  46. 'aid': '0',
  47. }
  48. _APP_INFO_POOL = None
  49. _APP_INFO = None
  50. _APP_USER_AGENT = None
  51. @functools.cached_property
  52. def _KNOWN_APP_INFO(self):
  53. # If we have a genuine device ID, we may not need any IID
  54. default = [''] if self._KNOWN_DEVICE_ID else []
  55. return self._configuration_arg('app_info', default, ie_key=TikTokIE)
  56. @functools.cached_property
  57. def _KNOWN_DEVICE_ID(self):
  58. return self._configuration_arg('device_id', [None], ie_key=TikTokIE)[0]
  59. @functools.cached_property
  60. def _DEVICE_ID(self):
  61. return self._KNOWN_DEVICE_ID or str(random.randint(7250000000000000000, 7351147085025500000))
  62. @functools.cached_property
  63. def _API_HOSTNAME(self):
  64. return self._configuration_arg(
  65. 'api_hostname', ['api16-normal-c-useast1a.tiktokv.com'], ie_key=TikTokIE)[0]
  66. def _get_next_app_info(self):
  67. if self._APP_INFO_POOL is None:
  68. defaults = {
  69. key: self._configuration_arg(key, [default], ie_key=TikTokIE)[0]
  70. for key, default in self._APP_INFO_DEFAULTS.items()
  71. if key != 'iid'
  72. }
  73. self._APP_INFO_POOL = [
  74. {**defaults, **dict(
  75. (k, v) for k, v in zip(self._APP_INFO_DEFAULTS, app_info.split('/')) if v
  76. )} for app_info in self._KNOWN_APP_INFO
  77. ]
  78. if not self._APP_INFO_POOL:
  79. return False
  80. self._APP_INFO = self._APP_INFO_POOL.pop(0)
  81. app_name = self._APP_INFO['app_name']
  82. version = self._APP_INFO['manifest_app_version']
  83. if app_name == 'musical_ly':
  84. package = f'com.zhiliaoapp.musically/{version}'
  85. else: # trill, aweme
  86. package = f'com.ss.android.ugc.{app_name}/{version}'
  87. self._APP_USER_AGENT = f'{package} (Linux; U; Android 13; en_US; Pixel 7; Build/TD1A.220804.031; Cronet/58.0.2991.0)'
  88. return True
  89. @staticmethod
  90. def _create_url(user_id, video_id):
  91. return f'https://www.tiktok.com/@{user_id or "_"}/video/{video_id}'
  92. def _get_sigi_state(self, webpage, display_id):
  93. return self._search_json(
  94. r'<script[^>]+\bid="(?:SIGI_STATE|sigi-persisted-data)"[^>]*>', webpage,
  95. 'sigi state', display_id, end_pattern=r'</script>', default={})
  96. def _get_universal_data(self, webpage, display_id):
  97. return traverse_obj(self._search_json(
  98. r'<script[^>]+\bid="__UNIVERSAL_DATA_FOR_REHYDRATION__"[^>]*>', webpage,
  99. 'universal data', display_id, end_pattern=r'</script>', default={}),
  100. ('__DEFAULT_SCOPE__', {dict})) or {}
  101. def _call_api_impl(self, ep, video_id, query=None, data=None, headers=None, fatal=True,
  102. note='Downloading API JSON', errnote='Unable to download API page'):
  103. self._set_cookie(self._API_HOSTNAME, 'odin_tt', ''.join(random.choices('0123456789abcdef', k=160)))
  104. webpage_cookies = self._get_cookies(self._WEBPAGE_HOST)
  105. if webpage_cookies.get('sid_tt'):
  106. self._set_cookie(self._API_HOSTNAME, 'sid_tt', webpage_cookies['sid_tt'].value)
  107. return self._download_json(
  108. f'https://{self._API_HOSTNAME}/aweme/v1/{ep}/', video_id=video_id,
  109. fatal=fatal, note=note, errnote=errnote, headers={
  110. 'User-Agent': self._APP_USER_AGENT,
  111. 'Accept': 'application/json',
  112. **(headers or {}),
  113. }, query=query, data=data)
  114. def _build_api_query(self, query):
  115. return filter_dict({
  116. **query,
  117. 'device_platform': 'android',
  118. 'os': 'android',
  119. 'ssmix': 'a',
  120. '_rticket': int(time.time() * 1000),
  121. 'cdid': str(uuid.uuid4()),
  122. 'channel': 'googleplay',
  123. 'aid': self._APP_INFO['aid'],
  124. 'app_name': self._APP_INFO['app_name'],
  125. 'version_code': ''.join(f'{int(v):02d}' for v in self._APP_INFO['app_version'].split('.')),
  126. 'version_name': self._APP_INFO['app_version'],
  127. 'manifest_version_code': self._APP_INFO['manifest_app_version'],
  128. 'update_version_code': self._APP_INFO['manifest_app_version'],
  129. 'ab_version': self._APP_INFO['app_version'],
  130. 'resolution': '1080*2400',
  131. 'dpi': 420,
  132. 'device_type': 'Pixel 7',
  133. 'device_brand': 'Google',
  134. 'language': 'en',
  135. 'os_api': '29',
  136. 'os_version': '13',
  137. 'ac': 'wifi',
  138. 'is_pad': '0',
  139. 'current_region': 'US',
  140. 'app_type': 'normal',
  141. 'sys_region': 'US',
  142. 'last_install_time': int(time.time()) - random.randint(86400, 1123200),
  143. 'timezone_name': 'America/New_York',
  144. 'residence': 'US',
  145. 'app_language': 'en',
  146. 'timezone_offset': '-14400',
  147. 'host_abi': 'armeabi-v7a',
  148. 'locale': 'en',
  149. 'ac2': 'wifi5g',
  150. 'uoo': '1',
  151. 'carrier_region': 'US',
  152. 'op_region': 'US',
  153. 'build_number': self._APP_INFO['app_version'],
  154. 'region': 'US',
  155. 'ts': int(time.time()),
  156. 'iid': self._APP_INFO.get('iid'),
  157. 'device_id': self._DEVICE_ID,
  158. 'openudid': ''.join(random.choices('0123456789abcdef', k=16)),
  159. })
  160. def _call_api(self, ep, video_id, query=None, data=None, headers=None, fatal=True,
  161. note='Downloading API JSON', errnote='Unable to download API page'):
  162. if not self._APP_INFO and not self._get_next_app_info():
  163. message = 'No working app info is available'
  164. if fatal:
  165. raise ExtractorError(message, expected=True)
  166. else:
  167. self.report_warning(message)
  168. return
  169. max_tries = len(self._APP_INFO_POOL) + 1 # _APP_INFO_POOL + _APP_INFO
  170. for count in itertools.count(1):
  171. self.write_debug(str(self._APP_INFO))
  172. real_query = self._build_api_query(query or {})
  173. try:
  174. return self._call_api_impl(
  175. ep, video_id, query=real_query, data=data, headers=headers,
  176. fatal=fatal, note=note, errnote=errnote)
  177. except ExtractorError as e:
  178. if isinstance(e.cause, json.JSONDecodeError) and e.cause.pos == 0:
  179. message = str(e.cause or e.msg)
  180. if not self._get_next_app_info():
  181. if fatal:
  182. raise
  183. else:
  184. self.report_warning(message)
  185. return
  186. self.report_warning(f'{message}. Retrying... (attempt {count} of {max_tries})')
  187. continue
  188. raise
  189. def _extract_aweme_app(self, aweme_id):
  190. aweme_detail = traverse_obj(
  191. self._call_api('multi/aweme/detail', aweme_id, data=urlencode_postdata({
  192. 'aweme_ids': f'[{aweme_id}]',
  193. 'request_source': '0',
  194. }), headers={'X-Argus': ''}), ('aweme_details', 0, {dict}))
  195. if not aweme_detail:
  196. raise ExtractorError('Unable to extract aweme detail info', video_id=aweme_id)
  197. return self._parse_aweme_video_app(aweme_detail)
  198. def _extract_web_data_and_status(self, url, video_id, fatal=True):
  199. video_data, status = {}, -1
  200. res = self._download_webpage_handle(url, video_id, fatal=fatal, headers={'User-Agent': 'Mozilla/5.0'})
  201. if res is False:
  202. return video_data, status
  203. webpage, urlh = res
  204. if urllib.parse.urlparse(urlh.url).path == '/login':
  205. message = 'TikTok is requiring login for access to this content'
  206. if fatal:
  207. self.raise_login_required(message)
  208. self.report_warning(f'{message}. {self._login_hint()}')
  209. return video_data, status
  210. if universal_data := self._get_universal_data(webpage, video_id):
  211. self.write_debug('Found universal data for rehydration')
  212. status = traverse_obj(universal_data, ('webapp.video-detail', 'statusCode', {int})) or 0
  213. video_data = traverse_obj(universal_data, ('webapp.video-detail', 'itemInfo', 'itemStruct', {dict}))
  214. elif sigi_data := self._get_sigi_state(webpage, video_id):
  215. self.write_debug('Found sigi state data')
  216. status = traverse_obj(sigi_data, ('VideoPage', 'statusCode', {int})) or 0
  217. video_data = traverse_obj(sigi_data, ('ItemModule', video_id, {dict}))
  218. elif next_data := self._search_nextjs_data(webpage, video_id, default={}):
  219. self.write_debug('Found next.js data')
  220. status = traverse_obj(next_data, ('props', 'pageProps', 'statusCode', {int})) or 0
  221. video_data = traverse_obj(next_data, ('props', 'pageProps', 'itemInfo', 'itemStruct', {dict}))
  222. elif fatal:
  223. raise ExtractorError('Unable to extract webpage video data')
  224. return video_data, status
  225. def _get_subtitles(self, aweme_detail, aweme_id, user_name):
  226. # TODO: Extract text positioning info
  227. subtitles = {}
  228. # aweme/detail endpoint subs
  229. captions_info = traverse_obj(
  230. aweme_detail, ('interaction_stickers', ..., 'auto_video_caption_info', 'auto_captions', ...), expected_type=dict)
  231. for caption in captions_info:
  232. caption_url = traverse_obj(caption, ('url', 'url_list', ...), expected_type=url_or_none, get_all=False)
  233. if not caption_url:
  234. continue
  235. caption_json = self._download_json(
  236. caption_url, aweme_id, note='Downloading captions', errnote='Unable to download captions', fatal=False)
  237. if not caption_json:
  238. continue
  239. subtitles.setdefault(caption.get('language', 'en'), []).append({
  240. 'ext': 'srt',
  241. 'data': '\n\n'.join(
  242. f'{i + 1}\n{srt_subtitles_timecode(line["start_time"] / 1000)} --> {srt_subtitles_timecode(line["end_time"] / 1000)}\n{line["text"]}'
  243. for i, line in enumerate(caption_json['utterances']) if line.get('text')),
  244. })
  245. # feed endpoint subs
  246. if not subtitles:
  247. for caption in traverse_obj(aweme_detail, ('video', 'cla_info', 'caption_infos', ...), expected_type=dict):
  248. if not caption.get('url'):
  249. continue
  250. subtitles.setdefault(caption.get('lang') or 'en', []).append({
  251. 'ext': remove_start(caption.get('caption_format'), 'web'),
  252. 'url': caption['url'],
  253. })
  254. # webpage subs
  255. if not subtitles:
  256. if user_name: # only _parse_aweme_video_app needs to extract the webpage here
  257. aweme_detail, _ = self._extract_web_data_and_status(
  258. self._create_url(user_name, aweme_id), aweme_id, fatal=False)
  259. for caption in traverse_obj(aweme_detail, ('video', 'subtitleInfos', lambda _, v: v['Url'])):
  260. subtitles.setdefault(caption.get('LanguageCodeName') or 'en', []).append({
  261. 'ext': remove_start(caption.get('Format'), 'web'),
  262. 'url': caption['Url'],
  263. })
  264. return subtitles
  265. def _parse_url_key(self, url_key):
  266. format_id, codec, res, bitrate = self._search_regex(
  267. r'v[^_]+_(?P<id>(?P<codec>[^_]+)_(?P<res>\d+p)_(?P<bitrate>\d+))', url_key,
  268. 'url key', default=(None, None, None, None), group=('id', 'codec', 'res', 'bitrate'))
  269. if not format_id:
  270. return {}, None
  271. return {
  272. 'format_id': format_id,
  273. 'vcodec': 'h265' if codec == 'bytevc1' else codec,
  274. 'tbr': int_or_none(bitrate, scale=1000) or None,
  275. 'quality': qualities(self.QUALITIES)(res),
  276. }, res
  277. def _parse_aweme_video_app(self, aweme_detail):
  278. aweme_id = aweme_detail['aweme_id']
  279. video_info = aweme_detail['video']
  280. known_resolutions = {}
  281. def audio_meta(url):
  282. ext = determine_ext(url, default_ext='m4a')
  283. return {
  284. 'format_note': 'Music track',
  285. 'ext': ext,
  286. 'acodec': 'aac' if ext == 'm4a' else ext,
  287. 'vcodec': 'none',
  288. 'width': None,
  289. 'height': None,
  290. } if ext == 'mp3' or '-music-' in url else {}
  291. def extract_addr(addr, add_meta={}):
  292. parsed_meta, res = self._parse_url_key(addr.get('url_key', ''))
  293. is_bytevc2 = parsed_meta.get('vcodec') == 'bytevc2'
  294. if res:
  295. known_resolutions.setdefault(res, {}).setdefault('height', int_or_none(addr.get('height')))
  296. known_resolutions[res].setdefault('width', int_or_none(addr.get('width')))
  297. parsed_meta.update(known_resolutions.get(res, {}))
  298. add_meta.setdefault('height', int_or_none(res[:-1]))
  299. return [{
  300. 'url': url,
  301. 'filesize': int_or_none(addr.get('data_size')),
  302. 'ext': 'mp4',
  303. 'acodec': 'aac',
  304. 'source_preference': -2 if 'aweme/v1' in url else -1, # Downloads from API might get blocked
  305. **add_meta, **parsed_meta,
  306. # bytevc2 is bytedance's own custom h266/vvc codec, as-of-yet unplayable
  307. 'preference': -100 if is_bytevc2 else -1,
  308. 'format_note': join_nonempty(
  309. add_meta.get('format_note'), '(API)' if 'aweme/v1' in url else None,
  310. '(UNPLAYABLE)' if is_bytevc2 else None, delim=' '),
  311. **audio_meta(url),
  312. } for url in addr.get('url_list') or []]
  313. # Hack: Add direct video links first to prioritize them when removing duplicate formats
  314. formats = []
  315. width = int_or_none(video_info.get('width'))
  316. height = int_or_none(video_info.get('height'))
  317. ratio = try_call(lambda: width / height) or 0.5625
  318. if video_info.get('play_addr'):
  319. formats.extend(extract_addr(video_info['play_addr'], {
  320. 'format_id': 'play_addr',
  321. 'format_note': 'Direct video',
  322. 'vcodec': 'h265' if traverse_obj(
  323. video_info, 'is_bytevc1', 'is_h265') else 'h264', # TODO: Check for "direct iOS" videos, like https://www.tiktok.com/@cookierun_dev/video/7039716639834656002
  324. 'width': width,
  325. 'height': height,
  326. }))
  327. if video_info.get('download_addr'):
  328. download_addr = video_info['download_addr']
  329. dl_width = int_or_none(download_addr.get('width'))
  330. formats.extend(extract_addr(download_addr, {
  331. 'format_id': 'download_addr',
  332. 'format_note': 'Download video%s' % (', watermarked' if video_info.get('has_watermark') else ''),
  333. 'vcodec': 'h264',
  334. 'width': dl_width,
  335. 'height': try_call(lambda: int(dl_width / ratio)), # download_addr['height'] is wrong
  336. 'preference': -2 if video_info.get('has_watermark') else -1,
  337. }))
  338. if video_info.get('play_addr_h264'):
  339. formats.extend(extract_addr(video_info['play_addr_h264'], {
  340. 'format_id': 'play_addr_h264',
  341. 'format_note': 'Direct video',
  342. 'vcodec': 'h264',
  343. }))
  344. if video_info.get('play_addr_bytevc1'):
  345. formats.extend(extract_addr(video_info['play_addr_bytevc1'], {
  346. 'format_id': 'play_addr_bytevc1',
  347. 'format_note': 'Direct video',
  348. 'vcodec': 'h265',
  349. }))
  350. for bitrate in video_info.get('bit_rate', []):
  351. if bitrate.get('play_addr'):
  352. formats.extend(extract_addr(bitrate['play_addr'], {
  353. 'format_id': bitrate.get('gear_name'),
  354. 'format_note': 'Playback video',
  355. 'tbr': try_get(bitrate, lambda x: x['bit_rate'] / 1000),
  356. 'vcodec': 'h265' if traverse_obj(
  357. bitrate, 'is_bytevc1', 'is_h265') else 'h264',
  358. 'fps': bitrate.get('FPS'),
  359. }))
  360. self._remove_duplicate_formats(formats)
  361. auth_cookie = self._get_cookies(self._WEBPAGE_HOST).get('sid_tt')
  362. if auth_cookie:
  363. for f in formats:
  364. self._set_cookie(urllib.parse.urlparse(f['url']).hostname, 'sid_tt', auth_cookie.value)
  365. thumbnails = []
  366. for cover_id in ('cover', 'ai_dynamic_cover', 'animated_cover', 'ai_dynamic_cover_bak',
  367. 'origin_cover', 'dynamic_cover'):
  368. for cover_url in traverse_obj(video_info, (cover_id, 'url_list', ...)):
  369. thumbnails.append({
  370. 'id': cover_id,
  371. 'url': cover_url,
  372. })
  373. stats_info = aweme_detail.get('statistics') or {}
  374. music_info = aweme_detail.get('music') or {}
  375. labels = traverse_obj(aweme_detail, ('hybrid_label', ..., 'text'), expected_type=str)
  376. contained_music_track = traverse_obj(
  377. music_info, ('matched_song', 'title'), ('matched_pgc_sound', 'title'), expected_type=str)
  378. contained_music_author = traverse_obj(
  379. music_info, ('matched_song', 'author'), ('matched_pgc_sound', 'author'), 'author', expected_type=str)
  380. is_generic_og_trackname = music_info.get('is_original_sound') and music_info.get('title') == 'original sound - {}'.format(music_info.get('owner_handle'))
  381. if is_generic_og_trackname:
  382. music_track, music_author = contained_music_track or 'original sound', contained_music_author
  383. else:
  384. music_track, music_author = music_info.get('title'), traverse_obj(music_info, ('author', {str}))
  385. author_info = traverse_obj(aweme_detail, ('author', {
  386. 'uploader': ('unique_id', {str}),
  387. 'uploader_id': ('uid', {str_or_none}),
  388. 'channel': ('nickname', {str}),
  389. 'channel_id': ('sec_uid', {str}),
  390. }))
  391. return {
  392. 'id': aweme_id,
  393. **traverse_obj(aweme_detail, {
  394. 'title': ('desc', {str}),
  395. 'description': ('desc', {str}),
  396. 'timestamp': ('create_time', {int_or_none}),
  397. }),
  398. **traverse_obj(stats_info, {
  399. 'view_count': 'play_count',
  400. 'like_count': 'digg_count',
  401. 'repost_count': 'share_count',
  402. 'comment_count': 'comment_count',
  403. }, expected_type=int_or_none),
  404. **author_info,
  405. 'channel_url': format_field(author_info, 'channel_id', self._UPLOADER_URL_FORMAT, default=None),
  406. 'uploader_url': format_field(
  407. author_info, ['uploader', 'uploader_id'], self._UPLOADER_URL_FORMAT, default=None),
  408. 'track': music_track,
  409. 'album': str_or_none(music_info.get('album')) or None,
  410. 'artists': re.split(r'(?:, | & )', music_author) if music_author else None,
  411. 'formats': formats,
  412. 'subtitles': self.extract_subtitles(
  413. aweme_detail, aweme_id, traverse_obj(author_info, 'uploader', 'uploader_id', 'channel_id')),
  414. 'thumbnails': thumbnails,
  415. 'duration': (traverse_obj(video_info, (
  416. (None, 'download_addr'), 'duration', {functools.partial(int_or_none, scale=1000)}, any))
  417. or traverse_obj(music_info, ('duration', {int_or_none}))),
  418. 'availability': self._availability(
  419. is_private='Private' in labels,
  420. needs_subscription='Friends only' in labels,
  421. is_unlisted='Followers only' in labels),
  422. '_format_sort_fields': ('quality', 'codec', 'size', 'br'),
  423. }
  424. def _extract_web_formats(self, aweme_detail):
  425. COMMON_FORMAT_INFO = {
  426. 'ext': 'mp4',
  427. 'vcodec': 'h264',
  428. 'acodec': 'aac',
  429. }
  430. video_info = traverse_obj(aweme_detail, ('video', {dict})) or {}
  431. play_width = int_or_none(video_info.get('width'))
  432. play_height = int_or_none(video_info.get('height'))
  433. ratio = try_call(lambda: play_width / play_height) or 0.5625
  434. formats = []
  435. for bitrate_info in traverse_obj(video_info, ('bitrateInfo', lambda _, v: v['PlayAddr']['UrlList'])):
  436. format_info, res = self._parse_url_key(
  437. traverse_obj(bitrate_info, ('PlayAddr', 'UrlKey', {str})) or '')
  438. # bytevc2 is bytedance's own custom h266/vvc codec, as-of-yet unplayable
  439. is_bytevc2 = format_info.get('vcodec') == 'bytevc2'
  440. format_info.update({
  441. 'format_note': 'UNPLAYABLE' if is_bytevc2 else None,
  442. 'preference': -100 if is_bytevc2 else -1,
  443. 'filesize': traverse_obj(bitrate_info, ('PlayAddr', 'DataSize', {int_or_none})),
  444. })
  445. if dimension := (res and int(res[:-1])):
  446. if dimension == 540: # '540p' is actually 576p
  447. dimension = 576
  448. if ratio < 1: # portrait: res/dimension is width
  449. y = int(dimension / ratio)
  450. format_info.update({
  451. 'width': dimension,
  452. 'height': y - (y % 2),
  453. })
  454. else: # landscape: res/dimension is height
  455. x = int(dimension * ratio)
  456. format_info.update({
  457. 'width': x + (x % 2),
  458. 'height': dimension,
  459. })
  460. for video_url in traverse_obj(bitrate_info, ('PlayAddr', 'UrlList', ..., {url_or_none})):
  461. formats.append({
  462. **COMMON_FORMAT_INFO,
  463. **format_info,
  464. 'url': self._proto_relative_url(video_url),
  465. })
  466. # We don't have res string for play formats, but need quality for sorting & de-duplication
  467. play_quality = traverse_obj(formats, (lambda _, v: v['width'] == play_width, 'quality', any))
  468. for play_url in traverse_obj(video_info, ('playAddr', ((..., 'src'), None), {url_or_none})):
  469. formats.append({
  470. **COMMON_FORMAT_INFO,
  471. 'format_id': 'play',
  472. 'url': self._proto_relative_url(play_url),
  473. 'width': play_width,
  474. 'height': play_height,
  475. 'quality': play_quality,
  476. })
  477. for download_url in traverse_obj(video_info, (('downloadAddr', ('download', 'url')), {url_or_none})):
  478. formats.append({
  479. **COMMON_FORMAT_INFO,
  480. 'format_id': 'download',
  481. 'url': self._proto_relative_url(download_url),
  482. })
  483. self._remove_duplicate_formats(formats)
  484. for f in traverse_obj(formats, lambda _, v: 'unwatermarked' not in v['url']):
  485. f.update({
  486. 'format_note': join_nonempty(f.get('format_note'), 'watermarked', delim=', '),
  487. 'preference': f.get('preference') or -2,
  488. })
  489. # Is it a slideshow with only audio for download?
  490. if not formats and traverse_obj(aweme_detail, ('music', 'playUrl', {url_or_none})):
  491. audio_url = aweme_detail['music']['playUrl']
  492. ext = traverse_obj(parse_qs(audio_url), (
  493. 'mime_type', -1, {lambda x: x.replace('_', '/')}, {mimetype2ext})) or 'm4a'
  494. formats.append({
  495. 'format_id': 'audio',
  496. 'url': self._proto_relative_url(audio_url),
  497. 'ext': ext,
  498. 'acodec': 'aac' if ext == 'm4a' else ext,
  499. 'vcodec': 'none',
  500. })
  501. return formats
  502. def _parse_aweme_video_web(self, aweme_detail, webpage_url, video_id, extract_flat=False):
  503. author_info = traverse_obj(aweme_detail, (('authorInfo', 'author', None), {
  504. 'channel': ('nickname', {str}),
  505. 'channel_id': (('authorSecId', 'secUid'), {str}),
  506. 'uploader': (('uniqueId', 'author'), {str}),
  507. 'uploader_id': (('authorId', 'uid', 'id'), {str_or_none}),
  508. }), get_all=False)
  509. return {
  510. 'id': video_id,
  511. 'formats': None if extract_flat else self._extract_web_formats(aweme_detail),
  512. 'subtitles': None if extract_flat else self.extract_subtitles(aweme_detail, video_id, None),
  513. 'http_headers': {'Referer': webpage_url},
  514. **author_info,
  515. 'channel_url': format_field(author_info, 'channel_id', self._UPLOADER_URL_FORMAT, default=None),
  516. 'uploader_url': format_field(
  517. author_info, ['uploader', 'uploader_id'], self._UPLOADER_URL_FORMAT, default=None),
  518. **traverse_obj(aweme_detail, ('music', {
  519. 'track': ('title', {str}),
  520. 'album': ('album', {str}, {lambda x: x or None}),
  521. 'artists': ('authorName', {str}, {lambda x: re.split(r'(?:, | & )', x) if x else None}),
  522. 'duration': ('duration', {int_or_none}),
  523. })),
  524. **traverse_obj(aweme_detail, {
  525. 'title': ('desc', {str}),
  526. 'description': ('desc', {str}),
  527. # audio-only slideshows have a video duration of 0 and an actual audio duration
  528. 'duration': ('video', 'duration', {int_or_none}, {lambda x: x or None}),
  529. 'timestamp': ('createTime', {int_or_none}),
  530. }),
  531. **traverse_obj(aweme_detail, ('stats', {
  532. 'view_count': 'playCount',
  533. 'like_count': 'diggCount',
  534. 'repost_count': 'shareCount',
  535. 'comment_count': 'commentCount',
  536. }), expected_type=int_or_none),
  537. 'thumbnails': traverse_obj(aweme_detail, (
  538. (None, 'video'), ('thumbnail', 'cover', 'dynamicCover', 'originCover'), {
  539. 'url': ({url_or_none}, {self._proto_relative_url}),
  540. },
  541. )),
  542. }
  543. class TikTokIE(TikTokBaseIE):
  544. _VALID_URL = r'https?://www\.tiktok\.com/(?:embed|@(?P<user_id>[\w\.-]+)?/video)/(?P<id>\d+)'
  545. _EMBED_REGEX = [rf'<(?:script|iframe)[^>]+\bsrc=(["\'])(?P<url>{_VALID_URL})']
  546. _TESTS = [{
  547. 'url': 'https://www.tiktok.com/@leenabhushan/video/6748451240264420610',
  548. 'md5': '736bb7a466c6f0a6afeb597da1e6f5b7',
  549. 'info_dict': {
  550. 'id': '6748451240264420610',
  551. 'ext': 'mp4',
  552. 'title': '#jassmanak #lehanga #leenabhushan',
  553. 'description': '#jassmanak #lehanga #leenabhushan',
  554. 'duration': 13,
  555. 'height': 1024,
  556. 'width': 576,
  557. 'uploader': 'leenabhushan',
  558. 'uploader_id': '6691488002098119685',
  559. 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAA_Eb4t1vodM1IuTy_cvp9CY22RAb59xqrO0Xtz9CYQJvgXaDvZxYnZYRzDWhhgJmy',
  560. 'creator': 'facestoriesbyleenabh',
  561. 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?',
  562. 'upload_date': '20191016',
  563. 'timestamp': 1571246252,
  564. 'view_count': int,
  565. 'like_count': int,
  566. 'repost_count': int,
  567. 'comment_count': int,
  568. 'artist': 'Ysrbeats',
  569. 'album': 'Lehanga',
  570. 'track': 'Lehanga',
  571. },
  572. 'skip': '404 Not Found',
  573. }, {
  574. 'url': 'https://www.tiktok.com/@patroxofficial/video/6742501081818877190?langCountry=en',
  575. 'md5': 'f21112672ee4ce05ca390fb6522e1b6f',
  576. 'info_dict': {
  577. 'id': '6742501081818877190',
  578. 'ext': 'mp4',
  579. 'title': 'md5:5e2a23877420bb85ce6521dbee39ba94',
  580. 'description': 'md5:5e2a23877420bb85ce6521dbee39ba94',
  581. 'duration': 27,
  582. 'height': 1024,
  583. 'width': 576,
  584. 'uploader': 'patrox',
  585. 'uploader_id': '18702747',
  586. 'uploader_url': 'https://www.tiktok.com/@patrox',
  587. 'channel_url': 'https://www.tiktok.com/@MS4wLjABAAAAiFnldaILebi5heDoVU6bn4jBWWycX6-9U3xuNPqZ8Ws',
  588. 'channel_id': 'MS4wLjABAAAAiFnldaILebi5heDoVU6bn4jBWWycX6-9U3xuNPqZ8Ws',
  589. 'channel': 'patroX',
  590. 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?',
  591. 'upload_date': '20190930',
  592. 'timestamp': 1569860870,
  593. 'view_count': int,
  594. 'like_count': int,
  595. 'repost_count': int,
  596. 'comment_count': int,
  597. 'artists': ['Evan Todd', 'Jessica Keenan Wynn', 'Alice Lee', 'Barrett Wilbert Weed', 'Jon Eidson'],
  598. 'track': 'Big Fun',
  599. },
  600. }, {
  601. # Banned audio, was available on the app, now works with web too
  602. 'url': 'https://www.tiktok.com/@barudakhb_/video/6984138651336838402',
  603. 'info_dict': {
  604. 'id': '6984138651336838402',
  605. 'ext': 'mp4',
  606. 'title': 'Balas @yolaaftwsr hayu yu ? #SquadRandom_ 🔥',
  607. 'description': 'Balas @yolaaftwsr hayu yu ? #SquadRandom_ 🔥',
  608. 'uploader': 'barudakhb_',
  609. 'channel': 'md5:29f238c49bc0c176cb3cef1a9cea9fa6',
  610. 'uploader_id': '6974687867511718913',
  611. 'uploader_url': 'https://www.tiktok.com/@barudakhb_',
  612. 'channel_url': 'https://www.tiktok.com/@MS4wLjABAAAAbhBwQC-R1iKoix6jDFsF-vBdfx2ABoDjaZrM9fX6arU3w71q3cOWgWuTXn1soZ7d',
  613. 'channel_id': 'MS4wLjABAAAAbhBwQC-R1iKoix6jDFsF-vBdfx2ABoDjaZrM9fX6arU3w71q3cOWgWuTXn1soZ7d',
  614. 'track': 'Boka Dance',
  615. 'artists': ['md5:29f238c49bc0c176cb3cef1a9cea9fa6'],
  616. 'timestamp': 1626121503,
  617. 'duration': 18,
  618. 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?',
  619. 'upload_date': '20210712',
  620. 'view_count': int,
  621. 'like_count': int,
  622. 'repost_count': int,
  623. 'comment_count': int,
  624. },
  625. }, {
  626. # Sponsored video, only available with feed workaround
  627. 'url': 'https://www.tiktok.com/@MS4wLjABAAAATh8Vewkn0LYM7Fo03iec3qKdeCUOcBIouRk1mkiag6h3o_pQu_dUXvZ2EZlGST7_/video/7042692929109986561',
  628. 'info_dict': {
  629. 'id': '7042692929109986561',
  630. 'ext': 'mp4',
  631. 'title': 'Slap and Run!',
  632. 'description': 'Slap and Run!',
  633. 'uploader': 'user440922249',
  634. 'channel': 'Slap And Run',
  635. 'uploader_id': '7036055384943690754',
  636. 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAATh8Vewkn0LYM7Fo03iec3qKdeCUOcBIouRk1mkiag6h3o_pQu_dUXvZ2EZlGST7_',
  637. 'channel_id': 'MS4wLjABAAAATh8Vewkn0LYM7Fo03iec3qKdeCUOcBIouRk1mkiag6h3o_pQu_dUXvZ2EZlGST7_',
  638. 'track': 'Promoted Music',
  639. 'timestamp': 1639754738,
  640. 'duration': 30,
  641. 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?',
  642. 'upload_date': '20211217',
  643. 'view_count': int,
  644. 'like_count': int,
  645. 'repost_count': int,
  646. 'comment_count': int,
  647. },
  648. 'skip': 'This video is unavailable',
  649. }, {
  650. # Video without title and description
  651. 'url': 'https://www.tiktok.com/@pokemonlife22/video/7059698374567611694',
  652. 'info_dict': {
  653. 'id': '7059698374567611694',
  654. 'ext': 'mp4',
  655. 'title': 'TikTok video #7059698374567611694',
  656. 'description': '',
  657. 'uploader': 'pokemonlife22',
  658. 'channel': 'Pokemon',
  659. 'uploader_id': '6820838815978423302',
  660. 'uploader_url': 'https://www.tiktok.com/@pokemonlife22',
  661. 'channel_url': 'https://www.tiktok.com/@MS4wLjABAAAA0tF1nBwQVVMyrGu3CqttkNgM68Do1OXUFuCY0CRQk8fEtSVDj89HqoqvbSTmUP2W',
  662. 'channel_id': 'MS4wLjABAAAA0tF1nBwQVVMyrGu3CqttkNgM68Do1OXUFuCY0CRQk8fEtSVDj89HqoqvbSTmUP2W',
  663. 'track': 'original sound',
  664. 'timestamp': 1643714123,
  665. 'duration': 6,
  666. 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?',
  667. 'upload_date': '20220201',
  668. 'artists': ['Pokemon'],
  669. 'view_count': int,
  670. 'like_count': int,
  671. 'repost_count': int,
  672. 'comment_count': int,
  673. },
  674. }, {
  675. # hydration JSON is sent in a <script> element
  676. 'url': 'https://www.tiktok.com/@denidil6/video/7065799023130643713',
  677. 'info_dict': {
  678. 'id': '7065799023130643713',
  679. 'ext': 'mp4',
  680. 'title': '#denidil#денидил',
  681. 'description': '#denidil#денидил',
  682. 'uploader': 'denidil6',
  683. 'uploader_id': '7046664115636405250',
  684. 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAsvMSzFdQ4ikl3uR2TEJwMBbB2yZh2Zxwhx-WCo3rbDpAharE3GQCrFuJArI3C8QJ',
  685. 'artist': 'Holocron Music',
  686. 'album': 'Wolf Sounds (1 Hour) Enjoy the Company of the Animal That Is the Majestic King of the Night',
  687. 'track': 'Wolf Sounds (1 Hour) Enjoy the Company of the Animal That Is the Majestic King of the Night',
  688. 'timestamp': 1645134536,
  689. 'duration': 26,
  690. 'upload_date': '20220217',
  691. 'view_count': int,
  692. 'like_count': int,
  693. 'repost_count': int,
  694. 'comment_count': int,
  695. },
  696. 'skip': 'This video is unavailable',
  697. }, {
  698. # slideshow audio-only mp3 format
  699. 'url': 'https://www.tiktok.com/@_le_cannibale_/video/7139980461132074283',
  700. 'info_dict': {
  701. 'id': '7139980461132074283',
  702. 'ext': 'mp3',
  703. 'title': 'TikTok video #7139980461132074283',
  704. 'description': '',
  705. 'channel': 'Antaura',
  706. 'uploader': '_le_cannibale_',
  707. 'uploader_id': '6604511138619654149',
  708. 'uploader_url': 'https://www.tiktok.com/@_le_cannibale_',
  709. 'channel_url': 'https://www.tiktok.com/@MS4wLjABAAAAoShJqaw_5gvy48y3azFeFcT4jeyKWbB0VVYasOCt2tTLwjNFIaDcHAM4D-QGXFOP',
  710. 'channel_id': 'MS4wLjABAAAAoShJqaw_5gvy48y3azFeFcT4jeyKWbB0VVYasOCt2tTLwjNFIaDcHAM4D-QGXFOP',
  711. 'artists': ['nathan !'],
  712. 'track': 'grahamscott canon',
  713. 'duration': 10,
  714. 'upload_date': '20220905',
  715. 'timestamp': 1662406249,
  716. 'view_count': int,
  717. 'like_count': int,
  718. 'repost_count': int,
  719. 'comment_count': int,
  720. 'thumbnail': r're:^https://.+\.(?:webp|jpe?g)',
  721. },
  722. }, {
  723. # only available via web
  724. 'url': 'https://www.tiktok.com/@moxypatch/video/7206382937372134662',
  725. 'md5': '4cdefa501ac8ac20bf04986e10916fea',
  726. 'info_dict': {
  727. 'id': '7206382937372134662',
  728. 'ext': 'mp4',
  729. 'title': 'md5:1d95c0b96560ca0e8a231af4172b2c0a',
  730. 'description': 'md5:1d95c0b96560ca0e8a231af4172b2c0a',
  731. 'channel': 'MoxyPatch',
  732. 'uploader': 'moxypatch',
  733. 'uploader_id': '7039142049363379205',
  734. 'uploader_url': 'https://www.tiktok.com/@moxypatch',
  735. 'channel_url': 'https://www.tiktok.com/@MS4wLjABAAAAFhqKnngMHJSsifL0w1vFOP5kn3Ndo1ODp0XuIBkNMBCkALTvwILdpu12g3pTtL4V',
  736. 'channel_id': 'MS4wLjABAAAAFhqKnngMHJSsifL0w1vFOP5kn3Ndo1ODp0XuIBkNMBCkALTvwILdpu12g3pTtL4V',
  737. 'artists': ['your worst nightmare'],
  738. 'track': 'original sound',
  739. 'upload_date': '20230303',
  740. 'timestamp': 1677866781,
  741. 'duration': 10,
  742. 'view_count': int,
  743. 'like_count': int,
  744. 'repost_count': int,
  745. 'comment_count': int,
  746. 'thumbnail': r're:^https://.+',
  747. 'thumbnails': 'count:3',
  748. },
  749. 'expected_warnings': ['Unable to find video in feed'],
  750. }, {
  751. # 1080p format
  752. 'url': 'https://www.tiktok.com/@tatemcrae/video/7107337212743830830', # FIXME: Web can only get audio
  753. 'md5': '982512017a8a917124d5a08c8ae79621',
  754. 'info_dict': {
  755. 'id': '7107337212743830830',
  756. 'ext': 'mp4',
  757. 'title': 'new music video 4 don’t come backkkk🧸🖤 i hope u enjoy !! @musicontiktok',
  758. 'description': 'new music video 4 don’t come backkkk🧸🖤 i hope u enjoy !! @musicontiktok',
  759. 'uploader': 'tatemcrae',
  760. 'uploader_id': '86328792343818240',
  761. 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAA-0bQT0CqebTRr6I4IkYvMDMKSRSJHLNPBo5HrSklJwyA2psXLSZG5FP-LMNpHnJd',
  762. 'channel_id': 'MS4wLjABAAAA-0bQT0CqebTRr6I4IkYvMDMKSRSJHLNPBo5HrSklJwyA2psXLSZG5FP-LMNpHnJd',
  763. 'channel': 'tate mcrae',
  764. 'artists': ['tate mcrae'],
  765. 'track': 'original sound',
  766. 'upload_date': '20220609',
  767. 'timestamp': 1654805899,
  768. 'duration': 150,
  769. 'view_count': int,
  770. 'like_count': int,
  771. 'repost_count': int,
  772. 'comment_count': int,
  773. 'thumbnail': r're:^https://.+\.webp',
  774. },
  775. 'skip': 'Unavailable via feed API, only audio available via web',
  776. }, {
  777. # Slideshow, audio-only m4a format
  778. 'url': 'https://www.tiktok.com/@hara_yoimiya/video/7253412088251534594',
  779. 'md5': '2ff8fe0174db2dbf49c597a7bef4e47d',
  780. 'info_dict': {
  781. 'id': '7253412088251534594',
  782. 'ext': 'm4a',
  783. 'title': 'я ред флаг простите #переписка #щитпост #тревожныйтиппривязанности #рекомендации ',
  784. 'description': 'я ред флаг простите #переписка #щитпост #тревожныйтиппривязанности #рекомендации ',
  785. 'uploader': 'hara_yoimiya',
  786. 'uploader_id': '6582536342634676230',
  787. 'uploader_url': 'https://www.tiktok.com/@hara_yoimiya',
  788. 'channel_url': 'https://www.tiktok.com/@MS4wLjABAAAAIAlDxriiPWLE-p8p1R_0Bx8qWKfi-7zwmGhzU8Mv25W8sNxjfIKrol31qTczzuLB',
  789. 'channel_id': 'MS4wLjABAAAAIAlDxriiPWLE-p8p1R_0Bx8qWKfi-7zwmGhzU8Mv25W8sNxjfIKrol31qTczzuLB',
  790. 'channel': 'лампочка(!)',
  791. 'artists': ['Øneheart'],
  792. 'album': 'watching the stars',
  793. 'track': 'watching the stars',
  794. 'duration': 60,
  795. 'upload_date': '20230708',
  796. 'timestamp': 1688816612,
  797. 'view_count': int,
  798. 'like_count': int,
  799. 'comment_count': int,
  800. 'repost_count': int,
  801. 'thumbnail': r're:^https://.+\.(?:webp|jpe?g)',
  802. },
  803. }, {
  804. # Auto-captions available
  805. 'url': 'https://www.tiktok.com/@hankgreen1/video/7047596209028074758',
  806. 'only_matching': True,
  807. }]
  808. def _real_extract(self, url):
  809. video_id, user_id = self._match_valid_url(url).group('id', 'user_id')
  810. if self._KNOWN_APP_INFO:
  811. try:
  812. return self._extract_aweme_app(video_id)
  813. except ExtractorError as e:
  814. e.expected = True
  815. self.report_warning(f'{e}; trying with webpage')
  816. url = self._create_url(user_id, video_id)
  817. video_data, status = self._extract_web_data_and_status(url, video_id)
  818. if video_data and status == 0:
  819. return self._parse_aweme_video_web(video_data, url, video_id)
  820. elif status == 10216:
  821. raise ExtractorError('This video is private', expected=True)
  822. raise ExtractorError(f'Video not available, status code {status}', video_id=video_id)
  823. class TikTokUserIE(TikTokBaseIE):
  824. IE_NAME = 'tiktok:user'
  825. _VALID_URL = r'(?:tiktokuser:|https?://(?:www\.)?tiktok\.com/@)(?P<id>[\w.-]+)/?(?:$|[#?])'
  826. _TESTS = [{
  827. 'url': 'https://tiktok.com/@corgibobaa?lang=en',
  828. 'playlist_mincount': 45,
  829. 'info_dict': {
  830. 'id': 'MS4wLjABAAAAepiJKgwWhulvCpSuUVsp7sgVVsFJbbNaLeQ6OQ0oAJERGDUIXhb2yxxHZedsItgT',
  831. 'title': 'corgibobaa',
  832. },
  833. }, {
  834. 'url': 'https://www.tiktok.com/@6820838815978423302',
  835. 'playlist_mincount': 5,
  836. 'info_dict': {
  837. 'id': 'MS4wLjABAAAA0tF1nBwQVVMyrGu3CqttkNgM68Do1OXUFuCY0CRQk8fEtSVDj89HqoqvbSTmUP2W',
  838. 'title': '6820838815978423302',
  839. },
  840. }, {
  841. 'url': 'https://www.tiktok.com/@meme',
  842. 'playlist_mincount': 593,
  843. 'info_dict': {
  844. 'id': 'MS4wLjABAAAAiKfaDWeCsT3IHwY77zqWGtVRIy9v4ws1HbVi7auP1Vx7dJysU_hc5yRiGywojRD6',
  845. 'title': 'meme',
  846. },
  847. }, {
  848. 'url': 'tiktokuser:MS4wLjABAAAAM3R2BtjzVT-uAtstkl2iugMzC6AtnpkojJbjiOdDDrdsTiTR75-8lyWJCY5VvDrZ',
  849. 'playlist_mincount': 31,
  850. 'info_dict': {
  851. 'id': 'MS4wLjABAAAAM3R2BtjzVT-uAtstkl2iugMzC6AtnpkojJbjiOdDDrdsTiTR75-8lyWJCY5VvDrZ',
  852. },
  853. }]
  854. _USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0'
  855. _API_BASE_URL = 'https://www.tiktok.com/api/creator/item_list/'
  856. def _build_web_query(self, sec_uid, cursor):
  857. return {
  858. 'aid': '1988',
  859. 'app_language': 'en',
  860. 'app_name': 'tiktok_web',
  861. 'browser_language': 'en-US',
  862. 'browser_name': 'Mozilla',
  863. 'browser_online': 'true',
  864. 'browser_platform': 'Win32',
  865. 'browser_version': '5.0 (Windows)',
  866. 'channel': 'tiktok_web',
  867. 'cookie_enabled': 'true',
  868. 'count': '15',
  869. 'cursor': cursor,
  870. 'device_id': self._DEVICE_ID,
  871. 'device_platform': 'web_pc',
  872. 'focus_state': 'true',
  873. 'from_page': 'user',
  874. 'history_len': '2',
  875. 'is_fullscreen': 'false',
  876. 'is_page_visible': 'true',
  877. 'language': 'en',
  878. 'os': 'windows',
  879. 'priority_region': '',
  880. 'referer': '',
  881. 'region': 'US',
  882. 'screen_height': '1080',
  883. 'screen_width': '1920',
  884. 'secUid': sec_uid,
  885. 'type': '1', # pagination type: 0 == oldest-to-newest, 1 == newest-to-oldest
  886. 'tz_name': 'UTC',
  887. 'verifyFp': f'verify_{"".join(random.choices(string.hexdigits, k=7))}',
  888. 'webcast_language': 'en',
  889. }
  890. def _entries(self, sec_uid, user_name):
  891. display_id = user_name or sec_uid
  892. seen_ids = set()
  893. cursor = int(time.time() * 1E3)
  894. for page in itertools.count(1):
  895. response = self._download_json(
  896. self._API_BASE_URL, display_id, f'Downloading page {page}',
  897. query=self._build_web_query(sec_uid, cursor), headers={'User-Agent': self._USER_AGENT})
  898. for video in traverse_obj(response, ('itemList', lambda _, v: v['id'])):
  899. video_id = video['id']
  900. if video_id in seen_ids:
  901. continue
  902. seen_ids.add(video_id)
  903. webpage_url = self._create_url(display_id, video_id)
  904. yield self.url_result(
  905. webpage_url, TikTokIE,
  906. **self._parse_aweme_video_web(video, webpage_url, video_id, extract_flat=True))
  907. old_cursor = cursor
  908. cursor = traverse_obj(
  909. response, ('itemList', -1, 'createTime', {lambda x: int(x * 1E3)}))
  910. if not cursor or old_cursor == cursor:
  911. # User may not have posted within this ~1 week lookback, so manually adjust cursor
  912. cursor = old_cursor - 7 * 86_400_000
  913. # In case 'hasMorePrevious' is wrong, break if we have gone back before TikTok existed
  914. if cursor < 1472706000000 or not traverse_obj(response, 'hasMorePrevious'):
  915. break
  916. def _get_sec_uid(self, user_url, user_name, msg):
  917. webpage = self._download_webpage(
  918. user_url, user_name, fatal=False, headers={'User-Agent': 'Mozilla/5.0'},
  919. note=f'Downloading {msg} webpage', errnote=f'Unable to download {msg} webpage') or ''
  920. return (traverse_obj(self._get_universal_data(webpage, user_name),
  921. ('webapp.user-detail', 'userInfo', 'user', 'secUid', {str}))
  922. or traverse_obj(self._get_sigi_state(webpage, user_name),
  923. ('LiveRoom', 'liveRoomUserInfo', 'user', 'secUid', {str}),
  924. ('UserModule', 'users', ..., 'secUid', {str}, any)))
  925. def _real_extract(self, url):
  926. user_name, sec_uid = self._match_id(url), None
  927. if mobj := re.fullmatch(r'MS4wLjABAAAA[\w-]{64}', user_name):
  928. user_name, sec_uid = None, mobj.group(0)
  929. else:
  930. sec_uid = (self._get_sec_uid(self._UPLOADER_URL_FORMAT % user_name, user_name, 'user')
  931. or self._get_sec_uid(self._UPLOADER_URL_FORMAT % f'{user_name}/live', user_name, 'live'))
  932. if not sec_uid:
  933. webpage = self._download_webpage(
  934. f'https://www.tiktok.com/embed/@{user_name}', user_name,
  935. note='Downloading user embed page', fatal=False) or ''
  936. data = traverse_obj(self._search_json(
  937. r'<script[^>]+\bid=[\'"]__FRONTITY_CONNECT_STATE__[\'"][^>]*>',
  938. webpage, 'data', user_name, default={}),
  939. ('source', 'data', f'/embed/@{user_name}', {dict}))
  940. for aweme_id in traverse_obj(data, ('videoList', ..., 'id', {str})):
  941. webpage_url = self._create_url(user_name, aweme_id)
  942. video_data, _ = self._extract_web_data_and_status(webpage_url, aweme_id, fatal=False)
  943. sec_uid = self._parse_aweme_video_web(
  944. video_data, webpage_url, aweme_id, extract_flat=True).get('channel_id')
  945. if sec_uid:
  946. break
  947. if not sec_uid:
  948. raise ExtractorError(
  949. 'Unable to extract secondary user ID. If you are able to get the channel_id '
  950. 'from a video posted by this user, try using "tiktokuser:channel_id" as the '
  951. 'input URL (replacing `channel_id` with its actual value)', expected=True)
  952. return self.playlist_result(self._entries(sec_uid, user_name), sec_uid, user_name)
  953. class TikTokBaseListIE(TikTokBaseIE): # XXX: Conventionally, base classes should end with BaseIE/InfoExtractor
  954. def _entries(self, list_id, display_id):
  955. query = {
  956. self._QUERY_NAME: list_id,
  957. 'cursor': 0,
  958. 'count': 20,
  959. 'type': 5,
  960. 'device_id': self._DEVICE_ID,
  961. }
  962. for page in itertools.count(1):
  963. for retry in self.RetryManager():
  964. try:
  965. post_list = self._call_api(
  966. self._API_ENDPOINT, display_id, query=query,
  967. note=f'Downloading video list page {page}',
  968. errnote='Unable to download video list')
  969. except ExtractorError as e:
  970. if isinstance(e.cause, json.JSONDecodeError) and e.cause.pos == 0:
  971. retry.error = e
  972. continue
  973. raise
  974. for video in post_list.get('aweme_list', []):
  975. yield {
  976. **self._parse_aweme_video_app(video),
  977. 'extractor_key': TikTokIE.ie_key(),
  978. 'extractor': 'TikTok',
  979. 'webpage_url': f'https://tiktok.com/@_/video/{video["aweme_id"]}',
  980. }
  981. if not post_list.get('has_more'):
  982. break
  983. query['cursor'] = post_list['cursor']
  984. def _real_extract(self, url):
  985. list_id = self._match_id(url)
  986. return self.playlist_result(self._entries(list_id, list_id), list_id)
  987. class TikTokSoundIE(TikTokBaseListIE):
  988. IE_NAME = 'tiktok:sound'
  989. _VALID_URL = r'https?://(?:www\.)?tiktok\.com/music/[\w\.-]+-(?P<id>[\d]+)[/?#&]?'
  990. _WORKING = False
  991. _QUERY_NAME = 'music_id'
  992. _API_ENDPOINT = 'music/aweme'
  993. _TESTS = [{
  994. 'url': 'https://www.tiktok.com/music/Build-a-Btch-6956990112127585029?lang=en',
  995. 'playlist_mincount': 100,
  996. 'info_dict': {
  997. 'id': '6956990112127585029',
  998. },
  999. 'expected_warnings': ['Retrying'],
  1000. }, {
  1001. # Actual entries are less than listed video count
  1002. 'url': 'https://www.tiktok.com/music/jiefei-soap-remix-7036843036118469381',
  1003. 'playlist_mincount': 2182,
  1004. 'info_dict': {
  1005. 'id': '7036843036118469381',
  1006. },
  1007. 'expected_warnings': ['Retrying'],
  1008. }]
  1009. class TikTokEffectIE(TikTokBaseListIE):
  1010. IE_NAME = 'tiktok:effect'
  1011. _VALID_URL = r'https?://(?:www\.)?tiktok\.com/sticker/[\w\.-]+-(?P<id>[\d]+)[/?#&]?'
  1012. _WORKING = False
  1013. _QUERY_NAME = 'sticker_id'
  1014. _API_ENDPOINT = 'sticker/aweme'
  1015. _TESTS = [{
  1016. 'url': 'https://www.tiktok.com/sticker/MATERIAL-GWOOORL-1258156',
  1017. 'playlist_mincount': 100,
  1018. 'info_dict': {
  1019. 'id': '1258156',
  1020. },
  1021. 'expected_warnings': ['Retrying'],
  1022. }, {
  1023. # Different entries between mobile and web, depending on region
  1024. 'url': 'https://www.tiktok.com/sticker/Elf-Friend-479565',
  1025. 'only_matching': True,
  1026. }]
  1027. class TikTokTagIE(TikTokBaseListIE):
  1028. IE_NAME = 'tiktok:tag'
  1029. _VALID_URL = r'https?://(?:www\.)?tiktok\.com/tag/(?P<id>[^/?#&]+)'
  1030. _WORKING = False
  1031. _QUERY_NAME = 'ch_id'
  1032. _API_ENDPOINT = 'challenge/aweme'
  1033. _TESTS = [{
  1034. 'url': 'https://tiktok.com/tag/hello2018',
  1035. 'playlist_mincount': 39,
  1036. 'info_dict': {
  1037. 'id': '46294678',
  1038. 'title': 'hello2018',
  1039. },
  1040. 'expected_warnings': ['Retrying'],
  1041. }, {
  1042. 'url': 'https://tiktok.com/tag/fypシ?is_copy_url=0&is_from_webapp=v1',
  1043. 'only_matching': True,
  1044. }]
  1045. def _real_extract(self, url):
  1046. display_id = self._match_id(url)
  1047. webpage = self._download_webpage(url, display_id, headers={
  1048. 'User-Agent': 'facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)',
  1049. })
  1050. tag_id = self._html_search_regex(r'snssdk\d*://challenge/detail/(\d+)', webpage, 'tag ID')
  1051. return self.playlist_result(self._entries(tag_id, display_id), tag_id, display_id)
  1052. class TikTokCollectionIE(TikTokBaseIE):
  1053. IE_NAME = 'tiktok:collection'
  1054. _VALID_URL = r'https?://www\.tiktok\.com/@(?P<user_id>[\w.-]+)/collection/(?P<title>[^/?#]+)-(?P<id>\d+)/?(?:[?#]|$)'
  1055. _TESTS = [{
  1056. # playlist should have exactly 9 videos
  1057. 'url': 'https://www.tiktok.com/@imanoreotwe/collection/count-test-7371330159376370462',
  1058. 'info_dict': {
  1059. 'id': '7371330159376370462',
  1060. 'title': 'imanoreotwe-count-test',
  1061. },
  1062. 'playlist_count': 9,
  1063. }, {
  1064. # tests returning multiple pages of a large collection
  1065. 'url': 'https://www.tiktok.com/@imanoreotwe/collection/%F0%9F%98%82-7111887189571160875',
  1066. 'info_dict': {
  1067. 'id': '7111887189571160875',
  1068. 'title': 'imanoreotwe-%F0%9F%98%82',
  1069. },
  1070. 'playlist_mincount': 100,
  1071. }]
  1072. _API_BASE_URL = 'https://www.tiktok.com/api/collection/item_list/'
  1073. _PAGE_COUNT = 30
  1074. def _build_web_query(self, collection_id, cursor):
  1075. return {
  1076. 'aid': '1988',
  1077. 'collectionId': collection_id,
  1078. 'count': self._PAGE_COUNT,
  1079. 'cursor': cursor,
  1080. 'sourceType': '113',
  1081. }
  1082. def _entries(self, collection_id):
  1083. cursor = 0
  1084. for page in itertools.count(1):
  1085. response = self._download_json(
  1086. self._API_BASE_URL, collection_id, f'Downloading page {page}',
  1087. query=self._build_web_query(collection_id, cursor))
  1088. for video in traverse_obj(response, ('itemList', lambda _, v: v['id'])):
  1089. video_id = video['id']
  1090. author = traverse_obj(video, ('author', ('uniqueId', 'secUid', 'id'), {str}, any)) or '_'
  1091. webpage_url = self._create_url(author, video_id)
  1092. yield self.url_result(
  1093. webpage_url, TikTokIE,
  1094. **self._parse_aweme_video_web(video, webpage_url, video_id, extract_flat=True))
  1095. if not traverse_obj(response, 'hasMore'):
  1096. break
  1097. cursor += self._PAGE_COUNT
  1098. def _real_extract(self, url):
  1099. collection_id, title, user_name = self._match_valid_url(url).group('id', 'title', 'user_id')
  1100. return self.playlist_result(
  1101. self._entries(collection_id), collection_id, '-'.join((user_name, title)))
  1102. class DouyinIE(TikTokBaseIE):
  1103. _VALID_URL = r'https?://(?:www\.)?douyin\.com/video/(?P<id>[0-9]+)'
  1104. _TESTS = [{
  1105. 'url': 'https://www.douyin.com/video/6961737553342991651',
  1106. 'md5': '9ecce7bc5b302601018ecb2871c63a75',
  1107. 'info_dict': {
  1108. 'id': '6961737553342991651',
  1109. 'ext': 'mp4',
  1110. 'title': '#杨超越 小小水手带你去远航❤️',
  1111. 'description': '#杨超越 小小水手带你去远航❤️',
  1112. 'uploader': '6897520xka',
  1113. 'uploader_id': '110403406559',
  1114. 'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
  1115. 'channel_id': 'MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
  1116. 'channel': '杨超越',
  1117. 'duration': 19,
  1118. 'timestamp': 1620905839,
  1119. 'upload_date': '20210513',
  1120. 'track': '@杨超越创作的原声',
  1121. 'artists': ['杨超越'],
  1122. 'view_count': int,
  1123. 'like_count': int,
  1124. 'repost_count': int,
  1125. 'comment_count': int,
  1126. 'thumbnail': r're:https?://.+\.jpe?g',
  1127. },
  1128. }, {
  1129. 'url': 'https://www.douyin.com/video/6982497745948921092',
  1130. 'md5': '15c5e660b7048af3707304e3cc02bbb5',
  1131. 'info_dict': {
  1132. 'id': '6982497745948921092',
  1133. 'ext': 'mp4',
  1134. 'title': '这个夏日和小羊@杨超越 一起遇见白色幻想',
  1135. 'description': '这个夏日和小羊@杨超越 一起遇见白色幻想',
  1136. 'uploader': '0731chaoyue',
  1137. 'uploader_id': '408654318141572',
  1138. 'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAZJpnglcjW2f_CMVcnqA_6oVBXKWMpH0F8LIHuUu8-lA',
  1139. 'channel_id': 'MS4wLjABAAAAZJpnglcjW2f_CMVcnqA_6oVBXKWMpH0F8LIHuUu8-lA',
  1140. 'channel': '杨超越工作室',
  1141. 'duration': 42,
  1142. 'timestamp': 1625739481,
  1143. 'upload_date': '20210708',
  1144. 'track': '@杨超越工作室创作的原声',
  1145. 'artists': ['杨超越工作室'],
  1146. 'view_count': int,
  1147. 'like_count': int,
  1148. 'repost_count': int,
  1149. 'comment_count': int,
  1150. 'thumbnail': r're:https?://.+\.jpe?g',
  1151. },
  1152. }, {
  1153. 'url': 'https://www.douyin.com/video/6953975910773099811',
  1154. 'md5': '0e6443758b8355db9a3c34864a4276be',
  1155. 'info_dict': {
  1156. 'id': '6953975910773099811',
  1157. 'ext': 'mp4',
  1158. 'title': '#一起看海 出现在你的夏日里',
  1159. 'description': '#一起看海 出现在你的夏日里',
  1160. 'uploader': '6897520xka',
  1161. 'uploader_id': '110403406559',
  1162. 'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
  1163. 'channel_id': 'MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
  1164. 'channel': '杨超越',
  1165. 'duration': 17,
  1166. 'timestamp': 1619098692,
  1167. 'upload_date': '20210422',
  1168. 'track': '@杨超越创作的原声',
  1169. 'artists': ['杨超越'],
  1170. 'view_count': int,
  1171. 'like_count': int,
  1172. 'repost_count': int,
  1173. 'comment_count': int,
  1174. 'thumbnail': r're:https?://.+\.jpe?g',
  1175. },
  1176. }, {
  1177. 'url': 'https://www.douyin.com/video/6950251282489675042',
  1178. 'md5': 'b4db86aec367ef810ddd38b1737d2fed',
  1179. 'info_dict': {
  1180. 'id': '6950251282489675042',
  1181. 'ext': 'mp4',
  1182. 'title': '哈哈哈,成功了哈哈哈哈哈哈',
  1183. 'uploader': '杨超越',
  1184. 'upload_date': '20210412',
  1185. 'timestamp': 1618231483,
  1186. 'uploader_id': '110403406559',
  1187. 'view_count': int,
  1188. 'like_count': int,
  1189. 'repost_count': int,
  1190. 'comment_count': int,
  1191. },
  1192. 'skip': 'No longer available',
  1193. }, {
  1194. 'url': 'https://www.douyin.com/video/6963263655114722595',
  1195. 'md5': '1440bcf59d8700f8e014da073a4dfea8',
  1196. 'info_dict': {
  1197. 'id': '6963263655114722595',
  1198. 'ext': 'mp4',
  1199. 'title': '#哪个爱豆的105度最甜 换个角度看看我哈哈',
  1200. 'description': '#哪个爱豆的105度最甜 换个角度看看我哈哈',
  1201. 'uploader': '6897520xka',
  1202. 'uploader_id': '110403406559',
  1203. 'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
  1204. 'channel_id': 'MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98',
  1205. 'channel': '杨超越',
  1206. 'duration': 15,
  1207. 'timestamp': 1621261163,
  1208. 'upload_date': '20210517',
  1209. 'track': '@杨超越创作的原声',
  1210. 'artists': ['杨超越'],
  1211. 'view_count': int,
  1212. 'like_count': int,
  1213. 'repost_count': int,
  1214. 'comment_count': int,
  1215. 'thumbnail': r're:https?://.+\.jpe?g',
  1216. },
  1217. }]
  1218. _UPLOADER_URL_FORMAT = 'https://www.douyin.com/user/%s'
  1219. _WEBPAGE_HOST = 'https://www.douyin.com/'
  1220. def _real_extract(self, url):
  1221. video_id = self._match_id(url)
  1222. detail = traverse_obj(self._download_json(
  1223. 'https://www.douyin.com/aweme/v1/web/aweme/detail/', video_id,
  1224. 'Downloading web detail JSON', 'Failed to download web detail JSON',
  1225. query={'aweme_id': video_id}, fatal=False), ('aweme_detail', {dict}))
  1226. if not detail:
  1227. # TODO: Run verification challenge code to generate signature cookies
  1228. raise ExtractorError(
  1229. 'Fresh cookies (not necessarily logged in) are needed',
  1230. expected=not self._get_cookies(self._WEBPAGE_HOST).get('s_v_web_id'))
  1231. return self._parse_aweme_video_app(detail)
  1232. class TikTokVMIE(InfoExtractor):
  1233. _VALID_URL = r'https?://(?:(?:vm|vt)\.tiktok\.com|(?:www\.)tiktok\.com/t)/(?P<id>\w+)'
  1234. IE_NAME = 'vm.tiktok'
  1235. _TESTS = [{
  1236. 'url': 'https://www.tiktok.com/t/ZTRC5xgJp',
  1237. 'info_dict': {
  1238. 'id': '7170520270497680683',
  1239. 'ext': 'mp4',
  1240. 'title': 'md5:c64f6152330c2efe98093ccc8597871c',
  1241. 'uploader_id': '6687535061741700102',
  1242. 'upload_date': '20221127',
  1243. 'view_count': int,
  1244. 'like_count': int,
  1245. 'comment_count': int,
  1246. 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAObqu3WCTXxmw2xwZ3iLEHnEecEIw7ks6rxWqOqOhaPja9BI7gqUQnjw8_5FSoDXX',
  1247. 'album': 'Wave of Mutilation: Best of Pixies',
  1248. 'thumbnail': r're:https://.+\.webp.*',
  1249. 'duration': 5,
  1250. 'timestamp': 1669516858,
  1251. 'repost_count': int,
  1252. 'artist': 'Pixies',
  1253. 'track': 'Where Is My Mind?',
  1254. 'description': 'md5:c64f6152330c2efe98093ccc8597871c',
  1255. 'uploader': 'sigmachaddeus',
  1256. 'creator': 'SigmaChad',
  1257. },
  1258. }, {
  1259. 'url': 'https://vm.tiktok.com/ZTR45GpSF/',
  1260. 'info_dict': {
  1261. 'id': '7106798200794926362',
  1262. 'ext': 'mp4',
  1263. 'title': 'md5:edc3e7ea587847f8537468f2fe51d074',
  1264. 'uploader_id': '6997695878846268418',
  1265. 'upload_date': '20220608',
  1266. 'view_count': int,
  1267. 'like_count': int,
  1268. 'comment_count': int,
  1269. 'thumbnail': r're:https://.+\.webp.*',
  1270. 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAdZ_NcPPgMneaGrW0hN8O_J_bwLshwNNERRF5DxOw2HKIzk0kdlLrR8RkVl1ksrMO',
  1271. 'duration': 29,
  1272. 'timestamp': 1654680400,
  1273. 'repost_count': int,
  1274. 'artist': 'Akihitoko',
  1275. 'track': 'original sound',
  1276. 'description': 'md5:edc3e7ea587847f8537468f2fe51d074',
  1277. 'uploader': 'akihitoko1',
  1278. 'creator': 'Akihitoko',
  1279. },
  1280. }, {
  1281. 'url': 'https://vt.tiktok.com/ZSe4FqkKd',
  1282. 'only_matching': True,
  1283. }]
  1284. def _real_extract(self, url):
  1285. new_url = self._request_webpage(
  1286. HEADRequest(url), self._match_id(url), headers={'User-Agent': 'facebookexternalhit/1.1'}).url
  1287. if self.suitable(new_url): # Prevent infinite loop in case redirect fails
  1288. raise UnsupportedError(new_url)
  1289. return self.url_result(new_url)
  1290. class TikTokLiveIE(TikTokBaseIE):
  1291. _VALID_URL = r'''(?x)https?://(?:
  1292. (?:www\.)?tiktok\.com/@(?P<uploader>[\w.-]+)/live|
  1293. m\.tiktok\.com/share/live/(?P<id>\d+)
  1294. )'''
  1295. IE_NAME = 'tiktok:live'
  1296. _TESTS = [{
  1297. 'url': 'https://www.tiktok.com/@weathernewslive/live',
  1298. 'info_dict': {
  1299. 'id': '7210809319192726273',
  1300. 'ext': 'mp4',
  1301. 'title': r're:ウェザーニュースLiVE[\d\s:-]*',
  1302. 'creator': 'ウェザーニュースLiVE',
  1303. 'uploader': 'weathernewslive',
  1304. 'uploader_id': '6621496731283095554',
  1305. 'uploader_url': 'https://www.tiktok.com/@weathernewslive',
  1306. 'live_status': 'is_live',
  1307. 'concurrent_view_count': int,
  1308. },
  1309. 'params': {'skip_download': 'm3u8'},
  1310. }, {
  1311. 'url': 'https://www.tiktok.com/@pilarmagenta/live',
  1312. 'info_dict': {
  1313. 'id': '7209423610325322522',
  1314. 'ext': 'mp4',
  1315. 'title': str,
  1316. 'creator': 'Pilarmagenta',
  1317. 'uploader': 'pilarmagenta',
  1318. 'uploader_id': '6624846890674683909',
  1319. 'uploader_url': 'https://www.tiktok.com/@pilarmagenta',
  1320. 'live_status': 'is_live',
  1321. 'concurrent_view_count': int,
  1322. },
  1323. 'skip': 'Livestream',
  1324. }, {
  1325. 'url': 'https://m.tiktok.com/share/live/7209423610325322522/?language=en',
  1326. 'only_matching': True,
  1327. }, {
  1328. 'url': 'https://www.tiktok.com/@iris04201/live',
  1329. 'only_matching': True,
  1330. }]
  1331. def _call_api(self, url, param, room_id, uploader, key=None):
  1332. response = traverse_obj(self._download_json(
  1333. url, room_id, fatal=False, query={
  1334. 'aid': '1988',
  1335. param: room_id,
  1336. }), (key, {dict}), default={})
  1337. # status == 2 if live else 4
  1338. if int_or_none(response.get('status')) == 2:
  1339. return response
  1340. # If room_id is obtained via mobile share URL and cannot be refreshed, do not wait for live
  1341. elif not uploader:
  1342. raise ExtractorError('This livestream has ended', expected=True)
  1343. raise UserNotLive(video_id=uploader)
  1344. def _real_extract(self, url):
  1345. uploader, room_id = self._match_valid_url(url).group('uploader', 'id')
  1346. webpage = self._download_webpage(
  1347. url, uploader or room_id, headers={'User-Agent': 'Mozilla/5.0'}, fatal=not room_id)
  1348. if webpage:
  1349. data = self._get_sigi_state(webpage, uploader or room_id)
  1350. room_id = (traverse_obj(data, ('UserModule', 'users', ..., 'roomId', {str_or_none}), get_all=False)
  1351. or self._search_regex(r'snssdk\d*://live\?room_id=(\d+)', webpage, 'room ID', default=None)
  1352. or room_id)
  1353. uploader = uploader or traverse_obj(
  1354. data, ('LiveRoom', 'liveRoomUserInfo', 'user', 'uniqueId'),
  1355. ('UserModule', 'users', ..., 'uniqueId'), get_all=False, expected_type=str)
  1356. if not room_id:
  1357. raise UserNotLive(video_id=uploader)
  1358. formats = []
  1359. live_info = self._call_api(
  1360. 'https://webcast.tiktok.com/webcast/room/info', 'room_id', room_id, uploader, key='data')
  1361. get_quality = qualities(('SD1', 'ld', 'SD2', 'sd', 'HD1', 'hd', 'FULL_HD1', 'uhd', 'ORIGION', 'origin'))
  1362. parse_inner = lambda x: self._parse_json(x, None)
  1363. for quality, stream in traverse_obj(live_info, (
  1364. 'stream_url', 'live_core_sdk_data', 'pull_data', 'stream_data',
  1365. {parse_inner}, 'data', {dict}), default={}).items():
  1366. sdk_params = traverse_obj(stream, ('main', 'sdk_params', {parse_inner}, {
  1367. 'vcodec': ('VCodec', {str}),
  1368. 'tbr': ('vbitrate', {lambda x: int_or_none(x, 1000)}),
  1369. 'resolution': ('resolution', {lambda x: re.match(r'(?i)\d+x\d+|\d+p', x).group().lower()}),
  1370. }))
  1371. flv_url = traverse_obj(stream, ('main', 'flv', {url_or_none}))
  1372. if flv_url:
  1373. formats.append({
  1374. 'url': flv_url,
  1375. 'ext': 'flv',
  1376. 'format_id': f'flv-{quality}',
  1377. 'quality': get_quality(quality),
  1378. **sdk_params,
  1379. })
  1380. hls_url = traverse_obj(stream, ('main', 'hls', {url_or_none}))
  1381. if hls_url:
  1382. formats.append({
  1383. 'url': hls_url,
  1384. 'ext': 'mp4',
  1385. 'protocol': 'm3u8_native',
  1386. 'format_id': f'hls-{quality}',
  1387. 'quality': get_quality(quality),
  1388. **sdk_params,
  1389. })
  1390. def get_vcodec(*keys):
  1391. return traverse_obj(live_info, (
  1392. 'stream_url', *keys, {parse_inner}, 'VCodec', {str}))
  1393. for stream in ('hls', 'rtmp'):
  1394. stream_url = traverse_obj(live_info, ('stream_url', f'{stream}_pull_url', {url_or_none}))
  1395. if stream_url:
  1396. formats.append({
  1397. 'url': stream_url,
  1398. 'ext': 'mp4' if stream == 'hls' else 'flv',
  1399. 'protocol': 'm3u8_native' if stream == 'hls' else 'https',
  1400. 'format_id': f'{stream}-pull',
  1401. 'vcodec': get_vcodec(f'{stream}_pull_url_params'),
  1402. 'quality': get_quality('ORIGION'),
  1403. })
  1404. for f_id, f_url in traverse_obj(live_info, ('stream_url', 'flv_pull_url', {dict}), default={}).items():
  1405. if not url_or_none(f_url):
  1406. continue
  1407. formats.append({
  1408. 'url': f_url,
  1409. 'ext': 'flv',
  1410. 'format_id': f'flv-{f_id}'.lower(),
  1411. 'vcodec': get_vcodec('flv_pull_url_params', f_id),
  1412. 'quality': get_quality(f_id),
  1413. })
  1414. # If uploader is a guest on another's livestream, primary endpoint will not have m3u8 URLs
  1415. if not traverse_obj(formats, lambda _, v: v['ext'] == 'mp4'):
  1416. live_info = merge_dicts(live_info, self._call_api(
  1417. 'https://www.tiktok.com/api/live/detail/', 'roomID', room_id, uploader, key='LiveRoomInfo'))
  1418. if url_or_none(live_info.get('liveUrl')):
  1419. formats.append({
  1420. 'url': live_info['liveUrl'],
  1421. 'ext': 'mp4',
  1422. 'protocol': 'm3u8_native',
  1423. 'format_id': 'hls-fallback',
  1424. 'vcodec': 'h264',
  1425. 'quality': get_quality('origin'),
  1426. })
  1427. uploader = uploader or traverse_obj(live_info, ('ownerInfo', 'uniqueId'), ('owner', 'display_id'))
  1428. return {
  1429. 'id': room_id,
  1430. 'uploader': uploader,
  1431. 'uploader_url': format_field(uploader, None, self._UPLOADER_URL_FORMAT) or None,
  1432. 'is_live': True,
  1433. 'formats': formats,
  1434. '_format_sort_fields': ('quality', 'ext'),
  1435. **traverse_obj(live_info, {
  1436. 'title': 'title',
  1437. 'uploader_id': (('ownerInfo', 'owner'), 'id', {str_or_none}),
  1438. 'creator': (('ownerInfo', 'owner'), 'nickname'),
  1439. 'concurrent_view_count': (('user_count', ('liveRoomStats', 'userCount')), {int_or_none}),
  1440. }, get_all=False),
  1441. }