From 8a5e7e7974a5288056907205765a31eb063e262b Mon Sep 17 00:00:00 2001 From: Bradley <15671567+bradleyhodges@users.noreply.github.com> Date: Thu, 22 Jan 2026 11:41:40 +0800 Subject: [PATCH 1/4] [ie/tiktok] Enable app-based extraction; avoid brittle webpage extraction failures --- yt_dlp/extractor/tiktok.py | 39 +++++++++++++++++++++++++++++--------- 1 file changed, 30 insertions(+), 9 deletions(-) diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py index 02ec2b2f45..ac7e4fd724 100644 --- a/yt_dlp/extractor/tiktok.py +++ b/yt_dlp/extractor/tiktok.py @@ -56,7 +56,7 @@ class TikTokBaseIE(InfoExtractor): @functools.cached_property def _KNOWN_APP_INFO(self): # If we have a genuine device ID, we may not need any IID - default = [''] if self._KNOWN_DEVICE_ID else [] + default = [''] # enable app-based extraction out of the box return self._configuration_arg('app_info', default, ie_key=TikTokIE) @functools.cached_property @@ -68,9 +68,24 @@ class TikTokBaseIE(InfoExtractor): return self._KNOWN_DEVICE_ID or str(random.randint(7250000000000000000, 7325099899999994577)) @functools.cached_property - def _API_HOSTNAME(self): + def _IID(self): + # Install ID (iid) used by the mobile API. When not explicitly provided via extractor-args, + # generate a plausible value so the app-based fallback works out of the box. + return str(random.randint(10 ** 18, 10 ** 19 - 1)) + + @functools.cached_property + def _API_HOSTNAMES(self): return self._configuration_arg( - 'api_hostname', ['api16-normal-c-useast1a.tiktokv.com'], ie_key=TikTokIE)[0] + 'api_hostname', [ + 'api16-normal-c-useast1a.tiktokv.com', + 'api22-normal-c-useast1a.tiktokv.com', + 'api19-normal-c-useast1a.tiktokv.com', + 'api-h2.tiktokv.com', + ], ie_key=TikTokIE) + + @functools.cached_property + def _API_HOSTNAME(self): + return self._API_HOSTNAMES[0] def _get_next_app_info(self): if self._APP_INFO_POOL is None: @@ -89,6 +104,7 @@ class TikTokBaseIE(InfoExtractor): return False self._APP_INFO = self._APP_INFO_POOL.pop(0) + self._APP_INFO.setdefault('iid', self._IID) app_name = self._APP_INFO['app_name'] version = self._APP_INFO['manifest_app_version'] @@ -116,13 +132,14 @@ class TikTokBaseIE(InfoExtractor): ('__DEFAULT_SCOPE__', {dict})) or {} def _call_api_impl(self, ep, video_id, query=None, data=None, headers=None, fatal=True, - note='Downloading API JSON', errnote='Unable to download API page'): - self._set_cookie(self._API_HOSTNAME, 'odin_tt', ''.join(random.choices('0123456789abcdef', k=160))) + note='Downloading API JSON', errnote='Unable to download API page', api_hostname=None): + api_hostname = api_hostname or self._API_HOSTNAME + self._set_cookie(api_hostname, 'odin_tt', ''.join(random.choices('0123456789abcdef', k=160))) webpage_cookies = self._get_cookies(self._WEBPAGE_HOST) if webpage_cookies.get('sid_tt'): - self._set_cookie(self._API_HOSTNAME, 'sid_tt', webpage_cookies['sid_tt'].value) + self._set_cookie(api_hostname, 'sid_tt', webpage_cookies['sid_tt'].value) return self._download_json( - f'https://{self._API_HOSTNAME}/aweme/v1/{ep}/', video_id=video_id, + f'https://{api_hostname}/aweme/v1/{ep}/', video_id=video_id, fatal=fatal, note=note, errnote=errnote, headers={ 'User-Agent': self._APP_USER_AGENT, 'Accept': 'application/json', @@ -171,7 +188,7 @@ class TikTokBaseIE(InfoExtractor): 'build_number': self._APP_INFO['app_version'], 'region': 'US', 'ts': int(time.time()), - 'iid': self._APP_INFO.get('iid'), + 'iid': self._APP_INFO.get('iid') or self._IID, 'device_id': self._DEVICE_ID, 'openudid': ''.join(random.choices('0123456789abcdef', k=16)), }) @@ -186,14 +203,18 @@ class TikTokBaseIE(InfoExtractor): self.report_warning(message) return + api_hostnames = self._API_HOSTNAMES or [self._API_HOSTNAME] + max_tries = len(self._APP_INFO_POOL) + 1 # _APP_INFO_POOL + _APP_INFO for count in itertools.count(1): + api_hostname = api_hostnames[(count - 1) % len(api_hostnames)] + self.write_debug(f'Using API hostname: {api_hostname}') self.write_debug(str(self._APP_INFO)) real_query = self._build_api_query(query or {}) try: return self._call_api_impl( ep, video_id, query=real_query, data=data, headers=headers, - fatal=fatal, note=note, errnote=errnote) + fatal=fatal, note=note, errnote=errnote, api_hostname=api_hostname) except ExtractorError as e: if isinstance(e.cause, json.JSONDecodeError) and e.cause.pos == 0: message = str(e.cause or e.msg) From 6b12fffeb7848768ae0615e28ec70969d46f4d79 Mon Sep 17 00:00:00 2001 From: Bradley <15671567+bradleyhodges@users.noreply.github.com> Date: Thu, 22 Jan 2026 12:05:05 +0800 Subject: [PATCH 2/4] [ie/tiktok] Initialize cookies early; add retry logic; detect bot blockpages; implement oEmbed fallback --- yt_dlp/extractor/tiktok.py | 345 ++++++++++++++++++++++++++++++++++--- 1 file changed, 320 insertions(+), 25 deletions(-) diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py index ac7e4fd724..67bb89fe99 100644 --- a/yt_dlp/extractor/tiktok.py +++ b/yt_dlp/extractor/tiktok.py @@ -37,6 +37,7 @@ from ..utils import ( class TikTokBaseIE(InfoExtractor): _UPLOADER_URL_FORMAT = 'https://www.tiktok.com/@%s' _WEBPAGE_HOST = 'https://www.tiktok.com/' + _OEMBED_API = 'https://www.tiktok.com/oembed' QUALITIES = ('360p', '540p', '720p', '1080p') _APP_INFO_DEFAULTS = { @@ -47,11 +48,12 @@ class TikTokBaseIE(InfoExtractor): 'app_version': '35.1.3', 'manifest_app_version': '2023501030', # "app id": aweme = 1128, trill = 1180, musical_ly = 1233, universal = 0 - 'aid': '0', + 'aid': '1233', } _APP_INFO_POOL = None _APP_INFO = None _APP_USER_AGENT = None + _cookies_initialized = False @functools.cached_property def _KNOWN_APP_INFO(self): @@ -131,6 +133,51 @@ class TikTokBaseIE(InfoExtractor): 'universal data', display_id, end_pattern=r'', default={}), ('__DEFAULT_SCOPE__', {dict})) or {} + def _initialize_cookies(self, video_id): + """Pre-initialize cookies by making a request to establish a session.""" + if self._cookies_initialized: + return + + # Make a lightweight request to get session cookies + try: + self._request_webpage( + 'https://www.tiktok.com/', video_id, + note='Initializing session', errnote=False, + headers={'Accept': 'text/html'}, fatal=False) + except Exception: + pass # Ignore failures, cookies are optional + self._cookies_initialized = True + + def _get_oembed_data(self, url, video_id): + """Fetch video metadata from TikTok's oEmbed API.""" + try: + return self._download_json( + self._OEMBED_API, video_id, + note='Downloading oEmbed data', + errnote='Unable to download oEmbed data', + query={'url': url}, fatal=False) + except ExtractorError: + return None + + def _is_blocked_response(self, webpage, urlh=None): + """Check if the response indicates a blocked/error page from TikTok.""" + if not webpage: + return True + # Check for very small responses (error pages) + if len(webpage) < 1000: + return True + # Check for system error indicators in content + if 'x-tt-system-error' in webpage.lower() or '__NEXT_DATA__' not in webpage: + # Check if we have the expected data structures + if not any(marker in webpage for marker in [ + '__UNIVERSAL_DATA_FOR_REHYDRATION__', + 'SIGI_STATE', + 'sigi-persisted-data', + '__NEXT_DATA__', + ]): + return True + return False + def _call_api_impl(self, ep, video_id, query=None, data=None, headers=None, fatal=True, note='Downloading API JSON', errnote='Unable to download API page', api_hostname=None): api_hostname = api_hostname or self._API_HOSTNAME @@ -241,34 +288,74 @@ class TikTokBaseIE(InfoExtractor): def _extract_web_data_and_status(self, url, video_id, fatal=True): video_data, status = {}, -1 - res = self._download_webpage_handle(url, video_id, fatal=fatal, impersonate=True) - if res is False: - return video_data, status + # Initialize cookies first for better success rate + self._initialize_cookies(video_id) - webpage, urlh = res - if urllib.parse.urlparse(urlh.url).path == '/login': - message = 'TikTok is requiring login for access to this content' - if fatal: - self.raise_login_required(message) - self.report_warning(f'{message}. {self._login_hint()}') - return video_data, status + # Try with impersonation first, then fall back to other methods + max_retries = 3 + for attempt in range(max_retries): + res = self._download_webpage_handle( + url, video_id, fatal=False, impersonate=True, + note=f'Downloading webpage{f" (attempt {attempt + 1})" if attempt else ""}') + if res is False: + if attempt < max_retries - 1: + self.write_debug(f'Webpage download failed, retrying ({attempt + 1}/{max_retries})') + time.sleep(1 + random.random()) + continue + break - if universal_data := self._get_universal_data(webpage, video_id): - self.write_debug('Found universal data for rehydration') - status = traverse_obj(universal_data, ('webapp.video-detail', 'statusCode', {int})) or 0 - video_data = traverse_obj(universal_data, ('webapp.video-detail', 'itemInfo', 'itemStruct', {dict})) + webpage, urlh = res - elif sigi_data := self._get_sigi_state(webpage, video_id): - self.write_debug('Found sigi state data') - status = traverse_obj(sigi_data, ('VideoPage', 'statusCode', {int})) or 0 - video_data = traverse_obj(sigi_data, ('ItemModule', video_id, {dict})) + # Check for login redirect + if urllib.parse.urlparse(urlh.url).path == '/login': + message = 'TikTok is requiring login for access to this content' + if fatal: + self.raise_login_required(message) + self.report_warning(f'{message}. {self._login_hint()}') + return video_data, status - elif next_data := self._search_nextjs_data(webpage, video_id, default={}): - self.write_debug('Found next.js data') - status = traverse_obj(next_data, ('props', 'pageProps', 'statusCode', {int})) or 0 - video_data = traverse_obj(next_data, ('props', 'pageProps', 'itemInfo', 'itemStruct', {dict})) + # Check if response is blocked/error page + if self._is_blocked_response(webpage, urlh): + self.write_debug(f'Received blocked/minimal response (attempt {attempt + 1})') + if attempt < max_retries - 1: + time.sleep(1.5 + random.random()) + continue + # On final attempt, still try to parse what we got + self.write_debug('All attempts returned blocked responses, trying to parse anyway') - elif fatal: + # Try to extract data from the webpage + if universal_data := self._get_universal_data(webpage, video_id): + self.write_debug('Found universal data for rehydration') + status = traverse_obj(universal_data, ('webapp.video-detail', 'statusCode', {int})) or 0 + video_data = traverse_obj(universal_data, ('webapp.video-detail', 'itemInfo', 'itemStruct', {dict})) + if video_data: + break + + if sigi_data := self._get_sigi_state(webpage, video_id): + self.write_debug('Found sigi state data') + status = traverse_obj(sigi_data, ('VideoPage', 'statusCode', {int})) or 0 + video_data = traverse_obj(sigi_data, ('ItemModule', video_id, {dict})) + if video_data: + break + + if next_data := self._search_nextjs_data(webpage, video_id, default={}): + self.write_debug('Found next.js data') + status = traverse_obj(next_data, ('props', 'pageProps', 'statusCode', {int})) or 0 + video_data = traverse_obj(next_data, ('props', 'pageProps', 'itemInfo', 'itemStruct', {dict})) + if video_data: + break + + # If no data found and more retries available, continue + if attempt < max_retries - 1: + self.write_debug('No video data found in response, retrying') + time.sleep(1 + random.random()) + continue + + # If still no data, try the embed page as a last resort + if not video_data: + video_data, status = self._try_extract_from_embed(url, video_id) + + if not video_data and fatal: raise ExtractorError('Unable to extract webpage video data') if not traverse_obj(video_data, ('video', {dict})) and traverse_obj(video_data, ('isContentClassified', {bool})): @@ -279,6 +366,41 @@ class TikTokBaseIE(InfoExtractor): return video_data, status + def _try_extract_from_embed(self, url, video_id): + """Try to extract video data from the embed page.""" + video_data, status = {}, -1 + + try: + embed_url = f'https://www.tiktok.com/embed/v2/{video_id}' + embed_page = self._download_webpage( + embed_url, video_id, note='Downloading embed page', + errnote='Unable to download embed page', fatal=False) + + if embed_page: + # Try to find video data in the embed page + if frontity_data := self._search_json( + r']+\bid=[\'"]__FRONTITY_CONNECT_STATE__[\'"][^>]*>', + embed_page, 'frontity data', video_id, default={}): + self.write_debug('Found frontity data in embed page') + video_data = traverse_obj(frontity_data, ( + 'source', 'data', ..., 'itemInfo', 'itemStruct', {dict}), get_all=False) + if video_data: + status = 0 + + if not video_data: + # Try alternative embed data structure + if embed_data := self._search_json( + r']+\bdata-testid=[\'"]__UNIVERSAL_DATA__[\'"][^>]*>', + embed_page, 'embed data', video_id, default={}, end_pattern=r''): + video_data = traverse_obj(embed_data, ('itemInfo', 'itemStruct', {dict})) + if video_data: + status = 0 + + except Exception as e: + self.write_debug(f'Embed extraction failed: {e}') + + return video_data, status + def _get_subtitles(self, aweme_detail, aweme_id, user_name): # TODO: Extract text positioning info @@ -931,7 +1053,7 @@ class TikTokIE(TikTokBaseIE): self.report_warning(f'{e}; trying with webpage') url = self._create_url(user_id, video_id) - video_data, status = self._extract_web_data_and_status(url, video_id) + video_data, status = self._extract_web_data_and_status(url, video_id, fatal=False) if video_data and status == 0: return self._parse_aweme_video_web(video_data, url, video_id) @@ -941,8 +1063,181 @@ class TikTokIE(TikTokBaseIE): 'You do not have permission to view this post. Log into an account that has access') elif status == 10204: raise ExtractorError('Your IP address is blocked from accessing this post', expected=True) + + # Fallback to oEmbed API for basic metadata + self.write_debug('Trying oEmbed API fallback') + oembed_data = self._get_oembed_data(url, video_id) + + if oembed_data: + # oEmbed doesn't provide direct video URLs, but gives us metadata + # We can construct a minimal result and try to get video from thumbnail patterns + self.write_debug('Got oEmbed data, attempting video extraction') + result = self._extract_from_oembed(oembed_data, url, video_id) + if result and result.get('formats'): + return result + # If we got metadata but no formats, report what we know + if result: + self.report_warning('Could not extract video formats, but metadata was retrieved') + result['formats'] = [] + return result + + if status == -1: + raise ExtractorError( + 'Unable to extract video data. TikTok may be blocking automated access. ' + 'Try using --cookies-from-browser to pass your browser cookies.', expected=True) raise ExtractorError(f'Video not available, status code {status}', video_id=video_id) + def _extract_from_oembed(self, oembed_data, url, video_id): + """Extract video info from oEmbed data.""" + if not oembed_data: + return None + + thumbnail_url = oembed_data.get('thumbnail_url') + formats = [] + + # Try to extract video URL from thumbnail URL pattern + # TikTok thumbnail URLs sometimes contain patterns that can be modified to get video URLs + if thumbnail_url: + # The thumbnail URL contains similar path structure to video URLs + # Try common video URL patterns based on thumbnail + video_patterns = self._try_video_urls_from_thumbnail(thumbnail_url, video_id) + for video_url in video_patterns: + formats.append({ + 'url': video_url, + 'ext': 'mp4', + 'format_id': 'oembed', + 'format_note': 'From oEmbed thumbnail pattern', + }) + + # Try the embed page for actual video URLs + if not formats: + embed_formats = self._try_extract_formats_from_embed(video_id) + formats.extend(embed_formats) + + # Extract author info from oEmbed + author_url = oembed_data.get('author_url', '') + uploader = None + if author_url: + uploader_match = re.search(r'@([\w.-]+)', author_url) + if uploader_match: + uploader = uploader_match.group(1) + + return { + 'id': video_id, + 'title': oembed_data.get('title') or f'TikTok video #{video_id}', + 'description': oembed_data.get('title'), + 'uploader': uploader or oembed_data.get('author_name'), + 'uploader_url': oembed_data.get('author_url'), + 'thumbnail': thumbnail_url, + 'thumbnails': [{'url': thumbnail_url}] if thumbnail_url else None, + 'formats': formats, + 'http_headers': {'Referer': url}, + } + + def _try_video_urls_from_thumbnail(self, thumbnail_url, video_id): + """Try to derive video URLs from thumbnail URL patterns.""" + # TikTok CDN patterns - thumbnails and videos often share similar base URLs + # This is a heuristic approach + return [] # Conservative: don't generate potentially broken URLs + + def _try_extract_formats_from_embed(self, video_id): + """Try to extract video formats from the embed page.""" + formats = [] + try: + embed_url = f'https://www.tiktok.com/embed/v2/{video_id}' + embed_page = self._download_webpage( + embed_url, video_id, note='Downloading embed page for formats', + errnote=False, fatal=False) + + if not embed_page: + return formats + + # Try to extract video data from FRONTITY_CONNECT_STATE + frontity_data = self._search_json( + r']+\bid=[\'"]__FRONTITY_CONNECT_STATE__[\'"][^>]*>', + embed_page, 'frontity data', video_id, default={}, end_pattern=r'') + + if frontity_data: + # Get the item struct for video info + item_struct = traverse_obj(frontity_data, ( + 'source', 'data', ..., 'itemInfo', 'itemStruct', {dict}), get_all=False) + + if item_struct: + # Use the existing _extract_web_formats method with the proper data structure + formats = self._extract_web_formats(item_struct) + if formats: + return formats + + # Fallback: try to extract URLs manually from the video structure + video_info = traverse_obj(item_struct, ('video', {dict})) or {} + play_width = int_or_none(video_info.get('width')) + play_height = int_or_none(video_info.get('height')) + + # Extract play URLs + for play_url in traverse_obj(video_info, ('playAddr', ((..., 'src'), None), {url_or_none})): + formats.append({ + 'url': self._proto_relative_url(play_url), + 'ext': 'mp4', + 'format_id': 'play', + 'format_note': 'From embed page', + 'vcodec': 'h264', + 'acodec': 'aac', + 'width': play_width, + 'height': play_height, + }) + + # Extract download URLs + for dl_url in traverse_obj(video_info, (('downloadAddr', ('download', 'url')), {url_or_none})): + formats.append({ + 'url': self._proto_relative_url(dl_url), + 'ext': 'mp4', + 'format_id': 'download', + 'format_note': 'From embed page, watermarked', + 'vcodec': 'h264', + 'acodec': 'aac', + 'preference': -2, + }) + + # Also try regex patterns for video URLs in the page + if not formats: + video_url_candidates = set() + # Look for URLs in escaped JSON format + video_url_patterns = [ + # TikTok CDN video URLs (exclude audio patterns) + r'(https?://v\d+[a-z]?\.tiktokcdn\.com/[^"\'<>\s\\]+)', + r'(https?://v\d+[a-z]?-[a-z]+\.tiktokcdn\.com/[^"\'<>\s\\]+)', + # Escaped URLs in JSON + r'"(?:playAddr|src)"["\']?\s*:\s*"(https?:[^"]+)"', + ] + for pattern in video_url_patterns: + for video_url in re.findall(pattern, embed_page, re.IGNORECASE): + # Clean up the URL + video_url = video_url.replace('\\u002F', '/').replace('\\/', '/').replace('\\u0026', '&') + # Filter out audio-only URLs + if 'audio_mpeg' in video_url or 'mime_type=audio' in video_url: + continue + # Only accept video URLs + if video_url and ('tiktokcdn' in video_url or 'bytedance' in video_url): + if 'video' in video_url or 'mime_type=video' in video_url or 'video_mp4' in video_url: + video_url_candidates.add(video_url) + + for i, video_url in enumerate(video_url_candidates): + formats.append({ + 'url': video_url, + 'ext': 'mp4', + 'format_id': f'embed_{i}', + 'format_note': 'From embed page (CDN)', + 'vcodec': 'h264', + 'acodec': 'aac', + }) + + self._remove_duplicate_formats(formats) + + except Exception as e: + self.write_debug(f'Embed format extraction failed: {e}') + + return formats + class TikTokUserIE(TikTokBaseIE): IE_NAME = 'tiktok:user' From 5560dd40106ffac596f95fc632983db24bf51ea1 Mon Sep 17 00:00:00 2001 From: Bradley <15671567+bradleyhodges@users.noreply.github.com> Date: Thu, 22 Jan 2026 12:06:49 +0800 Subject: [PATCH 3/4] [ie/tiktok] address linter complaint (replace try with `contextlib.suppress(Exception)`) --- yt_dlp/extractor/tiktok.py | 44 ++++++-------------------------------- 1 file changed, 6 insertions(+), 38 deletions(-) diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py index 67bb89fe99..d5713df080 100644 --- a/yt_dlp/extractor/tiktok.py +++ b/yt_dlp/extractor/tiktok.py @@ -1,3 +1,4 @@ +import contextlib import functools import itertools import json @@ -58,7 +59,7 @@ class TikTokBaseIE(InfoExtractor): @functools.cached_property def _KNOWN_APP_INFO(self): # If we have a genuine device ID, we may not need any IID - default = [''] # enable app-based extraction out of the box + default = [''] return self._configuration_arg('app_info', default, ie_key=TikTokIE) @functools.cached_property @@ -71,8 +72,6 @@ class TikTokBaseIE(InfoExtractor): @functools.cached_property def _IID(self): - # Install ID (iid) used by the mobile API. When not explicitly provided via extractor-args, - # generate a plausible value so the app-based fallback works out of the box. return str(random.randint(10 ** 18, 10 ** 19 - 1)) @functools.cached_property @@ -139,13 +138,11 @@ class TikTokBaseIE(InfoExtractor): return # Make a lightweight request to get session cookies - try: + with contextlib.suppress(Exception): self._request_webpage( 'https://www.tiktok.com/', video_id, note='Initializing session', errnote=False, headers={'Accept': 'text/html'}, fatal=False) - except Exception: - pass # Ignore failures, cookies are optional self._cookies_initialized = True def _get_oembed_data(self, url, video_id): @@ -163,12 +160,9 @@ class TikTokBaseIE(InfoExtractor): """Check if the response indicates a blocked/error page from TikTok.""" if not webpage: return True - # Check for very small responses (error pages) if len(webpage) < 1000: return True - # Check for system error indicators in content if 'x-tt-system-error' in webpage.lower() or '__NEXT_DATA__' not in webpage: - # Check if we have the expected data structures if not any(marker in webpage for marker in [ '__UNIVERSAL_DATA_FOR_REHYDRATION__', 'SIGI_STATE', @@ -291,7 +285,7 @@ class TikTokBaseIE(InfoExtractor): # Initialize cookies first for better success rate self._initialize_cookies(video_id) - # Try with impersonation first, then fall back to other methods + # First try with impersonation, then fall back to other methods if no joy max_retries = 3 for attempt in range(max_retries): res = self._download_webpage_handle( @@ -306,7 +300,6 @@ class TikTokBaseIE(InfoExtractor): webpage, urlh = res - # Check for login redirect if urllib.parse.urlparse(urlh.url).path == '/login': message = 'TikTok is requiring login for access to this content' if fatal: @@ -314,16 +307,13 @@ class TikTokBaseIE(InfoExtractor): self.report_warning(f'{message}. {self._login_hint()}') return video_data, status - # Check if response is blocked/error page if self._is_blocked_response(webpage, urlh): self.write_debug(f'Received blocked/minimal response (attempt {attempt + 1})') if attempt < max_retries - 1: time.sleep(1.5 + random.random()) continue - # On final attempt, still try to parse what we got self.write_debug('All attempts returned blocked responses, trying to parse anyway') - # Try to extract data from the webpage if universal_data := self._get_universal_data(webpage, video_id): self.write_debug('Found universal data for rehydration') status = traverse_obj(universal_data, ('webapp.video-detail', 'statusCode', {int})) or 0 @@ -345,7 +335,6 @@ class TikTokBaseIE(InfoExtractor): if video_data: break - # If no data found and more retries available, continue if attempt < max_retries - 1: self.write_debug('No video data found in response, retrying') time.sleep(1 + random.random()) @@ -377,7 +366,6 @@ class TikTokBaseIE(InfoExtractor): errnote='Unable to download embed page', fatal=False) if embed_page: - # Try to find video data in the embed page if frontity_data := self._search_json( r']+\bid=[\'"]__FRONTITY_CONNECT_STATE__[\'"][^>]*>', embed_page, 'frontity data', video_id, default={}): @@ -388,7 +376,6 @@ class TikTokBaseIE(InfoExtractor): status = 0 if not video_data: - # Try alternative embed data structure if embed_data := self._search_json( r']+\bdata-testid=[\'"]__UNIVERSAL_DATA__[\'"][^>]*>', embed_page, 'embed data', video_id, default={}, end_pattern=r''): @@ -1064,18 +1051,14 @@ class TikTokIE(TikTokBaseIE): elif status == 10204: raise ExtractorError('Your IP address is blocked from accessing this post', expected=True) - # Fallback to oEmbed API for basic metadata self.write_debug('Trying oEmbed API fallback') oembed_data = self._get_oembed_data(url, video_id) if oembed_data: - # oEmbed doesn't provide direct video URLs, but gives us metadata - # We can construct a minimal result and try to get video from thumbnail patterns self.write_debug('Got oEmbed data, attempting video extraction') result = self._extract_from_oembed(oembed_data, url, video_id) if result and result.get('formats'): return result - # If we got metadata but no formats, report what we know if result: self.report_warning('Could not extract video formats, but metadata was retrieved') result['formats'] = [] @@ -1095,11 +1078,7 @@ class TikTokIE(TikTokBaseIE): thumbnail_url = oembed_data.get('thumbnail_url') formats = [] - # Try to extract video URL from thumbnail URL pattern - # TikTok thumbnail URLs sometimes contain patterns that can be modified to get video URLs if thumbnail_url: - # The thumbnail URL contains similar path structure to video URLs - # Try common video URL patterns based on thumbnail video_patterns = self._try_video_urls_from_thumbnail(thumbnail_url, video_id) for video_url in video_patterns: formats.append({ @@ -1109,12 +1088,10 @@ class TikTokIE(TikTokBaseIE): 'format_note': 'From oEmbed thumbnail pattern', }) - # Try the embed page for actual video URLs if not formats: embed_formats = self._try_extract_formats_from_embed(video_id) formats.extend(embed_formats) - # Extract author info from oEmbed author_url = oembed_data.get('author_url', '') uploader = None if author_url: @@ -1136,9 +1113,7 @@ class TikTokIE(TikTokBaseIE): def _try_video_urls_from_thumbnail(self, thumbnail_url, video_id): """Try to derive video URLs from thumbnail URL patterns.""" - # TikTok CDN patterns - thumbnails and videos often share similar base URLs - # This is a heuristic approach - return [] # Conservative: don't generate potentially broken URLs + return [] # Don't generate potentially broken URLs def _try_extract_formats_from_embed(self, video_id): """Try to extract video formats from the embed page.""" @@ -1152,28 +1127,24 @@ class TikTokIE(TikTokBaseIE): if not embed_page: return formats - # Try to extract video data from FRONTITY_CONNECT_STATE frontity_data = self._search_json( r']+\bid=[\'"]__FRONTITY_CONNECT_STATE__[\'"][^>]*>', embed_page, 'frontity data', video_id, default={}, end_pattern=r'') if frontity_data: - # Get the item struct for video info item_struct = traverse_obj(frontity_data, ( 'source', 'data', ..., 'itemInfo', 'itemStruct', {dict}), get_all=False) if item_struct: - # Use the existing _extract_web_formats method with the proper data structure formats = self._extract_web_formats(item_struct) if formats: return formats - # Fallback: try to extract URLs manually from the video structure + # Try to extract URLs manually from the video structure (fallback) video_info = traverse_obj(item_struct, ('video', {dict})) or {} play_width = int_or_none(video_info.get('width')) play_height = int_or_none(video_info.get('height')) - # Extract play URLs for play_url in traverse_obj(video_info, ('playAddr', ((..., 'src'), None), {url_or_none})): formats.append({ 'url': self._proto_relative_url(play_url), @@ -1186,7 +1157,6 @@ class TikTokIE(TikTokBaseIE): 'height': play_height, }) - # Extract download URLs for dl_url in traverse_obj(video_info, (('downloadAddr', ('download', 'url')), {url_or_none})): formats.append({ 'url': self._proto_relative_url(dl_url), @@ -1198,10 +1168,8 @@ class TikTokIE(TikTokBaseIE): 'preference': -2, }) - # Also try regex patterns for video URLs in the page if not formats: video_url_candidates = set() - # Look for URLs in escaped JSON format video_url_patterns = [ # TikTok CDN video URLs (exclude audio patterns) r'(https?://v\d+[a-z]?\.tiktokcdn\.com/[^"\'<>\s\\]+)', From d1704531eebe541d85d149504fc955a416cd98a3 Mon Sep 17 00:00:00 2001 From: Bradley <15671567+bradleyhodges@users.noreply.github.com> Date: Thu, 22 Jan 2026 12:26:28 +0800 Subject: [PATCH 4/4] [ie/tiktok] Fix short videos (small file size) getting dismissed as "audio-only" --- yt_dlp/extractor/tiktok.py | 39 +++++++++++++++++++++++++++----------- 1 file changed, 28 insertions(+), 11 deletions(-) diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py index d5713df080..a04e90800e 100644 --- a/yt_dlp/extractor/tiktok.py +++ b/yt_dlp/extractor/tiktok.py @@ -1169,27 +1169,43 @@ class TikTokIE(TikTokBaseIE): }) if not formats: - video_url_candidates = set() + video_url_candidates = [] video_url_patterns = [ # TikTok CDN video URLs (exclude audio patterns) r'(https?://v\d+[a-z]?\.tiktokcdn\.com/[^"\'<>\s\\]+)', r'(https?://v\d+[a-z]?-[a-z]+\.tiktokcdn\.com/[^"\'<>\s\\]+)', - # Escaped URLs in JSON + r'(https?://v\d+m\.tiktokcdn\.com/[^"\'<>\s\\]+)', + # Escaped JSON URLs in embed page r'"(?:playAddr|src)"["\']?\s*:\s*"(https?:[^"]+)"', ] for pattern in video_url_patterns: for video_url in re.findall(pattern, embed_page, re.IGNORECASE): - # Clean up the URL video_url = video_url.replace('\\u002F', '/').replace('\\/', '/').replace('\\u0026', '&') - # Filter out audio-only URLs - if 'audio_mpeg' in video_url or 'mime_type=audio' in video_url: - continue - # Only accept video URLs - if video_url and ('tiktokcdn' in video_url or 'bytedance' in video_url): - if 'video' in video_url or 'mime_type=video' in video_url or 'video_mp4' in video_url: - video_url_candidates.add(video_url) - for i, video_url in enumerate(video_url_candidates): + if any(audio_marker in video_url for audio_marker in ( + 'audio_mpeg', 'mime_type=audio', '/music/', '-music-', + )): + continue + if video_url and ('tiktokcdn' in video_url or 'bytedance' in video_url): + if '/video/' in video_url or 'mime_type=video' in video_url or 'video_mp4' in video_url: + # Prioritize by app ID: a=1233/a=0 > no app ID > a=1180 + app_id_match = re.search(r'[?&]a=(\d+)', video_url) + app_id = app_id_match.group(1) if app_id_match else None + if app_id in ('1233', '0'): + priority = 2 # Preferred + elif app_id == '1180': + priority = 0 # Lower priority (trill app) + else: + priority = 1 # Neutral + video_url_candidates.append((video_url, priority)) + + # Sort by priority and theb by URL + seen_urls = set() + sorted_candidates = sorted(video_url_candidates, key=lambda x: (-x[1], x[0])) + for i, (video_url, priority) in enumerate(sorted_candidates): + if video_url in seen_urls: + continue + seen_urls.add(video_url) formats.append({ 'url': video_url, 'ext': 'mp4', @@ -1197,6 +1213,7 @@ class TikTokIE(TikTokBaseIE): 'format_note': 'From embed page (CDN)', 'vcodec': 'h264', 'acodec': 'aac', + 'preference': priority - 1, }) self._remove_duplicate_formats(formats)