From 91ae7e18dccd7000571d823470fa5b820b86db96 Mon Sep 17 00:00:00 2001 From: David Lee Date: Sun, 9 Nov 2025 21:17:54 -0800 Subject: [PATCH] Avoid dangerous accesses, more coding conventions --- yt_dlp/extractor/heinetworktv.py | 96 ++++++++++++++++---------------- 1 file changed, 49 insertions(+), 47 deletions(-) diff --git a/yt_dlp/extractor/heinetworktv.py b/yt_dlp/extractor/heinetworktv.py index 517f6efead..6aa8f67d59 100644 --- a/yt_dlp/extractor/heinetworktv.py +++ b/yt_dlp/extractor/heinetworktv.py @@ -12,6 +12,7 @@ from ..utils import ( get_element_html_by_id, get_element_text_and_html_by_tag, get_elements_html_by_class, + int_or_none, unified_strdate, ) @@ -57,7 +58,10 @@ class HEINetworkTVIE(InfoExtractor): def _real_extract(self, url): parts = urllib.parse.urlparse(url).path.split('/') - # remove empty parts; get last element + # Remove empty parts; get last element. + # This isn't necessarily a site-wide unique ID (we'll get that from the + # page content), but it at least gives us something unique-ish to pass + # to `_download_webpage`. item_id = next(filter(None, reversed(parts))) webpage = self._download_webpage(url, item_id) @@ -69,74 +73,70 @@ class HEINetworkTVIE(InfoExtractor): return self._extract_single_video(webpage, url) def _extract_collection(self, webpage, url): - grid = get_element_html_by_class('group/collection', webpage) - linksHtml = get_elements_html_by_class('group/thumb', grid) - urls = [extract_attributes(html)['href'] for html in linksHtml] + if grid := get_element_html_by_class('group/collection', webpage): + if linksHtml := get_elements_html_by_class('group/thumb', grid): + urls = [extract_attributes(html).get('href') for html in linksHtml] - return self.playlist_from_matches( - urls, - ie=HEINetworkTVIE, - playlist_id=self._path_components(url)[-1], - playlist_title=self._breadcrumbs(webpage)[-1], - ) + return self.playlist_from_matches( + urls, + ie=HEINetworkTVIE, + playlist_id=self._path_components(url)[-1], + playlist_title=self._breadcrumbs(webpage)[-1], + ) def _extract_season_name_and_number(self, webpage): - bc = self._breadcrumbs(webpage) - if len(bc) != 2: - return None, None - season_name = bc[-1] - season_number_match = re.match(r'Season (\d+)', season_name) - if not season_number_match: - return season_name, None - return season_name, int(season_number_match.group(1)) + # expected breadcrumbs to be [series, season] for an episode page + if (bc := self._breadcrumbs(webpage)) and len(bc) == 2: + season_name = bc[-1] + season_number_match = re.match(r'Season (?P\d+)', season_name) + if not season_number_match: + return season_name, None + return season_name, int_or_none(season_number_match.group('season')) + return None, None def _extract_series_name(self, webpage): - bc = self._breadcrumbs(webpage) - if len(bc) < 1: - return None - return bc[0] + if (bc := self._breadcrumbs(webpage)) and len(bc) >= 1: + return bc[0] def _path_components(self, url): return [p for p in urllib.parse.urlparse(url).path.split('/') if p] def _extract_video_id(self, webpage): _text, html = get_element_text_and_html_by_tag('hei-video', webpage) + if html is None: + return None attrs = extract_attributes(html) - return attrs['data-episode-id'] + return attrs.get('data-episode-id') def _clean_episode_title(self, video_title): - match = re.match(r'\d+\s+(?P.+)', video_title) - if match: + # ex: 1301 Episode Title + if match := re.match(r'\d+\s+(?P<title>.+)', video_title): return match.group('title') - return None def _episode_number(self, video_title, season_number): if season_number is None: return None + # ex: 1301 -> season 13, episode 01 match = re.match(fr'{re.escape(str(season_number))}(?P<episode_no>\d+)', video_title) if match: - return int(match.group('episode_no')) - return None + return int_or_none(match.group('episode_no')) def _extract_single_video(self, webpage, url): - path_components = self._path_components(url) video_id = self._extract_video_id(webpage) - video_src = self._extract_video_src(webpage) - formats, _subs = self._extract_m3u8_formats_and_subtitles(video_src, video_id) - air_date = self._air_date(webpage) + formats, _subs = self._extract_m3u8_formats_and_subtitles( + self._extract_video_src(webpage), video_id) season, season_number = self._extract_season_name_and_number(webpage) - series = self._extract_series_name(webpage) video_title = self._extract_video_title(webpage) return { 'id': video_id, 'title': video_title, 'formats': formats, - 'release_date': air_date, + 'release_date': self._air_date(webpage), 'season': season, 'season_number': season_number, - 'season_id': path_components[-2], - 'series': series, + 'season_id': self._path_components(url)[-2], + 'series': self._extract_series_name(webpage), 'episode': self._clean_episode_title(video_title), 'episode_number': self._episode_number(video_title, season_number), } @@ -150,26 +150,28 @@ class HEINetworkTVIE(InfoExtractor): return get_element_by_class('group/collection', webpage) is not None def _breadcrumbs(self, webpage): - breadcrumb_container = get_element_html_by_class('breadcrumbs', webpage) - root = ElementTree.fromstring(breadcrumb_container) - return [''.join(e.itertext()).strip() for e in root.findall('.//li')] + if breadcrumb_container := get_element_html_by_class('breadcrumbs', webpage): + root = ElementTree.fromstring(breadcrumb_container) + return [''.join(e.itertext()).strip() for e in root.findall('.//li')] # Single-video helpers def _extract_video_src(self, webpage): _, html = get_element_text_and_html_by_tag('castable-video', webpage) + if html is None: + return None attrs = extract_attributes(html) - return attrs['src'] + return attrs.get('src') def _extract_video_title(self, webpage): _, mux_video = get_element_text_and_html_by_tag('mux-video', webpage) - attrs = extract_attributes(mux_video) - if 'metadata-video-title' not in attrs: + if mux_video is None: return None - return attrs['metadata-video-title'] + attrs = extract_attributes(mux_video) + return attrs.get('metadata-video-title') def _air_date(self, webpage): - episode_info_container = get_element_html_by_id('hei-episode-title', webpage) - release_date_str = get_element_by_class('text-sm', episode_info_container) - matches = re.match(r'\s+Air Date: (?P<date>[\w/]+)', release_date_str) - return unified_strdate(matches.group('date'), day_first=False) + if episode_info_container := get_element_html_by_id('hei-episode-title', webpage): + if release_date_str := get_element_by_class('text-sm', episode_info_container): + if matches := re.match(r'\s+Air Date: (?P<date>[\w/]+)', release_date_str): + return unified_strdate(matches.group('date'), day_first=False)