From 3c8d9f8b3a3bcec672c2b8257f24cdb09a576a60 Mon Sep 17 00:00:00 2001 From: Moritz Barsnick Date: Sun, 26 Dec 2021 22:55:54 +0100 Subject: [PATCH 1/2] [ie/urort] rework extractor Signed-off-by: Moritz Barsnick --- yt_dlp/extractor/_extractors.py | 5 +- yt_dlp/extractor/urort.py | 114 +++++++++++++++++++++++--------- 2 files changed, 88 insertions(+), 31 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 072169d48d..8b16e13af9 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2266,7 +2266,10 @@ from .uplynk import ( UplynkIE, UplynkPreplayIE, ) -from .urort import UrortIE +from .urort import ( + UrortIE, + UrortPlaylistIE, +) from .urplay import URPlayIE from .usanetwork import USANetworkIE from .usatoday import USATodayIE diff --git a/yt_dlp/extractor/urort.py b/yt_dlp/extractor/urort.py index 06931293ee..9c7c2ba749 100644 --- a/yt_dlp/extractor/urort.py +++ b/yt_dlp/extractor/urort.py @@ -1,16 +1,23 @@ -import urllib.parse +import re from .common import InfoExtractor -from ..utils import unified_strdate +from ..networking import HEADRequest +from ..utils import ( + determine_ext, + extract_attributes, + get_elements_by_class, + urlhandle_detect_ext, + urljoin, +) -class UrortIE(InfoExtractor): - _WORKING = False - IE_DESC = 'NRK P3 Urørt' - _VALID_URL = r'https?://(?:www\.)?urort\.p3\.no/#!/Band/(?P[^/]+)$' +class UrortPlaylistIE(InfoExtractor): + IE_DESC = 'NRK P3 Urørt Playlist' + _VALID_URL = r'https?://(?:www\.)?urort\.p3\.no/(?:artist|playlist)/(?P[^/?#]+)$' + # FIXME: make a proper playlist test _TEST = { - 'url': 'https://urort.p3.no/#!/Band/Gerilja', + 'url': 'https://urort.p3.no/artist/Gerilja', 'md5': '5ed31a924be8a05e47812678a86e127b', 'info_dict': { 'id': '33124-24', @@ -28,33 +35,80 @@ class UrortIE(InfoExtractor): def _real_extract(self, url): playlist_id = self._match_id(url) - - fstr = urllib.parse.quote(f"InternalBandUrl eq '{playlist_id}'") - json_url = f'http://urort.p3.no/breeze/urort/TrackDTOViews?$filter={fstr}&$orderby=Released%20desc&$expand=Tags%2CFiles' - songs = self._download_json(json_url, playlist_id) + webpage = self._download_webpage(url, playlist_id) + playlist_divs = get_elements_by_class('list-track-artist', webpage) + if not playlist_divs: + playlist_divs = get_elements_by_class('list-track-container', webpage) + track_containers = [] + if playlist_divs: + track_containers = get_elements_by_class('track-container', playlist_divs[0]) + if not track_containers: + return {} entries = [] - for s in songs: - formats = [{ - 'tbr': f.get('Quality'), - 'ext': f['FileType'], - 'format_id': '{}-{}'.format(f['FileType'], f.get('Quality', '')), - 'url': 'http://p3urort.blob.core.windows.net/tracks/{}'.format(f['FileRef']), - 'quality': 3 if f['FileType'] == 'mp3' else 2, - } for f in s['Files']] - e = { - 'id': '%d-%s' % (s['BandId'], s['$id']), - 'title': s['Title'], - 'uploader_id': playlist_id, - 'uploader': s.get('BandName', playlist_id), - 'thumbnail': 'http://urort.p3.no/cloud/images/{}'.format(s['Image']), - 'upload_date': unified_strdate(s.get('Released')), - 'formats': formats, - } - entries.append(e) + for track_container in track_containers: + info_divs = get_elements_by_class('info', track_container) + title_divs = get_elements_by_class('title', info_divs[0]) + entries.extend([self.url_result( + urljoin(url, x)) for x in re.findall(r']+href="([^"]+)"', title_divs[0])]) return { '_type': 'playlist', 'id': playlist_id, - 'title': playlist_id, 'entries': entries, + 'title': self._og_search_title(webpage).replace(' | NRK P3 Urørt', ''), + } + + +class UrortIE(InfoExtractor): + IE_DESC = 'NRK P3 Urørt' + _VALID_URL = r'https?://(?:www\.)?urort\.p3\.no/track/[^/]+/(?P[^/&?#$]+)' + + _TEST = { + 'url': 'https://urort.p3.no/track/Gerilja/the-bomb', + 'md5': '5ed31a924be8a05e47812678a86e127b', + 'info_dict': { + 'id': '33124-24', + 'ext': 'mp3', + 'title': 'The Bomb', + 'thumbnail': r're:^https?://.+\.jpg', + 'uploader': 'Gerilja', + 'uploader_id': 'Gerilja', + 'upload_date': '20100323', + }, + 'params': { + 'matchtitle': '^The Bomb$', # To test, we want just one video + }, + } + + def _real_extract(self, url): + title_id = self._match_id(url) + webpage = self._download_webpage(url, title_id) + track_info_divs = get_elements_by_class('track-info-page track', webpage) + cover_divs = get_elements_by_class('cover-page', track_info_divs[0]) + if cover_divs: + thumbnail = self._search_regex( + r']+\bsrc=(["\'])(?P(?:(?!\1).)+)\1', cover_divs[0], 'thumbnail', + default=None, group='value') or self._html_search_meta( + ['og:image', 'twitter:image'], webpage, 'thumbnail', default=None) + + track_info_div = self._search_regex(r'(]+\bclass="[^"]*track-info["\s][^>]*>)', + track_info_divs[0], 'media URL', default=None) + video_url = extract_attributes(track_info_div)['data-trackurl'] + video_id = extract_attributes(track_info_div).get('data-trackid') or title_id + + # usually no media type known or extension found + ext = determine_ext(video_url) + if ext == 'unknown_video': + urlh = self._request_webpage( + HEADRequest(video_url), video_id, fatal=False, note='Checking media type') + if urlh and urlh.status == 200: + ext = urlhandle_detect_ext(urlh) + + return { + 'display_id': title_id, + 'id': video_id, + 'url': video_url, + 'ext': ext, + 'title': self._og_search_title(webpage).replace(' | NRK P3 Urørt', ''), + 'thumbnail': thumbnail, } From e814d0791098a1da6675aaac5258346d3a681689 Mon Sep 17 00:00:00 2001 From: Moritz Barsnick Date: Wed, 1 Oct 2025 11:42:45 +0200 Subject: [PATCH 2/2] [ie/urort] implement tests Signed-off-by: Moritz Barsnick --- yt_dlp/extractor/urort.py | 67 ++++++++++++++++++++++++++------------- 1 file changed, 45 insertions(+), 22 deletions(-) diff --git a/yt_dlp/extractor/urort.py b/yt_dlp/extractor/urort.py index 9c7c2ba749..80352501ec 100644 --- a/yt_dlp/extractor/urort.py +++ b/yt_dlp/extractor/urort.py @@ -15,23 +15,28 @@ class UrortPlaylistIE(InfoExtractor): IE_DESC = 'NRK P3 Urørt Playlist' _VALID_URL = r'https?://(?:www\.)?urort\.p3\.no/(?:artist|playlist)/(?P[^/?#]+)$' - # FIXME: make a proper playlist test - _TEST = { + _TESTS = [{ + 'note': 'artist playlist', 'url': 'https://urort.p3.no/artist/Gerilja', 'md5': '5ed31a924be8a05e47812678a86e127b', 'info_dict': { - 'id': '33124-24', - 'ext': 'mp3', - 'title': 'The Bomb', - 'thumbnail': r're:^https?://.+\.jpg', - 'uploader': 'Gerilja', - 'uploader_id': 'Gerilja', - 'upload_date': '20100323', + 'id': '1355', + 'ext': 'm4a', + 'title': 'Gerilja: Animals (radio edit)', + 'thumbnail': r're:^https?://.+\.(jpg|png]', }, 'params': { - 'matchtitle': '^The Bomb$', # To test, we want just one video + 'matchtitle': '^Gerilja', # To test, we want just one video }, - } + }, { + 'note': 'track playlist', + 'url': 'https://urort.p3.no/playlist/rock', + 'info_dict': { + 'id': str, + 'title': str, + }, + 'playlist_count': 100, + }] def _real_extract(self, url): playlist_id = self._match_id(url) @@ -63,22 +68,40 @@ class UrortIE(InfoExtractor): IE_DESC = 'NRK P3 Urørt' _VALID_URL = r'https?://(?:www\.)?urort\.p3\.no/track/[^/]+/(?P[^/&?#$]+)' - _TEST = { + _TESTS = [{ + 'note': 'MP4 AAC and default empty PNG thunbnail', 'url': 'https://urort.p3.no/track/Gerilja/the-bomb', - 'md5': '5ed31a924be8a05e47812678a86e127b', + 'md5': '9eee22c88598e285a3b4fa06ac387f79', 'info_dict': { - 'id': '33124-24', + 'id': '1356', + 'display_id': 'the-bomb', + 'ext': 'm4a', + 'title': 'Gerilja: The Bomb', + 'thumbnail': r're:^https?://.+\.(jpg|png)', + }, + }, { + 'note': 'MP3 and custom thunbnail', + 'url': 'https://urort.p3.no/track/lokal-politikk/svarteper-1', + 'md5': 'b9cc2b97820016a89b1140f11cf78fac', + 'info_dict': { + 'id': '191567', + 'display_id': 'svarteper-1', 'ext': 'mp3', - 'title': 'The Bomb', - 'thumbnail': r're:^https?://.+\.jpg', - 'uploader': 'Gerilja', - 'uploader_id': 'Gerilja', - 'upload_date': '20100323', + 'title': 'Lokal Politikk : Svarteper', + 'thumbnail': r're:^https?://', }, - 'params': { - 'matchtitle': '^The Bomb$', # To test, we want just one video + }, { + 'note': 'WAV and custom thunbnail', + 'url': 'https://urort.p3.no/track/girl-group/shut-your-mouth-sometimes', + 'md5': '625873985ccdbbc37d05078d8927a522', + 'info_dict': { + 'id': '224148', + 'display_id': 'shut-your-mouth-sometimes', + 'ext': 'wav', + 'title': 'Girl Group: Shut Your Mouth (Sometimes) ', + 'thumbnail': r're:^https?://', }, - } + }] def _real_extract(self, url): title_id = self._match_id(url)