Merge e814d07910 into 36b29bb353

2025-12-06 14:55:02 +01:00 · 2025-12-06 00:27:10 +01:00 · 2025-12-06 00:27:10 +01:00 · 02deebe03d
commit 02deebe03d
parent 36b29bb353 e814d07910
2 changed files with 121 additions and 41 deletions
--- a/yt_dlp/extractor/_extractors.py
+++ b/yt_dlp/extractor/_extractors.py
@ -2287,7 +2287,10 @@ from .uplynk import (
    UplynkIE,
    UplynkPreplayIE,
 )
-from .urort import UrortIE
+from .urort import (
+    UrortIE,
+    UrortPlaylistIE,
+)
 from .urplay import URPlayIE
 from .usanetwork import USANetworkIE
 from .usatoday import USATodayIE
--- a/yt_dlp/extractor/urort.py
+++ b/yt_dlp/extractor/urort.py
@ -1,60 +1,137 @@
-import urllib.parse
+import re

 from .common import InfoExtractor
-from ..utils import unified_strdate
+from ..networking import HEADRequest
+from ..utils import (
+    determine_ext,
+    extract_attributes,
+    get_elements_by_class,
+    urlhandle_detect_ext,
+    urljoin,
+)


-class UrortIE(InfoExtractor):
-    _WORKING = False
-    IE_DESC = 'NRK P3 Urørt'
-    _VALID_URL = r'https?://(?:www\.)?urort\.p3\.no/#!/Band/(?P<id>[^/]+)$'
+class UrortPlaylistIE(InfoExtractor):
+    IE_DESC = 'NRK P3 Urørt Playlist'
+    _VALID_URL = r'https?://(?:www\.)?urort\.p3\.no/(?:artist|playlist)/(?P<id>[^/?#]+)$'

-    _TEST = {
-        'url': 'https://urort.p3.no/#!/Band/Gerilja',
+    _TESTS = [{
+        'note': 'artist playlist',
+        'url': 'https://urort.p3.no/artist/Gerilja',
        'md5': '5ed31a924be8a05e47812678a86e127b',
        'info_dict': {
-            'id': '33124-24',
-            'ext': 'mp3',
-            'title': 'The Bomb',
-            'thumbnail': r're:^https?://.+\.jpg',
-            'uploader': 'Gerilja',
-            'uploader_id': 'Gerilja',
-            'upload_date': '20100323',
+            'id': '1355',
+            'ext': 'm4a',
+            'title': 'Gerilja: Animals (radio edit)',
+            'thumbnail': r're:^https?://.+\.(jpg|png]',
        },
        'params': {
-            'matchtitle': '^The Bomb$',  # To test, we want just one video
+            'matchtitle': '^Gerilja',  # To test, we want just one video
        },
-    }
+    }, {
+        'note': 'track playlist',
+        'url': 'https://urort.p3.no/playlist/rock',
+        'info_dict': {
+            'id': str,
+            'title': str,
+        },
+        'playlist_count': 100,
+    }]

    def _real_extract(self, url):
        playlist_id = self._match_id(url)
-
-        fstr = urllib.parse.quote(f"InternalBandUrl eq '{playlist_id}'")
-        json_url = f'http://urort.p3.no/breeze/urort/TrackDTOViews?$filter={fstr}&$orderby=Released%20desc&$expand=Tags%2CFiles'
-        songs = self._download_json(json_url, playlist_id)
+        webpage = self._download_webpage(url, playlist_id)
+        playlist_divs = get_elements_by_class('list-track-artist', webpage)
+        if not playlist_divs:
+            playlist_divs = get_elements_by_class('list-track-container', webpage)
+        track_containers = []
+        if playlist_divs:
+            track_containers = get_elements_by_class('track-container', playlist_divs[0])
+        if not track_containers:
+            return {}
        entries = []
-        for s in songs:
-            formats = [{
-                'tbr': f.get('Quality'),
-                'ext': f['FileType'],
-                'format_id': '{}-{}'.format(f['FileType'], f.get('Quality', '')),
-                'url': 'http://p3urort.blob.core.windows.net/tracks/{}'.format(f['FileRef']),
-                'quality': 3 if f['FileType'] == 'mp3' else 2,
-            } for f in s['Files']]
-            e = {
-                'id': '%d-%s' % (s['BandId'], s['$id']),
-                'title': s['Title'],
-                'uploader_id': playlist_id,
-                'uploader': s.get('BandName', playlist_id),
-                'thumbnail': 'http://urort.p3.no/cloud/images/{}'.format(s['Image']),
-                'upload_date': unified_strdate(s.get('Released')),
-                'formats': formats,
-            }
-            entries.append(e)
+        for track_container in track_containers:
+            info_divs = get_elements_by_class('info', track_container)
+            title_divs = get_elements_by_class('title', info_divs[0])
+            entries.extend([self.url_result(
+                urljoin(url, x)) for x in re.findall(r'<a[^>]+href="([^"]+)"', title_divs[0])])

        return {
            '_type': 'playlist',
            'id': playlist_id,
-            'title': playlist_id,
            'entries': entries,
+            'title': self._og_search_title(webpage).replace(' | NRK P3 Urørt', ''),
+        }
+
+
+class UrortIE(InfoExtractor):
+    IE_DESC = 'NRK P3 Urørt'
+    _VALID_URL = r'https?://(?:www\.)?urort\.p3\.no/track/[^/]+/(?P<id>[^/&?#$]+)'
+
+    _TESTS = [{
+        'note': 'MP4 AAC and default empty PNG thunbnail',
+        'url': 'https://urort.p3.no/track/Gerilja/the-bomb',
+        'md5': '9eee22c88598e285a3b4fa06ac387f79',
+        'info_dict': {
+            'id': '1356',
+            'display_id': 'the-bomb',
+            'ext': 'm4a',
+            'title': 'Gerilja: The Bomb',
+            'thumbnail': r're:^https?://.+\.(jpg|png)',
+        },
+    }, {
+        'note': 'MP3 and custom thunbnail',
+        'url': 'https://urort.p3.no/track/lokal-politikk/svarteper-1',
+        'md5': 'b9cc2b97820016a89b1140f11cf78fac',
+        'info_dict': {
+            'id': '191567',
+            'display_id': 'svarteper-1',
+            'ext': 'mp3',
+            'title': 'Lokal Politikk : Svarteper',
+            'thumbnail': r're:^https?://',
+        },
+    }, {
+        'note': 'WAV and custom thunbnail',
+        'url': 'https://urort.p3.no/track/girl-group/shut-your-mouth-sometimes',
+        'md5': '625873985ccdbbc37d05078d8927a522',
+        'info_dict': {
+            'id': '224148',
+            'display_id': 'shut-your-mouth-sometimes',
+            'ext': 'wav',
+            'title': 'Girl Group: Shut Your Mouth (Sometimes) ',
+            'thumbnail': r're:^https?://',
+        },
+    }]
+
+    def _real_extract(self, url):
+        title_id = self._match_id(url)
+        webpage = self._download_webpage(url, title_id)
+        track_info_divs = get_elements_by_class('track-info-page track', webpage)
+        cover_divs = get_elements_by_class('cover-page', track_info_divs[0])
+        if cover_divs:
+            thumbnail = self._search_regex(
+                r'<img[^>]+\bsrc=(["\'])(?P<value>(?:(?!\1).)+)\1', cover_divs[0], 'thumbnail',
+                default=None, group='value') or self._html_search_meta(
+                    ['og:image', 'twitter:image'], webpage, 'thumbnail', default=None)
+
+        track_info_div = self._search_regex(r'(<div[^>]+\bclass="[^"]*track-info["\s][^>]*>)',
+                                            track_info_divs[0], 'media URL', default=None)
+        video_url = extract_attributes(track_info_div)['data-trackurl']
+        video_id = extract_attributes(track_info_div).get('data-trackid') or title_id
+
+        # usually no media type known or extension found
+        ext = determine_ext(video_url)
+        if ext == 'unknown_video':
+            urlh = self._request_webpage(
+                HEADRequest(video_url), video_id, fatal=False, note='Checking media type')
+            if urlh and urlh.status == 200:
+                ext = urlhandle_detect_ext(urlh)
+
+        return {
+            'display_id': title_id,
+            'id': video_id,
+            'url': video_url,
+            'ext': ext,
+            'title': self._og_search_title(webpage).replace(' | NRK P3 Urørt', ''),
+            'thumbnail': thumbnail,
        }