[ie/mave:channel] Add extractor (#14915)

Authored by: anlar
This commit is contained in:
Anton Larionov 2025-11-17 00:05:44 +01:00 committed by GitHub
parent 4cb5e191ef
commit 5f66ac71f6
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 123 additions and 37 deletions

View file

@ -1094,7 +1094,10 @@ from .markiza import (
from .massengeschmacktv import MassengeschmackTVIE from .massengeschmacktv import MassengeschmackTVIE
from .masters import MastersIE from .masters import MastersIE
from .matchtv import MatchTVIE from .matchtv import MatchTVIE
from .mave import MaveIE from .mave import (
MaveChannelIE,
MaveIE,
)
from .mbn import MBNIE from .mbn import MBNIE
from .mdr import MDRIE from .mdr import MDRIE
from .medaltv import MedalTVIE from .medaltv import MedalTVIE

View file

@ -1,7 +1,9 @@
import re import functools
import math
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
InAdvancePagedList,
clean_html, clean_html,
int_or_none, int_or_none,
parse_iso8601, parse_iso8601,
@ -10,15 +12,64 @@ from ..utils import (
from ..utils.traversal import require, traverse_obj from ..utils.traversal import require, traverse_obj
class MaveIE(InfoExtractor): class MaveBaseIE(InfoExtractor):
_VALID_URL = r'https?://(?P<channel>[\w-]+)\.mave\.digital/(?P<id>ep-\d+)' _API_BASE_URL = 'https://api.mave.digital/v1/website'
_API_BASE_STORAGE_URL = 'https://store.cloud.mts.ru/mave/'
def _load_channel_meta(self, channel_id, display_id):
return traverse_obj(self._download_json(
f'{self._API_BASE_URL}/{channel_id}/', display_id,
note='Downloading channel metadata'), 'podcast')
def _load_episode_meta(self, channel_id, episode_code, display_id):
return self._download_json(
f'{self._API_BASE_URL}/{channel_id}/episodes/{episode_code}',
display_id, note='Downloading episode metadata')
def _create_entry(self, channel_id, channel_meta, episode_meta):
episode_code = traverse_obj(episode_meta, ('code', {int}, {require('episode code')}))
return {
'display_id': f'{channel_id}-{episode_code}',
'extractor_key': MaveIE.ie_key(),
'extractor': MaveIE.IE_NAME,
'webpage_url': f'https://{channel_id}.mave.digital/ep-{episode_code}',
'channel_id': channel_id,
'channel_url': f'https://{channel_id}.mave.digital/',
'vcodec': 'none',
**traverse_obj(episode_meta, {
'id': ('id', {str}),
'url': ('audio', {urljoin(self._API_BASE_STORAGE_URL)}),
'title': ('title', {str}),
'description': ('description', {clean_html}),
'thumbnail': ('image', {urljoin(self._API_BASE_STORAGE_URL)}),
'duration': ('duration', {int_or_none}),
'season_number': ('season', {int_or_none}),
'episode_number': ('number', {int_or_none}),
'view_count': ('listenings', {int_or_none}),
'like_count': ('reactions', lambda _, v: v['type'] == 'like', 'count', {int_or_none}, any),
'dislike_count': ('reactions', lambda _, v: v['type'] == 'dislike', 'count', {int_or_none}, any),
'age_limit': ('is_explicit', {bool}, {lambda x: 18 if x else None}),
'timestamp': ('publish_date', {parse_iso8601}),
}),
**traverse_obj(channel_meta, {
'series_id': ('id', {str}),
'series': ('title', {str}),
'channel': ('title', {str}),
'uploader': ('author', {str}),
}),
}
class MaveIE(MaveBaseIE):
IE_NAME = 'mave'
_VALID_URL = r'https?://(?P<channel_id>[\w-]+)\.mave\.digital/ep-(?P<episode_code>\d+)'
_TESTS = [{ _TESTS = [{
'url': 'https://ochenlichnoe.mave.digital/ep-25', 'url': 'https://ochenlichnoe.mave.digital/ep-25',
'md5': 'aa3e513ef588b4366df1520657cbc10c', 'md5': 'aa3e513ef588b4366df1520657cbc10c',
'info_dict': { 'info_dict': {
'id': '4035f587-914b-44b6-aa5a-d76685ad9bc2', 'id': '4035f587-914b-44b6-aa5a-d76685ad9bc2',
'ext': 'mp3', 'ext': 'mp3',
'display_id': 'ochenlichnoe-ep-25', 'display_id': 'ochenlichnoe-25',
'title': 'Между мной и миром: психология самооценки', 'title': 'Между мной и миром: психология самооценки',
'description': 'md5:4b7463baaccb6982f326bce5c700382a', 'description': 'md5:4b7463baaccb6982f326bce5c700382a',
'uploader': 'Самарский университет', 'uploader': 'Самарский университет',
@ -45,7 +96,7 @@ class MaveIE(InfoExtractor):
'info_dict': { 'info_dict': {
'id': '41898bb5-ff57-4797-9236-37a8e537aa21', 'id': '41898bb5-ff57-4797-9236-37a8e537aa21',
'ext': 'mp3', 'ext': 'mp3',
'display_id': 'budem-ep-12', 'display_id': 'budem-12',
'title': 'Екатерина Михайлова: "Горе от ума" не про женщин написана', 'title': 'Екатерина Михайлова: "Горе от ума" не про женщин написана',
'description': 'md5:fa3bdd59ee829dfaf16e3efcb13f1d19', 'description': 'md5:fa3bdd59ee829dfaf16e3efcb13f1d19',
'uploader': 'Полина Цветкова+Евгения Акопова', 'uploader': 'Полина Цветкова+Евгения Акопова',
@ -68,40 +119,72 @@ class MaveIE(InfoExtractor):
'upload_date': '20241230', 'upload_date': '20241230',
}, },
}] }]
_API_BASE_URL = 'https://api.mave.digital/'
def _real_extract(self, url): def _real_extract(self, url):
channel_id, slug = self._match_valid_url(url).group('channel', 'id') channel_id, episode_code = self._match_valid_url(url).group(
display_id = f'{channel_id}-{slug}' 'channel_id', 'episode_code')
webpage = self._download_webpage(url, display_id) display_id = f'{channel_id}-{episode_code}'
data = traverse_obj(
self._search_nuxt_json(webpage, display_id), channel_meta = self._load_channel_meta(channel_id, display_id)
('data', lambda _, v: v['activeEpisodeData'], any, {require('podcast data')})) episode_meta = self._load_episode_meta(channel_id, episode_code, display_id)
return self._create_entry(channel_id, channel_meta, episode_meta)
class MaveChannelIE(MaveBaseIE):
IE_NAME = 'mave:channel'
_VALID_URL = r'https?://(?P<id>[\w-]+)\.mave\.digital/?(?:$|[?#])'
_TESTS = [{
'url': 'https://budem.mave.digital/',
'info_dict': {
'id': 'budem',
'title': 'Все там будем',
'description': 'md5:f04ae12a42be0f1d765c5e326b41987a',
},
'playlist_mincount': 15,
}, {
'url': 'https://ochenlichnoe.mave.digital/',
'info_dict': {
'id': 'ochenlichnoe',
'title': 'Очень личное',
'description': 'md5:ee36a6a52546b91b487fe08c552fdbb2',
},
'playlist_mincount': 20,
}, {
'url': 'https://geekcity.mave.digital/',
'info_dict': {
'id': 'geekcity',
'title': 'Мужчины в трико',
'description': 'md5:4164d425d60a0d97abdce9d1f6f8e049',
},
'playlist_mincount': 80,
}]
_PAGE_SIZE = 50
def _entries(self, channel_id, channel_meta, page_num):
page_data = self._download_json(
f'{self._API_BASE_URL}/{channel_id}/episodes', channel_id, query={
'view': 'all',
'page': page_num + 1,
'sort': 'newest',
'format': 'all',
}, note=f'Downloading page {page_num + 1}')
for ep in traverse_obj(page_data, ('episodes', lambda _, v: v['audio'] and v['id'])):
yield self._create_entry(channel_id, channel_meta, ep)
def _real_extract(self, url):
channel_id = self._match_id(url)
channel_meta = self._load_channel_meta(channel_id, channel_id)
return { return {
'display_id': display_id, '_type': 'playlist',
'channel_id': channel_id, 'id': channel_id,
'channel_url': f'https://{channel_id}.mave.digital/', **traverse_obj(channel_meta, {
'vcodec': 'none',
'thumbnail': re.sub(r'_\d+(?=\.(?:jpg|png))', '', self._og_search_thumbnail(webpage, default='')) or None,
**traverse_obj(data, ('activeEpisodeData', {
'url': ('audio', {urljoin(self._API_BASE_URL)}),
'id': ('id', {str}),
'title': ('title', {str}), 'title': ('title', {str}),
'description': ('description', {clean_html}), 'description': ('description', {str}),
'duration': ('duration', {int_or_none}), }),
'season_number': ('season', {int_or_none}), 'entries': InAdvancePagedList(
'episode_number': ('number', {int_or_none}), functools.partial(self._entries, channel_id, channel_meta),
'view_count': ('listenings', {int_or_none}), math.ceil(channel_meta['episodes_count'] / self._PAGE_SIZE), self._PAGE_SIZE),
'like_count': ('reactions', lambda _, v: v['type'] == 'like', 'count', {int_or_none}, any),
'dislike_count': ('reactions', lambda _, v: v['type'] == 'dislike', 'count', {int_or_none}, any),
'age_limit': ('is_explicit', {bool}, {lambda x: 18 if x else None}),
'timestamp': ('publish_date', {parse_iso8601}),
})),
**traverse_obj(data, ('podcast', 'podcast', {
'series_id': ('id', {str}),
'series': ('title', {str}),
'channel': ('title', {str}),
'uploader': ('author', {str}),
})),
} }