yt-dlp/yt_dlp/extractor/generic.py

import os
import re
import types
import urllib.parse
import xml.etree.ElementTree

from .common import InfoExtractor
from .commonprotocols import RtmpIE
from .youtube import YoutubeIE
from ..compat import compat_etree_fromstring
from ..cookies import LenientSimpleCookie
from ..networking.exceptions import HTTPError
from ..networking.impersonate import ImpersonateTarget
from ..utils import (
    KNOWN_EXTENSIONS,
    MEDIA_EXTENSIONS,
    ExtractorError,
    UnsupportedError,
    determine_ext,
    determine_protocol,
    dict_get,
    extract_basic_auth,
    filter_dict,
    format_field,
    int_or_none,
    is_html,
    js_to_json,
    merge_dicts,
    mimetype2ext,
    orderedSet,
    parse_duration,
    parse_resolution,
    smuggle_url,
    str_or_none,
    traverse_obj,
    try_call,
    unescapeHTML,
    unified_timestamp,
    unsmuggle_url,
    update_url,
    update_url_query,
    url_or_none,
    urlhandle_detect_ext,
    urljoin,
    variadic,
    xpath_attr,
    xpath_text,
    xpath_with_ns,
)
from ..utils._utils import _UnsafeExtensionError


class GenericIE(InfoExtractor):
    IE_DESC = 'Generic downloader that works on some sites'
    _VALID_URL = r'.*'
    IE_NAME = 'generic'
    _NETRC_MACHINE = False  # Suppress username warning
    _TESTS = [{
        # Direct link
        # https://github.com/ytdl-org/youtube-dl/commit/c5fa81fe81ce05cd81c20ff4ea6dac3dccdcbf9d
        'url': 'https://media.w3.org/2010/05/sintel/trailer.mp4',
        'md5': '67d406c2bcb6af27fa886f31aa934bbe',
        'info_dict': {
            'id': 'trailer',
            'ext': 'mp4',
            'title': 'trailer',
            'direct': True,
            'timestamp': 1273772943,
            'upload_date': '20100513',
        },
    }, {
        # Direct link: No HEAD support
        # https://github.com/ytdl-org/youtube-dl/issues/4032
        'url': 'http://ai-radio.org:8000/radio.opus',
        'info_dict': {
            'id': 'radio',
            'ext': 'opus',
            'title': 'radio',
        },
        'skip': 'Invalid URL',
    }, {
        # Direct link: Incorrect MIME type
        # https://github.com/ytdl-org/youtube-dl/commit/c5fa81fe81ce05cd81c20ff4ea6dac3dccdcbf9d
        'url': 'https://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
        'md5': '4ccbebe5f36706d85221f204d7eb5913',
        'info_dict': {
            'id': '5_Lennart_Poettering_-_Systemd',
            'ext': 'webm',
            'title': '5_Lennart_Poettering_-_Systemd',
            'direct': True,
            'timestamp': 1416498816,
            'upload_date': '20141120',
        },
    }, {
        # Direct link: Live HLS; https://castr.com/hlsplayer/
        # https://github.com/yt-dlp/yt-dlp/pull/6775
        'url': 'https://stream-akamai.castr.com/5b9352dbda7b8c769937e459/live_2361c920455111ea85db6911fe397b9e/index.fmp4.m3u8',
        'info_dict': {
            'id': 'index.fmp4',
            'ext': 'mp4',
            'title': str,
            'live_status': 'is_live',
        },
        'params': {'skip_download': 'm3u8'},
    }, {
        # Compressed when `Accept-Encoding: *`
        # https://github.com/ytdl-org/youtube-dl/commit/a074e922967fa571d4f1abb1773c711747060f00
        'url': 'http://calimero.tk/muzik/FictionJunction-Parallel_Hearts.flac',
        'info_dict': {
            'id': 'FictionJunction-Parallel_Hearts',
            'ext': 'flac',
            'title': 'FictionJunction-Parallel_Hearts',
        },
        'skip': 'Invalid URL',
    }, {
        # `Content-Encoding: br` when `Accept-Encoding: *`
        # https://github.com/yt-dlp/yt-dlp/commit/3e01ce744a981d8f19ae77ec695005e7000f4703
        'url': 'https://www.extra.cz/cauky-lidi-70-dil-babis-predstavil-pohadky-prymulanek-nebo-andrejovy-nove-saty-ac867',
        'md5': 'a9a2cad3e54f78e4680c6deef82417e9',
        'info_dict': {
            'id': 'cauky-lidi-70-dil-babis-predstavil-pohadky-prymulanek-nebo-andrejovy-nove-saty-ac867',
            'ext': 'mp4',
            'title': 'čauky lidi 70 finall',
            'description': 'md5:47b2673a5b76780d9d329783e1fbf5aa',
            'direct': True,
            'duration': 318.0,
            'thumbnail': r're:https?://media\.extra\.cz/static/img/.+\.jpg',
            'timestamp': 1654513791,
            'upload_date': '20220606',
        },
        'params': {'extractor_args': {'generic': {'impersonate': ['chrome']}}},
    }, {
        # HLS: `Content-Type: audio/mpegurl`; https://bitmovin.com/demos/stream-test
        # https://github.com/ytdl-org/youtube-dl/commit/20938f768b16c945c6041ba3c0a7ae1a4e790881
        'url': 'https://cdn.bitmovin.com/content/assets/art-of-motion-dash-hls-progressive/m3u8s/f08e80da-bf1d-4e3d-8899-f0f6155f6efa.m3u8',
        'info_dict': {
            'id': 'f08e80da-bf1d-4e3d-8899-f0f6155f6efa',
            'ext': 'mp4',
            'title': 'f08e80da-bf1d-4e3d-8899-f0f6155f6efa',
            'duration': 211,
            'timestamp': 1737363648,
            'upload_date': '20250120',
        },
        'params': {'skip_download': 'm3u8'},
    }, {
        # HLS: `Content-Type: text/plain`; https://github.com/grafov/m3u8
        # https://github.com/ytdl-org/youtube-dl/commit/edd9b71c2cca7e5a0df8799710d9ad410ec77d29
        'url': 'https://raw.githubusercontent.com/grafov/m3u8/refs/heads/master/sample-playlists/master.m3u8',
        'info_dict': {
            'id': 'master',
            'ext': 'mp4',
            'title': 'master',
        },
        'params': {'skip_download': 'm3u8'},
    }, {
        # MPEG-DASH; https://bitmovin.com/demos/stream-test
        # https://github.com/ytdl-org/youtube-dl/commit/9d939cec48f06a401fb79eb078c1fc50b2aefbe1
        'url': 'https://cdn.bitmovin.com/content/assets/art-of-motion-dash-hls-progressive/mpds/f08e80da-bf1d-4e3d-8899-f0f6155f6efa.mpd',
        'info_dict': {
            'id': 'f08e80da-bf1d-4e3d-8899-f0f6155f6efa',
            'ext': 'mp4',
            'title': 'f08e80da-bf1d-4e3d-8899-f0f6155f6efa',
            'timestamp': 1737363728,
            'upload_date': '20250120',
        },
        'params': {'skip_download': True},
    }, {
        # Live MPEG-DASH; https://livesim2.dashif.org/urlgen/create
        # https://github.com/yt-dlp/yt-dlp/pull/12256
        'url': 'https://livesim2.dashif.org/livesim2/ato_10/testpic_2s/Manifest.mpd',
        'info_dict': {
            'id': 'Manifest',
            'ext': 'mp4',
            'title': str,
            'live_status': 'is_live',
        },
        'params': {'skip_download': 'livestream'},
    }, {
        # SMIL
        # https://github.com/ytdl-org/youtube-dl/pull/6428
        'url': 'https://api.new.livestream.com/accounts/21/events/7954027/videos/166558123.secure.smil',
        'info_dict': {
            'id': '166558123.secure',
            'ext': 'mp4',
            'title': '73fb2379-a624-4b6c-bce4-e46086007f2c',
        },
        'params': {'skip_download': 'smil'},
    }, {
        # XSPF playlist; https://shellac-archive.ch/de/index.html
        # https://github.com/ytdl-org/youtube-dl/commit/1de5cd3ba51ce67d9a1cd3b40157058e78e46692
        'url': 'https://shellac-archive.ch/repository/xspf/22-AL0019Z.xspf',
        'info_dict': {
            'id': '22-AL0019Z',
        },
        'playlist_count': 12,
        'params': {'skip_download': True},
    }, {
        # RSS feed
        # https://github.com/ytdl-org/youtube-dl/commit/c5fa81fe81ce05cd81c20ff4ea6dac3dccdcbf9d
        'url': 'http://phihag.de/2014/youtube-dl/rss2.xml',
        'info_dict': {
            'id': 'https://phihag.de/2014/youtube-dl/rss2.xml',
            'title': 'Zero Punctuation',
            'description': 'md5:512ae5f840e52eb3c0d08d4bed08eb3e',
        },
        'playlist_mincount': 11,
    }, {
        # RSS feed: Includes enclosure, description, and thumbnails
        # https://github.com/ytdl-org/youtube-dl/pull/27405
        'url': 'https://anchor.fm/s/dd00e14/podcast/rss',
        'info_dict': {
            'id': 'https://anchor.fm/s/dd00e14/podcast/rss',
            'title': '100% Hydrogen ',
            'description': 'md5:7ec96327f8b91a2549a2e74f064022a1',
        },
        'playlist_count': 1,
        'params': {'skip_download': True},
    }, {
        # RSS feed: Includes guid
        'url': 'https://www.omnycontent.com/d/playlist/a7b4f8fe-59d9-4afc-a79a-a90101378abf/bf2c1d80-3656-4449-9d00-a903004e8f84/efbff746-e7c1-463a-9d80-a903004e8f8f/podcast.rss',
        'info_dict': {
            'id': 'https://www.omnycontent.com/d/playlist/a7b4f8fe-59d9-4afc-a79a-a90101378abf/bf2c1d80-3656-4449-9d00-a903004e8f84/efbff746-e7c1-463a-9d80-a903004e8f8f/podcast.rss',
            'title': 'The Little Red Podcast',
            'description': 'md5:be809a44b63b0c56fb485caf68685520',
        },
        'playlist_mincount': 76,
    }, {
        # RSS feed: Includes enclosure and unsupported URLs
        # https://github.com/ytdl-org/youtube-dl/pull/16189
        'url': 'https://www.interfax.ru/rss.asp',
        'info_dict': {
            'id': 'https://www.interfax.ru/rss.asp',
            'title': 'Интерфакс',
            'description': 'md5:49b6b8905772efba21923942bbc0444c',
        },
        'playlist_mincount': 25,
    }, {
        # Webpage starts with a duplicate UTF-8 BOM
        # https://github.com/yt-dlp/yt-dlp/commit/80e8493ee7c3083f4e215794e4a67ba5265f24f7
        'url': 'https://www.filmarkivet.se/movies/paris-d-moll/',
        'md5': 'df02cadc719dcc63d43288366f037754',
        'info_dict': {
            'id': 'paris-d-moll',
            'ext': 'mp4',
            'title': 'Paris d-moll',
            'description': 'md5:319e37ea5542293db37e1e13072fe330',
            'thumbnail': r're:https?://www\.filmarkivet\.se/wp-content/uploads/.+\.jpg',
        },
    }, {
        # Multiple HTML5 videos
        # https://github.com/ytdl-org/youtube-dl/pull/14107
        'url': 'https://www.dagbladet.no/nyheter/etter-ett-ars-planlegging-klaffet-endelig-alt---jeg-matte-ta-en-liten-dans/60413035',
        'info_dict': {
            'id': '60413035',
            'title': 'Etter ett års planlegging, klaffet endelig alt: - Jeg måtte ta en liten dans',
            'description': 'md5:bbb4e12e42e78609a74fd421b93b1239',
            'thumbnail': r're:https?://www\.dagbladet\.no/images/.+',
        },
        'playlist_count': 2,
    }, {
        # Cinerama Player
        # https://github.com/ytdl-org/youtube-dl/commit/501f13fbf3d1f7225f91e3e0ad008df2cd3219f1
        'url': 'https://www.abc.net.au/res/libraries/cinerama2/examples/single_clip.htm',
        'info_dict': {
            'id': 'single_clip',
            'title': 'Single Clip player examples',
        },
        'playlist_count': 3,
    }, {
        # FIXME: Improve extraction
        # Flowplayer
        # https://github.com/ytdl-org/youtube-dl/commit/4d805e063c6c4ffd557d7c7cb905a3ed9c926b08
        'url': 'https://flowplayer.com/resources/demos/standard-setup',
        'info_dict': {
            'id': 'playlist',
            'ext': 'mp4',
            'title': 'playlist',
            'duration': 13,
            'timestamp': 1539082175,
            'upload_date': '20181009',
        },
        'params': {'skip_download': 'm3u8'},
    }, {
        # JW Player: YouTube
        # https://github.com/ytdl-org/youtube-dl/commit/a0f719854463c6f4226e4042dfa80c1b17154e1d
        'url': 'https://media.nationalarchives.gov.uk/index.php/webinar-using-discovery-national-archives-online-catalogue/',
        'info_dict': {
            'id': 'Mrj4DVp2zeA',
            'ext': 'mp4',
            'title': 'Using Discovery, The National Archives’ online catalogue',
            'age_limit': 0,
            'availability': 'unlisted',
            'categories': ['Education'],
            'channel': 'The National Archives UK',
            'channel_follower_count': int,
            'channel_id': 'UCUuzebc1yADDJEnOLA5P9xw',
            'channel_url': 'https://www.youtube.com/channel/UCUuzebc1yADDJEnOLA5P9xw',
            'chapters': 'count:13',
            'description': 'md5:a236581cd2449dd2df4f93412f3f01c6',
            'duration': 3066,
            'like_count': int,
            'live_status': 'not_live',
            'media_type': 'video',
            'playable_in_embed': True,
            'tags': 'count:5',
            'thumbnail': r're:https?://i\.ytimg\.com/vi/.+',
            'timestamp': 1423757117,
            'upload_date': '20150212',
            'uploader': 'The National Archives UK',
            'uploader_id': '@TheNationalArchivesUK',
            'uploader_url': 'https://www.youtube.com/@TheNationalArchivesUK',
            'view_count': int,
        },
        'add_ie': ['Youtube'],
    }, {
        # JW Player: Complex
        # https://github.com/ytdl-org/youtube-dl/commit/a4a554a79354981fcab55de8eaab7b95a40bbb48
        'url': 'https://www.indiedb.com/games/king-machine/videos',
        'info_dict': {
            'id': 'videos-1',
            'ext': 'mp4',
            'title': 'Videos & Audio - King Machine (1)',
            'description': 'Browse King Machine videos & audio for sweet media. Your eyes will thank you.',
            'thumbnail': r're:https?://media\.indiedb\.com/cache/images/.+\.jpg',
            '_old_archive_ids': ['generic videos'],
        },
    }, {
        # JW Player: JSON Feed URL
        # https://github.com/yt-dlp/yt-dlp/issues/1476
        'url': 'https://foodschmooze.org/',
        'info_dict': {
            'id': 'z00Frhnw',
            'ext': 'mp4',
            'title': 'Grilling Beef Tenderloin',
            'description': '',
            'duration': 392.0,
            'thumbnail': r're:https?://cdn\.jwplayer\.com/v2/media/.+',
            'timestamp': 1465313685,
            'upload_date': '20160607',
        },
        'params': {'skip_download': 'm3u8'},
    }, {
        # JW Player: RTMP
        # https://github.com/ytdl-org/youtube-dl/issues/11993
        'url': 'http://www.suffolk.edu/sjc/live.php',
        'info_dict': {
            'id': 'live',
            'ext': 'flv',
            'title': 'Massachusetts Supreme Judicial Court Oral Arguments',
        },
        'skip': 'Invalid URL',
    }, {
        # KVS Player v7.3.3
        # kt_player.js?v=5.1.1
        'url': 'https://bogmedia.org/videos/21217/40-nochey-2016/',
        'md5': '94166bdb26b4cb1fb9214319a629fc51',
        'info_dict': {
            'id': '21217',
            'ext': 'mp4',
            'title': '40 ночей (2016) - BogMedia.org',
            'description': 'md5:4e6d7d622636eb7948275432eb256dc3',
            'display_id': '40-nochey-2016',
            'thumbnail': r're:https?://bogmedia\.org/contents/videos_screenshots/.+\.jpg',
        },
    }, {
        # KVS Player v7.7.11
        # kt_player.js?v=5.5.1
        # https://github.com/yt-dlp/yt-dlp/commit/a318f59d14792d25b2206c3f50181e03e8716db7
        'url': 'https://youix.com/video/leningrad-zoj/',
        'md5': '94f96ba95706dc3880812b27b7d8a2b8',
        'info_dict': {
            'id': '18485',
            'ext': 'mp4',
            'title': 'Клип: Ленинград - ЗОЖ скачать, смотреть онлайн | Youix.com',
            'display_id': 'leningrad-zoj',
            'thumbnail': r're:https?://youix\.com/contents/videos_screenshots/.+\.jpg',
        },
    }, {
        # KVS Player v7.10.3
        # kt_player.js?v=12
        # https://github.com/ytdl-org/youtube-dl/commit/fc2beab0e701c497a003f11fef5c0df54fba1da3
        'url': 'https://shooshtime.com/videos/346037/fresh-out-of-the-shower/',
        'md5': 'c9a97ad528607a4516d4df83a3aeb12c',
        'info_dict': {
            'id': '346037',
            'ext': 'mp4',
            'title': 'Fresh out of the shower - Shooshtime',
            'age_limit': 18,
            'description': 'md5:efd70fd3973f8750d285c743b910580a',
            'display_id': 'fresh-out-of-the-shower',
            'thumbnail': r're:https?://i\.shoosh\.co/contents/videos_screenshots/.+\.jpg',
        },
        'expected_warnings': ['Untested major version'],
    }, {
        # FIXME: Unable to extract flashvars
        # KVS Player v7.11.4
        # kt_player.js?v=2.11.5.1
        # https://github.com/yt-dlp/yt-dlp/commit/a318f59d14792d25b2206c3f50181e03e8716db7
        'url': 'https://www.kvs-demo.com/video/105/kelis-4th-of-july/',
        'info_dict': {
            'id': '105',
            'ext': 'mp4',
            'title': 'Kelis - 4th Of July',
        },
    }, {
        # KVS Player v7.11.4
        # kt_player.js?v=6.3.2
        # https://github.com/yt-dlp/yt-dlp/commit/a318f59d14792d25b2206c3f50181e03e8716db7
        'url': 'https://www.kvs-demo.com/embed/105/',
        'md5': '1ff84c70acaddbb03288c6cc5ee1879f',
        'info_dict': {
            'id': '105',
            'ext': 'mp4',
            'title': 'Kelis - 4th Of July / Embed Player',
            'display_id': 'kelis-4th-of-july',
            'thumbnail': r're:https?://www\.kvs-demo\.com/contents/videos_screenshots/.+\.jpg',
        },
    }, {
        # twitter:player:stream
        # https://github.com/ytdl-org/youtube-dl/commit/371ddb14fe651d4a1e5a8310d6d7c0e395cd92b0
        'url': 'https://beltzlaw.com/',
        'info_dict': {
            'id': 'beltzlaw-1',
            'ext': 'mp4',
            'title': str,
            'description': str,
            'thumbnail': r're:https?://beltzlaw\.com/wp-content/uploads/.+\.jpg',
            'timestamp': int,  # varies
            'upload_date': str,
            '_old_archive_ids': ['generic beltzlaw'],
        },
    }, {
        # twitter:player
        # https://github.com/ytdl-org/youtube-dl/commit/329179073b93e37ab76e759d1fe96d8f984367f3
        'url': 'https://cine.ar/',
        'md5': 'd3e33335e339f04008690118698dfd08',
        'info_dict': {
            'id': 'cine-1',
            'ext': 'webm',
            'title': 'CINE.AR (1)',
            'description': 'md5:a4e58f9e2291c940e485f34251898c4a',
            'thumbnail': r're:https?://cine\.ar/img/.+\.png',
            '_old_archive_ids': ['generic cine'],
        },
        'params': {'format': 'webm'},
    }, {
        # JSON-LD: multiple @type
        # https://github.com/yt-dlp/yt-dlp/commit/f3c0c77304bc0e5614a65c45629de22f067685ac
        'url': 'https://www.nu.nl/280161/video/hoe-een-bladvlo-dit-verwoestende-japanse-onkruid-moet-vernietigen.html',
        'info_dict': {
            'id': 'ipy2AcGL',
            'ext': 'mp4',
            'title': 'Hoe een bladvlo dit verwoestende Japanse onkruid moet vernietigen',
            'description': 'md5:6a9d644bab0dc2dc06849c2505d8383d',
            'duration': 111.0,
            'thumbnail': r're:https?://images\.nu\.nl/.+\.jpg',
            'timestamp': 1586584674,
            'upload_date': '20200411',
        },
        'params': {'extractor_args': {'generic': {'impersonate': ['chrome']}}},
    }, {
        # JSON-LD: unexpected @type
        # https://github.com/yt-dlp/yt-dlp/pull/5145
        'url': 'https://www.autoweek.nl/autotests/artikel/porsche-911-gt3-rs-rij-impressie-2/',
        'info_dict': {
            'id': 'porsche-911-gt3-rs-rij-impressie-2',
            'ext': 'mp4',
            'title': 'Test: Porsche 911 GT3 RS - AutoWeek',
            'description': 'md5:a17b5bd84288448d8f11b838505718fc',
            'direct': True,
            'thumbnail': r're:https?://images\.autoweek\.nl/.+',
            'timestamp': 1664920902,
            'upload_date': '20221004',
        },
        'params': {'extractor_args': {'generic': {'impersonate': ['chrome']}}},
    }, {
        # JSON-LD: VideoObject
        # https://github.com/ytdl-org/youtube-dl/commit/6e6b70d65f0681317c425bfe1e157f3474afbbe8
        'url': 'https://breezy.hr/',
        'info_dict': {
            'id': 'k6gl2kt2eq',
            'ext': 'mp4',
            'title': 'Breezy HR\'s ATS helps you find & hire employees sooner',
            'average_rating': 4.5,
            'description': 'md5:eee75fdd3044c538003f3be327ba01e1',
            'duration': 60.1,
            'thumbnail': r're:https?://cdn\.prod\.website-files\.com/.+\.webp',
            'timestamp': 1485734400,
            'upload_date': '20170130',
        },
    }, {
        # Video.js: VOD HLS
        # https://github.com/yt-dlp/yt-dlp/pull/6775
        'url': 'https://gist.githubusercontent.com/bashonly/2aae0862c50f4a4b84f220c315767208/raw/e3380d413749dabbe804c9c2d8fd9a45142475c7/videojs_hls_test.html',
        'info_dict': {
            'id': 'videojs_hls_test',
            'ext': 'mp4',
            'title': 'video',
            'duration': 1800,
        },
        'params': {'skip_download': 'm3u8'},
    }, {
        # Video.js: YouTube
        # https://github.com/ytdl-org/youtube-dl/commit/63d990d2859d0e981da2e416097655798334431b
        'url': 'https://ortcam.com/solidworks-%d1%83%d1%80%d0%be%d0%ba-6-%d0%bd%d0%b0%d1%81%d1%82%d1%80%d0%be%d0%b9%d0%ba%d0%b0-%d1%87%d0%b5%d1%80%d1%82%d0%b5%d0%b6%d0%b0_33f9b7351.html?vid=33f9b7351',
        'info_dict': {
            'id': 'yygqldloqIk',
            'ext': 'mp4',
            'title': 'SolidWorks. Урок 6 Настройка чертежа',
            'age_limit': 0,
            'availability': 'public',
            'categories': ['Education'],
            'channel': 'PROстое3D',
            'channel_follower_count': int,
            'channel_id': 'UCy91Bug3dERhbwGh2m2Ijng',
            'channel_url': 'https://www.youtube.com/channel/UCy91Bug3dERhbwGh2m2Ijng',
            'comment_count': int,
            'description': 'md5:baf95267792646afdbf030e4d06b2ab3',
            'duration': 1160,
            'heatmap': 'count:100',
            'like_count': int,
            'live_status': 'not_live',
            'media_type': 'video',
            'playable_in_embed': True,
            'tags': 'count:17',
            'thumbnail': r're:https?://i\.ytimg\.com/vi/.+',
            'timestamp': 1363263144,
            'upload_date': '20130314',
            'uploader': 'PROстое3D',
            'uploader_id': '@PROstoe3D',
            'uploader_url': 'https://www.youtube.com/@PROstoe3D',
            'view_count': int,
        },
        'add_ie': ['Youtube'],
    }, {
        # Redirect
        # https://github.com/ytdl-org/youtube-dl/issues/413
        'url': 'https://www.google.com/url?rct=j&url=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY',
        'info_dict': {
            'id': 'cmQHVoWB5FY',
            'ext': 'mp4',
            'title': 'First Firefox OS phones side-by-side',
            'age_limit': 0,
            'availability': 'public',
            'categories': ['Entertainment'],
            'channel': 'The Verge',
            'channel_follower_count': int,
            'channel_id': 'UCddiUEpeqJcYeBxX1IVBKvQ',
            'channel_is_verified': True,
            'channel_url': 'https://www.youtube.com/channel/UCddiUEpeqJcYeBxX1IVBKvQ',
            'comment_count': int,
            'description': 'md5:7a676046ad24d9ea55cdde4a6657c5b3',
            'duration': 207,
            'like_count': int,
            'live_status': 'not_live',
            'media_type': 'video',
            'playable_in_embed': True,
            'tags': 'count:15',
            'thumbnail': r're:https?://i\.ytimg\.com/vi/.+',
            'timestamp': 1361738430,
            'upload_date': '20130224',
            'uploader': 'The Verge',
            'uploader_id': '@TheVerge',
            'uploader_url': 'https://www.youtube.com/@TheVerge',
            'view_count': int,
        },
        'add_ie': ['Youtube'],
    }]

    def report_following_redirect(self, new_url):
        """Report information extraction."""
        self._downloader.to_screen(f'[redirect] Following redirect to {new_url}')

    def report_detected(self, name, num=1, note=None):
        if num > 1:
            name += 's'
        elif not num:
            return
        else:
            num = 'a'

        self._downloader.write_debug(f'Identified {num} {name}{format_field(note, None, "; %s")}')

    def _extra_manifest_info(self, info, manifest_url):
        fragment_query = self._configuration_arg('fragment_query', [None], casesense=True)[0]
        if fragment_query is not None:
            info['extra_param_to_segment_url'] = (
                urllib.parse.urlparse(fragment_query).query or fragment_query
                or urllib.parse.urlparse(manifest_url).query or None)

        key_query = self._configuration_arg('key_query', [None], casesense=True)[0]
        if key_query is not None:
            info['extra_param_to_key_url'] = (
                urllib.parse.urlparse(key_query).query or key_query
                or urllib.parse.urlparse(manifest_url).query or None)

        def hex_or_none(value):
            return value if re.fullmatch(r'(0x)?[\da-f]+', value, re.IGNORECASE) else None

        info['hls_aes'] = traverse_obj(self._configuration_arg('hls_key', casesense=True), {
            'uri': (0, {url_or_none}), 'key': (0, {hex_or_none}), 'iv': (1, {hex_or_none}),
        }) or None

        variant_query = self._configuration_arg('variant_query', [None], casesense=True)[0]
        if variant_query is not None:
            query = urllib.parse.parse_qs(
                urllib.parse.urlparse(variant_query).query or variant_query
                or urllib.parse.urlparse(manifest_url).query)
            for fmt in self._downloader._get_formats(info):
                fmt['url'] = update_url_query(fmt['url'], query)

        # Attempt to detect live HLS or set VOD duration
        m3u8_format = next((f for f in self._downloader._get_formats(info)
                            if determine_protocol(f) == 'm3u8_native'), None)
        if m3u8_format:
            is_live = self._configuration_arg('is_live', [None])[0]
            if is_live is not None:
                info['live_status'] = 'not_live' if is_live == 'false' else 'is_live'
                return
            headers = m3u8_format.get('http_headers') or info.get('http_headers') or {}
            display_id = info.get('id')
            urlh = self._request_webpage(
                m3u8_format['url'], display_id, 'Checking m3u8 live status', errnote=False,
                headers={**headers, 'Accept-Encoding': 'identity'}, fatal=False)
            if urlh is False:
                return
            first_bytes = urlh.read(512)
            if not first_bytes.startswith(b'#EXTM3U'):
                return
            m3u8_doc = self._webpage_read_content(
                urlh, urlh.url, display_id, prefix=first_bytes, fatal=False, errnote=False)
            if not m3u8_doc:
                return
            duration = self._parse_m3u8_vod_duration(m3u8_doc, display_id)
            if not duration:
                info['live_status'] = 'is_live'
            info['duration'] = info.get('duration') or duration

    def _extract_rss(self, url, video_id, doc):
        NS_MAP = {
            'itunes': 'http://www.itunes.com/dtds/podcast-1.0.dtd',
        }

        entries = []
        for it in doc.findall('./channel/item'):
            next_url = next(
                (e.attrib.get('url') for e in it.findall('./enclosure')),
                xpath_text(it, 'link', fatal=False))
            if not next_url:
                continue

            guid = try_call(lambda: it.find('guid').text)
            if guid:
                next_url = smuggle_url(next_url, {'force_videoid': guid})

            def itunes(key):
                return xpath_text(it, xpath_with_ns(f'./itunes:{key}', NS_MAP), default=None)

            entries.append({
                '_type': 'url_transparent',
                'url': next_url,
                'title': try_call(lambda: it.find('title').text),
                'description': xpath_text(it, 'description', default=None),
                'timestamp': unified_timestamp(xpath_text(it, 'pubDate', default=None)),
                'duration': parse_duration(itunes('duration')),
                'thumbnail': url_or_none(xpath_attr(it, xpath_with_ns('./itunes:image', NS_MAP), 'href')),
                'episode': itunes('title'),
                'episode_number': int_or_none(itunes('episode')),
                'season_number': int_or_none(itunes('season')),
                'age_limit': {'true': 18, 'yes': 18, 'false': 0, 'no': 0}.get((itunes('explicit') or '').lower()),
            })

        return {
            '_type': 'playlist',
            'id': url,
            'title': try_call(lambda: doc.find('./channel/title').text),
            'description': try_call(lambda: doc.find('./channel/description').text),
            'entries': entries,
        }

    @classmethod
    def _kvs_get_real_url(cls, video_url, license_code):
        if not video_url.startswith('function/0/'):
            return video_url  # not obfuscated

        parsed = urllib.parse.urlparse(video_url[len('function/0/'):])
        license_token = cls._kvs_get_license_token(license_code)
        urlparts = parsed.path.split('/')

        HASH_LENGTH = 32
        hash_ = urlparts[3][:HASH_LENGTH]
        indices = list(range(HASH_LENGTH))

        # Swap indices of hash according to the destination calculated from the license token
        accum = 0
        for src in reversed(range(HASH_LENGTH)):
            accum += license_token[src]
            dest = (src + accum) % HASH_LENGTH
            indices[src], indices[dest] = indices[dest], indices[src]

        urlparts[3] = ''.join(hash_[index] for index in indices) + urlparts[3][HASH_LENGTH:]
        return urllib.parse.urlunparse(parsed._replace(path='/'.join(urlparts)))

    @staticmethod
    def _kvs_get_license_token(license_code):
        license_code = license_code.replace('$', '')
        license_values = [int(char) for char in license_code]

        modlicense = license_code.replace('0', '1')
        center = len(modlicense) // 2
        fronthalf = int(modlicense[:center + 1])
        backhalf = int(modlicense[center:])
        modlicense = str(4 * abs(fronthalf - backhalf))[:center + 1]

        return [
            (license_values[index + offset] + current) % 10
            for index, current in enumerate(map(int, modlicense))
            for offset in range(4)
        ]

    def _extract_kvs(self, url, webpage, video_id):
        flashvars = self._search_json(
            r'(?s:<script\b[^>]*>.*?var\s+flashvars\s*=)',
            webpage, 'flashvars', video_id, transform_source=js_to_json)

        # extract the part after the last / as the display_id from the
        # canonical URL.
        display_id = self._search_regex(
            r'(?:<link href="https?://[^"]+/(.+?)/?" rel="canonical"\s*/?>'
            r'|<link rel="canonical" href="https?://[^"]+/(.+?)/?"\s*/?>)',
            webpage, 'display_id', fatal=False)
        title = self._html_search_regex(r'<(?:h1|title)>(?:Video: )?(.+?)</(?:h1|title)>', webpage, 'title')

        thumbnail = flashvars['preview_url']
        if thumbnail.startswith('//'):
            protocol, _, _ = url.partition('/')
            thumbnail = protocol + thumbnail

        url_keys = list(filter(re.compile(r'^video_(?:url|alt_url\d*)$').match, flashvars.keys()))
        formats = []
        for key in url_keys:
            if '/get_file/' not in flashvars[key]:
                continue
            format_id = flashvars.get(f'{key}_text', key)
            formats.append({
                'url': urljoin(url, self._kvs_get_real_url(flashvars[key], flashvars['license_code'])),
                'format_id': format_id,
                'ext': 'mp4',
                **(parse_resolution(format_id) or parse_resolution(flashvars[key])),
                'http_headers': {'Referer': url},
            })
            if not formats[-1].get('height'):
                formats[-1]['quality'] = 1

        return {
            'id': flashvars['video_id'],
            'display_id': display_id,
            'title': title,
            'thumbnail': urljoin(url, thumbnail),
            'formats': formats,
        }

    def _real_extract(self, url):
        if url.startswith('//'):
            return self.url_result(self.http_scheme() + url)

        parsed_url = urllib.parse.urlparse(url)
        if not parsed_url.scheme:
            default_search = self.get_param('default_search')
            if default_search is None:
                default_search = 'fixup_error'

            if default_search in ('auto', 'auto_warning', 'fixup_error'):
                if re.match(r'[^\s/]+\.[^\s/]+/', url):
                    self.report_warning('The url doesn\'t specify the protocol, trying with https')
                    return self.url_result('https://' + url)
                elif default_search != 'fixup_error':
                    if default_search == 'auto_warning':
                        if re.match(r'^(?:url|URL)$', url):
                            raise ExtractorError(
                                f'Invalid URL:  {url!r} . Call yt-dlp like this:  yt-dlp -v "https://www.youtube.com/watch?v=BaW_jenozKc"  ',
                                expected=True)
                        else:
                            self.report_warning(
                                f'Falling back to youtube search for  {url} . Set --default-search "auto" to suppress this warning.')
                    return self.url_result('ytsearch:' + url)

            if default_search in ('error', 'fixup_error'):
                raise ExtractorError(f'{url!r} is not a valid URL', expected=True)
            else:
                if ':' not in default_search:
                    default_search += ':'
                return self.url_result(default_search + url)

        original_url = url
        url, smuggled_data = unsmuggle_url(url, {})
        force_videoid = None
        is_intentional = smuggled_data.get('to_generic')
        if 'force_videoid' in smuggled_data:
            force_videoid = smuggled_data['force_videoid']
            video_id = force_videoid
        else:
            video_id = self._generic_id(url)

        # Do not impersonate by default; see https://github.com/yt-dlp/yt-dlp/issues/11335
        impersonate = self._configuration_arg('impersonate', ['false'])
        if 'false' in impersonate:
            impersonate = None

        # Some webservers may serve compressed content of rather big size (e.g. gzipped flac)
        # making it impossible to download only chunk of the file (yet we need only 512kB to
        # test whether it's HTML or not). According to yt-dlp default Accept-Encoding
        # that will always result in downloading the whole file that is not desirable.
        # Therefore for extraction pass we have to override Accept-Encoding to any in order
        # to accept raw bytes and being able to download only a chunk.
        # It may probably better to solve this by checking Content-Type for application/octet-stream
        # after a HEAD request, but not sure if we can rely on this.
        try:
            full_response = self._request_webpage(url, video_id, headers=filter_dict({
                'Accept-Encoding': 'identity',
                'Referer': smuggled_data.get('referer'),
            }), impersonate=impersonate)
        except ExtractorError as e:
            if not (isinstance(e.cause, HTTPError) and e.cause.status == 403
                    and e.cause.response.get_header('cf-mitigated') == 'challenge'
                    and e.cause.response.extensions.get('impersonate') is None):
                raise
            cf_cookie_domain = traverse_obj(
                LenientSimpleCookie(e.cause.response.get_header('set-cookie')),
                ('__cf_bm', 'domain'))
            if cf_cookie_domain:
                self.write_debug(f'Clearing __cf_bm cookie for {cf_cookie_domain}')
                self.cookiejar.clear(domain=cf_cookie_domain, path='/', name='__cf_bm')
            msg = 'Got HTTP Error 403 caused by Cloudflare anti-bot challenge; '
            if not self._downloader._impersonate_target_available(ImpersonateTarget()):
                msg += ('see  https://github.com/yt-dlp/yt-dlp#impersonation  for '
                        'how to install the required impersonation dependency, and ')
            raise ExtractorError(
                f'{msg}try again with  --extractor-args "generic:impersonate"', expected=True)

        new_url = full_response.url
        if new_url != extract_basic_auth(url)[0]:
            self.report_following_redirect(new_url)
            if force_videoid:
                new_url = smuggle_url(new_url, {'force_videoid': force_videoid})
            return self.url_result(new_url)

        info_dict = {
            'id': video_id,
            'title': self._generic_title(url),
            'timestamp': unified_timestamp(full_response.headers.get('Last-Modified')),
        }

        # Check for direct link to a video
        content_type = full_response.headers.get('Content-Type', '').lower()
        m = re.match(r'(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>[^;\s]+)', content_type)
        if m:
            self.report_detected('direct video link')
            headers = filter_dict({'Referer': smuggled_data.get('referer')})
            format_id = str(m.group('format_id'))
            ext = determine_ext(url, default_ext=None) or urlhandle_detect_ext(full_response)
            subtitles = {}
            if format_id.endswith('mpegurl') or ext == 'm3u8':
                formats, subtitles = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4', headers=headers)
            elif format_id == 'f4m' or ext == 'f4m':
                formats = self._extract_f4m_formats(url, video_id, headers=headers)
            # Don't check for DASH/mpd here, do it later w/ first_bytes. Same number of requests either way
            else:
                formats = [{
                    'format_id': format_id,
                    'url': url,
                    'ext': ext,
                    'vcodec': 'none' if m.group('type') == 'audio' else None,
                }]
                info_dict['direct'] = True
            info_dict.update({
                'formats': formats,
                'subtitles': subtitles,
                'http_headers': headers or None,
            })
            self._extra_manifest_info(info_dict, url)
            return info_dict

        if not self.get_param('test', False) and not is_intentional:
            force = self.get_param('force_generic_extractor', False)
            self.report_warning('%s generic information extractor' % ('Forcing' if force else 'Falling back on'))

        first_bytes = full_response.read(512)

        # Is it an M3U playlist?
        if first_bytes.startswith(b'#EXTM3U'):
            self.report_detected('M3U playlist')
            info_dict['formats'], info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4')
            self._extra_manifest_info(info_dict, url)
            return info_dict

        # Maybe it's a direct link to a video?
        # Be careful not to download the whole thing!
        if not is_html(first_bytes):
            self.report_warning(
                'URL could be a direct video link, returning it as such.')
            ext = determine_ext(url)
            if ext not in _UnsafeExtensionError.ALLOWED_EXTENSIONS:
                ext = 'unknown_video'
            info_dict.update({
                'direct': True,
                'url': url,
                'ext': ext,
            })
            return info_dict

        webpage = self._webpage_read_content(
            full_response, url, video_id, prefix=first_bytes)

        if '<title>DPG Media Privacy Gate</title>' in webpage:
            webpage = self._download_webpage(url, video_id)

        self.report_extraction(video_id)

        # Is it an RSS feed, a SMIL file, an XSPF playlist or a MPD manifest?
        try:
            try:
                doc = compat_etree_fromstring(webpage)
            except xml.etree.ElementTree.ParseError:
                doc = compat_etree_fromstring(webpage.encode())
            if doc.tag == 'rss':
                self.report_detected('RSS feed')
                return self._extract_rss(url, video_id, doc)
            elif doc.tag == 'SmoothStreamingMedia':
                info_dict['formats'], info_dict['subtitles'] = self._parse_ism_formats_and_subtitles(doc, url)
                self.report_detected('ISM manifest')
                return info_dict
            elif re.match(r'^(?:{[^}]+})?smil$', doc.tag):
                smil = self._parse_smil(doc, url, video_id)
                self.report_detected('SMIL file')
                return smil
            elif doc.tag == '{http://xspf.org/ns/0/}playlist':
                self.report_detected('XSPF playlist')
                return self.playlist_result(
                    self._parse_xspf(
                        doc, video_id, xspf_url=url,
                        xspf_base_url=new_url),
                    video_id)
            elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag):
                info_dict['formats'], info_dict['subtitles'] = self._parse_mpd_formats_and_subtitles(
                    doc,
                    # Do not use yt_dlp.utils.base_url here since it will raise on file:// URLs
                    mpd_base_url=update_url(new_url, query=None, fragment=None).rpartition('/')[0],
                    mpd_url=url)
                info_dict['live_status'] = 'is_live' if doc.get('type') == 'dynamic' else None
                self._extra_manifest_info(info_dict, url)
                self.report_detected('DASH manifest')
                return info_dict
            elif re.match(r'^{http://ns\.adobe\.com/f4m/[12]\.0}manifest$', doc.tag):
                info_dict['formats'] = self._parse_f4m_formats(doc, url, video_id)
                self.report_detected('F4M manifest')
                return info_dict
        except xml.etree.ElementTree.ParseError:
            pass

        info_dict.update({
            # it's tempting to parse this further, but you would
            # have to take into account all the variations like
            #   Video Title - Site Name
            #   Site Name | Video Title
            #   Video Title - Tagline | Site Name
            # and so on and so forth; it's just not practical
            'title': self._generic_title('', webpage, default='video'),
            'description': self._og_search_description(webpage, default=None),
            'thumbnail': self._og_search_thumbnail(webpage, default=None),
            'age_limit': self._rta_search(webpage),
        })

        self._downloader.write_debug('Looking for embeds')
        embeds = list(self._extract_embeds(original_url, webpage, urlh=full_response, info_dict=info_dict))
        if len(embeds) == 1:
            return merge_dicts(embeds[0], info_dict)
        elif embeds:
            return self.playlist_result(embeds, **info_dict)
        raise UnsupportedError(url)

    def _extract_embeds(self, url, webpage, *, urlh=None, info_dict={}):
        """Returns an iterator of video entries"""
        info_dict = types.MappingProxyType(info_dict)  # Prevents accidental mutation
        video_id = traverse_obj(info_dict, 'display_id', 'id') or self._generic_id(url)
        url, smuggled_data = unsmuggle_url(url, {})
        actual_url = urlh.url if urlh else url

        # Sometimes embedded video player is hidden behind percent encoding
        # (e.g. https://github.com/ytdl-org/youtube-dl/issues/2448)
        # Unescaping the whole page allows to handle those cases in a generic way
        # FIXME: unescaping the whole page may break URLs, commenting out for now.
        # There probably should be a second run of generic extractor on unescaped webpage.
        # webpage = urllib.parse.unquote(webpage)

        embeds = []
        for ie in self._downloader._ies.values():
            if ie.ie_key() in smuggled_data.get('block_ies', []):
                continue
            gen = ie.extract_from_webpage(self._downloader, url, webpage)
            current_embeds = []
            try:
                while True:
                    current_embeds.append(next(gen))
            except self.StopExtraction:
                self.report_detected(f'{ie.IE_NAME} exclusive embed', len(current_embeds),
                                     embeds and 'discarding other embeds')
                return current_embeds
            except StopIteration:
                self.report_detected(f'{ie.IE_NAME} embed', len(current_embeds))
                embeds.extend(current_embeds)

        if embeds:
            return embeds

        jwplayer_data = self._find_jwplayer_data(
            webpage, video_id, transform_source=js_to_json)
        if jwplayer_data:
            if isinstance(jwplayer_data.get('playlist'), str):
                self.report_detected('JW Player playlist')
                return [self.url_result(jwplayer_data['playlist'], 'JWPlatform')]
            try:
                info = self._parse_jwplayer_data(
                    jwplayer_data, video_id, require_title=False, base_url=url)
                if traverse_obj(info, 'formats', ('entries', ..., 'formats')):
                    self.report_detected('JW Player data')
                    return [info]
            except ExtractorError:
                # See https://github.com/ytdl-org/youtube-dl/pull/16735
                pass

        # Video.js embed
        mobj = re.search(
            r'(?s)\bvideojs\s*\(.+?([a-zA-Z0-9_$]+)\.src\s*\(\s*((?:\[.+?\]|{.+?}))\s*\)\s*;',
            webpage)
        if mobj is not None:
            varname = mobj.group(1)
            sources = variadic(self._parse_json(
                mobj.group(2), video_id, transform_source=js_to_json, fatal=False) or [])
            formats, subtitles, src = [], {}, None
            for source in sources:
                src = source.get('src')
                if not src or not isinstance(src, str):
                    continue
                src = urllib.parse.urljoin(url, src)
                src_type = source.get('type')
                if isinstance(src_type, str):
                    src_type = src_type.lower()
                ext = determine_ext(src).lower()
                if src_type == 'video/youtube':
                    return [self.url_result(src, YoutubeIE.ie_key())]
                if src_type == 'application/dash+xml' or ext == 'mpd':
                    fmts, subs = self._extract_mpd_formats_and_subtitles(
                        src, video_id, mpd_id='dash', fatal=False)
                    formats.extend(fmts)
                    self._merge_subtitles(subs, target=subtitles)
                elif src_type == 'application/x-mpegurl' or ext == 'm3u8':
                    fmts, subs = self._extract_m3u8_formats_and_subtitles(
                        src, video_id, 'mp4', entry_protocol='m3u8_native',
                        m3u8_id='hls', fatal=False)
                    formats.extend(fmts)
                    self._merge_subtitles(subs, target=subtitles)

                if not formats:
                    formats.append({
                        'url': src,
                        'ext': (mimetype2ext(src_type)
                                or ext if ext in KNOWN_EXTENSIONS else 'mp4'),
                        'http_headers': {
                            'Referer': actual_url,
                        },
                    })
            # https://docs.videojs.com/player#addRemoteTextTrack
            # https://html.spec.whatwg.org/multipage/media.html#htmltrackelement
            for sub_match in re.finditer(rf'(?s){re.escape(varname)}' + r'\.addRemoteTextTrack\(({.+?})\s*,\s*(?:true|false)\)', webpage):
                sub = self._parse_json(
                    sub_match.group(1), video_id, transform_source=js_to_json, fatal=False) or {}
                sub_src = str_or_none(sub.get('src'))
                if not sub_src:
                    continue
                subtitles.setdefault(dict_get(sub, ('language', 'srclang')) or 'und', []).append({
                    'url': urllib.parse.urljoin(url, sub_src),
                    'name': sub.get('label'),
                    'http_headers': {
                        'Referer': actual_url,
                    },
                })
            if formats or subtitles:
                self.report_detected('video.js embed')
                info_dict = {'formats': formats, 'subtitles': subtitles}
                if formats:
                    self._extra_manifest_info(info_dict, src)
                return [info_dict]

        # Look for generic KVS player (before json-ld bc of some urls that break otherwise)
        found = self._search_regex((
            r'<script\b[^>]+?\bsrc\s*=\s*(["\'])https?://(?:(?!\1)[^?#])+/kt_player\.js\?v=(?P<ver>\d+(?:\.\d+)+)\1[^>]*>',
            r'kt_player\s*\(\s*(["\'])(?:(?!\1)[\w\W])+\1\s*,\s*(["\'])https?://(?:(?!\2)[^?#])+/kt_player\.swf\?v=(?P<ver>\d+(?:\.\d+)+)\2\s*,',
        ), webpage, 'KVS player', group='ver', default=False)
        if found:
            self.report_detected('KVS Player')
            if found.split('.')[0] not in ('4', '5', '6'):
                self.report_warning(f'Untested major version ({found}) in player engine - download may fail.')
            return [self._extract_kvs(url, webpage, video_id)]

        # Looking for http://schema.org/VideoObject
        json_ld = self._search_json_ld(webpage, video_id, default={})
        if json_ld.get('url') not in (url, None):
            self.report_detected('JSON LD')
            is_direct = json_ld.get('ext') not in (None, *MEDIA_EXTENSIONS.manifests)
            return [merge_dicts({
                '_type': 'video' if is_direct else 'url_transparent',
                'url': smuggle_url(json_ld['url'], {
                    'force_videoid': video_id,
                    'to_generic': True,
                    'referer': url,
                }),
            }, json_ld)]

        def check_video(vurl):
            if YoutubeIE.suitable(vurl):
                return True
            if RtmpIE.suitable(vurl):
                return True
            vpath = urllib.parse.urlparse(vurl).path
            vext = determine_ext(vpath, None)
            return vext not in (None, 'swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml', 'js', 'xml')

        def filter_video(urls):
            return list(filter(check_video, urls))

        # Start with something easy: JW Player in SWFObject
        found = filter_video(re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage))
        if found:
            self.report_detected('JW Player in SFWObject')
        else:
            # Look for gorilla-vid style embedding
            found = filter_video(re.findall(r'''(?sx)
                (?:
                    jw_plugins|
                    JWPlayerOptions|
                    jwplayer\s*\(\s*["'][^'"]+["']\s*\)\s*\.setup
                )
                .*?
                ['"]?file['"]?\s*:\s*["\'](.*?)["\']''', webpage))
            if found:
                self.report_detected('JW Player embed')
        if not found:
            # Broaden the search a little bit
            found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage))
            if found:
                self.report_detected('video file')
        if not found:
            # Broaden the findall a little bit: JWPlayer JS loader
            found = filter_video(re.findall(
                r'[^A-Za-z0-9]?(?:file|video_url)["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage))
            if found:
                self.report_detected('JW Player JS loader')
        if not found:
            # Flow player
            found = filter_video(re.findall(r'''(?xs)
                flowplayer\("[^"]+",\s*
                    \{[^}]+?\}\s*,
                    \s*\{[^}]+? ["']?clip["']?\s*:\s*\{\s*
                        ["']?url["']?\s*:\s*["']([^"']+)["']
            ''', webpage))
            if found:
                self.report_detected('Flow Player')
        if not found:
            # Cinerama player
            found = re.findall(
                r"cinerama\.embedPlayer\(\s*\'[^']+\',\s*'([^']+)'", webpage)
            if found:
                self.report_detected('Cinerama player')
        if not found:
            # Try to find twitter cards info
            # twitter:player:stream should be checked before twitter:player since
            # it is expected to contain a raw stream (see
            # https://dev.twitter.com/cards/types/player#On_twitter.com_via_desktop_browser)
            found = filter_video(re.findall(
                r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage))
            if found:
                self.report_detected('Twitter card')
        if not found:
            # We look for Open Graph info:
            # We have to match any number spaces between elements, some sites try to align them, e.g.: statigr.am
            m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
            # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
            if m_video_type is not None:
                found = filter_video(re.findall(r'<meta.*?property="og:(?:video|audio)".*?content="(.*?)"', webpage))
                if found:
                    self.report_detected('Open Graph video info')
        if not found:
            REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)'
            found = re.search(
                r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")'
                rf'(?:[a-z-]+="[^"]+"\s+)*?content="{REDIRECT_REGEX}',
                webpage)
            if not found:
                # Look also in Refresh HTTP header
                refresh_header = urlh and urlh.headers.get('Refresh')
                if refresh_header:
                    found = re.search(REDIRECT_REGEX, refresh_header)
            if found:
                new_url = urllib.parse.urljoin(url, unescapeHTML(found.group(1)))
                if new_url != url:
                    self.report_following_redirect(new_url)
                    return [self.url_result(new_url)]
                else:
                    found = None

        if not found:
            # twitter:player is a https URL to iframe player that may or may not
            # be supported by yt-dlp thus this is checked the very last (see
            # https://dev.twitter.com/cards/types/player#On_twitter.com_via_desktop_browser)
            embed_url = self._html_search_meta('twitter:player', webpage, default=None)
            if embed_url and embed_url != url:
                self.report_detected('twitter:player iframe')
                return [self.url_result(embed_url)]

        if not found:
            return []

        domain_name = self._search_regex(r'^(?:https?://)?([^/]*)/.*', url, 'video uploader', default=None)

        entries = []
        for video_url in orderedSet(found):
            video_url = video_url.encode().decode('unicode-escape')
            video_url = unescapeHTML(video_url)
            video_url = video_url.replace('\\/', '/')
            video_url = urllib.parse.urljoin(url, video_url)
            video_id = urllib.parse.unquote(os.path.basename(video_url))

            # Sometimes, jwplayer extraction will result in a YouTube URL
            if YoutubeIE.suitable(video_url):
                entries.append(self.url_result(video_url, 'Youtube'))
                continue

            video_id = os.path.splitext(video_id)[0]
            headers = {
                'referer': actual_url,
            }

            entry_info_dict = {
                'id': video_id,
                'uploader': domain_name,
                'title': info_dict['title'],
                'age_limit': info_dict['age_limit'],
                'http_headers': headers,
            }

            if RtmpIE.suitable(video_url):
                entry_info_dict.update({
                    '_type': 'url_transparent',
                    'ie_key': RtmpIE.ie_key(),
                    'url': video_url,
                })
                entries.append(entry_info_dict)
                continue

            ext = determine_ext(video_url)
            if ext == 'smil':
                entry_info_dict = {**self._extract_smil_info(video_url, video_id), **entry_info_dict}
            elif ext == 'xspf':
                return [self._extract_xspf_playlist(video_url, video_id)]
            elif ext == 'm3u8':
                entry_info_dict['formats'], entry_info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(video_url, video_id, ext='mp4', headers=headers)
                self._extra_manifest_info(entry_info_dict, video_url)
            elif ext == 'mpd':
                entry_info_dict['formats'], entry_info_dict['subtitles'] = self._extract_mpd_formats_and_subtitles(video_url, video_id, headers=headers)
                self._extra_manifest_info(entry_info_dict, video_url)
            elif ext == 'f4m':
                entry_info_dict['formats'] = self._extract_f4m_formats(video_url, video_id, headers=headers)
            elif re.search(r'(?i)\.(?:ism|smil)/manifest', video_url) and video_url != url:
                # Just matching .ism/manifest is not enough to be reliably sure
                # whether it's actually an ISM manifest or some other streaming
                # manifest since there are various streaming URL formats
                # possible (see [1]) as well as some other shenanigans like
                # .smil/manifest URLs that actually serve an ISM (see [2]) and
                # so on.
                # Thus the most reasonable way to solve this is to delegate
                # to generic extractor in order to look into the contents of
                # the manifest itself.
                # 1. https://azure.microsoft.com/en-us/documentation/articles/media-services-deliver-content-overview/#streaming-url-formats
                # 2. https://svs.itworkscdn.net/lbcivod/smil:itwfcdn/lbci/170976.smil/Manifest
                entry_info_dict = self.url_result(
                    smuggle_url(video_url, {'to_generic': True}),
                    GenericIE.ie_key())
            else:
                entry_info_dict['url'] = video_url

            entries.append(entry_info_dict)

        if len(entries) > 1:
            for num, e in enumerate(entries, start=1):
                # 'url' results don't have a title
                if e.get('title') is not None:
                    e['title'] = '{} ({})'.format(e['title'], num)
        return entries
-												Move GenericIE into its own file

											
										
										
											2013-06-23 20:31:45 +02:00
+								import os
 								import re
-												[extractor/generic] Separate embed extraction into own function (#5176)


											
										
										
											2022-10-09 16:09:36 +05:30
+								import types
-												[compat] Remove deprecated functions from core code

											
										
										
											2022-06-24 16:24:43 +05:30
+								import urllib.parse
-												[cleanup] Mark some compat variables for removal (#2173)

Authored by fstirlitz, pukkandan

											
										
										
											2022-04-12 01:39:26 +05:30
+								import xml.etree.ElementTree
-												Move GenericIE into its own file

											
										
										
											2013-06-23 20:31:45 +02:00
-												[misc] Add `hatch`, `ruff`, `pre-commit` and improve dev docs (#7409)

Authored by: bashonly, seproDev, Grub4K

Co-authored-by: bashonly <88596187+bashonly@users.noreply.github.com>
Co-authored-by: sepro <4618135+seproDev@users.noreply.github.com>
											
										
										
											2024-05-26 21:27:21 +02:00
+								from .common import InfoExtractor
-												[cleanup] Sort imports

Using https://github.com/PyCQA/isort

    isort -m VERTICAL_HANGING_INDENT --py 36 -l 80 --rr -n --tc .

											
										
										
											2022-04-12 04:02:57 +05:30
+								from .commonprotocols import RtmpIE
-												[youtube] Support jwplayer with YouTube URLs (Closes #2075)

											
										
										
											2014-01-06 01:42:58 +01:00
+								from .youtube import YoutubeIE
-												[compat] Remove deprecated functions from core code

											
										
										
											2022-06-24 16:24:43 +05:30
+								from ..compat import compat_etree_fromstring
-												[ie/generic] Do not impersonate by default (fix edfd095b1917701c5046bd51f9542897c17d41a7) (#11336)

Closes #11335
Authored by: bashonly
											
										
										
											2024-10-24 23:11:48 +00:00
+								from ..cookies import LenientSimpleCookie
 								from ..networking.exceptions import HTTPError
-												[ie/generic] Impersonate browser by default (#11206)

Also adds `impersonate` extractor arg

Authored by: Grub4K
											
										
										
											2024-10-13 03:42:43 +02:00
+								from ..networking.impersonate import ImpersonateTarget
-												[util] Move compatibility functions out of util

utils is large enough without these compatibility functions.

Everything that is present in newer versions of Python (i.e. with dev Python it's just an import) goes into compat.py .
Everything else (i.e. youtube-dl-specific helpers) goes into utils.py .

											
										
										
											2014-11-02 11:23:40 +01:00
+								from ..utils import (
-												[cleanup] Sort imports

Using https://github.com/PyCQA/isort

    isort -m VERTICAL_HANGING_INDENT --py 36 -l 80 --rr -n --tc .

											
										
										
											2022-04-12 04:02:57 +05:30
+								    KNOWN_EXTENSIONS,
-												[extractor/generic] Fix JSON LD manifest extraction (#5577)

Closes #5572
Authored by: bashonly, pukkandan
											
										
										
											2022-11-18 02:04:03 +00:00
+								    MEDIA_EXTENSIONS,
-												[cleanup] Sort imports

Using https://github.com/PyCQA/isort

    isort -m VERTICAL_HANGING_INDENT --py 36 -l 80 --rr -n --tc .

											
										
										
											2022-04-12 04:02:57 +05:30
+								    ExtractorError,
 								    UnsupportedError,
-												fix up imports

											
										
										
											2014-08-28 01:00:59 +02:00
+								    determine_ext,
-												[extractor/generic] Attempt to detect live HLS (#6775)

* Extract duration for non-live generic HLS videos
* Add extractor-arg `is_live` to bypass live HLS check

Closes #6705
Authored by: bashonly
											
										
										
											2023-04-13 14:36:06 -05:00
+								    determine_protocol,
-												[generic] Extract subtitles from video.js (#3156)

Authored by: Lesmiscore 
											
										
										
											2022-03-23 15:28:53 +09:00
+								    dict_get,
-												[extractor/generic] Handle basic-auth when checking redirects

Closes #6352

											
										
										
											2023-02-26 10:27:04 +05:30
+								    extract_basic_auth,
-												[ie] Do not smuggle `http_headers`

See: https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-3ch3-jhc6-5r8x

Authored by: coletdjnz

											
										
										
											2023-08-16 18:42:48 -05:00
+								    filter_dict,
-												[extractor] Framework for embed detection (#4307)

											
										
										
											2022-08-01 06:52:03 +05:30
+								    format_field,
-												Preparing for release

											
										
										
											2021-01-07 12:11:05 +05:30
+								    int_or_none,
-												[generic] Add support for BOMs (Fixes #4753)

											
										
										
											2015-01-23 01:21:30 +01:00
+								    is_html,
-												[generic] Try parsing JWPlayer embedded videos (closes #12030)

											
										
										
											2017-02-16 23:42:36 +08:00
+								    js_to_json,
-												[utils] Introduce merge_dicts

											
										
										
											2018-04-28 02:47:17 +07:00
+								    merge_dicts,
-												[generic] Add support for Video.js embeds

											
										
										
											2017-09-28 00:29:42 +07:00
+								    mimetype2ext,
-												[generic] Simplify playlist support (#2948)

											
										
										
											2014-08-22 18:19:56 +02:00
+								    orderedSet,
-												Preparing for release

											
										
										
											2021-01-07 12:11:05 +05:30
+								    parse_duration,
-												[generic] Improve KVS player extraction (#2328)

Closes #2281
Authored by: trassshhub
											
										
										
											2022-01-14 00:51:00 +08:00
+								    parse_resolution,
-												[generic] Support embedded vimeo videos (#1602)

											
										
										
											2013-10-15 12:05:13 +02:00
+								    smuggle_url,
-												[generic] Extract subtitles from video.js (#3156)

Authored by: Lesmiscore 
											
										
										
											2022-03-23 15:28:53 +09:00
+								    str_or_none,
-												[extractor/generic] Don't return JW player without formats

CLoses #4765

											
										
										
											2022-08-27 06:20:48 +05:30
+								    traverse_obj,
-												[generic] Refactor `_extract_rss`

Closes #3738

											
										
										
											2022-05-18 04:14:13 +05:30
+								    try_call,
-												[generic] Support embedded vimeo videos (#1602)

											
										
										
											2013-10-15 12:05:13 +02:00
+								    unescapeHTML,
-												Update to ytdl-2021.01.03

											
										
										
											2021-01-01 17:56:37 +05:30
+								    unified_timestamp,
-												[ministrygrid] Add extractor (Fixes #2900)

											
										
										
											2014-08-24 04:47:18 +02:00
+								    unsmuggle_url,
-												[ie/generic] Fix MPD extraction for `file://` URLs (#12978)

Fix 5086d4aed6aeb3908c62f49e2d8f74cc0cb05110
Authored by: bashonly
											
										
										
											2025-04-22 19:06:35 -05:00
+								    update_url,
-												[extractor/generic] Add extractor-args `hls_key`, `variant_query` (#6567)

Authored by: bashonly
											
										
										
											2023-03-21 18:12:17 -05:00
+								    update_url_query,
-												Update to ytdl-2021.01.03

											
										
										
											2021-01-01 17:56:37 +05:30
+								    url_or_none,
-												[cleanup] Misc (#8598)

Authored by: bashonly, pukkandan, seproDev, Grub4K

Co-authored-by: bashonly <bashonly@protonmail.com>
Co-authored-by: pukkandan <pukkandan.ytdlp@gmail.com>
Co-authored-by: sepro <4618135+seproDev@users.noreply.github.com>
											
										
										
											2023-12-30 22:27:36 +01:00
+								    urlhandle_detect_ext,
-												Update to ytdl-commit-195f22f6

[generic] Improve KVS (etc) extraction
https://github.com/ytdl-org/youtube-dl/commit/195f22f679330549882a8234e7234942893a4902

Closes #3716
Authored by: Grub4k, pukkandan
											
										
										
											2023-01-02 14:45:36 +01:00
+								    urljoin,
-												[cleanup Misc

Closes #5162

											
										
										
											2022-10-18 23:28:57 +05:30
+								    variadic,
-												Update to ytdl-2021.01.03

											
										
										
											2021-01-01 17:56:37 +05:30
+								    xpath_attr,
-												[generic] Parse RSS enclosure URLs (Fixes #5091)

											
										
										
											2015-03-02 15:21:11 +01:00
+								    xpath_text,
-												Preparing for release

											
										
										
											2021-01-07 12:11:05 +05:30
+								    xpath_with_ns,
-												Move GenericIE into its own file

											
										
										
											2013-06-23 20:31:45 +02:00
+								)
-												[ie/generic] Fix direct video link extensions (#10468)

Fixes regression in the generic extractor due in 5ce582448ececb8d9c30c8c31f58330090ced03a

Closes #10459
Authored by: bashonly
											
										
										
											2024-07-14 13:57:07 -05:00
+								from ..utils._utils import _UnsafeExtensionError
-												Move GenericIE into its own file

											
										
										
											2013-06-23 20:31:45 +02:00
-												[generic] Support double slash URLs (Fixes #1309)

											
										
										
											2013-08-24 22:49:52 +02:00
-												Move GenericIE into its own file

											
										
										
											2013-06-23 20:31:45 +02:00
+								class GenericIE(InfoExtractor):
-												[generic] Use unicode_literals instead of duplicating the u'

											
										
										
											2014-01-06 01:47:52 +01:00
+								    IE_DESC = 'Generic downloader that works on some sites'
-												Move GenericIE into its own file

											
										
										
											2013-06-23 20:31:45 +02:00
+								    _VALID_URL = r'.*'
-												[generic] Use unicode_literals instead of duplicating the u'

											
										
										
											2014-01-06 01:47:52 +01:00
+								    IE_NAME = 'generic'
-												[cleanup] Fix some typos (#4194)

Authored by: crazymoose77756
											
										
										
											2022-06-26 20:50:06 -04:00
+								    _NETRC_MACHINE = False  # Suppress username warning
-												[cleanup] Move embed tests to dedicated extractors (#13782)

Authored by: doe1080
											
										
										
											2025-08-02 05:50:20 +09:00
+								    _TESTS = [{
 								        # Direct link
 								        # https://github.com/ytdl-org/youtube-dl/commit/c5fa81fe81ce05cd81c20ff4ea6dac3dccdcbf9d
 								        'url': 'https://media.w3.org/2010/05/sintel/trailer.mp4',
 								        'md5': '67d406c2bcb6af27fa886f31aa934bbe',
 								        'info_dict': {
 								            'id': 'trailer',
 								            'ext': 'mp4',
 								            'title': 'trailer',
 								            'direct': True,
 								            'timestamp': 1273772943,
 								            'upload_date': '20100513',
 								        },
 								    }, {
 								        # Direct link: No HEAD support
 								        # https://github.com/ytdl-org/youtube-dl/issues/4032
 								        'url': 'http://ai-radio.org:8000/radio.opus',
 								        'info_dict': {
 								            'id': 'radio',
 								            'ext': 'opus',
 								            'title': 'radio',
 								        },
 								        'skip': 'Invalid URL',
 								    }, {
 								        # Direct link: Incorrect MIME type
 								        # https://github.com/ytdl-org/youtube-dl/commit/c5fa81fe81ce05cd81c20ff4ea6dac3dccdcbf9d
 								        'url': 'https://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
 								        'md5': '4ccbebe5f36706d85221f204d7eb5913',
 								        'info_dict': {
 								            'id': '5_Lennart_Poettering_-_Systemd',
 								            'ext': 'webm',
 								            'title': '5_Lennart_Poettering_-_Systemd',
 								            'direct': True,
 								            'timestamp': 1416498816,
 								            'upload_date': '20141120',
 								        },
 								    }, {
 								        # Direct link: Live HLS; https://castr.com/hlsplayer/
 								        # https://github.com/yt-dlp/yt-dlp/pull/6775
 								        'url': 'https://stream-akamai.castr.com/5b9352dbda7b8c769937e459/live_2361c920455111ea85db6911fe397b9e/index.fmp4.m3u8',
 								        'info_dict': {
 								            'id': 'index.fmp4',
 								            'ext': 'mp4',
 								            'title': str,
 								            'live_status': 'is_live',
 								        },
 								        'params': {'skip_download': 'm3u8'},
 								    }, {
 								        # Compressed when `Accept-Encoding: *`
 								        # https://github.com/ytdl-org/youtube-dl/commit/a074e922967fa571d4f1abb1773c711747060f00
 								        'url': 'http://calimero.tk/muzik/FictionJunction-Parallel_Hearts.flac',
 								        'info_dict': {
 								            'id': 'FictionJunction-Parallel_Hearts',
 								            'ext': 'flac',
 								            'title': 'FictionJunction-Parallel_Hearts',
 								        },
 								        'skip': 'Invalid URL',
 								    }, {
 								        # `Content-Encoding: br` when `Accept-Encoding: *`
 								        # https://github.com/yt-dlp/yt-dlp/commit/3e01ce744a981d8f19ae77ec695005e7000f4703
 								        'url': 'https://www.extra.cz/cauky-lidi-70-dil-babis-predstavil-pohadky-prymulanek-nebo-andrejovy-nove-saty-ac867',
 								        'md5': 'a9a2cad3e54f78e4680c6deef82417e9',
 								        'info_dict': {
 								            'id': 'cauky-lidi-70-dil-babis-predstavil-pohadky-prymulanek-nebo-andrejovy-nove-saty-ac867',
 								            'ext': 'mp4',
 								            'title': 'čauky lidi 70 finall',
 								            'description': 'md5:47b2673a5b76780d9d329783e1fbf5aa',
 								            'direct': True,
 								            'duration': 318.0,
 								            'thumbnail': r're:https?://media\.extra\.cz/static/img/.+\.jpg',
 								            'timestamp': 1654513791,
 								            'upload_date': '20220606',
 								        },
 								        'params': {'extractor_args': {'generic': {'impersonate': ['chrome']}}},
 								    }, {
 								        # HLS: `Content-Type: audio/mpegurl`; https://bitmovin.com/demos/stream-test
 								        # https://github.com/ytdl-org/youtube-dl/commit/20938f768b16c945c6041ba3c0a7ae1a4e790881
 								        'url': 'https://cdn.bitmovin.com/content/assets/art-of-motion-dash-hls-progressive/m3u8s/f08e80da-bf1d-4e3d-8899-f0f6155f6efa.m3u8',
 								        'info_dict': {
 								            'id': 'f08e80da-bf1d-4e3d-8899-f0f6155f6efa',
 								            'ext': 'mp4',
 								            'title': 'f08e80da-bf1d-4e3d-8899-f0f6155f6efa',
 								            'duration': 211,
 								            'timestamp': 1737363648,
 								            'upload_date': '20250120',
 								        },
 								        'params': {'skip_download': 'm3u8'},
 								    }, {
 								        # HLS: `Content-Type: text/plain`; https://github.com/grafov/m3u8
 								        # https://github.com/ytdl-org/youtube-dl/commit/edd9b71c2cca7e5a0df8799710d9ad410ec77d29
 								        'url': 'https://raw.githubusercontent.com/grafov/m3u8/refs/heads/master/sample-playlists/master.m3u8',
 								        'info_dict': {
 								            'id': 'master',
 								            'ext': 'mp4',
 								            'title': 'master',
 								        },
 								        'params': {'skip_download': 'm3u8'},
 								    }, {
 								        # MPEG-DASH; https://bitmovin.com/demos/stream-test
 								        # https://github.com/ytdl-org/youtube-dl/commit/9d939cec48f06a401fb79eb078c1fc50b2aefbe1
 								        'url': 'https://cdn.bitmovin.com/content/assets/art-of-motion-dash-hls-progressive/mpds/f08e80da-bf1d-4e3d-8899-f0f6155f6efa.mpd',
 								        'info_dict': {
 								            'id': 'f08e80da-bf1d-4e3d-8899-f0f6155f6efa',
 								            'ext': 'mp4',
 								            'title': 'f08e80da-bf1d-4e3d-8899-f0f6155f6efa',
 								            'timestamp': 1737363728,
 								            'upload_date': '20250120',
 								        },
 								        'params': {'skip_download': True},
 								    }, {
 								        # Live MPEG-DASH; https://livesim2.dashif.org/urlgen/create
 								        # https://github.com/yt-dlp/yt-dlp/pull/12256
 								        'url': 'https://livesim2.dashif.org/livesim2/ato_10/testpic_2s/Manifest.mpd',
 								        'info_dict': {
 								            'id': 'Manifest',
 								            'ext': 'mp4',
 								            'title': str,
 								            'live_status': 'is_live',
 								        },
 								        'params': {'skip_download': 'livestream'},
 								    }, {
 								        # SMIL
 								        # https://github.com/ytdl-org/youtube-dl/pull/6428
 								        'url': 'https://api.new.livestream.com/accounts/21/events/7954027/videos/166558123.secure.smil',
 								        'info_dict': {
 								            'id': '166558123.secure',
 								            'ext': 'mp4',
 								            'title': '73fb2379-a624-4b6c-bce4-e46086007f2c',
 								        },
 								        'params': {'skip_download': 'smil'},
 								    }, {
 								        # XSPF playlist; https://shellac-archive.ch/de/index.html
 								        # https://github.com/ytdl-org/youtube-dl/commit/1de5cd3ba51ce67d9a1cd3b40157058e78e46692
 								        'url': 'https://shellac-archive.ch/repository/xspf/22-AL0019Z.xspf',
 								        'info_dict': {
 								            'id': '22-AL0019Z',
 								        },
 								        'playlist_count': 12,
 								        'params': {'skip_download': True},
 								    }, {
-												[extractor/generic] Put all direct link tests near to each other for better navigation

											
										
										
											2015-05-31 02:22:29 +06:00
+								        # RSS feed
-												[cleanup] Move embed tests to dedicated extractors (#13782)

Authored by: doe1080
											
										
										
											2025-08-02 05:50:20 +09:00
+								        # https://github.com/ytdl-org/youtube-dl/commit/c5fa81fe81ce05cd81c20ff4ea6dac3dccdcbf9d
 								        'url': 'http://phihag.de/2014/youtube-dl/rss2.xml',
 								        'info_dict': {
 								            'id': 'https://phihag.de/2014/youtube-dl/rss2.xml',
 								            'title': 'Zero Punctuation',
 								            'description': 'md5:512ae5f840e52eb3c0d08d4bed08eb3e',
 								        },
 								        'playlist_mincount': 11,
 								    }, {
 								        # RSS feed: Includes enclosure, description, and thumbnails
 								        # https://github.com/ytdl-org/youtube-dl/pull/27405
 								        'url': 'https://anchor.fm/s/dd00e14/podcast/rss',
 								        'info_dict': {
 								            'id': 'https://anchor.fm/s/dd00e14/podcast/rss',
 								            'title': '100% Hydrogen ',
 								            'description': 'md5:7ec96327f8b91a2549a2e74f064022a1',
 								        },
 								        'playlist_count': 1,
 								        'params': {'skip_download': True},
 								    }, {
 								        # RSS feed: Includes guid
 								        'url': 'https://www.omnycontent.com/d/playlist/a7b4f8fe-59d9-4afc-a79a-a90101378abf/bf2c1d80-3656-4449-9d00-a903004e8f84/efbff746-e7c1-463a-9d80-a903004e8f8f/podcast.rss',
 								        'info_dict': {
 								            'id': 'https://www.omnycontent.com/d/playlist/a7b4f8fe-59d9-4afc-a79a-a90101378abf/bf2c1d80-3656-4449-9d00-a903004e8f84/efbff746-e7c1-463a-9d80-a903004e8f8f/podcast.rss',
 								            'title': 'The Little Red Podcast',
 								            'description': 'md5:be809a44b63b0c56fb485caf68685520',
 								        },
 								        'playlist_mincount': 76,
 								    }, {
 								        # RSS feed: Includes enclosure and unsupported URLs
 								        # https://github.com/ytdl-org/youtube-dl/pull/16189
 								        'url': 'https://www.interfax.ru/rss.asp',
 								        'info_dict': {
 								            'id': 'https://www.interfax.ru/rss.asp',
 								            'title': 'Интерфакс',
 								            'description': 'md5:49b6b8905772efba21923942bbc0444c',
 								        },
 								        'playlist_mincount': 25,
 								    }, {
 								        # Webpage starts with a duplicate UTF-8 BOM
 								        # https://github.com/yt-dlp/yt-dlp/commit/80e8493ee7c3083f4e215794e4a67ba5265f24f7
 								        'url': 'https://www.filmarkivet.se/movies/paris-d-moll/',
 								        'md5': 'df02cadc719dcc63d43288366f037754',
 								        'info_dict': {
 								            'id': 'paris-d-moll',
 								            'ext': 'mp4',
 								            'title': 'Paris d-moll',
 								            'description': 'md5:319e37ea5542293db37e1e13072fe330',
 								            'thumbnail': r're:https?://www\.filmarkivet\.se/wp-content/uploads/.+\.jpg',
 								        },
 								    }, {
 								        # Multiple HTML5 videos
 								        # https://github.com/ytdl-org/youtube-dl/pull/14107
 								        'url': 'https://www.dagbladet.no/nyheter/etter-ett-ars-planlegging-klaffet-endelig-alt---jeg-matte-ta-en-liten-dans/60413035',
 								        'info_dict': {
 								            'id': '60413035',
 								            'title': 'Etter ett års planlegging, klaffet endelig alt: - Jeg måtte ta en liten dans',
 								            'description': 'md5:bbb4e12e42e78609a74fd421b93b1239',
 								            'thumbnail': r're:https?://www\.dagbladet\.no/images/.+',
 								        },
 								        'playlist_count': 2,
 								    }, {
 								        # Cinerama Player
 								        # https://github.com/ytdl-org/youtube-dl/commit/501f13fbf3d1f7225f91e3e0ad008df2cd3219f1
 								        'url': 'https://www.abc.net.au/res/libraries/cinerama2/examples/single_clip.htm',
 								        'info_dict': {
 								            'id': 'single_clip',
 								            'title': 'Single Clip player examples',
 								        },
 								        'playlist_count': 3,
 								    }, {
 								        # FIXME: Improve extraction
-												[generic] Automatic detection of flow player and age_limit (Fixes #3576)

											
										
										
											2014-08-24 05:31:32 +02:00
+								        # Flowplayer
-												[cleanup] Move embed tests to dedicated extractors (#13782)

Authored by: doe1080
											
										
										
											2025-08-02 05:50:20 +09:00
+								        # https://github.com/ytdl-org/youtube-dl/commit/4d805e063c6c4ffd557d7c7cb905a3ed9c926b08
 								        'url': 'https://flowplayer.com/resources/demos/standard-setup',
 								        'info_dict': {
 								            'id': 'playlist',
 								            'ext': 'mp4',
 								            'title': 'playlist',
 								            'duration': 13,
 								            'timestamp': 1539082175,
 								            'upload_date': '20181009',
 								        },
 								        'params': {'skip_download': 'm3u8'},
 								    }, {
 								        # JW Player: YouTube
 								        # https://github.com/ytdl-org/youtube-dl/commit/a0f719854463c6f4226e4042dfa80c1b17154e1d
 								        'url': 'https://media.nationalarchives.gov.uk/index.php/webinar-using-discovery-national-archives-online-catalogue/',
 								        'info_dict': {
 								            'id': 'Mrj4DVp2zeA',
 								            'ext': 'mp4',
 								            'title': 'Using Discovery, The National Archives’ online catalogue',
 								            'age_limit': 0,
 								            'availability': 'unlisted',
 								            'categories': ['Education'],
 								            'channel': 'The National Archives UK',
 								            'channel_follower_count': int,
 								            'channel_id': 'UCUuzebc1yADDJEnOLA5P9xw',
 								            'channel_url': 'https://www.youtube.com/channel/UCUuzebc1yADDJEnOLA5P9xw',
 								            'chapters': 'count:13',
 								            'description': 'md5:a236581cd2449dd2df4f93412f3f01c6',
 								            'duration': 3066,
 								            'like_count': int,
 								            'live_status': 'not_live',
 								            'media_type': 'video',
 								            'playable_in_embed': True,
 								            'tags': 'count:5',
 								            'thumbnail': r're:https?://i\.ytimg\.com/vi/.+',
 								            'timestamp': 1423757117,
 								            'upload_date': '20150212',
 								            'uploader': 'The National Archives UK',
 								            'uploader_id': '@TheNationalArchivesUK',
 								            'uploader_url': 'https://www.youtube.com/@TheNationalArchivesUK',
 								            'view_count': int,
 								        },
 								        'add_ie': ['Youtube'],
 								    }, {
 								        # JW Player: Complex
 								        # https://github.com/ytdl-org/youtube-dl/commit/a4a554a79354981fcab55de8eaab7b95a40bbb48
 								        'url': 'https://www.indiedb.com/games/king-machine/videos',
 								        'info_dict': {
 								            'id': 'videos-1',
 								            'ext': 'mp4',
 								            'title': 'Videos & Audio - King Machine (1)',
 								            'description': 'Browse King Machine videos & audio for sweet media. Your eyes will thank you.',
 								            'thumbnail': r're:https?://media\.indiedb\.com/cache/images/.+\.jpg',
 								            '_old_archive_ids': ['generic videos'],
 								        },
 								    }, {
 								        # JW Player: JSON Feed URL
 								        # https://github.com/yt-dlp/yt-dlp/issues/1476
 								        'url': 'https://foodschmooze.org/',
 								        'info_dict': {
 								            'id': 'z00Frhnw',
 								            'ext': 'mp4',
 								            'title': 'Grilling Beef Tenderloin',
 								            'description': '',
 								            'duration': 392.0,
 								            'thumbnail': r're:https?://cdn\.jwplayer\.com/v2/media/.+',
 								            'timestamp': 1465313685,
 								            'upload_date': '20160607',
 								        },
 								        'params': {'skip_download': 'm3u8'},
 								    }, {
 								        # JW Player: RTMP
 								        # https://github.com/ytdl-org/youtube-dl/issues/11993
 								        'url': 'http://www.suffolk.edu/sjc/live.php',
 								        'info_dict': {
 								            'id': 'live',
 								            'ext': 'flv',
 								            'title': 'Massachusetts Supreme Judicial Court Oral Arguments',
 								        },
 								        'skip': 'Invalid URL',
 								    }, {
 								        # KVS Player v7.3.3
 								        # kt_player.js?v=5.1.1
 								        'url': 'https://bogmedia.org/videos/21217/40-nochey-2016/',
 								        'md5': '94166bdb26b4cb1fb9214319a629fc51',
 								        'info_dict': {
 								            'id': '21217',
 								            'ext': 'mp4',
 								            'title': '40 ночей (2016) - BogMedia.org',
 								            'description': 'md5:4e6d7d622636eb7948275432eb256dc3',
 								            'display_id': '40-nochey-2016',
 								            'thumbnail': r're:https?://bogmedia\.org/contents/videos_screenshots/.+\.jpg',
 								        },
 								    }, {
 								        # KVS Player v7.7.11
 								        # kt_player.js?v=5.5.1
 								        # https://github.com/yt-dlp/yt-dlp/commit/a318f59d14792d25b2206c3f50181e03e8716db7
 								        'url': 'https://youix.com/video/leningrad-zoj/',
 								        'md5': '94f96ba95706dc3880812b27b7d8a2b8',
 								        'info_dict': {
 								            'id': '18485',
 								            'ext': 'mp4',
 								            'title': 'Клип: Ленинград - ЗОЖ скачать, смотреть онлайн | Youix.com',
 								            'display_id': 'leningrad-zoj',
 								            'thumbnail': r're:https?://youix\.com/contents/videos_screenshots/.+\.jpg',
 								        },
 								    }, {
 								        # KVS Player v7.10.3
 								        # kt_player.js?v=12
 								        # https://github.com/ytdl-org/youtube-dl/commit/fc2beab0e701c497a003f11fef5c0df54fba1da3
 								        'url': 'https://shooshtime.com/videos/346037/fresh-out-of-the-shower/',
 								        'md5': 'c9a97ad528607a4516d4df83a3aeb12c',
 								        'info_dict': {
 								            'id': '346037',
 								            'ext': 'mp4',
 								            'title': 'Fresh out of the shower - Shooshtime',
 								            'age_limit': 18,
 								            'description': 'md5:efd70fd3973f8750d285c743b910580a',
 								            'display_id': 'fresh-out-of-the-shower',
 								            'thumbnail': r're:https?://i\.shoosh\.co/contents/videos_screenshots/.+\.jpg',
 								        },
 								        'expected_warnings': ['Untested major version'],
 								    }, {
 								        # FIXME: Unable to extract flashvars
 								        # KVS Player v7.11.4
 								        # kt_player.js?v=2.11.5.1
 								        # https://github.com/yt-dlp/yt-dlp/commit/a318f59d14792d25b2206c3f50181e03e8716db7
 								        'url': 'https://www.kvs-demo.com/video/105/kelis-4th-of-july/',
 								        'info_dict': {
 								            'id': '105',
 								            'ext': 'mp4',
 								            'title': 'Kelis - 4th Of July',
 								        },
 								    }, {
 								        # KVS Player v7.11.4
 								        # kt_player.js?v=6.3.2
 								        # https://github.com/yt-dlp/yt-dlp/commit/a318f59d14792d25b2206c3f50181e03e8716db7
 								        'url': 'https://www.kvs-demo.com/embed/105/',
 								        'md5': '1ff84c70acaddbb03288c6cc5ee1879f',
 								        'info_dict': {
 								            'id': '105',
 								            'ext': 'mp4',
 								            'title': 'Kelis - 4th Of July / Embed Player',
 								            'display_id': 'kelis-4th-of-july',
 								            'thumbnail': r're:https?://www\.kvs-demo\.com/contents/videos_screenshots/.+\.jpg',
 								        },
 								    }, {
 								        # twitter:player:stream
 								        # https://github.com/ytdl-org/youtube-dl/commit/371ddb14fe651d4a1e5a8310d6d7c0e395cd92b0
 								        'url': 'https://beltzlaw.com/',
 								        'info_dict': {
 								            'id': 'beltzlaw-1',
 								            'ext': 'mp4',
-												[ie] `_rta_search`: Do not assume `age_limit` is `0` (#13985)

Authored by: doe1080
											
										
										
											2025-08-16 13:28:58 +09:00
+								            'title': str,
 								            'description': str,
-												[cleanup] Move embed tests to dedicated extractors (#13782)

Authored by: doe1080
											
										
										
											2025-08-02 05:50:20 +09:00
+								            'thumbnail': r're:https?://beltzlaw\.com/wp-content/uploads/.+\.jpg',
 								            'timestamp': int,  # varies
 								            'upload_date': str,
 								            '_old_archive_ids': ['generic beltzlaw'],
 								        },
 								    }, {
 								        # twitter:player
 								        # https://github.com/ytdl-org/youtube-dl/commit/329179073b93e37ab76e759d1fe96d8f984367f3
 								        'url': 'https://cine.ar/',
 								        'md5': 'd3e33335e339f04008690118698dfd08',
 								        'info_dict': {
 								            'id': 'cine-1',
 								            'ext': 'webm',
 								            'title': 'CINE.AR (1)',
 								            'description': 'md5:a4e58f9e2291c940e485f34251898c4a',
 								            'thumbnail': r're:https?://cine\.ar/img/.+\.png',
 								            '_old_archive_ids': ['generic cine'],
 								        },
 								        'params': {'format': 'webm'},
 								    }, {
 								        # JSON-LD: multiple @type
 								        # https://github.com/yt-dlp/yt-dlp/commit/f3c0c77304bc0e5614a65c45629de22f067685ac
 								        'url': 'https://www.nu.nl/280161/video/hoe-een-bladvlo-dit-verwoestende-japanse-onkruid-moet-vernietigen.html',
 								        'info_dict': {
 								            'id': 'ipy2AcGL',
 								            'ext': 'mp4',
 								            'title': 'Hoe een bladvlo dit verwoestende Japanse onkruid moet vernietigen',
 								            'description': 'md5:6a9d644bab0dc2dc06849c2505d8383d',
 								            'duration': 111.0,
 								            'thumbnail': r're:https?://images\.nu\.nl/.+\.jpg',
 								            'timestamp': 1586584674,
 								            'upload_date': '20200411',
 								        },
 								        'params': {'extractor_args': {'generic': {'impersonate': ['chrome']}}},
 								    }, {
 								        # JSON-LD: unexpected @type
 								        # https://github.com/yt-dlp/yt-dlp/pull/5145
 								        'url': 'https://www.autoweek.nl/autotests/artikel/porsche-911-gt3-rs-rij-impressie-2/',
 								        'info_dict': {
 								            'id': 'porsche-911-gt3-rs-rij-impressie-2',
 								            'ext': 'mp4',
 								            'title': 'Test: Porsche 911 GT3 RS - AutoWeek',
 								            'description': 'md5:a17b5bd84288448d8f11b838505718fc',
 								            'direct': True,
 								            'thumbnail': r're:https?://images\.autoweek\.nl/.+',
 								            'timestamp': 1664920902,
 								            'upload_date': '20221004',
 								        },
 								        'params': {'extractor_args': {'generic': {'impersonate': ['chrome']}}},
 								    }, {
 								        # JSON-LD: VideoObject
 								        # https://github.com/ytdl-org/youtube-dl/commit/6e6b70d65f0681317c425bfe1e157f3474afbbe8
 								        'url': 'https://breezy.hr/',
 								        'info_dict': {
 								            'id': 'k6gl2kt2eq',
 								            'ext': 'mp4',
 								            'title': 'Breezy HR\'s ATS helps you find & hire employees sooner',
 								            'average_rating': 4.5,
 								            'description': 'md5:eee75fdd3044c538003f3be327ba01e1',
 								            'duration': 60.1,
 								            'thumbnail': r're:https?://cdn\.prod\.website-files\.com/.+\.webp',
 								            'timestamp': 1485734400,
 								            'upload_date': '20170130',
 								        },
 								    }, {
 								        # Video.js: VOD HLS
 								        # https://github.com/yt-dlp/yt-dlp/pull/6775
 								        'url': 'https://gist.githubusercontent.com/bashonly/2aae0862c50f4a4b84f220c315767208/raw/e3380d413749dabbe804c9c2d8fd9a45142475c7/videojs_hls_test.html',
 								        'info_dict': {
 								            'id': 'videojs_hls_test',
 								            'ext': 'mp4',
 								            'title': 'video',
 								            'duration': 1800,
 								        },
 								        'params': {'skip_download': 'm3u8'},
 								    }, {
 								        # Video.js: YouTube
 								        # https://github.com/ytdl-org/youtube-dl/commit/63d990d2859d0e981da2e416097655798334431b
 								        'url': 'https://ortcam.com/solidworks-%d1%83%d1%80%d0%be%d0%ba-6-%d0%bd%d0%b0%d1%81%d1%82%d1%80%d0%be%d0%b9%d0%ba%d0%b0-%d1%87%d0%b5%d1%80%d1%82%d0%b5%d0%b6%d0%b0_33f9b7351.html?vid=33f9b7351',
 								        'info_dict': {
 								            'id': 'yygqldloqIk',
 								            'ext': 'mp4',
 								            'title': 'SolidWorks. Урок 6 Настройка чертежа',
 								            'age_limit': 0,
 								            'availability': 'public',
 								            'categories': ['Education'],
 								            'channel': 'PROстое3D',
 								            'channel_follower_count': int,
 								            'channel_id': 'UCy91Bug3dERhbwGh2m2Ijng',
 								            'channel_url': 'https://www.youtube.com/channel/UCy91Bug3dERhbwGh2m2Ijng',
 								            'comment_count': int,
 								            'description': 'md5:baf95267792646afdbf030e4d06b2ab3',
 								            'duration': 1160,
 								            'heatmap': 'count:100',
 								            'like_count': int,
 								            'live_status': 'not_live',
 								            'media_type': 'video',
 								            'playable_in_embed': True,
 								            'tags': 'count:17',
 								            'thumbnail': r're:https?://i\.ytimg\.com/vi/.+',
 								            'timestamp': 1363263144,
 								            'upload_date': '20130314',
 								            'uploader': 'PROстое3D',
 								            'uploader_id': '@PROstoe3D',
 								            'uploader_url': 'https://www.youtube.com/@PROstoe3D',
 								            'view_count': int,
 								        },
 								        'add_ie': ['Youtube'],
 								    }, {
 								        # Redirect
 								        # https://github.com/ytdl-org/youtube-dl/issues/413
 								        'url': 'https://www.google.com/url?rct=j&url=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY',
 								        'info_dict': {
 								            'id': 'cmQHVoWB5FY',
 								            'ext': 'mp4',
 								            'title': 'First Firefox OS phones side-by-side',
 								            'age_limit': 0,
 								            'availability': 'public',
 								            'categories': ['Entertainment'],
 								            'channel': 'The Verge',
 								            'channel_follower_count': int,
 								            'channel_id': 'UCddiUEpeqJcYeBxX1IVBKvQ',
 								            'channel_is_verified': True,
 								            'channel_url': 'https://www.youtube.com/channel/UCddiUEpeqJcYeBxX1IVBKvQ',
 								            'comment_count': int,
 								            'description': 'md5:7a676046ad24d9ea55cdde4a6657c5b3',
 								            'duration': 207,
 								            'like_count': int,
 								            'live_status': 'not_live',
 								            'media_type': 'video',
 								            'playable_in_embed': True,
 								            'tags': 'count:15',
 								            'thumbnail': r're:https?://i\.ytimg\.com/vi/.+',
 								            'timestamp': 1361738430,
 								            'upload_date': '20130224',
 								            'uploader': 'The Verge',
 								            'uploader_id': '@TheVerge',
 								            'uploader_url': 'https://www.youtube.com/@TheVerge',
 								            'view_count': int,
 								        },
 								        'add_ie': ['Youtube'],
 								    }]
-												Move GenericIE into its own file

											
										
										
											2013-06-23 20:31:45 +02:00
 								    def report_following_redirect(self, new_url):
 								        """Report information extraction."""
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								        self._downloader.to_screen(f'[redirect] Following redirect to {new_url}')
-												Move GenericIE into its own file

											
										
										
											2013-06-23 20:31:45 +02:00
-												[extractor] Framework for embed detection (#4307)

											
										
										
											2022-08-01 06:52:03 +05:30
+								    def report_detected(self, name, num=1, note=None):
 								        if num > 1:
 								            name += 's'
 								        elif not num:
 								            return
 								        else:
 								            num = 'a'
 								        self._downloader.write_debug(f'Identified {num} {name}{format_field(note, None, "; %s")}')
-												[cleanup] Minor improvements to error and debug messages

											
										
										
											2021-11-10 04:19:33 +05:30
-												[extractor/generic] Add extractor-args `hls_key`, `variant_query` (#6567)

Authored by: bashonly
											
										
										
											2023-03-21 18:12:17 -05:00
+								    def _extra_manifest_info(self, info, manifest_url):
-												[extractor/generic] Accept values for `fragment_query`, `variant_query` (#6600)

Closes #6593
Authored by: bashonly
											
										
										
											2023-03-23 11:28:23 -05:00
+								        fragment_query = self._configuration_arg('fragment_query', [None], casesense=True)[0]
 								        if fragment_query is not None:
 								            info['extra_param_to_segment_url'] = (
 								                urllib.parse.urlparse(fragment_query).query or fragment_query
 								                or urllib.parse.urlparse(manifest_url).query or None)
-												[extractor/generic] Add extractor-args `hls_key`, `variant_query` (#6567)

Authored by: bashonly
											
										
										
											2023-03-21 18:12:17 -05:00
-												[ie/generic] Add `key_query` extractor-arg

Authored by: bashonly

											
										
										
											2024-06-15 18:18:42 -05:00
+								        key_query = self._configuration_arg('key_query', [None], casesense=True)[0]
 								        if key_query is not None:
 								            info['extra_param_to_key_url'] = (
 								                urllib.parse.urlparse(key_query).query or key_query
 								                or urllib.parse.urlparse(manifest_url).query or None)
 								        def hex_or_none(value):
 								            return value if re.fullmatch(r'(0x)?[\da-f]+', value, re.IGNORECASE) else None
-												[extractor/generic] Accept values for `fragment_query`, `variant_query` (#6600)

Closes #6593
Authored by: bashonly
											
										
										
											2023-03-23 11:28:23 -05:00
+								        info['hls_aes'] = traverse_obj(self._configuration_arg('hls_key', casesense=True), {
-												[extractor/generic] Add extractor-args `hls_key`, `variant_query` (#6567)

Authored by: bashonly
											
										
										
											2023-03-21 18:12:17 -05:00
+								            'uri': (0, {url_or_none}), 'key': (0, {hex_or_none}), 'iv': (1, {hex_or_none}),
 								        }) or None
-												[extractor/generic] Accept values for `fragment_query`, `variant_query` (#6600)

Closes #6593
Authored by: bashonly
											
										
										
											2023-03-23 11:28:23 -05:00
+								        variant_query = self._configuration_arg('variant_query', [None], casesense=True)[0]
 								        if variant_query is not None:
 								            query = urllib.parse.parse_qs(
 								                urllib.parse.urlparse(variant_query).query or variant_query
 								                or urllib.parse.urlparse(manifest_url).query)
-												[extractor/generic] Add extractor-args `hls_key`, `variant_query` (#6567)

Authored by: bashonly
											
										
										
											2023-03-21 18:12:17 -05:00
+								            for fmt in self._downloader._get_formats(info):
 								                fmt['url'] = update_url_query(fmt['url'], query)
-												[extractor/generic] Add `fragment_query` extractor arg for DASH and HLS (#5528)

* `fragment_query`: passthrough any query in generic mpd/m3u8 manifest URLs to their fragments
* Add support for `extra_param_to_segment_url` to DASH downloader
Authored by: bashonly, pukkandan
											
										
										
											2022-11-21 00:51:45 +00:00
-												[extractor/generic] Attempt to detect live HLS (#6775)

* Extract duration for non-live generic HLS videos
* Add extractor-arg `is_live` to bypass live HLS check

Closes #6705
Authored by: bashonly
											
										
										
											2023-04-13 14:36:06 -05:00
+								        # Attempt to detect live HLS or set VOD duration
 								        m3u8_format = next((f for f in self._downloader._get_formats(info)
 								                            if determine_protocol(f) == 'm3u8_native'), None)
 								        if m3u8_format:
 								            is_live = self._configuration_arg('is_live', [None])[0]
 								            if is_live is not None:
 								                info['live_status'] = 'not_live' if is_live == 'false' else 'is_live'
 								                return
-												[ie/generic] Validate response before checking m3u8 live status (#12784)

Closes #12744
Authored by: bashonly
											
										
										
											2025-03-30 18:02:59 -05:00
+								            headers = m3u8_format.get('http_headers') or info.get('http_headers') or {}
 								            display_id = info.get('id')
 								            urlh = self._request_webpage(
 								                m3u8_format['url'], display_id, 'Checking m3u8 live status', errnote=False,
 								                headers={**headers, 'Accept-Encoding': 'identity'}, fatal=False)
 								            if urlh is False:
 								                return
 								            first_bytes = urlh.read(512)
 								            if not first_bytes.startswith(b'#EXTM3U'):
 								                return
 								            m3u8_doc = self._webpage_read_content(
 								                urlh, urlh.url, display_id, prefix=first_bytes, fatal=False, errnote=False)
 								            if not m3u8_doc:
 								                return
 								            duration = self._parse_m3u8_vod_duration(m3u8_doc, display_id)
-												[extractor/generic] Attempt to detect live HLS (#6775)

* Extract duration for non-live generic HLS videos
* Add extractor-arg `is_live` to bypass live HLS check

Closes #6705
Authored by: bashonly
											
										
										
											2023-04-13 14:36:06 -05:00
+								            if not duration:
 								                info['live_status'] = 'is_live'
 								            info['duration'] = info.get('duration') or duration
-												[generic] Add support for RSS feeds (Fixes #667)

											
										
										
											2014-02-20 13:14:05 +01:00
+								    def _extract_rss(self, url, video_id, doc):
-												Update to ytdl-2021.01.03

											
										
										
											2021-01-01 17:56:37 +05:30
+								        NS_MAP = {
 								            'itunes': 'http://www.itunes.com/dtds/podcast-1.0.dtd',
 								        }
-												[generic] Parse RSS enclosure URLs (Fixes #5091)

											
										
										
											2015-03-02 15:21:11 +01:00
+								        entries = []
 								        for it in doc.findall('./channel/item'):
-												[generic] Refactor `_extract_rss`

Closes #3738

											
										
										
											2022-05-18 04:14:13 +05:30
+								            next_url = next(
 								                (e.attrib.get('url') for e in it.findall('./enclosure')),
 								                xpath_text(it, 'link', fatal=False))
-												[generic] Parse RSS enclosure URLs (Fixes #5091)

											
										
										
											2015-03-02 15:21:11 +01:00
+								            if not next_url:
 								                continue
-												[generic] Refactor `_extract_rss`

Closes #3738

											
										
										
											2022-05-18 04:14:13 +05:30
+								            guid = try_call(lambda: it.find('guid').text)
 								            if guid:
 								                next_url = smuggle_url(next_url, {'force_videoid': guid})
-												[generic] Set rss `guid` as video id (#2741)

Closes #2424
Authored by: Bricio
											
										
										
											2022-02-11 20:32:58 -03:00
-												Update to ytdl-2021.01.03

											
										
										
											2021-01-01 17:56:37 +05:30
+								            def itunes(key):
-												[generic] Refactor `_extract_rss`

Closes #3738

											
										
										
											2022-05-18 04:14:13 +05:30
+								                return xpath_text(it, xpath_with_ns(f'./itunes:{key}', NS_MAP), default=None)
-												Update to ytdl-2021.01.03

											
										
										
											2021-01-01 17:56:37 +05:30
-												[generic] Parse RSS enclosure URLs (Fixes #5091)

											
										
										
											2015-03-02 15:21:11 +01:00
+								            entries.append({
-												[extractor/generic] Extract RSS entries as url_transparent (#11163)

											
										
										
											2017-04-16 00:52:15 +07:00
+								                '_type': 'url_transparent',
-												[generic] Parse RSS enclosure URLs (Fixes #5091)

											
										
										
											2015-03-02 15:21:11 +01:00
+								                'url': next_url,
-												[generic] Refactor `_extract_rss`

Closes #3738

											
										
										
											2022-05-18 04:14:13 +05:30
+								                'title': try_call(lambda: it.find('title').text),
-												Update to ytdl-2021.01.03

											
										
										
											2021-01-01 17:56:37 +05:30
+								                'description': xpath_text(it, 'description', default=None),
-												[generic] Refactor `_extract_rss`

Closes #3738

											
										
										
											2022-05-18 04:14:13 +05:30
+								                'timestamp': unified_timestamp(xpath_text(it, 'pubDate', default=None)),
 								                'duration': parse_duration(itunes('duration')),
-												Update to ytdl-2021.01.03

											
										
										
											2021-01-01 17:56:37 +05:30
+								                'thumbnail': url_or_none(xpath_attr(it, xpath_with_ns('./itunes:image', NS_MAP), 'href')),
 								                'episode': itunes('title'),
 								                'episode_number': int_or_none(itunes('episode')),
 								                'season_number': int_or_none(itunes('season')),
-												[generic] Refactor `_extract_rss`

Closes #3738

											
										
										
											2022-05-18 04:14:13 +05:30
+								                'age_limit': {'true': 18, 'yes': 18, 'false': 0, 'no': 0}.get((itunes('explicit') or '').lower()),
-												[generic] Parse RSS enclosure URLs (Fixes #5091)

											
										
										
											2015-03-02 15:21:11 +01:00
+								            })
-												[generic] Add support for RSS feeds (Fixes #667)

											
										
										
											2014-02-20 13:14:05 +01:00
 								        return {
 								            '_type': 'playlist',
 								            'id': url,
-												[generic] Refactor `_extract_rss`

Closes #3738

											
										
										
											2022-05-18 04:14:13 +05:30
+								            'title': try_call(lambda: doc.find('./channel/title').text),
 								            'description': try_call(lambda: doc.find('./channel/description').text),
-												[generic] Add support for RSS feeds (Fixes #667)

											
										
										
											2014-02-20 13:14:05 +01:00
+								            'entries': entries,
 								        }
-												Update to ytdl-commit-195f22f6

[generic] Improve KVS (etc) extraction
https://github.com/ytdl-org/youtube-dl/commit/195f22f679330549882a8234e7234942893a4902

Closes #3716
Authored by: Grub4k, pukkandan
											
										
										
											2023-01-02 14:45:36 +01:00
+								    @classmethod
 								    def _kvs_get_real_url(cls, video_url, license_code):
-												[generic] Support KVS player (#549)

* Replaces the extractor for thisvid

Fixes: https://github.com/ytdl-org/youtube-dl/issues/2077
Authored-by: rigstot
											
										
										
											2021-07-29 06:03:01 +02:00
+								        if not video_url.startswith('function/0/'):
 								            return video_url  # not obfuscated
-												Update to ytdl-commit-195f22f6

[generic] Improve KVS (etc) extraction
https://github.com/ytdl-org/youtube-dl/commit/195f22f679330549882a8234e7234942893a4902

Closes #3716
Authored by: Grub4k, pukkandan
											
										
										
											2023-01-02 14:45:36 +01:00
+								        parsed = urllib.parse.urlparse(video_url[len('function/0/'):])
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								        license_token = cls._kvs_get_license_token(license_code)
-												Update to ytdl-commit-195f22f6

[generic] Improve KVS (etc) extraction
https://github.com/ytdl-org/youtube-dl/commit/195f22f679330549882a8234e7234942893a4902

Closes #3716
Authored by: Grub4k, pukkandan
											
										
										
											2023-01-02 14:45:36 +01:00
+								        urlparts = parsed.path.split('/')
-												[generic] Support KVS player (#549)

* Replaces the extractor for thisvid

Fixes: https://github.com/ytdl-org/youtube-dl/issues/2077
Authored-by: rigstot
											
										
										
											2021-07-29 06:03:01 +02:00
-												Update to ytdl-commit-195f22f6

[generic] Improve KVS (etc) extraction
https://github.com/ytdl-org/youtube-dl/commit/195f22f679330549882a8234e7234942893a4902

Closes #3716
Authored by: Grub4k, pukkandan
											
										
										
											2023-01-02 14:45:36 +01:00
+								        HASH_LENGTH = 32
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								        hash_ = urlparts[3][:HASH_LENGTH]
-												Update to ytdl-commit-195f22f6

[generic] Improve KVS (etc) extraction
https://github.com/ytdl-org/youtube-dl/commit/195f22f679330549882a8234e7234942893a4902

Closes #3716
Authored by: Grub4k, pukkandan
											
										
										
											2023-01-02 14:45:36 +01:00
+								        indices = list(range(HASH_LENGTH))
-												[generic] Support KVS player (#549)

* Replaces the extractor for thisvid

Fixes: https://github.com/ytdl-org/youtube-dl/issues/2077
Authored-by: rigstot
											
										
										
											2021-07-29 06:03:01 +02:00
-												Update to ytdl-commit-195f22f6

[generic] Improve KVS (etc) extraction
https://github.com/ytdl-org/youtube-dl/commit/195f22f679330549882a8234e7234942893a4902

Closes #3716
Authored by: Grub4k, pukkandan
											
										
										
											2023-01-02 14:45:36 +01:00
+								        # Swap indices of hash according to the destination calculated from the license token
 								        accum = 0
 								        for src in reversed(range(HASH_LENGTH)):
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								            accum += license_token[src]
-												Update to ytdl-commit-195f22f6

[generic] Improve KVS (etc) extraction
https://github.com/ytdl-org/youtube-dl/commit/195f22f679330549882a8234e7234942893a4902

Closes #3716
Authored by: Grub4k, pukkandan
											
										
										
											2023-01-02 14:45:36 +01:00
+								            dest = (src + accum) % HASH_LENGTH
 								            indices[src], indices[dest] = indices[dest], indices[src]
-												[generic] Support KVS player (#549)

* Replaces the extractor for thisvid

Fixes: https://github.com/ytdl-org/youtube-dl/issues/2077
Authored-by: rigstot
											
										
										
											2021-07-29 06:03:01 +02:00
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								        urlparts[3] = ''.join(hash_[index] for index in indices) + urlparts[3][HASH_LENGTH:]
-												Update to ytdl-commit-195f22f6

[generic] Improve KVS (etc) extraction
https://github.com/ytdl-org/youtube-dl/commit/195f22f679330549882a8234e7234942893a4902

Closes #3716
Authored by: Grub4k, pukkandan
											
										
										
											2023-01-02 14:45:36 +01:00
+								        return urllib.parse.urlunparse(parsed._replace(path='/'.join(urlparts)))
-												[generic] Support KVS player (#549)

* Replaces the extractor for thisvid

Fixes: https://github.com/ytdl-org/youtube-dl/issues/2077
Authored-by: rigstot
											
										
										
											2021-07-29 06:03:01 +02:00
-												Update to ytdl-commit-195f22f6

[generic] Improve KVS (etc) extraction
https://github.com/ytdl-org/youtube-dl/commit/195f22f679330549882a8234e7234942893a4902

Closes #3716
Authored by: Grub4k, pukkandan
											
										
										
											2023-01-02 14:45:36 +01:00
+								    @staticmethod
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								    def _kvs_get_license_token(license_code):
 								        license_code = license_code.replace('$', '')
 								        license_values = [int(char) for char in license_code]
-												Update to ytdl-commit-195f22f6

[generic] Improve KVS (etc) extraction
https://github.com/ytdl-org/youtube-dl/commit/195f22f679330549882a8234e7234942893a4902

Closes #3716
Authored by: Grub4k, pukkandan
											
										
										
											2023-01-02 14:45:36 +01:00
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								        modlicense = license_code.replace('0', '1')
-												Update to ytdl-commit-195f22f6

[generic] Improve KVS (etc) extraction
https://github.com/ytdl-org/youtube-dl/commit/195f22f679330549882a8234e7234942893a4902

Closes #3716
Authored by: Grub4k, pukkandan
											
										
										
											2023-01-02 14:45:36 +01:00
+								        center = len(modlicense) // 2
-												[generic] Support KVS player (#549)

* Replaces the extractor for thisvid

Fixes: https://github.com/ytdl-org/youtube-dl/issues/2077
Authored-by: rigstot
											
										
										
											2021-07-29 06:03:01 +02:00
+								        fronthalf = int(modlicense[:center + 1])
 								        backhalf = int(modlicense[center:])
-												Update to ytdl-commit-195f22f6

[generic] Improve KVS (etc) extraction
https://github.com/ytdl-org/youtube-dl/commit/195f22f679330549882a8234e7234942893a4902

Closes #3716
Authored by: Grub4k, pukkandan
											
										
										
											2023-01-02 14:45:36 +01:00
+								        modlicense = str(4 * abs(fronthalf - backhalf))[:center + 1]
 								        return [
 								            (license_values[index + offset] + current) % 10
 								            for index, current in enumerate(map(int, modlicense))
 								            for offset in range(4)
 								        ]
 								    def _extract_kvs(self, url, webpage, video_id):
 								        flashvars = self._search_json(
 								            r'(?s:<script\b[^>]*>.*?var\s+flashvars\s*=)',
 								            webpage, 'flashvars', video_id, transform_source=js_to_json)
 								        # extract the part after the last / as the display_id from the
 								        # canonical URL.
 								        display_id = self._search_regex(
 								            r'(?:<link href="https?://[^"]+/(.+?)/?" rel="canonical"\s*/?>'
 								            r'|<link rel="canonical" href="https?://[^"]+/(.+?)/?"\s*/?>)',
 								            webpage, 'display_id', fatal=False)
 								        title = self._html_search_regex(r'<(?:h1|title)>(?:Video: )?(.+?)</(?:h1|title)>', webpage, 'title')
 								        thumbnail = flashvars['preview_url']
 								        if thumbnail.startswith('//'):
 								            protocol, _, _ = url.partition('/')
 								            thumbnail = protocol + thumbnail
-												[generic] Support KVS player (#549)

* Replaces the extractor for thisvid

Fixes: https://github.com/ytdl-org/youtube-dl/issues/2077
Authored-by: rigstot
											
										
										
											2021-07-29 06:03:01 +02:00
-												Update to ytdl-commit-195f22f6

[generic] Improve KVS (etc) extraction
https://github.com/ytdl-org/youtube-dl/commit/195f22f679330549882a8234e7234942893a4902

Closes #3716
Authored by: Grub4k, pukkandan
											
										
										
											2023-01-02 14:45:36 +01:00
+								        url_keys = list(filter(re.compile(r'^video_(?:url|alt_url\d*)$').match, flashvars.keys()))
 								        formats = []
 								        for key in url_keys:
 								            if '/get_file/' not in flashvars[key]:
 								                continue
 								            format_id = flashvars.get(f'{key}_text', key)
 								            formats.append({
 								                'url': urljoin(url, self._kvs_get_real_url(flashvars[key], flashvars['license_code'])),
 								                'format_id': format_id,
 								                'ext': 'mp4',
 								                **(parse_resolution(format_id) or parse_resolution(flashvars[key])),
 								                'http_headers': {'Referer': url},
 								            })
 								            if not formats[-1].get('height'):
 								                formats[-1]['quality'] = 1
 								        return {
 								            'id': flashvars['video_id'],
 								            'display_id': display_id,
 								            'title': title,
-												[ie/generic] Fix KVS thumbnail extraction

Closes #8045
Authored by: bashonly

											
										
										
											2023-09-16 16:20:34 -05:00
+								            'thumbnail': urljoin(url, thumbnail),
-												Update to ytdl-commit-195f22f6

[generic] Improve KVS (etc) extraction
https://github.com/ytdl-org/youtube-dl/commit/195f22f679330549882a8234e7234942893a4902

Closes #3716
Authored by: Grub4k, pukkandan
											
										
										
											2023-01-02 14:45:36 +01:00
+								            'formats': formats,
 								        }
-												[generic] Support KVS player (#549)

* Replaces the extractor for thisvid

Fixes: https://github.com/ytdl-org/youtube-dl/issues/2077
Authored-by: rigstot
											
										
										
											2021-07-29 06:03:01 +02:00
-												Move GenericIE into its own file

											
										
										
											2013-06-23 20:31:45 +02:00
+								    def _real_extract(self, url):
-												[generic] Add support for protocol-independent URLs (Fixes #2810)

											
										
										
											2014-04-30 01:46:06 +02:00
+								        if url.startswith('//'):
-												Refactor code to use url_result


											
										
										
											2019-01-01 18:56:05 +02:00
+								            return self.url_result(self.http_scheme() + url)
-												[generic] Add support for protocol-independent URLs (Fixes #2810)

											
										
										
											2014-04-30 01:46:06 +02:00
-												[compat] Remove deprecated functions from core code

											
										
										
											2022-06-24 16:24:43 +05:30
+								        parsed_url = urllib.parse.urlparse(url)
-												[generic] If the url doesn't specify the protocol, then try to extract prepending 'http://'

											
										
										
											2013-09-06 18:39:35 +02:00
+								        if not parsed_url.scheme:
-												[extractor] Add `write_debug` and `get_param`

											
										
										
											2021-05-17 17:53:08 +05:30
+								            default_search = self.get_param('default_search')
-												Add new --default-search option (#2193)

											
										
										
											2014-01-22 14:16:43 +01:00
+								            if default_search is None:
-												[generic] Add --default-search fixup_error

This restores the ability to enter URLs without a scheme (and default to http), but still fail if the input is a search term.

											
										
										
											2014-07-29 17:17:43 +02:00
+								                default_search = 'fixup_error'
-												Add new --default-search option (#2193)

											
										
										
											2014-01-22 14:16:43 +01:00
-												[generic] Add --default-search fixup_error

This restores the ability to enter URLs without a scheme (and default to http), but still fail if the input is a search term.

											
										
										
											2014-07-29 17:17:43 +02:00
+								            if default_search in ('auto', 'auto_warning', 'fixup_error'):
-												[cleanup] Misc (#10807)

Closes #10751, Closes #10769, Closes #10791
Authored by: bashonly, Codenade, pzhlkj6612, seproDev, coletdjnz, grqz, Grub4K

Co-authored-by: Codenade <amadeus.dorian04@gmail.com>
Co-authored-by: Mozi <29089388+pzhlkj6612@users.noreply.github.com>
Co-authored-by: sepro <4618135+seproDev@users.noreply.github.com>
Co-authored-by: coletdjnz <coletdjnz@protonmail.com>
Co-authored-by: N/Ame <173015200+grqz@users.noreply.github.com>
Co-authored-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-09-27 17:46:22 -05:00
+								                if re.match(r'[^\s/]+\.[^\s/]+/', url):
-												[ie/generic] Use https as fallback protocol (#14160)

Authored by: seproDev
											
										
										
											2025-08-27 22:25:35 +02:00
+								                    self.report_warning('The url doesn\'t specify the protocol, trying with https')
 								                    return self.url_result('https://' + url)
-												[generic] Add --default-search fixup_error

This restores the ability to enter URLs without a scheme (and default to http), but still fail if the input is a search term.

											
										
										
											2014-07-29 17:17:43 +02:00
+								                elif default_search != 'fixup_error':
-												[generic] Warn before fallback to automatic search

											
										
										
											2014-03-30 15:57:31 +02:00
+								                    if default_search == 'auto_warning':
-												[generic] Abort if user passes in URL "url" (#2942)

											
										
										
											2014-05-19 17:10:11 +02:00
+								                        if re.match(r'^(?:url|URL)$', url):
 								                            raise ExtractorError(
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								                                f'Invalid URL:  {url!r} . Call yt-dlp like this:  yt-dlp -v "https://www.youtube.com/watch?v=BaW_jenozKc"  ',
-												[generic] Abort if user passes in URL "url" (#2942)

											
										
										
											2014-05-19 17:10:11 +02:00
+								                                expected=True)
 								                        else:
-												Fix inconsistent use of `report_warning`

											
										
										
											2021-04-16 15:31:10 +05:30
+								                            self.report_warning(
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								                                f'Falling back to youtube search for  {url} . Set --default-search "auto" to suppress this warning.')
-												Add new --default-search option (#2193)

											
										
										
											2014-01-22 14:16:43 +01:00
+								                    return self.url_result('ytsearch:' + url)
-												[generic] Add --default-search fixup_error

This restores the ability to enter URLs without a scheme (and default to http), but still fail if the input is a search term.

											
										
										
											2014-07-29 17:17:43 +02:00
 								            if default_search in ('error', 'fixup_error'):
-												[ie/generic] Simplify invalid URL error message (#14167)

Authored by: seproDev
											
										
										
											2025-08-27 23:27:57 +02:00
+								                raise ExtractorError(f'{url!r} is not a valid URL', expected=True)
-												Add new --default-search option (#2193)

											
										
										
											2014-01-22 14:16:43 +01:00
+								            else:
-												[generic] Allow --default-search without colon

											
										
										
											2014-10-23 21:13:45 +02:00
+								                if ':' not in default_search:
 								                    default_search += ':'
-												Add new --default-search option (#2193)

											
										
										
											2014-01-22 14:16:43 +01:00
+								                return self.url_result(default_search + url)
-												[ministrygrid] Add extractor (Fixes #2900)

											
										
										
											2014-08-24 04:47:18 +02:00
-												[extractor/generic] Separate embed extraction into own function (#5176)


											
										
										
											2022-10-09 16:09:36 +05:30
+								        original_url = url
-												[extractor/generic] Pass through referer from json-ld

Closes #4941

											
										
										
											2022-09-16 23:05:49 +05:30
+								        url, smuggled_data = unsmuggle_url(url, {})
-												[ministrygrid] Add extractor (Fixes #2900)

											
										
										
											2014-08-24 04:47:18 +02:00
+								        force_videoid = None
-												[docs] Misc improvements

Closes #4987, Closes #4906, Closes #4919, Closes #4977, Closes #4979

											
										
										
											2022-09-22 01:37:44 +05:30
+								        is_intentional = smuggled_data.get('to_generic')
 								        if 'force_videoid' in smuggled_data:
-												[ministrygrid] Add extractor (Fixes #2900)

											
										
										
											2014-08-24 04:47:18 +02:00
+								            force_videoid = smuggled_data['force_videoid']
 								            video_id = force_videoid
 								        else:
-												[generic,commonprotocols] Move mms suuport from GenericIE

And use _generic_* helpers in those extractors

											
										
										
											2016-10-07 19:22:30 +08:00
+								            video_id = self._generic_id(url)
-												[generic] Support direct MMS links (closes #10838)

											
										
										
											2016-10-07 17:50:45 +08:00
-												[ie/generic] Do not impersonate by default (fix edfd095b1917701c5046bd51f9542897c17d41a7) (#11336)

Closes #11335
Authored by: bashonly
											
										
										
											2024-10-24 23:11:48 +00:00
+								        # Do not impersonate by default; see https://github.com/yt-dlp/yt-dlp/issues/11335
 								        impersonate = self._configuration_arg('impersonate', ['false'])
 								        if 'false' in impersonate:
-												[ie/generic] Impersonate browser by default (#11206)

Also adds `impersonate` extractor arg

Authored by: Grub4K
											
										
										
											2024-10-13 03:42:43 +02:00
+								            impersonate = None
-												[extractor/generic] Remove HEAD request

											
										
										
											2022-07-07 12:00:23 +05:30
+								        # Some webservers may serve compressed content of rather big size (e.g. gzipped flac)
 								        # making it impossible to download only chunk of the file (yet we need only 512kB to
 								        # test whether it's HTML or not). According to yt-dlp default Accept-Encoding
 								        # that will always result in downloading the whole file that is not desirable.
 								        # Therefore for extraction pass we have to override Accept-Encoding to any in order
 								        # to accept raw bytes and being able to download only a chunk.
 								        # It may probably better to solve this by checking Content-Type for application/octet-stream
 								        # after a HEAD request, but not sure if we can rely on this.
-												[ie/generic] Do not impersonate by default (fix edfd095b1917701c5046bd51f9542897c17d41a7) (#11336)

Closes #11335
Authored by: bashonly
											
										
										
											2024-10-24 23:11:48 +00:00
+								        try:
 								            full_response = self._request_webpage(url, video_id, headers=filter_dict({
 								                'Accept-Encoding': 'identity',
 								                'Referer': smuggled_data.get('referer'),
 								            }), impersonate=impersonate)
 								        except ExtractorError as e:
 								            if not (isinstance(e.cause, HTTPError) and e.cause.status == 403
 								                    and e.cause.response.get_header('cf-mitigated') == 'challenge'
 								                    and e.cause.response.extensions.get('impersonate') is None):
 								                raise
 								            cf_cookie_domain = traverse_obj(
 								                LenientSimpleCookie(e.cause.response.get_header('set-cookie')),
 								                ('__cf_bm', 'domain'))
 								            if cf_cookie_domain:
 								                self.write_debug(f'Clearing __cf_bm cookie for {cf_cookie_domain}')
 								                self.cookiejar.clear(domain=cf_cookie_domain, path='/', name='__cf_bm')
 								            msg = 'Got HTTP Error 403 caused by Cloudflare anti-bot challenge; '
 								            if not self._downloader._impersonate_target_available(ImpersonateTarget()):
 								                msg += ('see  https://github.com/yt-dlp/yt-dlp#impersonation  for '
 								                        'how to install the required impersonation dependency, and ')
 								            raise ExtractorError(
 								                f'{msg}try again with  --extractor-args "generic:impersonate"', expected=True)
-												[compat, networking] Deprecate old functions (#2861)

Authored by: coletdjnz, pukkandan

											
										
										
											2023-07-09 13:23:02 +05:30
+								        new_url = full_response.url
-												[extractor/generic] Handle basic-auth when checking redirects

Closes #6352

											
										
										
											2023-02-26 10:27:04 +05:30
+								        if new_url != extract_basic_auth(url)[0]:
-												[extractor/generic] Remove HEAD request

											
										
										
											2022-07-07 12:00:23 +05:30
+								            self.report_following_redirect(new_url)
 								            if force_videoid:
 								                new_url = smuggle_url(new_url, {'force_videoid': force_videoid})
 								            return self.url_result(new_url)
-												[generic] Handle audio streams that do not implement HEAD (Fixes #4032)

											
										
										
											2014-10-26 17:05:44 +01:00
-												[extractor/generic] Extract f4m formats and refactor common info

											
										
										
											2016-03-13 03:17:25 +06:00
+								        info_dict = {
 								            'id': video_id,
-												[generic,commonprotocols] Move mms suuport from GenericIE

And use _generic_* helpers in those extractors

											
										
										
											2016-10-07 19:22:30 +08:00
+								            'title': self._generic_title(url),
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								            'timestamp': unified_timestamp(full_response.headers.get('Last-Modified')),
-												[extractor/generic] Extract f4m formats and refactor common info

											
										
										
											2016-03-13 03:17:25 +06:00
+								        }
-												[generic] Handle audio streams that do not implement HEAD (Fixes #4032)

											
										
										
											2014-10-26 17:05:44 +01:00
+								        # Check for direct link to a video
-												[extractor/generic] Remove HEAD request

											
										
										
											2022-07-07 12:00:23 +05:30
+								        content_type = full_response.headers.get('Content-Type', '').lower()
-												[cleanup] Misc (#10807)

Closes #10751, Closes #10769, Closes #10791
Authored by: bashonly, Codenade, pzhlkj6612, seproDev, coletdjnz, grqz, Grub4K

Co-authored-by: Codenade <amadeus.dorian04@gmail.com>
Co-authored-by: Mozi <29089388+pzhlkj6612@users.noreply.github.com>
Co-authored-by: sepro <4618135+seproDev@users.noreply.github.com>
Co-authored-by: coletdjnz <coletdjnz@protonmail.com>
Co-authored-by: N/Ame <173015200+grqz@users.noreply.github.com>
Co-authored-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-09-27 17:46:22 -05:00
+								        m = re.match(r'(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>[^;\s]+)', content_type)
-												[generic] Handle audio streams that do not implement HEAD (Fixes #4032)

											
										
										
											2014-10-26 17:05:44 +01:00
+								        if m:
-												[cleanup] Minor improvements to error and debug messages

											
										
										
											2021-11-10 04:19:33 +05:30
+								            self.report_detected('direct video link')
-												[ie] Do not smuggle `http_headers`

See: https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-3ch3-jhc6-5r8x

Authored by: coletdjnz

											
										
										
											2023-08-16 18:42:48 -05:00
+								            headers = filter_dict({'Referer': smuggled_data.get('referer')})
-												[compat] Remove deprecated functions from core code

											
										
										
											2022-06-24 16:24:43 +05:30
+								            format_id = str(m.group('format_id'))
-												[ie/generic] Improve direct video link ext detection (#8340)

Closes #8265
Authored by: bashonly
											
										
										
											2023-10-27 19:35:37 -05:00
+								            ext = determine_ext(url, default_ext=None) or urlhandle_detect_ext(full_response)
-												[generic] Extract subtitles from direct HLS manifest links

											
										
										
											2021-04-15 10:15:43 +02:00
+								            subtitles = {}
-												[extractor/generic] Detect manifest links via extension

Authored by: bashonly

											
										
										
											2023-03-01 06:38:02 -06:00
+								            if format_id.endswith('mpegurl') or ext == 'm3u8':
-												[extractor/generic] Pass through referer from json-ld

Closes #4941

											
										
										
											2022-09-16 23:05:49 +05:30
+								                formats, subtitles = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4', headers=headers)
-												[extractor/generic] Detect manifest links via extension

Authored by: bashonly

											
										
										
											2023-03-01 06:38:02 -06:00
+								            elif format_id == 'f4m' or ext == 'f4m':
-												[extractor/generic] Pass through referer from json-ld

Closes #4941

											
										
										
											2022-09-16 23:05:49 +05:30
+								                formats = self._extract_f4m_formats(url, video_id, headers=headers)
-												[ie/generic] Extract `live_status` for DASH manifest URLs (#12256)

* Also removes the content-type check for dash+xml/mpd.
This was added in cf1f13b817d88eb7d4b449f20cbad3215030e35f,
but is a no-op since the regex pattern was never changed accordingly.
And it looks like it was unwanted anyways per 28ad7df65ddb78c16ac008886d14ae2914aea6be

Closes #12255
Authored by: mp3butcher
											
										
										
											2025-02-09 00:28:54 +01:00
+								            # Don't check for DASH/mpd here, do it later w/ first_bytes. Same number of requests either way
-												[generic] extract m3u8 formats when mpegurl content type detected

											
										
										
											2016-02-04 01:25:36 +01:00
+								            else:
 								                formats = [{
-												[extractor/generic] Ensure format id is unicode string

											
										
										
											2017-06-10 23:55:05 +07:00
+								                    'format_id': format_id,
-												[generic] extract m3u8 formats when mpegurl content type detected

											
										
										
											2016-02-04 01:25:36 +01:00
+								                    'url': url,
-												[ie/generic] Improve direct video link ext detection (#8340)

Closes #8265
Authored by: bashonly
											
										
										
											2023-10-27 19:35:37 -05:00
+								                    'ext': ext,
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								                    'vcodec': 'none' if m.group('type') == 'audio' else None,
-												[generic] extract m3u8 formats when mpegurl content type detected

											
										
										
											2016-02-04 01:25:36 +01:00
+								                }]
-												[extractor/generic] Fix direct link semantics

											
										
										
											2016-03-18 22:43:07 +06:00
+								                info_dict['direct'] = True
-												[extractor/generic] Pass through referer from json-ld

Closes #4941

											
										
										
											2022-09-16 23:05:49 +05:30
+								            info_dict.update({
 								                'formats': formats,
 								                'subtitles': subtitles,
-												[downloader/ffmpeg] Fix headers for video+audio formats (#5659)

Authored by: bashonly, Grub4K
											
										
										
											2022-12-09 23:36:38 +00:00
+								                'http_headers': headers or None,
-												[extractor/generic] Pass through referer from json-ld

Closes #4941

											
										
										
											2022-09-16 23:05:49 +05:30
+								            })
-												[extractor/generic] Add extractor-args `hls_key`, `variant_query` (#6567)

Authored by: bashonly
											
										
										
											2023-03-21 18:12:17 -05:00
+								            self._extra_manifest_info(info_dict, url)
-												[extractor/generic] Extract f4m formats and refactor common info

											
										
										
											2016-03-13 03:17:25 +06:00
+								            return info_dict
-												Add support for direct links to a video (#1973)

											
										
										
											2013-12-17 12:33:55 +01:00
-												[extractor] Add `write_debug` and `get_param`

											
										
										
											2021-05-17 17:53:08 +05:30
+								        if not self.get_param('test', False) and not is_intentional:
 								            force = self.get_param('force_generic_extractor', False)
-												[extractor] Framework for embed detection (#4307)

											
										
										
											2022-08-01 06:52:03 +05:30
+								            self.report_warning('%s generic information extractor' % ('Forcing' if force else 'Falling back on'))
-												[vimeo:likes] Add new extractor (Fixes #3835)

											
										
										
											2014-09-28 12:14:16 +02:00
-												[extractor/generic] Detect m3u playlists served without proper Content-Type

											
										
										
											2016-03-18 22:45:28 +06:00
+								        first_bytes = full_response.read(512)
 								        # Is it an M3U playlist?
-												[extractor/generic] Fix missing byte literal prefix

											
										
										
											2016-03-19 05:43:43 +06:00
+								        if first_bytes.startswith(b'#EXTM3U'):
-												[cleanup] Minor improvements to error and debug messages

											
										
										
											2021-11-10 04:19:33 +05:30
+								            self.report_detected('M3U playlist')
-												[generic] Extract previously missed subtitles (#515)

* [generic] Extract subtitles in cases missed previously
* [common] Detect discarded subtitles in SMIL manifests
* [generic] Extract everything in the SMIL manifest

Authored by: fstirlitz 
											
										
										
											2021-07-16 16:22:56 +02:00
+								            info_dict['formats'], info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4')
-												[extractor/generic] Add extractor-args `hls_key`, `variant_query` (#6567)

Authored by: bashonly
											
										
										
											2023-03-21 18:12:17 -05:00
+								            self._extra_manifest_info(info_dict, url)
-												[extractor/generic] Detect m3u playlists served without proper Content-Type

											
										
										
											2016-03-18 22:45:28 +06:00
+								            return info_dict
-												[generic] Detect direct video links (Fixes #4149, #4313)

											
										
										
											2014-11-26 10:44:39 +01:00
+								        # Maybe it's a direct link to a video?
 								        # Be careful not to download the whole thing!
-												[generic] Add support for BOMs (Fixes #4753)

											
										
										
											2015-01-23 01:21:30 +01:00
+								        if not is_html(first_bytes):
-												Fix inconsistent use of `report_warning`

											
										
										
											2021-04-16 15:31:10 +05:30
+								            self.report_warning(
-												[generic] Detect direct video links (Fixes #4149, #4313)

											
										
										
											2014-11-26 10:44:39 +01:00
+								                'URL could be a direct video link, returning it as such.')
-												[ie/generic] Fix direct video link extensions (#10468)

Fixes regression in the generic extractor due in 5ce582448ececb8d9c30c8c31f58330090ced03a

Closes #10459
Authored by: bashonly
											
										
										
											2024-07-14 13:57:07 -05:00
+								            ext = determine_ext(url)
 								            if ext not in _UnsafeExtensionError.ALLOWED_EXTENSIONS:
 								                ext = 'unknown_video'
-												[extractor/generic] Extract f4m formats and refactor common info

											
										
										
											2016-03-13 03:17:25 +06:00
+								            info_dict.update({
-												[generic] Detect direct video links (Fixes #4149, #4313)

											
										
										
											2014-11-26 10:44:39 +01:00
+								                'direct': True,
 								                'url': url,
-												[ie/generic] Fix direct video link extensions (#10468)

Fixes regression in the generic extractor due in 5ce582448ececb8d9c30c8c31f58330090ced03a

Closes #10459
Authored by: bashonly
											
										
										
											2024-07-14 13:57:07 -05:00
+								                'ext': ext,
-												[extractor/generic] Extract f4m formats and refactor common info

											
										
										
											2016-03-13 03:17:25 +06:00
+								            })
 								            return info_dict
-												[generic] Detect direct video links (Fixes #4149, #4313)

											
										
										
											2014-11-26 10:44:39 +01:00
 								        webpage = self._webpage_read_content(
 								            full_response, url, video_id, prefix=first_bytes)
-												Update to ytdl-2021.02.04.1 except youtube

											
										
										
											2021-02-04 13:26:01 +05:30
+								        if '<title>DPG Media Privacy Gate</title>' in webpage:
 								            webpage = self._download_webpage(url, video_id)
-												Move GenericIE into its own file

											
										
										
											2013-06-23 20:31:45 +02:00
+								        self.report_extraction(video_id)
-												Support multiple embedded YouTube URLs (Fixes #1787)

											
										
										
											2013-11-18 13:28:26 +01:00
-												[extractor/generic] Detect DASH manifests and extract mpd formats

											
										
										
											2016-02-06 19:35:32 +06:00
+								        # Is it an RSS feed, a SMIL file, an XSPF playlist or a MPD manifest?
-												[generic] Add support for RSS feeds (Fixes #667)

											
										
										
											2014-02-20 13:14:05 +01:00
+								        try:
-												[generic] Respect the encoding in manifest

											
										
										
											2021-05-11 12:59:45 +05:30
+								            try:
 								                doc = compat_etree_fromstring(webpage)
-												[cleanup] Mark some compat variables for removal (#2173)

Authored by fstirlitz, pukkandan

											
										
										
											2022-04-12 01:39:26 +05:30
+								            except xml.etree.ElementTree.ParseError:
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								                doc = compat_etree_fromstring(webpage.encode())
-												[generic] Add support for RSS feeds (Fixes #667)

											
										
										
											2014-02-20 13:14:05 +01:00
+								            if doc.tag == 'rss':
-												[cleanup] Minor improvements to error and debug messages

											
										
										
											2021-11-10 04:19:33 +05:30
+								                self.report_detected('RSS feed')
-												[generic] Add support for RSS feeds (Fixes #667)

											
										
										
											2014-02-20 13:14:05 +01:00
+								                return self._extract_rss(url, video_id, doc)
-												[extractor/generic] Add support for ISM manifests

											
										
										
											2016-11-02 03:01:13 +07:00
+								            elif doc.tag == 'SmoothStreamingMedia':
-												[generic] Extract subtitles from direct SSTR manifest links

											
										
										
											2021-04-18 14:26:32 +02:00
+								                info_dict['formats'], info_dict['subtitles'] = self._parse_ism_formats_and_subtitles(doc, url)
-												[cleanup] Minor improvements to error and debug messages

											
										
										
											2021-11-10 04:19:33 +05:30
+								                self.report_detected('ISM manifest')
-												[extractor/generic] Add support for ISM manifests

											
										
										
											2016-11-02 03:01:13 +07:00
+								                return info_dict
-												[extractor/generic] Improve generic SMIL detection

											
										
										
											2015-08-02 01:13:59 +06:00
+								            elif re.match(r'^(?:{[^}]+})?smil$', doc.tag):
-												Remove _sort_formats from _extract_*_formats methods

Now _sort_formats should be called explicitly.
_sort_formats has been added to all the necessary places in code.

Closes #8051

											
										
										
											2016-03-27 07:03:08 +06:00
+								                smil = self._parse_smil(doc, url, video_id)
-												[cleanup] Minor improvements to error and debug messages

											
										
										
											2021-11-10 04:19:33 +05:30
+								                self.report_detected('SMIL file')
-												Remove _sort_formats from _extract_*_formats methods

Now _sort_formats should be called explicitly.
_sort_formats has been added to all the necessary places in code.

Closes #8051

											
										
										
											2016-03-27 07:03:08 +06:00
+								                return smil
-												[extractor/generic] Add support for xspf playlists

											
										
										
											2015-08-09 19:43:42 +06:00
+								            elif doc.tag == '{http://xspf.org/ns/0/}playlist':
-												[cleanup] Minor improvements to error and debug messages

											
										
										
											2021-11-10 04:19:33 +05:30
+								                self.report_detected('XSPF playlist')
-												[extractor/generic] Support relative URIs in _parse_xspf

<location> can have relative URIs, not just absolute.

											
										
										
											2018-03-07 21:31:53 +00:00
+								                return self.playlist_result(
-												Generalize XML manifest processing code and improve XSPF parsing (closes #15794)

											
										
										
											2018-03-18 02:46:50 +07:00
+								                    self._parse_xspf(
 								                        doc, video_id, xspf_url=url,
-												[ie/generic] Fix MPD extraction for `file://` URLs (#12978)

Fix 5086d4aed6aeb3908c62f49e2d8f74cc0cb05110
Authored by: bashonly
											
										
										
											2025-04-22 19:06:35 -05:00
+								                        xspf_base_url=new_url),
-												[extractor/generic] Support relative URIs in _parse_xspf

<location> can have relative URIs, not just absolute.

											
										
										
											2018-03-07 21:31:53 +00:00
+								                    video_id)
-												[extractor/generic] Detect DASH manifests and extract mpd formats

											
										
										
											2016-02-06 19:35:32 +06:00
+								            elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag):
-												[generic] Extract subtitles from direct DASH manifest links

											
										
										
											2021-04-18 08:41:28 +02:00
+								                info_dict['formats'], info_dict['subtitles'] = self._parse_mpd_formats_and_subtitles(
-												[extractor/generic] Fix typo (closes #14902)
Don't pass video_id as mpd_id

											
										
										
											2017-12-05 23:11:15 +07:00
+								                    doc,
-												[ie/generic] Fix MPD extraction for `file://` URLs (#12978)

Fix 5086d4aed6aeb3908c62f49e2d8f74cc0cb05110
Authored by: bashonly
											
										
										
											2025-04-22 19:06:35 -05:00
+								                    # Do not use yt_dlp.utils.base_url here since it will raise on file:// URLs
 								                    mpd_base_url=update_url(new_url, query=None, fragment=None).rpartition('/')[0],
-												Refactor fragments interface and dash segments downloader
- Eliminate segment_urls and initialization_url
+ Introduce manifest_url (manifest may contain unfragmented data in this case url will be used for direct media URL and manifest_url for manifest itself correspondingly)
* Rewrite dashsegments downloader to use fragments data
* Improve generic mpd extraction

											
										
										
											2016-09-17 20:35:22 +07:00
+								                    mpd_url=url)
-												[ie/generic] Extract `live_status` for DASH manifest URLs (#12256)

* Also removes the content-type check for dash+xml/mpd.
This was added in cf1f13b817d88eb7d4b449f20cbad3215030e35f,
but is a no-op since the regex pattern was never changed accordingly.
And it looks like it was unwanted anyways per 28ad7df65ddb78c16ac008886d14ae2914aea6be

Closes #12255
Authored by: mp3butcher
											
										
										
											2025-02-09 00:28:54 +01:00
+								                info_dict['live_status'] = 'is_live' if doc.get('type') == 'dynamic' else None
-												[extractor/generic] Add extractor-args `hls_key`, `variant_query` (#6567)

Authored by: bashonly
											
										
										
											2023-03-21 18:12:17 -05:00
+								                self._extra_manifest_info(info_dict, url)
-												[cleanup] Minor improvements to error and debug messages

											
										
										
											2021-11-10 04:19:33 +05:30
+								                self.report_detected('DASH manifest')
-												[extractor/generic] Extract f4m formats and refactor common info

											
										
										
											2016-03-13 03:17:25 +06:00
+								                return info_dict
 								            elif re.match(r'^{http://ns\.adobe\.com/f4m/[12]\.0}manifest$', doc.tag):
 								                info_dict['formats'] = self._parse_f4m_formats(doc, url, video_id)
-												[cleanup] Minor improvements to error and debug messages

											
										
										
											2021-11-10 04:19:33 +05:30
+								                self.report_detected('F4M manifest')
-												[extractor/generic] Extract f4m formats and refactor common info

											
										
										
											2016-03-13 03:17:25 +06:00
+								                return info_dict
-												[cleanup] Mark some compat variables for removal (#2173)

Authored by fstirlitz, pukkandan

											
										
										
											2022-04-12 01:39:26 +05:30
+								        except xml.etree.ElementTree.ParseError:
-												[generic] Add support for RSS feeds (Fixes #667)

											
										
										
											2014-02-20 13:14:05 +01:00
+								            pass
-												[extractor] Framework for embed detection (#4307)

											
										
										
											2022-08-01 06:52:03 +05:30
+								        info_dict.update({
 								            # it's tempting to parse this further, but you would
 								            # have to take into account all the variations like
 								            #   Video Title - Site Name
 								            #   Site Name | Video Title
 								            #   Video Title - Tagline | Site Name
 								            # and so on and so forth; it's just not practical
-												[extractor] Improve `_generic_title`

											
										
										
											2022-10-31 17:35:20 +05:30
+								            'title': self._generic_title('', webpage, default='video'),
-												[extractor] Framework for embed detection (#4307)

											
										
										
											2022-08-01 06:52:03 +05:30
+								            'description': self._og_search_description(webpage, default=None),
 								            'thumbnail': self._og_search_thumbnail(webpage, default=None),
 								            'age_limit': self._rta_search(webpage),
 								        })
-												[extractor/generic] Separate embed extraction into own function (#5176)


											
										
										
											2022-10-09 16:09:36 +05:30
+								        self._downloader.write_debug('Looking for embeds')
 								        embeds = list(self._extract_embeds(original_url, webpage, urlh=full_response, info_dict=info_dict))
 								        if len(embeds) == 1:
-												[ie/generic] Fix generic title for embeds

Closes #7067

											
										
										
											2023-07-22 07:54:25 +05:30
+								            return merge_dicts(embeds[0], info_dict)
-												[extractor/generic] Separate embed extraction into own function (#5176)


											
										
										
											2022-10-09 16:09:36 +05:30
+								        elif embeds:
 								            return self.playlist_result(embeds, **info_dict)
 								        raise UnsupportedError(url)
 								    def _extract_embeds(self, url, webpage, *, urlh=None, info_dict={}):
 								        """Returns an iterator of video entries"""
 								        info_dict = types.MappingProxyType(info_dict)  # Prevents accidental mutation
 								        video_id = traverse_obj(info_dict, 'display_id', 'id') or self._generic_id(url)
 								        url, smuggled_data = unsmuggle_url(url, {})
-												[compat, networking] Deprecate old functions (#2861)

Authored by: coletdjnz, pukkandan

											
										
										
											2023-07-09 13:23:02 +05:30
+								        actual_url = urlh.url if urlh else url
-												[extractor] Framework for embed detection (#4307)

											
										
										
											2022-08-01 06:52:03 +05:30
-												[generic] Add comment for unescaping webpage contents

											
										
										
											2014-03-15 04:38:49 +07:00
+								        # Sometimes embedded video player is hidden behind percent encoding
-												Start moving to ytdl-org

											
										
										
											2019-03-09 19:14:41 +07:00
+								        # (e.g. https://github.com/ytdl-org/youtube-dl/issues/2448)
-												[generic] Add comment for unescaping webpage contents

											
										
										
											2014-03-15 04:38:49 +07:00
+								        # Unescaping the whole page allows to handle those cases in a generic way
-												Update to ytdl-2021.01.03

											
										
										
											2021-01-01 17:56:37 +05:30
+								        # FIXME: unescaping the whole page may break URLs, commenting out for now.
 								        # There probably should be a second run of generic extractor on unescaped webpage.
-												[compat] Remove deprecated functions from core code

											
										
										
											2022-06-24 16:24:43 +05:30
+								        # webpage = urllib.parse.unquote(webpage)
-												[generic] Unescape webpage contents
											
										
										
											2014-02-24 23:44:31 +07:00
-												[extractor] Framework for embed detection (#4307)

											
										
										
											2022-08-01 06:52:03 +05:30
+								        embeds = []
-												Fix bug in fe7866d0ed6bfa3904ce12b049a3424fdc0ea1fa

											
										
										
											2022-08-24 08:20:52 +05:30
+								        for ie in self._downloader._ies.values():
-												[extractor/generic] Separate embed extraction into own function (#5176)


											
										
										
											2022-10-09 16:09:36 +05:30
+								            if ie.ie_key() in smuggled_data.get('block_ies', []):
 								                continue
-												[extractor] Framework for embed detection (#4307)

											
										
										
											2022-08-01 06:52:03 +05:30
+								            gen = ie.extract_from_webpage(self._downloader, url, webpage)
 								            current_embeds = []
 								            try:
 								                while True:
 								                    current_embeds.append(next(gen))
 								            except self.StopExtraction:
 								                self.report_detected(f'{ie.IE_NAME} exclusive embed', len(current_embeds),
 								                                     embeds and 'discarding other embeds')
-												[extractor/generic] Separate embed extraction into own function (#5176)


											
										
										
											2022-10-09 16:09:36 +05:30
+								                return current_embeds
-												[extractor] Framework for embed detection (#4307)

											
										
										
											2022-08-01 06:52:03 +05:30
+								            except StopIteration:
 								                self.report_detected(f'{ie.IE_NAME} embed', len(current_embeds))
 								                embeds.extend(current_embeds)
-												[extractor/generic] Separate embed extraction into own function (#5176)


											
										
										
											2022-10-09 16:09:36 +05:30
+								        if embeds:
 								            return embeds
-												[tiktok] Detect embeds

Closes #3799

											
										
										
											2022-05-20 06:01:08 +05:30
-												_find_jwplayer_data() returns dict or None

This simplifies code for callers of `_find_jwplayer_data()` which no longer have
to run `_parse_json()` on the return value.

It also makes sure that `_find_jwplayer_data()` returns either a `dict` or
`None` and nothing else.

											
										
										
											2017-03-25 19:38:30 +01:00
+								        jwplayer_data = self._find_jwplayer_data(
 								            webpage, video_id, transform_source=js_to_json)
 								        if jwplayer_data:
-												[generic] parse jwplayer with only the json URL
Closes #1476

											
										
										
											2021-10-30 01:53:28 +05:30
+								            if isinstance(jwplayer_data.get('playlist'), str):
-												[cleanup] Minor improvements to error and debug messages

											
										
										
											2021-11-10 04:19:33 +05:30
+								                self.report_detected('JW Player playlist')
-												[extractor/generic] Separate embed extraction into own function (#5176)


											
										
										
											2022-10-09 16:09:36 +05:30
+								                return [self.url_result(jwplayer_data['playlist'], 'JWPlatform')]
-												[generic] Skip unsuccessful jwplayer extraction (closes #16735)

											
										
										
											2018-06-14 17:12:33 +02:00
+								            try:
 								                info = self._parse_jwplayer_data(
 								                    jwplayer_data, video_id, require_title=False, base_url=url)
-												[extractor/generic] Don't return JW player without formats

CLoses #4765

											
										
										
											2022-08-27 06:20:48 +05:30
+								                if traverse_obj(info, 'formats', ('entries', ..., 'formats')):
 								                    self.report_detected('JW Player data')
-												[extractor/generic] Separate embed extraction into own function (#5176)


											
										
										
											2022-10-09 16:09:36 +05:30
+								                    return [info]
-												[generic] Skip unsuccessful jwplayer extraction (closes #16735)

											
										
										
											2018-06-14 17:12:33 +02:00
+								            except ExtractorError:
-												Start moving to ytdl-org

											
										
										
											2019-03-09 19:14:41 +07:00
+								                # See https://github.com/ytdl-org/youtube-dl/pull/16735
-												[generic] Skip unsuccessful jwplayer extraction (closes #16735)

											
										
										
											2018-06-14 17:12:33 +02:00
+								                pass
-												[generic] Try parsing JWPlayer embedded videos (closes #12030)

											
										
										
											2017-02-16 23:42:36 +08:00
-												[generic] Add support for Video.js embeds

											
										
										
											2017-09-28 00:29:42 +07:00
+								        # Video.js embed
 								        mobj = re.search(
-												[generic] Extract subtitles from video.js (#3156)

Authored by: Lesmiscore 
											
										
										
											2022-03-23 15:28:53 +09:00
+								            r'(?s)\bvideojs\s*\(.+?([a-zA-Z0-9_$]+)\.src\s*\(\s*((?:\[.+?\]|{.+?}))\s*\)\s*;',
-												[generic] Add support for Video.js embeds

											
										
										
											2017-09-28 00:29:42 +07:00
+								            webpage)
 								        if mobj is not None:
-												[generic] Extract subtitles from video.js (#3156)

Authored by: Lesmiscore 
											
										
										
											2022-03-23 15:28:53 +09:00
+								            varname = mobj.group(1)
-												[cleanup Misc

Closes #5162

											
										
										
											2022-10-18 23:28:57 +05:30
+								            sources = variadic(self._parse_json(
 								                mobj.group(2), video_id, transform_source=js_to_json, fatal=False) or [])
-												[extractor/generic] Attempt to detect live HLS (#6775)

* Extract duration for non-live generic HLS videos
* Add extractor-arg `is_live` to bypass live HLS check

Closes #6705
Authored by: bashonly
											
										
										
											2023-04-13 14:36:06 -05:00
+								            formats, subtitles, src = [], {}, None
-												[generic] Add support for Video.js embeds

											
										
										
											2017-09-28 00:29:42 +07:00
+								            for source in sources:
-												[generic] Allow relative src for videojs embeds (closes #17324)

											
										
										
											2018-08-24 23:12:53 +07:00
+								                src = source.get('src')
-												[compat] Remove deprecated functions from core code

											
										
										
											2022-06-24 16:24:43 +05:30
+								                if not src or not isinstance(src, str):
-												[generic] Add support for Video.js embeds

											
										
										
											2017-09-28 00:29:42 +07:00
+								                    continue
-												[compat] Remove deprecated functions from core code

											
										
										
											2022-06-24 16:24:43 +05:30
+								                src = urllib.parse.urljoin(url, src)
-												[generic] Add support for Video.js embeds

											
										
										
											2017-09-28 00:29:42 +07:00
+								                src_type = source.get('type')
-												[compat] Remove deprecated functions from core code

											
										
										
											2022-06-24 16:24:43 +05:30
+								                if isinstance(src_type, str):
-												[generic] Add support for Video.js embeds

											
										
										
											2017-09-28 00:29:42 +07:00
+								                    src_type = src_type.lower()
 								                ext = determine_ext(src).lower()
 								                if src_type == 'video/youtube':
-												[extractor/generic] Separate embed extraction into own function (#5176)


											
										
										
											2022-10-09 16:09:36 +05:30
+								                    return [self.url_result(src, YoutubeIE.ie_key())]
-												[generic] Add support for Video.js embeds

											
										
										
											2017-09-28 00:29:42 +07:00
+								                if src_type == 'application/dash+xml' or ext == 'mpd':
-												[generic] Extract previously missed subtitles (#515)

* [generic] Extract subtitles in cases missed previously
* [common] Detect discarded subtitles in SMIL manifests
* [generic] Extract everything in the SMIL manifest

Authored by: fstirlitz 
											
										
										
											2021-07-16 16:22:56 +02:00
+								                    fmts, subs = self._extract_mpd_formats_and_subtitles(
 								                        src, video_id, mpd_id='dash', fatal=False)
 								                    formats.extend(fmts)
 								                    self._merge_subtitles(subs, target=subtitles)
-												[generic] Add support for Video.js embeds

											
										
										
											2017-09-28 00:29:42 +07:00
+								                elif src_type == 'application/x-mpegurl' or ext == 'm3u8':
-												[generic] Extract previously missed subtitles (#515)

* [generic] Extract subtitles in cases missed previously
* [common] Detect discarded subtitles in SMIL manifests
* [generic] Extract everything in the SMIL manifest

Authored by: fstirlitz 
											
										
										
											2021-07-16 16:22:56 +02:00
+								                    fmts, subs = self._extract_m3u8_formats_and_subtitles(
-												[generic] Add support for Video.js embeds

											
										
										
											2017-09-28 00:29:42 +07:00
+								                        src, video_id, 'mp4', entry_protocol='m3u8_native',
-												[generic] Extract previously missed subtitles (#515)

* [generic] Extract subtitles in cases missed previously
* [common] Detect discarded subtitles in SMIL manifests
* [generic] Extract everything in the SMIL manifest

Authored by: fstirlitz 
											
										
										
											2021-07-16 16:22:56 +02:00
+								                        m3u8_id='hls', fatal=False)
 								                    formats.extend(fmts)
 								                    self._merge_subtitles(subs, target=subtitles)
-												[extractor/generic] Add `fragment_query` extractor arg for DASH and HLS (#5528)

* `fragment_query`: passthrough any query in generic mpd/m3u8 manifest URLs to their fragments
* Add support for `extra_param_to_segment_url` to DASH downloader
Authored by: bashonly, pukkandan
											
										
										
											2022-11-21 00:51:45 +00:00
 								                if not formats:
-												[generic] Add support for Video.js embeds

											
										
										
											2017-09-28 00:29:42 +07:00
+								                    formats.append({
 								                        'url': src,
-												Fix W504 and disable W503 (closes #20863)

											
										
										
											2019-05-11 03:56:22 +07:00
+								                        'ext': (mimetype2ext(src_type)
 								                                or ext if ext in KNOWN_EXTENSIONS else 'mp4'),
-												Update to ytdl-commit-dfbbe29

[redbulltv] fix embed data extraction
https://github.com/ytdl-org/youtube-dl/commit/dfbbe2902fc67f0f93ee47a8077c148055c67a9b

											
										
										
											2021-05-20 21:08:49 +05:30
+								                        'http_headers': {
-												[extractor/generic] Separate embed extraction into own function (#5176)


											
										
										
											2022-10-09 16:09:36 +05:30
+								                            'Referer': actual_url,
-												Update to ytdl-commit-dfbbe29

[redbulltv] fix embed data extraction
https://github.com/ytdl-org/youtube-dl/commit/dfbbe2902fc67f0f93ee47a8077c148055c67a9b

											
										
										
											2021-05-20 21:08:49 +05:30
+								                        },
-												[generic] Add support for Video.js embeds

											
										
										
											2017-09-28 00:29:42 +07:00
+								                    })
-												[generic] Extract subtitles from video.js (#3156)

Authored by: Lesmiscore 
											
										
										
											2022-03-23 15:28:53 +09:00
+								            # https://docs.videojs.com/player#addRemoteTextTrack
 								            # https://html.spec.whatwg.org/multipage/media.html#htmltrackelement
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								            for sub_match in re.finditer(rf'(?s){re.escape(varname)}' + r'\.addRemoteTextTrack\(({.+?})\s*,\s*(?:true|false)\)', webpage):
-												[generic] Extract subtitles from video.js (#3156)

Authored by: Lesmiscore 
											
										
										
											2022-03-23 15:28:53 +09:00
+								                sub = self._parse_json(
 								                    sub_match.group(1), video_id, transform_source=js_to_json, fatal=False) or {}
-												[extractor/generic] Attempt to detect live HLS (#6775)

* Extract duration for non-live generic HLS videos
* Add extractor-arg `is_live` to bypass live HLS check

Closes #6705
Authored by: bashonly
											
										
										
											2023-04-13 14:36:06 -05:00
+								                sub_src = str_or_none(sub.get('src'))
 								                if not sub_src:
-												[generic] Extract subtitles from video.js (#3156)

Authored by: Lesmiscore 
											
										
										
											2022-03-23 15:28:53 +09:00
+								                    continue
 								                subtitles.setdefault(dict_get(sub, ('language', 'srclang')) or 'und', []).append({
-												[extractor/generic] Attempt to detect live HLS (#6775)

* Extract duration for non-live generic HLS videos
* Add extractor-arg `is_live` to bypass live HLS check

Closes #6705
Authored by: bashonly
											
										
										
											2023-04-13 14:36:06 -05:00
+								                    'url': urllib.parse.urljoin(url, sub_src),
-												[generic] Extract subtitles from video.js (#3156)

Authored by: Lesmiscore 
											
										
										
											2022-03-23 15:28:53 +09:00
+								                    'name': sub.get('label'),
 								                    'http_headers': {
-												[extractor/generic] Separate embed extraction into own function (#5176)


											
										
										
											2022-10-09 16:09:36 +05:30
+								                        'Referer': actual_url,
-												[generic] Extract subtitles from video.js (#3156)

Authored by: Lesmiscore 
											
										
										
											2022-03-23 15:28:53 +09:00
+								                    },
 								                })
-												[generic] Extract previously missed subtitles (#515)

* [generic] Extract subtitles in cases missed previously
* [common] Detect discarded subtitles in SMIL manifests
* [generic] Extract everything in the SMIL manifest

Authored by: fstirlitz 
											
										
										
											2021-07-16 16:22:56 +02:00
+								            if formats or subtitles:
-												[cleanup] Minor improvements to error and debug messages

											
										
										
											2021-11-10 04:19:33 +05:30
+								                self.report_detected('video.js embed')
-												[extractor/generic] Attempt to detect live HLS (#6775)

* Extract duration for non-live generic HLS videos
* Add extractor-arg `is_live` to bypass live HLS check

Closes #6705
Authored by: bashonly
											
										
										
											2023-04-13 14:36:06 -05:00
+								                info_dict = {'formats': formats, 'subtitles': subtitles}
 								                if formats:
 								                    self._extra_manifest_info(info_dict, src)
 								                return [info_dict]
-												[generic] Add support for Video.js embeds

											
										
										
											2017-09-28 00:29:42 +07:00
-												Update to ytdl-commit-195f22f6

[generic] Improve KVS (etc) extraction
https://github.com/ytdl-org/youtube-dl/commit/195f22f679330549882a8234e7234942893a4902

Closes #3716
Authored by: Grub4k, pukkandan
											
										
										
											2023-01-02 14:45:36 +01:00
+								        # Look for generic KVS player (before json-ld bc of some urls that break otherwise)
 								        found = self._search_regex((
-												[extractor/generic] Avoid catastrophic backtracking in KVS regex

Authored by: bashonly

											
										
										
											2023-01-29 00:59:37 -06:00
+								            r'<script\b[^>]+?\bsrc\s*=\s*(["\'])https?://(?:(?!\1)[^?#])+/kt_player\.js\?v=(?P<ver>\d+(?:\.\d+)+)\1[^>]*>',
 								            r'kt_player\s*\(\s*(["\'])(?:(?!\1)[\w\W])+\1\s*,\s*(["\'])https?://(?:(?!\2)[^?#])+/kt_player\.swf\?v=(?P<ver>\d+(?:\.\d+)+)\2\s*,',
-												Update to ytdl-commit-195f22f6

[generic] Improve KVS (etc) extraction
https://github.com/ytdl-org/youtube-dl/commit/195f22f679330549882a8234e7234942893a4902

Closes #3716
Authored by: Grub4k, pukkandan
											
										
										
											2023-01-02 14:45:36 +01:00
+								        ), webpage, 'KVS player', group='ver', default=False)
 								        if found:
-												[extractor/generic] Avoid catastrophic backtracking in KVS regex

Authored by: bashonly

											
										
										
											2023-01-29 00:59:37 -06:00
+								            self.report_detected('KVS Player')
-												Update to ytdl-commit-195f22f6

[generic] Improve KVS (etc) extraction
https://github.com/ytdl-org/youtube-dl/commit/195f22f679330549882a8234e7234942893a4902

Closes #3716
Authored by: Grub4k, pukkandan
											
										
										
											2023-01-02 14:45:36 +01:00
+								            if found.split('.')[0] not in ('4', '5', '6'):
 								                self.report_warning(f'Untested major version ({found}) in player engine - download may fail.')
 								            return [self._extract_kvs(url, webpage, video_id)]
-												[extractor/generic] Extract from LD-JSON last of all
Previous sources may contain several formats, e.g. http://tamasha.com/v/PgGZ

											
										
										
											2017-08-27 03:27:20 +07:00
+								        # Looking for http://schema.org/VideoObject
-												[generic] Detect more json_ld
Closes #1475

											
										
										
											2021-10-30 02:03:53 +05:30
+								        json_ld = self._search_json_ld(webpage, video_id, default={})
-												[generic] Allow further processing of json_ld URL
Closes #2578

											
										
										
											2022-02-02 07:28:01 +05:30
+								        if json_ld.get('url') not in (url, None):
-												[cleanup] Minor improvements to error and debug messages

											
										
										
											2021-11-10 04:19:33 +05:30
+								            self.report_detected('JSON LD')
-												[extractor/generic] Fix JSON LD manifest extraction (#5577)

Closes #5572
Authored by: bashonly, pukkandan
											
										
										
											2022-11-18 02:04:03 +00:00
+								            is_direct = json_ld.get('ext') not in (None, *MEDIA_EXTENSIONS.manifests)
-												[extractor/generic] Separate embed extraction into own function (#5176)


											
										
										
											2022-10-09 16:09:36 +05:30
+								            return [merge_dicts({
-												[extractor/generic] Fix JSON LD manifest extraction (#5577)

Closes #5572
Authored by: bashonly, pukkandan
											
										
										
											2022-11-18 02:04:03 +00:00
+								                '_type': 'video' if is_direct else 'url_transparent',
-												[extractor/generic] Pass through referer from json-ld

Closes #4941

											
										
										
											2022-09-16 23:05:49 +05:30
+								                'url': smuggle_url(json_ld['url'], {
 								                    'force_videoid': video_id,
 								                    'to_generic': True,
-												[ie] Do not smuggle `http_headers`

See: https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-3ch3-jhc6-5r8x

Authored by: coletdjnz

											
										
										
											2023-08-16 18:42:48 -05:00
+								                    'referer': url,
-												[extractor/generic] Pass through referer from json-ld

Closes #4941

											
										
										
											2022-09-16 23:05:49 +05:30
+								                }),
-												[extractor/generic] Separate embed extraction into own function (#5176)


											
										
										
											2022-10-09 16:09:36 +05:30
+								            }, json_ld)]
-												[extractor/generic] Extract from LD-JSON last of all
Previous sources may contain several formats, e.g. http://tamasha.com/v/PgGZ

											
										
										
											2017-08-27 03:27:20 +07:00
-												[generic] Ignore some non-video file extensions during generic extraction (Closes #3900)

											
										
										
											2014-10-09 19:26:23 +07:00
+								        def check_video(vurl):
-												[generic] Add support for jwPlayer YouTube videos

This makes nationalarchives.gov.uk work (Fixes #4907, fixes #4876)

											
										
										
											2015-02-09 10:42:25 +01:00
+								            if YoutubeIE.suitable(vurl):
 								                return True
-												[extractor/generic] Improve rtmp support (closes #11993)

											
										
										
											2017-02-06 23:23:40 +07:00
+								            if RtmpIE.suitable(vurl):
 								                return True
-												[compat] Remove deprecated functions from core code

											
										
										
											2022-06-24 16:24:43 +05:30
+								            vpath = urllib.parse.urlparse(vurl).path
-												[cleanup] Misc fixes

Closes https://github.com/yt-dlp/yt-dlp/pull/3213, Closes https://github.com/yt-dlp/yt-dlp/pull/3117

Related: https://github.com/yt-dlp/yt-dlp/issues/3146#issuecomment-1077323114, https://github.com/yt-dlp/yt-dlp/pull/3277#discussion_r841019671, https://github.com/yt-dlp/yt-dlp/commit/a825ffbffa0bea322e3ccb44c6f8e01d8d9572fb#commitcomment-68538986, https://github.com/yt-dlp/yt-dlp/issues/2360, https://github.com/yt-dlp/yt-dlp/commit/5fa3c9a88f597625296981a4a26be723e65d4842#r70393519, https://github.com/yt-dlp/yt-dlp/commit/5fa3c9a88f597625296981a4a26be723e65d4842#r70393254

											
										
										
											2022-03-27 07:50:43 +05:30
+								            vext = determine_ext(vpath, None)
 								            return vext not in (None, 'swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml', 'js', 'xml')
-												[generic] Ignore some non-video file extensions during generic extraction (Closes #3900)

											
										
										
											2014-10-09 19:26:23 +07:00
 								        def filter_video(urls):
 								            return list(filter(check_video, urls))
-												Move GenericIE into its own file

											
										
										
											2013-06-23 20:31:45 +02:00
+								        # Start with something easy: JW Player in SWFObject
-												[generic] Ignore some non-video file extensions during generic extraction (Closes #3900)

											
										
										
											2014-10-09 19:26:23 +07:00
+								        found = filter_video(re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage))
-												[cleanup] Minor improvements to error and debug messages

											
										
										
											2021-11-10 04:19:33 +05:30
+								        if found:
 								            self.report_detected('JW Player in SFWObject')
 								        else:
-												[generic] Support gorillavid.in

Previously, we were a little bit over-eager and got a random swf file.
Fixes #2084.

											
										
										
											2014-01-05 05:34:06 +01:00
+								            # Look for gorilla-vid style embedding
-												[generic] Ignore some non-video file extensions during generic extraction (Closes #3900)

											
										
										
											2014-10-09 19:26:23 +07:00
+								            found = filter_video(re.findall(r'''(?sx)
-												[generic] Improve jwplayer detection (Fixes #2731)

											
										
										
											2014-04-21 16:16:53 +02:00
+								                (?:
 								                    jw_plugins|
 								                    JWPlayerOptions|
 								                    jwplayer\s*\(\s*["'][^'"]+["']\s*\)\s*\.setup
 								                )
-												[generic] Add support for jwPlayer YouTube videos

This makes nationalarchives.gov.uk work (Fixes #4907, fixes #4876)

											
										
										
											2015-02-09 10:42:25 +01:00
+								                .*?
 								                ['"]?file['"]?\s*:\s*["\'](.*?)["\']''', webpage))
-												[cleanup] Minor improvements to error and debug messages

											
										
										
											2021-11-10 04:19:33 +05:30
+								            if found:
 								                self.report_detected('JW Player embed')
-												[generic] Allow multiple matches for generic hits (Fixes #2818)

											
										
										
											2014-04-30 02:23:51 +02:00
+								        if not found:
-												Move GenericIE into its own file

											
										
										
											2013-06-23 20:31:45 +02:00
+								            # Broaden the search a little bit
-												[generic] Ignore some non-video file extensions during generic extraction (Closes #3900)

											
										
										
											2014-10-09 19:26:23 +07:00
+								            found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage))
-												[cleanup] Minor improvements to error and debug messages

											
										
										
											2021-11-10 04:19:33 +05:30
+								            if found:
 								                self.report_detected('video file')
-												[generic] Allow multiple matches for generic hits (Fixes #2818)

											
										
										
											2014-04-30 02:23:51 +02:00
+								        if not found:
 								            # Broaden the findall a little bit: JWPlayer JS loader
-												[generic] Ignore some non-video file extensions during generic extraction (Closes #3900)

											
										
										
											2014-10-09 19:26:23 +07:00
+								            found = filter_video(re.findall(
-												[generic] Expand jwplayer support

											
										
										
											2015-08-05 21:19:52 +06:00
+								                r'[^A-Za-z0-9]?(?:file|video_url)["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage))
-												[cleanup] Minor improvements to error and debug messages

											
										
										
											2021-11-10 04:19:33 +05:30
+								            if found:
 								                self.report_detected('JW Player JS loader')
-												[generic] Automatic detection of flow player and age_limit (Fixes #3576)

											
										
										
											2014-08-24 05:31:32 +02:00
+								        if not found:
 								            # Flow player
-												[generic] Ignore some non-video file extensions during generic extraction (Closes #3900)

											
										
										
											2014-10-09 19:26:23 +07:00
+								            found = filter_video(re.findall(r'''(?xs)
-												[generic] Automatic detection of flow player and age_limit (Fixes #3576)

											
										
										
											2014-08-24 05:31:32 +02:00
+								                flowplayer\("[^"]+",\s*
 								                    \{[^}]+?\}\s*,
-												The opening curly brace `{` is a regex reserved [control character](http://stackoverflow.com/a/400316/1106367), so it needs to be escaped.

											
										
										
											2015-01-30 18:41:40 +11:00
+								                    \s*\{[^}]+? ["']?clip["']?\s*:\s*\{\s*
-												[generic] Automatic detection of flow player and age_limit (Fixes #3576)

											
										
										
											2014-08-24 05:31:32 +02:00
+								                        ["']?url["']?\s*:\s*["']([^"']+)["']
-												[generic] Ignore some non-video file extensions during generic extraction (Closes #3900)

											
										
										
											2014-10-09 19:26:23 +07:00
+								            ''', webpage))
-												[cleanup] Minor improvements to error and debug messages

											
										
										
											2021-11-10 04:19:33 +05:30
+								            if found:
 								                self.report_detected('Flow Player')
-												[generic] Add support for Cinerama player (Fixes #4752)

											
										
										
											2015-01-23 12:00:25 +01:00
+								        if not found:
 								            # Cinerama player
 								            found = re.findall(
 								                r"cinerama\.embedPlayer\(\s*\'[^']+\',\s*'([^']+)'", webpage)
-												[cleanup] Minor improvements to error and debug messages

											
										
										
											2021-11-10 04:19:33 +05:30
+								            if found:
 								                self.report_detected('Cinerama player')
-												[generic] Allow multiple matches for generic hits (Fixes #2818)

											
										
										
											2014-04-30 02:23:51 +02:00
+								        if not found:
-												Move GenericIE into its own file

											
										
										
											2013-06-23 20:31:45 +02:00
+								            # Try to find twitter cards info
-												[extractor/generic] Change twitter:player embeds priority to lowest (Closes #10090)

											
										
										
											2016-07-16 15:59:43 +07:00
+								            # twitter:player:stream should be checked before twitter:player since
 								            # it is expected to contain a raw stream (see
 								            # https://dev.twitter.com/cards/types/player#On_twitter.com_via_desktop_browser)
-												[generic] Ignore some non-video file extensions during generic extraction (Closes #3900)

											
										
										
											2014-10-09 19:26:23 +07:00
+								            found = filter_video(re.findall(
 								                r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage))
-												[cleanup] Minor improvements to error and debug messages

											
										
										
											2021-11-10 04:19:33 +05:30
+								            if found:
 								                self.report_detected('Twitter card')
-												[generic] Allow multiple matches for generic hits (Fixes #2818)

											
										
										
											2014-04-30 02:23:51 +02:00
+								        if not found:
-												Move GenericIE into its own file

											
										
										
											2013-06-23 20:31:45 +02:00
+								            # We look for Open Graph info:
-												[docs] Consistent use of `e.g.` (#4643)

Authored by: Lesmiscore
											
										
										
											2022-08-14 21:04:13 +09:00
+								            # We have to match any number spaces between elements, some sites try to align them, e.g.: statigr.am
-												[generic] Allow multiple matches for generic hits (Fixes #2818)

											
										
										
											2014-04-30 02:23:51 +02:00
+								            m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
-												Move GenericIE into its own file

											
										
										
											2013-06-23 20:31:45 +02:00
+								            # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
 								            if m_video_type is not None:
-												Update to ytdl-commit-dfbbe29

[redbulltv] fix embed data extraction
https://github.com/ytdl-org/youtube-dl/commit/dfbbe2902fc67f0f93ee47a8077c148055c67a9b

											
										
										
											2021-05-20 21:08:49 +05:30
+								                found = filter_video(re.findall(r'<meta.*?property="og:(?:video|audio)".*?content="(.*?)"', webpage))
-												[cleanup] Minor improvements to error and debug messages

											
										
										
											2021-11-10 04:19:33 +05:30
+								                if found:
 								                    self.report_detected('Open Graph video info')
-												[generic] Allow multiple matches for generic hits (Fixes #2818)

											
										
										
											2014-04-30 02:23:51 +02:00
+								        if not found:
-												[generic] Generalize redirect regex

											
										
										
											2015-03-18 00:05:40 +06:00
+								            REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)'
-												[generic] Fix redirect

											
										
										
											2014-05-16 20:32:53 +07:00
+								            found = re.search(
-												[generic] Add support for <meta redirect>

Fixes #413

											
										
										
											2014-02-27 07:21:59 +01:00
+								                r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")'
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								                rf'(?:[a-z-]+="[^"]+"\s+)*?content="{REDIRECT_REGEX}',
-												[generic] Add support for <meta redirect>

Fixes #413

											
										
										
											2014-02-27 07:21:59 +01:00
+								                webpage)
-												[generic] Follow redirects specified by `Refresh` HTTP header

											
										
										
											2015-03-17 23:51:40 +06:00
+								            if not found:
 								                # Look also in Refresh HTTP header
-												[extractor/generic] Separate embed extraction into own function (#5176)


											
										
										
											2022-10-09 16:09:36 +05:30
+								                refresh_header = urlh and urlh.headers.get('Refresh')
-												[generic] Follow redirects specified by `Refresh` HTTP header

											
										
										
											2015-03-17 23:51:40 +06:00
+								                if refresh_header:
-												[generic] Generalize redirect regex

											
										
										
											2015-03-18 00:05:40 +06:00
+								                    found = re.search(REDIRECT_REGEX, refresh_header)
-												[generic] Allow multiple matches for generic hits (Fixes #2818)

											
										
										
											2014-04-30 02:23:51 +02:00
+								            if found:
-												[compat] Remove deprecated functions from core code

											
										
										
											2022-06-24 16:24:43 +05:30
+								                new_url = urllib.parse.urljoin(url, unescapeHTML(found.group(1)))
-												[generic] Do not follow redirects to the same URL

											
										
										
											2017-03-24 00:45:24 +07:00
+								                if new_url != url:
 								                    self.report_following_redirect(new_url)
-												[extractor/generic] Separate embed extraction into own function (#5176)


											
										
										
											2022-10-09 16:09:36 +05:30
+								                    return [self.url_result(new_url)]
-												[generic] Do not follow redirects to the same URL

											
										
										
											2017-03-24 00:45:24 +07:00
+								                else:
 								                    found = None
-												[extractor/generic] Change twitter:player embeds priority to lowest (Closes #10090)

											
										
										
											2016-07-16 15:59:43 +07:00
 								        if not found:
 								            # twitter:player is a https URL to iframe player that may or may not
-												Completely change project name to yt-dlp (#85)

* All modules and binary names are changed
* All documentation references changed
* yt-dlp no longer loads youtube-dlc config files
* All URLs changed to point to organization account

Co-authored-by: Pccode66
Co-authored-by: pukkandan
											
										
										
											2021-02-24 15:45:56 -03:00
+								            # be supported by yt-dlp thus this is checked the very last (see
-												[extractor/generic] Change twitter:player embeds priority to lowest (Closes #10090)

											
										
										
											2016-07-16 15:59:43 +07:00
+								            # https://dev.twitter.com/cards/types/player#On_twitter.com_via_desktop_browser)
 								            embed_url = self._html_search_meta('twitter:player', webpage, default=None)
-												[generic] Fix infinite recursion for twitter:player URLs (closes #14339)

											
										
										
											2017-09-26 21:47:18 +07:00
+								            if embed_url and embed_url != url:
-												[cleanup] Minor improvements to error and debug messages

											
										
										
											2021-11-10 04:19:33 +05:30
+								                self.report_detected('twitter:player iframe')
-												[extractor/generic] Separate embed extraction into own function (#5176)


											
										
										
											2022-10-09 16:09:36 +05:30
+								                return [self.url_result(embed_url)]
-												[extractor/generic] Change twitter:player embeds priority to lowest (Closes #10090)

											
										
										
											2016-07-16 15:59:43 +07:00
-												[generic] Allow multiple matches for generic hits (Fixes #2818)

											
										
										
											2014-04-30 02:23:51 +02:00
+								        if not found:
-												[extractor/generic] Separate embed extraction into own function (#5176)


											
										
										
											2022-10-09 16:09:36 +05:30
+								            return []
 								        domain_name = self._search_regex(r'^(?:https?://)?([^/]*)/.*', url, 'video uploader', default=None)
-												Move GenericIE into its own file

											
										
										
											2013-06-23 20:31:45 +02:00
-												[generic] Allow multiple matches for generic hits (Fixes #2818)

											
										
										
											2014-04-30 02:23:51 +02:00
+								        entries = []
-												[genric] Eliminate duplicated video URLs (closes #6562)

											
										
										
											2016-05-22 22:22:27 +08:00
+								        for video_url in orderedSet(found):
-												[extractor/generic] Decode unicode-escaped embed URLs (#5919)

Authored by: bashonly
Closes #5854
											
										
										
											2023-01-02 08:06:01 -06:00
+								            video_url = video_url.encode().decode('unicode-escape')
-												[generic] Unescape the video URL

Fixes #9279

											
										
										
											2016-04-24 16:23:21 +08:00
+								            video_url = unescapeHTML(video_url)
-												[generic] Unescape URLs from JWPlayer (#7582)

											
										
										
											2015-11-21 14:12:34 +08:00
+								            video_url = video_url.replace('\\/', '/')
-												[compat] Remove deprecated functions from core code

											
										
										
											2022-06-24 16:24:43 +05:30
+								            video_url = urllib.parse.urljoin(url, video_url)
 								            video_id = urllib.parse.unquote(os.path.basename(video_url))
-												Move GenericIE into its own file

											
										
										
											2013-06-23 20:31:45 +02:00
-												[generic] Allow multiple matches for generic hits (Fixes #2818)

											
										
										
											2014-04-30 02:23:51 +02:00
+								            # Sometimes, jwplayer extraction will result in a YouTube URL
 								            if YoutubeIE.suitable(video_url):
 								                entries.append(self.url_result(video_url, 'Youtube'))
 								                continue
-												Move GenericIE into its own file

											
										
										
											2013-06-23 20:31:45 +02:00
-												[generic] Allow multiple matches for generic hits (Fixes #2818)

											
										
										
											2014-04-30 02:23:51 +02:00
+								            video_id = os.path.splitext(video_id)[0]
-												[generic] Pass referer to extracted formats

Closes #2839

											
										
										
											2022-02-20 17:13:13 +05:30
+								            headers = {
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								                'referer': actual_url,
-												[generic] Pass referer to extracted formats

Closes #2839

											
										
										
											2022-02-20 17:13:13 +05:30
+								            }
-												[youtube] Support jwplayer with YouTube URLs (Closes #2075)

											
										
										
											2014-01-06 01:42:58 +01:00
-												[generic] Refactor

											
										
										
											2015-11-21 16:08:54 +08:00
+								            entry_info_dict = {
 								                'id': video_id,
-												[extractors] Use new framework for existing embeds (#4307)

`Brightcove` is difficult to migrate because it's subclasses may depend
on the signature of the current functions. So it is left as-is for now

Note: Tests have not been migrated

											
										
										
											2022-08-01 06:53:25 +05:30
+								                'uploader': domain_name,
 								                'title': info_dict['title'],
 								                'age_limit': info_dict['age_limit'],
-												[generic] Pass referer to extracted formats

Closes #2839

											
										
										
											2022-02-20 17:13:13 +05:30
+								                'http_headers': headers,
-												[generic] Refactor

											
										
										
											2015-11-21 16:08:54 +08:00
+								            }
-												[extractor/generic] Add test for #11993 and more metadata for rtmp

											
										
										
											2017-02-06 23:31:58 +07:00
+								            if RtmpIE.suitable(video_url):
 								                entry_info_dict.update({
 								                    '_type': 'url_transparent',
 								                    'ie_key': RtmpIE.ie_key(),
 								                    'url': video_url,
 								                })
 								                entries.append(entry_info_dict)
 								                continue
-												[extractor/generic] Add support for xspf playlists

											
										
										
											2015-08-09 19:43:42 +06:00
+								            ext = determine_ext(video_url)
 								            if ext == 'smil':
-												[generic] Extract previously missed subtitles (#515)

* [generic] Extract subtitles in cases missed previously
* [common] Detect discarded subtitles in SMIL manifests
* [generic] Extract everything in the SMIL manifest

Authored by: fstirlitz 
											
										
										
											2021-07-16 16:22:56 +02:00
+								                entry_info_dict = {**self._extract_smil_info(video_url, video_id), **entry_info_dict}
-												[extractor/generic] Add support for xspf playlists

											
										
										
											2015-08-09 19:43:42 +06:00
+								            elif ext == 'xspf':
-												[extractor/generic] Separate embed extraction into own function (#5176)


											
										
										
											2022-10-09 16:09:36 +05:30
+								                return [self._extract_xspf_playlist(video_url, video_id)]
-												[generic] Extract M3U8 formats (closes #7582)

											
										
										
											2015-11-21 16:43:01 +08:00
+								            elif ext == 'm3u8':
-												[generic] Pass referer to extracted formats

Closes #2839

											
										
										
											2022-02-20 17:13:13 +05:30
+								                entry_info_dict['formats'], entry_info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(video_url, video_id, ext='mp4', headers=headers)
-												[extractor/generic] Add extractor-args `hls_key`, `variant_query` (#6567)

Authored by: bashonly
											
										
										
											2023-03-21 18:12:17 -05:00
+								                self._extra_manifest_info(entry_info_dict, video_url)
-												[extractor/generic] Detect DASH manifests in found URLs and extract mpd formats

											
										
										
											2016-02-06 19:42:03 +06:00
+								            elif ext == 'mpd':
-												[generic] Pass referer to extracted formats

Closes #2839

											
										
										
											2022-02-20 17:13:13 +05:30
+								                entry_info_dict['formats'], entry_info_dict['subtitles'] = self._extract_mpd_formats_and_subtitles(video_url, video_id, headers=headers)
-												[extractor/generic] Add extractor-args `hls_key`, `variant_query` (#6567)

Authored by: bashonly
											
										
										
											2023-03-21 18:12:17 -05:00
+								                self._extra_manifest_info(entry_info_dict, video_url)
-												[extractor/generic] Extract f4m formats from final URLs

											
										
										
											2016-03-13 03:38:20 +06:00
+								            elif ext == 'f4m':
-												[generic] Pass referer to extracted formats

Closes #2839

											
										
										
											2022-02-20 17:13:13 +05:30
+								                entry_info_dict['formats'] = self._extract_f4m_formats(video_url, video_id, headers=headers)
-												[extractor/generic] Skip URLs we came from when delegating ISM extraction

											
										
										
											2016-11-02 23:43:41 +07:00
+								            elif re.search(r'(?i)\.(?:ism|smil)/manifest', video_url) and video_url != url:
-												[extractor/generic] Improve ISM extraction

											
										
										
											2016-11-02 23:34:37 +07:00
+								                # Just matching .ism/manifest is not enough to be reliably sure
 								                # whether it's actually an ISM manifest or some other streaming
 								                # manifest since there are various streaming URL formats
 								                # possible (see [1]) as well as some other shenanigans like
 								                # .smil/manifest URLs that actually serve an ISM (see [2]) and
 								                # so on.
 								                # Thus the most reasonable way to solve this is to delegate
 								                # to generic extractor in order to look into the contents of
 								                # the manifest itself.
 								                # 1. https://azure.microsoft.com/en-us/documentation/articles/media-services-deliver-content-overview/#streaming-url-formats
 								                # 2. https://svs.itworkscdn.net/lbcivod/smil:itwfcdn/lbci/170976.smil/Manifest
 								                entry_info_dict = self.url_result(
 								                    smuggle_url(video_url, {'to_generic': True}),
 								                    GenericIE.ie_key())
-												[generic] Extract videos from SMIL manifests (closes #5145 and fixes #5135)

											
										
										
											2015-04-16 17:16:11 +08:00
+								            else:
-												[generic] Refactor

											
										
										
											2015-11-21 16:08:54 +08:00
+								                entry_info_dict['url'] = video_url
 								            entries.append(entry_info_dict)
-												[generic] Allow multiple matches for generic hits (Fixes #2818)

											
										
										
											2014-04-30 02:23:51 +02:00
-												[extractor/generic] Separate embed extraction into own function (#5176)


											
										
										
											2022-10-09 16:09:36 +05:30
+								        if len(entries) > 1:
-												[generic] Allow multiple matches for generic hits (Fixes #2818)

											
										
										
											2014-04-30 02:23:51 +02:00
+								            for num, e in enumerate(entries, start=1):
-												[generic] Don't set the 'title' if it's not defined in the entry (closes #5061)

Some of them may be an 'url' result, which in general don't have the 'title' field.

											
										
										
											2015-02-25 17:56:51 +01:00
+								                # 'url' results don't have a title
 								                if e.get('title') is not None:
-												[cleanup] Add more ruff rules (#10149)

Authored by: seproDev

Reviewed-by: bashonly <88596187+bashonly@users.noreply.github.com>
Reviewed-by: Simon Sawicki <contact@grub4k.xyz>
											
										
										
											2024-06-12 01:09:58 +02:00
+								                    e['title'] = '{} ({})'.format(e['title'], num)
-												[extractor/generic] Separate embed extraction into own function (#5176)


											
										
										
											2022-10-09 16:09:36 +05:30
+								        return entries