diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml index 04de087085..ee89f27d43 100644 --- a/.github/FUNDING.yml +++ b/.github/FUNDING.yml @@ -10,4 +10,4 @@ liberapay: # Replace with a single Liberapay username issuehunt: # Replace with a single IssueHunt username otechie: # Replace with a single Otechie username -custom: ['https://github.com/yt-dlp/yt-dlp/blob/master/Collaborators.md#collaborators'] +custom: ['https://github.com/yt-dlp/yt-dlp/blob/master/Maintainers.md#maintainers'] diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index fda92cea88..69f8eb0c8d 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -194,7 +194,7 @@ jobs: UPDATE_TO: yt-dlp/yt-dlp@2025.09.05 steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 with: fetch-depth: 0 # Needed for changelog @@ -214,7 +214,7 @@ jobs: - name: Build Unix platform-independent binary run: | - make all tar + make all-extra tar - name: Verify --update-to if: vars.UPDATE_TO_VERIFICATION @@ -255,7 +255,7 @@ jobs: SKIP_ONEFILE_BUILD: ${{ (!matrix.onefile && '1') || '' }} steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 - name: Cache requirements if: matrix.cache_requirements @@ -318,7 +318,7 @@ jobs: UPDATE_TO: yt-dlp/yt-dlp@2025.09.05 steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 # NB: Building universal2 does not work with python from actions/setup-python - name: Cache requirements @@ -341,14 +341,14 @@ jobs: brew uninstall --ignore-dependencies python3 python3 -m venv ~/yt-dlp-build-venv source ~/yt-dlp-build-venv/bin/activate - python3 devscripts/install_deps.py -o --include build - python3 devscripts/install_deps.py --print --include pyinstaller > requirements.txt + python3 devscripts/install_deps.py --only-optional-groups --include-group build + python3 devscripts/install_deps.py --print --include-group pyinstaller > requirements.txt # We need to ignore wheels otherwise we break universal2 builds python3 -m pip install -U --no-binary :all: -r requirements.txt # We need to fuse our own universal2 wheels for curl_cffi python3 -m pip install -U 'delocate==0.11.0' mkdir curl_cffi_whls curl_cffi_universal2 - python3 devscripts/install_deps.py --print -o --include curl-cffi > requirements.txt + python3 devscripts/install_deps.py --print --only-optional-groups --include-group curl-cffi > requirements.txt for platform in "macosx_11_0_arm64" "macosx_11_0_x86_64"; do python3 -m pip download \ --only-binary=:all: \ @@ -448,7 +448,7 @@ jobs: PYI_WHEEL: pyinstaller-${{ matrix.pyi_version }}-py3-none-${{ matrix.platform_tag }}.whl steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 - uses: actions/setup-python@v6 with: python-version: ${{ matrix.python_version }} @@ -482,11 +482,11 @@ jobs: mkdir /pyi-wheels python -m pip download -d /pyi-wheels --no-deps --require-hashes "pyinstaller@${Env:PYI_URL}#sha256=${Env:PYI_HASH}" python -m pip install --force-reinstall -U "/pyi-wheels/${Env:PYI_WHEEL}" - python devscripts/install_deps.py -o --include build + python devscripts/install_deps.py --only-optional-groups --include-group build if ("${Env:ARCH}" -eq "x86") { python devscripts/install_deps.py } else { - python devscripts/install_deps.py --include curl-cffi + python devscripts/install_deps.py --include-group curl-cffi } - name: Prepare @@ -536,7 +536,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Download artifacts - uses: actions/download-artifact@v4 + uses: actions/download-artifact@v5 with: path: artifact pattern: build-bin-* @@ -558,35 +558,39 @@ jobs: cat >> _update_spec << EOF # This file is used for regulating self-update lock 2022.08.18.36 .+ Python 3\.6 - lock 2023.11.16 (?!win_x86_exe).+ Python 3\.7 + lock 2023.11.16 zip Python 3\.7 lock 2023.11.16 win_x86_exe .+ Windows-(?:Vista|2008Server) lock 2024.10.22 py2exe .+ lock 2024.10.22 zip Python 3\.8 lock 2024.10.22 win(?:_x86)?_exe Python 3\.[78].+ Windows-(?:7-|2008ServerR2) lock 2025.08.11 darwin_legacy_exe .+ lock 2025.08.27 linux_armv7l_exe .+ + lock 2025.10.14 zip Python 3\.9 lockV2 yt-dlp/yt-dlp 2022.08.18.36 .+ Python 3\.6 - lockV2 yt-dlp/yt-dlp 2023.11.16 (?!win_x86_exe).+ Python 3\.7 + lockV2 yt-dlp/yt-dlp 2023.11.16 zip Python 3\.7 lockV2 yt-dlp/yt-dlp 2023.11.16 win_x86_exe .+ Windows-(?:Vista|2008Server) lockV2 yt-dlp/yt-dlp 2024.10.22 py2exe .+ lockV2 yt-dlp/yt-dlp 2024.10.22 zip Python 3\.8 lockV2 yt-dlp/yt-dlp 2024.10.22 win(?:_x86)?_exe Python 3\.[78].+ Windows-(?:7-|2008ServerR2) lockV2 yt-dlp/yt-dlp 2025.08.11 darwin_legacy_exe .+ lockV2 yt-dlp/yt-dlp 2025.08.27 linux_armv7l_exe .+ - lockV2 yt-dlp/yt-dlp-nightly-builds 2023.11.15.232826 (?!win_x86_exe).+ Python 3\.7 + lockV2 yt-dlp/yt-dlp 2025.10.14 zip Python 3\.9 + lockV2 yt-dlp/yt-dlp-nightly-builds 2023.11.15.232826 zip Python 3\.7 lockV2 yt-dlp/yt-dlp-nightly-builds 2023.11.15.232826 win_x86_exe .+ Windows-(?:Vista|2008Server) lockV2 yt-dlp/yt-dlp-nightly-builds 2024.10.22.051025 py2exe .+ lockV2 yt-dlp/yt-dlp-nightly-builds 2024.10.22.051025 zip Python 3\.8 lockV2 yt-dlp/yt-dlp-nightly-builds 2024.10.22.051025 win(?:_x86)?_exe Python 3\.[78].+ Windows-(?:7-|2008ServerR2) lockV2 yt-dlp/yt-dlp-nightly-builds 2025.08.12.233030 darwin_legacy_exe .+ lockV2 yt-dlp/yt-dlp-nightly-builds 2025.08.30.232839 linux_armv7l_exe .+ - lockV2 yt-dlp/yt-dlp-master-builds 2023.11.15.232812 (?!win_x86_exe).+ Python 3\.7 + lockV2 yt-dlp/yt-dlp-nightly-builds 2025.10.14.232845 zip Python 3\.9 + lockV2 yt-dlp/yt-dlp-master-builds 2023.11.15.232812 zip Python 3\.7 lockV2 yt-dlp/yt-dlp-master-builds 2023.11.15.232812 win_x86_exe .+ Windows-(?:Vista|2008Server) lockV2 yt-dlp/yt-dlp-master-builds 2024.10.22.045052 py2exe .+ lockV2 yt-dlp/yt-dlp-master-builds 2024.10.22.060347 zip Python 3\.8 lockV2 yt-dlp/yt-dlp-master-builds 2024.10.22.060347 win(?:_x86)?_exe Python 3\.[78].+ Windows-(?:7-|2008ServerR2) lockV2 yt-dlp/yt-dlp-master-builds 2025.08.12.232447 darwin_legacy_exe .+ lockV2 yt-dlp/yt-dlp-master-builds 2025.09.05.212910 linux_armv7l_exe .+ + lockV2 yt-dlp/yt-dlp-master-builds 2025.10.14.232330 zip Python 3\.9 EOF - name: Sign checksum files diff --git a/.github/workflows/challenge-tests.yml b/.github/workflows/challenge-tests.yml new file mode 100644 index 0000000000..89895eb07b --- /dev/null +++ b/.github/workflows/challenge-tests.yml @@ -0,0 +1,77 @@ +name: Challenge Tests +on: + push: + paths: + - .github/workflows/challenge-tests.yml + - test/test_jsc/*.py + - yt_dlp/extractor/youtube/jsc/**.js + - yt_dlp/extractor/youtube/jsc/**.py + - yt_dlp/extractor/youtube/pot/**.py + - yt_dlp/utils/_jsruntime.py + pull_request: + paths: + - .github/workflows/challenge-tests.yml + - test/test_jsc/*.py + - yt_dlp/extractor/youtube/jsc/**.js + - yt_dlp/extractor/youtube/jsc/**.py + - yt_dlp/extractor/youtube/pot/**.py + - yt_dlp/utils/_jsruntime.py +permissions: + contents: read + +concurrency: + group: challenge-tests-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.event_name == 'pull_request' }} + +jobs: + tests: + name: Challenge Tests + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, windows-latest] + python-version: ['3.10', '3.11', '3.12', '3.13', '3.14', pypy-3.11] + env: + QJS_VERSION: '2025-04-26' # Earliest version with rope strings + steps: + - uses: actions/checkout@v5 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v6 + with: + python-version: ${{ matrix.python-version }} + - name: Install Deno + uses: denoland/setup-deno@v2 + with: + deno-version: '2.0.0' # minimum supported version + - name: Install Bun + uses: oven-sh/setup-bun@v2 + with: + # minimum supported version is 1.0.31 but earliest available Windows version is 1.1.0 + bun-version: ${{ (matrix.os == 'windows-latest' && '1.1.0') || '1.0.31' }} + - name: Install Node + uses: actions/setup-node@v6 + with: + node-version: '20.0' # minimum supported version + - name: Install QuickJS (Linux) + if: matrix.os == 'ubuntu-latest' + run: | + wget "https://bellard.org/quickjs/binary_releases/quickjs-linux-x86_64-${QJS_VERSION}.zip" -O quickjs.zip + unzip quickjs.zip qjs + sudo install qjs /usr/local/bin/qjs + - name: Install QuickJS (Windows) + if: matrix.os == 'windows-latest' + shell: pwsh + run: | + Invoke-WebRequest "https://bellard.org/quickjs/binary_releases/quickjs-win-x86_64-${Env:QJS_VERSION}.zip" -OutFile quickjs.zip + unzip quickjs.zip + - name: Install test requirements + run: | + python ./devscripts/install_deps.py --print --only-optional-groups --include-group test > requirements.txt + python ./devscripts/install_deps.py --print -c certifi -c requests -c urllib3 -c yt-dlp-ejs >> requirements.txt + python -m pip install -U -r requirements.txt + - name: Run tests + timeout-minutes: 15 + run: | + python -m yt_dlp -v --js-runtimes node --js-runtimes bun --js-runtimes quickjs || true + python ./devscripts/run_tests.py test/test_jsc -k download diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 910c409efc..6d4dbdf193 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -29,7 +29,7 @@ jobs: steps: - name: Checkout repository - uses: actions/checkout@v4 + uses: actions/checkout@v5 # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL diff --git a/.github/workflows/core.yml b/.github/workflows/core.yml index 93442529f5..ae3dc95e1b 100644 --- a/.github/workflows/core.yml +++ b/.github/workflows/core.yml @@ -7,6 +7,7 @@ on: - test/** - yt_dlp/**.py - '!yt_dlp/extractor/**.py' + - yt_dlp/extractor/youtube/**.py - yt_dlp/extractor/__init__.py - yt_dlp/extractor/common.py - yt_dlp/extractor/extractors.py @@ -17,6 +18,7 @@ on: - test/** - yt_dlp/**.py - '!yt_dlp/extractor/**.py' + - yt_dlp/extractor/youtube/**.py - yt_dlp/extractor/__init__.py - yt_dlp/extractor/common.py - yt_dlp/extractor/extractors.py @@ -36,12 +38,10 @@ jobs: fail-fast: false matrix: os: [ubuntu-latest] - # CPython 3.9 is in quick-test - python-version: ['3.10', '3.11', '3.12', '3.13', '3.14-dev', pypy-3.11] + # CPython 3.10 is in quick-test + python-version: ['3.11', '3.12', '3.13', '3.14', pypy-3.11] include: # atleast one of each CPython/PyPy tests must be in windows - - os: windows-latest - python-version: '3.9' - os: windows-latest python-version: '3.10' - os: windows-latest @@ -51,17 +51,17 @@ jobs: - os: windows-latest python-version: '3.13' - os: windows-latest - python-version: '3.14-dev' + python-version: '3.14' - os: windows-latest python-version: pypy-3.11 steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} - name: Install test requirements - run: python3 ./devscripts/install_deps.py --include test --include curl-cffi + run: python ./devscripts/install_deps.py --include-group test --include-group curl-cffi - name: Run tests timeout-minutes: 15 continue-on-error: False diff --git a/.github/workflows/download.yml b/.github/workflows/download.yml index 8dbfee6f88..d075270d7b 100644 --- a/.github/workflows/download.yml +++ b/.github/workflows/download.yml @@ -9,16 +9,16 @@ jobs: if: "contains(github.event.head_commit.message, 'ci run dl')" runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 - name: Set up Python uses: actions/setup-python@v6 with: - python-version: 3.9 + python-version: '3.10' - name: Install test requirements - run: python3 ./devscripts/install_deps.py --include dev + run: python ./devscripts/install_deps.py --include-group dev - name: Run tests continue-on-error: true - run: python3 ./devscripts/run_tests.py download + run: python ./devscripts/run_tests.py download full: name: Full Download Tests @@ -28,21 +28,21 @@ jobs: fail-fast: true matrix: os: [ubuntu-latest] - python-version: ['3.10', '3.11', '3.12', '3.13', '3.14-dev', pypy-3.11] + python-version: ['3.11', '3.12', '3.13', '3.14', pypy-3.11] include: # atleast one of each CPython/PyPy tests must be in windows - os: windows-latest - python-version: '3.9' + python-version: '3.10' - os: windows-latest python-version: pypy-3.11 steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} - name: Install test requirements - run: python3 ./devscripts/install_deps.py --include dev + run: python ./devscripts/install_deps.py --include-group dev - name: Run tests continue-on-error: true - run: python3 ./devscripts/run_tests.py download + run: python ./devscripts/run_tests.py download diff --git a/.github/workflows/quick-test.yml b/.github/workflows/quick-test.yml index 1c0fb4e4da..a6e84b1d80 100644 --- a/.github/workflows/quick-test.yml +++ b/.github/workflows/quick-test.yml @@ -9,13 +9,13 @@ jobs: if: "!contains(github.event.head_commit.message, 'ci skip all')" runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 - - name: Set up Python 3.9 + - uses: actions/checkout@v5 + - name: Set up Python 3.10 uses: actions/setup-python@v6 with: - python-version: '3.9' + python-version: '3.10' - name: Install test requirements - run: python3 ./devscripts/install_deps.py -o --include test + run: python ./devscripts/install_deps.py --only-optional-groups --include-group test - name: Run tests timeout-minutes: 15 run: | @@ -26,14 +26,14 @@ jobs: if: "!contains(github.event.head_commit.message, 'ci skip all')" runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 - uses: actions/setup-python@v6 with: - python-version: '3.9' + python-version: '3.10' - name: Install dev dependencies - run: python3 ./devscripts/install_deps.py -o --include static-analysis + run: python ./devscripts/install_deps.py --only-optional-groups --include-group static-analysis - name: Make lazy extractors - run: python3 ./devscripts/make_lazy_extractors.py + run: python ./devscripts/make_lazy_extractors.py - name: Run ruff run: ruff check --output-format github . - name: Run autopep8 diff --git a/.github/workflows/release-master.yml b/.github/workflows/release-master.yml index 7dfda9f842..f44da792f8 100644 --- a/.github/workflows/release-master.yml +++ b/.github/workflows/release-master.yml @@ -38,7 +38,7 @@ jobs: id-token: write # mandatory for trusted publishing steps: - name: Download artifacts - uses: actions/download-artifact@v4 + uses: actions/download-artifact@v5 with: path: dist name: build-pypi diff --git a/.github/workflows/release-nightly.yml b/.github/workflows/release-nightly.yml index 13cce8c33f..26be60fe61 100644 --- a/.github/workflows/release-nightly.yml +++ b/.github/workflows/release-nightly.yml @@ -12,7 +12,7 @@ jobs: outputs: commit: ${{ steps.check_for_new_commits.outputs.commit }} steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 with: fetch-depth: 0 - name: Check for new commits @@ -53,7 +53,7 @@ jobs: id-token: write # mandatory for trusted publishing steps: - name: Download artifacts - uses: actions/download-artifact@v4 + uses: actions/download-artifact@v5 with: path: dist name: build-pypi diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 52c61808fc..afe1d384b4 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -75,7 +75,7 @@ jobs: head_sha: ${{ steps.get_target.outputs.head_sha }} steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 with: fetch-depth: 0 @@ -170,7 +170,7 @@ jobs: id-token: write # mandatory for trusted publishing steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 with: fetch-depth: 0 - uses: actions/setup-python@v6 @@ -180,7 +180,7 @@ jobs: - name: Install Requirements run: | sudo apt -y install pandoc man - python devscripts/install_deps.py -o --include build + python devscripts/install_deps.py --only-optional-groups --include-group build - name: Prepare env: @@ -233,10 +233,10 @@ jobs: VERSION: ${{ needs.prepare.outputs.version }} HEAD_SHA: ${{ needs.prepare.outputs.head_sha }} steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 with: fetch-depth: 0 - - uses: actions/download-artifact@v4 + - uses: actions/download-artifact@v5 with: path: artifact pattern: build-* @@ -259,7 +259,7 @@ jobs: "[![Discord](https://img.shields.io/discord/807245652072857610?color=blue&labelColor=555555&label=&logo=discord&style=for-the-badge)]" \ "(https://discord.gg/H5MNcFW63r \"Discord\") " \ "[![Donate](https://img.shields.io/badge/_-Donate-red.svg?logo=githubsponsors&labelColor=555555&style=for-the-badge)]" \ - "(https://github.com/${BASE_REPO}/blob/master/Collaborators.md#collaborators \"Donate\") " \ + "(https://github.com/${BASE_REPO}/blob/master/Maintainers.md#maintainers \"Donate\") " \ "[![Documentation](https://img.shields.io/badge/-Docs-brightgreen.svg?style=for-the-badge&logo=GitBook&labelColor=555555)]" \ "(https://github.com/${REPOSITORY}${DOCS_PATH}#readme \"Documentation\") " > ./RELEASE_NOTES if [[ "${TARGET_REPO}" == "${BASE_REPO}" ]]; then @@ -269,9 +269,10 @@ jobs: "[![Master](https://img.shields.io/badge/Master%20builds-lightblue.svg?style=for-the-badge)]" \ "(https://github.com/${MASTER_REPO}/releases/latest \"Master builds\")" >> ./RELEASE_NOTES fi - printf '\n\n%s\n\n%s%s\n\n---\n' \ + printf '\n\n%s\n\n%s%s%s\n\n---\n' \ "#### A description of the various files is in the [README](https://github.com/${REPOSITORY}#release-files)" \ - "The PyInstaller-bundled executables are subject to the licenses described in " \ + "The zipimport Unix executable contains code licensed under ISC and MIT. " \ + "The PyInstaller-bundled executables are subject to these and other licenses, all of which are compiled in " \ "[THIRD_PARTY_LICENSES.txt](https://github.com/${BASE_REPO}/blob/${HEAD_SHA}/THIRD_PARTY_LICENSES.txt)" >> ./RELEASE_NOTES python ./devscripts/make_changelog.py -vv --collapsible >> ./RELEASE_NOTES printf '%s\n\n' '**This is a pre-release build**' >> ./PRERELEASE_NOTES diff --git a/.github/workflows/signature-tests.yml b/.github/workflows/signature-tests.yml deleted file mode 100644 index ae2221d28a..0000000000 --- a/.github/workflows/signature-tests.yml +++ /dev/null @@ -1,41 +0,0 @@ -name: Signature Tests -on: - push: - paths: - - .github/workflows/signature-tests.yml - - test/test_youtube_signature.py - - yt_dlp/jsinterp.py - pull_request: - paths: - - .github/workflows/signature-tests.yml - - test/test_youtube_signature.py - - yt_dlp/jsinterp.py -permissions: - contents: read - -concurrency: - group: signature-tests-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: ${{ github.event_name == 'pull_request' }} - -jobs: - tests: - name: Signature Tests - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - os: [ubuntu-latest, windows-latest] - python-version: ['3.9', '3.10', '3.11', '3.12', '3.13', '3.14-dev', pypy-3.11] - steps: - - uses: actions/checkout@v4 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v6 - with: - python-version: ${{ matrix.python-version }} - - name: Install test requirements - run: python3 ./devscripts/install_deps.py --only-optional --include test - - name: Run tests - timeout-minutes: 15 - run: | - python3 -m yt_dlp -v || true # Print debug head - python3 ./devscripts/run_tests.py test/test_youtube_signature.py diff --git a/.github/workflows/test-workflows.yml b/.github/workflows/test-workflows.yml index 684ec6cc44..37bf044d69 100644 --- a/.github/workflows/test-workflows.yml +++ b/.github/workflows/test-workflows.yml @@ -26,7 +26,7 @@ jobs: name: Check workflows runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 - uses: actions/setup-python@v6 with: python-version: "3.10" # Keep this in sync with release.yml's prepare job @@ -34,7 +34,7 @@ jobs: env: ACTIONLINT_TARBALL: ${{ format('actionlint_{0}_linux_amd64.tar.gz', env.ACTIONLINT_VERSION) }} run: | - python -m devscripts.install_deps -o --include test + python -m devscripts.install_deps --only-optional-groups --include-group test sudo apt -y install shellcheck python -m pip install -U pyflakes curl -LO "${ACTIONLINT_REPO}/releases/download/v${ACTIONLINT_VERSION}/${ACTIONLINT_TARBALL}" diff --git a/.gitignore b/.gitignore index 40bb34d2aa..af6da639db 100644 --- a/.gitignore +++ b/.gitignore @@ -107,6 +107,7 @@ README.txt test/testdata/sigs/player-*.js test/testdata/thumbnails/empty.webp test/testdata/thumbnails/foo\ %d\ bar/foo_%d.* +.ejs-* # Binary /youtube-dl @@ -129,3 +130,6 @@ yt-dlp.zip # Plugins ytdlp_plugins/ yt-dlp-plugins + +# Packages +yt_dlp_ejs/ diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 25cdf21e20..89327581c0 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -284,7 +284,7 @@ After you have ensured this site is distributing its content legally, you can fo You can use `hatch fmt` to automatically fix problems. Rules that the linter/formatter enforces should not be disabled with `# noqa` unless a maintainer requests it. The only exception allowed is for old/printf-style string formatting in GraphQL query templates (use `# noqa: UP031`). -1. Make sure your code works under all [Python](https://www.python.org/) versions supported by yt-dlp, namely CPython >=3.9 and PyPy >=3.11. Backward compatibility is not required for even older versions of Python. +1. Make sure your code works under all [Python](https://www.python.org/) versions supported by yt-dlp, namely CPython >=3.10 and PyPy >=3.11. Backward compatibility is not required for even older versions of Python. 1. When the tests pass, [add](https://git-scm.com/docs/git-add) the new files, [commit](https://git-scm.com/docs/git-commit) them and [push](https://git-scm.com/docs/git-push) the result, like this: ```shell diff --git a/CONTRIBUTORS b/CONTRIBUTORS index 9064ebd7f8..888d48d561 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -1,10 +1,10 @@ pukkandan (owner) -shirt-dev (collaborator) -coletdjnz/colethedj (collaborator) -Ashish0804 (collaborator) -bashonly (collaborator) -Grub4K (collaborator) -seproDev (collaborator) +shirt-dev (maintainer) +coletdjnz (maintainer) +Ashish0804 (maintainer) +bashonly (maintainer) +Grub4K (maintainer) +seproDev (maintainer) h-h-h-h pauldubois98 nixxo @@ -811,3 +811,10 @@ zakaryan2004 cdce8p nicolaasjan willsmillie +CasualYT31 +cecilia-sanare +dhwz +robin-mu +shssoichiro +thanhtaivtt +uoag diff --git a/Changelog.md b/Changelog.md index 9d563c8ac9..8737441e86 100644 --- a/Changelog.md +++ b/Changelog.md @@ -4,6 +4,56 @@ # To create a release, dispatch the https://github.com/yt-dlp/yt-dlp/actions/workflows/release.yml workflow on master --> +### 2025.10.22 + +#### Important changes +- **A stopgap release with a *TEMPORARY partial* fix for YouTube support** +Some formats may still be unavailable, especially if cookies are passed to yt-dlp. The ***NEXT*** release, expected very soon, **will require an external JS runtime (e.g. Deno)** in order for YouTube downloads to work properly. [Read more](https://github.com/yt-dlp/yt-dlp/issues/14404) +- **The minimum *required* Python version has been raised to 3.10** +Python 3.9 has reached its end-of-life as of October 2025, and yt-dlp has now removed support for it. [Read more](https://github.com/yt-dlp/yt-dlp/issues/13858) + +#### Core changes +- [Remove Python 3.9 support](https://github.com/yt-dlp/yt-dlp/commit/4e6a693057cfaf1ce1f07b019ed3bfce2bf936f6) ([#13861](https://github.com/yt-dlp/yt-dlp/issues/13861)) by [bashonly](https://github.com/bashonly) + +#### Extractor changes +- **appleconnect**: [Rework extractor](https://github.com/yt-dlp/yt-dlp/commit/78748b506f0dca8236ac0045ed7f72f7cf334b62) ([#13229](https://github.com/yt-dlp/yt-dlp/issues/13229)) by [doe1080](https://github.com/doe1080) +- **idagio**: [Support URLs with country codes](https://github.com/yt-dlp/yt-dlp/commit/c9356f308dd3c5f9f494cb40ed14c5df017b4fe0) ([#14655](https://github.com/yt-dlp/yt-dlp/issues/14655)) by [robin-mu](https://github.com/robin-mu) +- **tvnoe**: [Rework Extractor](https://github.com/yt-dlp/yt-dlp/commit/fe5ae54a7b08ebe679f03afdeafbe1cee5784d5b) ([#13369](https://github.com/yt-dlp/yt-dlp/issues/13369)) by [doe1080](https://github.com/doe1080) +- **youtube**: [Use temporary player client workaround](https://github.com/yt-dlp/yt-dlp/commit/2c9091e355a7ba5d1edb69796ecdca48199b77fb) ([#14693](https://github.com/yt-dlp/yt-dlp/issues/14693)) by [gamer191](https://github.com/gamer191) + +#### Misc. changes +- **cleanup** + - Miscellaneous + - [c7bda21](https://github.com/yt-dlp/yt-dlp/commit/c7bda2192aa24afce40fdbbbe056d269aa3b2872) by [bashonly](https://github.com/bashonly), [seproDev](https://github.com/seproDev) + - [de7b3c0](https://github.com/yt-dlp/yt-dlp/commit/de7b3c0705022cb777c5b4b7f0c69c59ad6ff538) by [bashonly](https://github.com/bashonly) +- **docs**: [Update list of maintainers](https://github.com/yt-dlp/yt-dlp/commit/dfc0a84c192a7357dd1768cc345d590253a14fe5) ([#14148](https://github.com/yt-dlp/yt-dlp/issues/14148)) by [bashonly](https://github.com/bashonly), [coletdjnz](https://github.com/coletdjnz), [seproDev](https://github.com/seproDev) + +### 2025.10.14 + +#### Core changes +- [Fix `prefer-vp9-sort` compat option](https://github.com/yt-dlp/yt-dlp/commit/a6673a8e82276ea529c1773ed09e5bc4a22e822a) ([#14603](https://github.com/yt-dlp/yt-dlp/issues/14603)) by [seproDev](https://github.com/seproDev) + +#### Extractor changes +- **10play** + - [Handle geo-restriction errors](https://github.com/yt-dlp/yt-dlp/commit/ad55bfcfb700fbfc1364c04e3425761d6f95c0a7) ([#14618](https://github.com/yt-dlp/yt-dlp/issues/14618)) by [bashonly](https://github.com/bashonly) + - [Rework extractor](https://github.com/yt-dlp/yt-dlp/commit/eafedc21817bb0de20e9aaccd7151a1d4c4e1ebd) ([#14417](https://github.com/yt-dlp/yt-dlp/issues/14417)) by [seproDev](https://github.com/seproDev), [Sipherdrakon](https://github.com/Sipherdrakon) +- **abc.net.au**: [Support listen URLs](https://github.com/yt-dlp/yt-dlp/commit/0ea5d5882def84415f946907cfc00ab431c18fed) ([#14389](https://github.com/yt-dlp/yt-dlp/issues/14389)) by [uoag](https://github.com/uoag) +- **cbc.ca**: listen: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/df160ab18db523f6629f2e7e20123d7a3551df28) ([#14391](https://github.com/yt-dlp/yt-dlp/issues/14391)) by [uoag](https://github.com/uoag) +- **dropout**: [Update extractor for new domain](https://github.com/yt-dlp/yt-dlp/commit/8eb8695139dece6351aac10463df63b87b45b000) ([#14531](https://github.com/yt-dlp/yt-dlp/issues/14531)) by [cecilia-sanare](https://github.com/cecilia-sanare) +- **idagio**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/a98e7f9f58a9492d2cb216baa59c890ed8ce02f3) ([#14586](https://github.com/yt-dlp/yt-dlp/issues/14586)) by [robin-mu](https://github.com/robin-mu) +- **musescore**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/87be1bb96ac47abaaa4cfc6d7dd651e511b74551) ([#14598](https://github.com/yt-dlp/yt-dlp/issues/14598)) by [seproDev](https://github.com/seproDev) +- **prankcastpost**: [Rework extractor](https://github.com/yt-dlp/yt-dlp/commit/5d7678195a7d0c045a9fe0418383171a71a7ea43) ([#14445](https://github.com/yt-dlp/yt-dlp/issues/14445)) by [columndeeply](https://github.com/columndeeply) +- **slideslive**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/c2e124881f9aa02097589e853b3d3505e78372c4) ([#14619](https://github.com/yt-dlp/yt-dlp/issues/14619)) by [bashonly](https://github.com/bashonly) +- **soundcloud**: [Support new API URLs](https://github.com/yt-dlp/yt-dlp/commit/6d41aaf21c61a87e74564646abd0a8ee887e888d) ([#14449](https://github.com/yt-dlp/yt-dlp/issues/14449)) by [seproDev](https://github.com/seproDev) +- **tiktok** + - [Support browser impersonation](https://github.com/yt-dlp/yt-dlp/commit/5513036104ed9710f624c537fb3644b07a0680db) ([#14473](https://github.com/yt-dlp/yt-dlp/issues/14473)) by [bashonly](https://github.com/bashonly), [thanhtaivtt](https://github.com/thanhtaivtt) + - user: [Fix private account extraction](https://github.com/yt-dlp/yt-dlp/commit/cdc533b114c35ceb8a2e9dd3eb9c172a8737ae5e) ([#14585](https://github.com/yt-dlp/yt-dlp/issues/14585)) by [CasualYT31](https://github.com/CasualYT31) +- **vidyard**: [Extract chapters](https://github.com/yt-dlp/yt-dlp/commit/5f94f054907c12e68129cd9ac2508ed8aba1b223) ([#14478](https://github.com/yt-dlp/yt-dlp/issues/14478)) by [exterrestris](https://github.com/exterrestris) +- **xhamster**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/739125d40f8ede3beb7be68fc4df55bec0d226fd) ([#14446](https://github.com/yt-dlp/yt-dlp/issues/14446)) by [dhwz](https://github.com/dhwz), [dirkf](https://github.com/dirkf), [shssoichiro](https://github.com/shssoichiro) +- **youtube** + - [Detect experiment binding GVS PO Token to video id](https://github.com/yt-dlp/yt-dlp/commit/bd5ed90419eea18adfb2f0d8efa9d22b2029119f) ([#14471](https://github.com/yt-dlp/yt-dlp/issues/14471)) by [coletdjnz](https://github.com/coletdjnz) + - tab: [Fix approximate timestamp extraction for feeds](https://github.com/yt-dlp/yt-dlp/commit/ccc25d6710a4aa373b7e15c558e07f8a2ffae5f3) ([#14539](https://github.com/yt-dlp/yt-dlp/issues/14539)) by [coletdjnz](https://github.com/coletdjnz) + ### 2025.09.26 #### Extractor changes diff --git a/Collaborators.md b/Maintainers.md similarity index 54% rename from Collaborators.md rename to Maintainers.md index ee748eb7fd..8b52daf5fa 100644 --- a/Collaborators.md +++ b/Maintainers.md @@ -1,59 +1,34 @@ -# Collaborators +# Maintainers -This is a list of the collaborators of the project and their major contributions. See the [Changelog](Changelog.md) for more details. +This file lists the maintainers of yt-dlp and their major contributions. See the [Changelog](Changelog.md) for more details. You can also find lists of all [contributors of yt-dlp](CONTRIBUTORS) and [authors of youtube-dl](https://github.com/ytdl-org/youtube-dl/blob/master/AUTHORS) +## Core Maintainers -## [pukkandan](https://github.com/pukkandan) +Core Maintainers are responsible for reviewing and merging contributions, publishing releases, and steering the overall direction of the project. -[![ko-fi](https://img.shields.io/badge/_-Ko--fi-red.svg?logo=kofi&labelColor=555555&style=for-the-badge)](https://ko-fi.com/pukkandan) -[![gh-sponsor](https://img.shields.io/badge/_-Github-white.svg?logo=github&labelColor=555555&style=for-the-badge)](https://github.com/sponsors/pukkandan) +**You can contact the core maintainers via `maintainers@yt-dlp.org`.** -* Owner of the fork - - - -## [shirt](https://github.com/shirt-dev) - -[![ko-fi](https://img.shields.io/badge/_-Ko--fi-red.svg?logo=kofi&labelColor=555555&style=for-the-badge)](https://ko-fi.com/shirt) - -* Multithreading (`-N`) and aria2c support for fragment downloads -* Support for media initialization and discontinuity in HLS -* The self-updater (`-U`) - - - -## [coletdjnz](https://github.com/coletdjnz) +### [coletdjnz](https://github.com/coletdjnz) [![gh-sponsor](https://img.shields.io/badge/_-Github-white.svg?logo=github&labelColor=555555&style=for-the-badge)](https://github.com/sponsors/coletdjnz) -* Improved plugin architecture -* Rewrote the networking infrastructure, implemented support for `requests` -* YouTube improvements including: age-gate bypass, private playlists, multiple-clients (to avoid throttling) and a lot of under-the-hood improvements -* Added support for new websites YoutubeWebArchive, MainStreaming, PRX, nzherald, Mediaklikk, StarTV etc -* Improved/fixed support for Patreon, panopto, gfycat, itv, pbs, SouthParkDE etc +* Overhauled the networking stack and implemented support for `requests` and `curl_cffi` (`--impersonate`) HTTP clients +* Reworked the plugin architecture to support installing plugins across all yt-dlp distributions (exe, pip, etc.) +* Maintains support for YouTube +* Added and fixed support for various other sites + +### [bashonly](https://github.com/bashonly) + +* Rewrote and maintains the build/release workflows and the self-updater: executables, automated/nightly/master releases, `--update-to` +* Overhauled external downloader cookie handling +* Added `--cookies-from-browser` support for Firefox containers +* Overhauled and maintains support for sites like Youtube, Vimeo, Twitter, TikTok, etc +* Added support for sites like Dacast, Kick, Loom, SproutVideo, Triller, Weverse, etc - -## [Ashish0804](https://github.com/Ashish0804) [Inactive] - -[![ko-fi](https://img.shields.io/badge/_-Ko--fi-red.svg?logo=kofi&labelColor=555555&style=for-the-badge)](https://ko-fi.com/ashish0804) - -* Added support for new websites BiliIntl, DiscoveryPlusIndia, OlympicsReplay, PlanetMarathi, ShemarooMe, Utreon, Zee5 etc -* Added playlist/series downloads for Hotstar, ParamountPlus, Rumble, SonyLIV, Trovo, TubiTv, Voot etc -* Improved/fixed support for HiDive, HotStar, Hungama, LBRY, LinkedInLearning, Mxplayer, SonyLiv, TV2, Vimeo, VLive etc - - -## [bashonly](https://github.com/bashonly) - -* `--update-to`, self-updater rewrite, automated/nightly/master releases -* `--cookies-from-browser` support for Firefox containers, external downloader cookie handling overhaul -* Added support for new websites like Dacast, Kick, NBCStations, Triller, VideoKen, Weverse, WrestleUniverse etc -* Improved/fixed support for Anvato, Brightcove, Reddit, SlidesLive, TikTok, Twitter, Vimeo etc - - -## [Grub4K](https://github.com/Grub4K) +### [Grub4K](https://github.com/Grub4K) [![gh-sponsor](https://img.shields.io/badge/_-Github-white.svg?logo=github&labelColor=555555&style=for-the-badge)](https://github.com/sponsors/Grub4K) [![ko-fi](https://img.shields.io/badge/_-Ko--fi-red.svg?logo=kofi&labelColor=555555&style=for-the-badge)](https://ko-fi.com/Grub4K) @@ -63,8 +38,48 @@ You can also find lists of all [contributors of yt-dlp](CONTRIBUTORS) and [autho * Improved/fixed/added Bundestag, crunchyroll, pr0gramm, Twitter, WrestleUniverse etc -## [sepro](https://github.com/seproDev) +### [sepro](https://github.com/seproDev) * UX improvements: Warn when ffmpeg is missing, warn when double-clicking exe * Code cleanup: Remove dead extractors, mark extractors as broken, enable/apply ruff rules * Improved/fixed/added ArdMediathek, DRTV, Floatplane, MagentaMusik, Naver, Nebula, OnDemandKorea, Vbox7 etc + + +## Inactive Core Maintainers + +### [pukkandan](https://github.com/pukkandan) + +[![ko-fi](https://img.shields.io/badge/_-Ko--fi-red.svg?logo=kofi&labelColor=555555&style=for-the-badge)](https://ko-fi.com/pukkandan) +[![gh-sponsor](https://img.shields.io/badge/_-Github-white.svg?logo=github&labelColor=555555&style=for-the-badge)](https://github.com/sponsors/pukkandan) + +* Founder of the fork +* Lead Maintainer from 2021-2024 + + +### [shirt](https://github.com/shirt-dev) + +[![ko-fi](https://img.shields.io/badge/_-Ko--fi-red.svg?logo=kofi&labelColor=555555&style=for-the-badge)](https://ko-fi.com/shirt) + +* Multithreading (`-N`) and aria2c support for fragment downloads +* Support for media initialization and discontinuity in HLS +* The self-updater (`-U`) + + +### [Ashish0804](https://github.com/Ashish0804) + +[![ko-fi](https://img.shields.io/badge/_-Ko--fi-red.svg?logo=kofi&labelColor=555555&style=for-the-badge)](https://ko-fi.com/ashish0804) + +* Added support for new websites BiliIntl, DiscoveryPlusIndia, OlympicsReplay, PlanetMarathi, ShemarooMe, Utreon, Zee5 etc +* Added playlist/series downloads for Hotstar, ParamountPlus, Rumble, SonyLIV, Trovo, TubiTv, Voot etc +* Improved/fixed support for HiDive, HotStar, Hungama, LBRY, LinkedInLearning, Mxplayer, SonyLiv, TV2, Vimeo, VLive etc + +## Triage Maintainers + +Triage Maintainers are frequent contributors who can manage issues and pull requests. + +- [gamer191](https://github.com/gamer191) +- [garret1317](https://github.com/garret1317) +- [pzhlkj6612](https://github.com/pzhlkj6612) +- [DTrombett](https://github.com/dtrombett) +- [doe1080](https://github.com/doe1080) +- [grqz](https://github.com/grqz) diff --git a/Makefile b/Makefile index 404250c815..88727219b8 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,5 @@ all: lazy-extractors yt-dlp doc pypi-files +all-extra: lazy-extractors yt-dlp-extra doc pypi-files clean: clean-test clean-dist clean-all: clean clean-cache completions: completion-bash completion-fish completion-zsh @@ -15,7 +16,11 @@ pypi-files: AUTHORS Changelog.md LICENSE README.md README.txt supportedsites \ .PHONY: all clean clean-all clean-test clean-dist clean-cache \ completions completion-bash completion-fish completion-zsh \ doc issuetemplates supportedsites ot offlinetest codetest test \ - tar pypi-files lazy-extractors install uninstall + tar pypi-files lazy-extractors install uninstall \ + all-extra yt-dlp-extra current-ejs-version + +.IGNORE: current-ejs-version +.SILENT: current-ejs-version clean-test: rm -rf tmp/ *.annotations.xml *.aria2 *.description *.dump *.frag \ @@ -25,7 +30,8 @@ clean-test: test/testdata/sigs/player-*.js test/testdata/thumbnails/empty.webp "test/testdata/thumbnails/foo %d bar/foo_%d."* clean-dist: rm -rf yt-dlp.1.temp.md yt-dlp.1 README.txt MANIFEST build/ dist/ .coverage cover/ yt-dlp.tar.gz completions/ \ - yt_dlp/extractor/lazy_extractors.py *.spec CONTRIBUTING.md.tmp yt-dlp yt-dlp.exe yt_dlp.egg-info/ AUTHORS + yt_dlp/extractor/lazy_extractors.py *.spec CONTRIBUTING.md.tmp yt-dlp yt-dlp.exe yt_dlp.egg-info/ AUTHORS \ + yt-dlp.zip .ejs-* yt_dlp_ejs/ clean-cache: find . \( \ -type d -name ".*_cache" -o -type d -name __pycache__ -o -name "*.pyc" -o -name "*.class" \ @@ -81,28 +87,49 @@ test: offlinetest: codetest $(PYTHON) -m pytest -Werror -m "not download" -CODE_FOLDERS_CMD = find yt_dlp -type f -name '__init__.py' | sed 's,/__init__.py,,' | grep -v '/__' | sort -CODE_FOLDERS != $(CODE_FOLDERS_CMD) -CODE_FOLDERS ?= $(shell $(CODE_FOLDERS_CMD)) -CODE_FILES_CMD = for f in $(CODE_FOLDERS) ; do echo "$$f" | sed 's,$$,/*.py,' ; done -CODE_FILES != $(CODE_FILES_CMD) -CODE_FILES ?= $(shell $(CODE_FILES_CMD)) -yt-dlp: $(CODE_FILES) +PY_CODE_FOLDERS_CMD = find yt_dlp -type f -name '__init__.py' | sed 's|/__init__\.py||' | grep -v '/__' | sort +PY_CODE_FOLDERS != $(PY_CODE_FOLDERS_CMD) +PY_CODE_FOLDERS ?= $(shell $(PY_CODE_FOLDERS_CMD)) + +PY_CODE_FILES_CMD = for f in $(PY_CODE_FOLDERS) ; do echo "$$f" | sed 's|$$|/*.py|' ; done +PY_CODE_FILES != $(PY_CODE_FILES_CMD) +PY_CODE_FILES ?= $(shell $(PY_CODE_FILES_CMD)) + +JS_CODE_FOLDERS_CMD = find yt_dlp -type f -name '*.js' | sed 's|/[^/]\{1,\}\.js$$||' | uniq +JS_CODE_FOLDERS != $(JS_CODE_FOLDERS_CMD) +JS_CODE_FOLDERS ?= $(shell $(JS_CODE_FOLDERS_CMD)) + +JS_CODE_FILES_CMD = for f in $(JS_CODE_FOLDERS) ; do echo "$$f" | sed 's|$$|/*.js|' ; done +JS_CODE_FILES != $(JS_CODE_FILES_CMD) +JS_CODE_FILES ?= $(shell $(JS_CODE_FILES_CMD)) + +yt-dlp.zip: $(PY_CODE_FILES) $(JS_CODE_FILES) mkdir -p zip - for d in $(CODE_FOLDERS) ; do \ + for d in $(PY_CODE_FOLDERS) ; do \ mkdir -p zip/$$d ;\ cp -pPR $$d/*.py zip/$$d/ ;\ done - (cd zip && touch -t 200001010101 $(CODE_FILES)) - mv zip/yt_dlp/__main__.py zip/ - (cd zip && zip -q ../yt-dlp $(CODE_FILES) __main__.py) + for d in $(JS_CODE_FOLDERS) ; do \ + mkdir -p zip/$$d ;\ + cp -pPR $$d/*.js zip/$$d/ ;\ + done + (cd zip && touch -t 200001010101 $(PY_CODE_FILES) $(JS_CODE_FILES)) + rm -f zip/yt_dlp/__main__.py + (cd zip && zip -q ../yt-dlp.zip $(PY_CODE_FILES) $(JS_CODE_FILES)) rm -rf zip + +yt-dlp: yt-dlp.zip + mkdir -p zip + cp -pP yt_dlp/__main__.py zip/ + touch -t 200001010101 zip/__main__.py + (cd zip && zip -q ../yt-dlp.zip __main__.py) echo '#!$(PYTHON)' > yt-dlp cat yt-dlp.zip >> yt-dlp rm yt-dlp.zip chmod a+x yt-dlp + rm -rf zip -README.md: $(CODE_FILES) devscripts/make_readme.py +README.md: $(PY_CODE_FILES) devscripts/make_readme.py COLUMNS=80 $(PYTHON) yt_dlp/__main__.py --ignore-config --help | $(PYTHON) devscripts/make_readme.py CONTRIBUTING.md: README.md devscripts/make_contributing.py @@ -127,15 +154,15 @@ yt-dlp.1: README.md devscripts/prepare_manpage.py pandoc -s -f $(MARKDOWN) -t man yt-dlp.1.temp.md -o yt-dlp.1 rm -f yt-dlp.1.temp.md -completions/bash/yt-dlp: $(CODE_FILES) devscripts/bash-completion.in +completions/bash/yt-dlp: $(PY_CODE_FILES) devscripts/bash-completion.in mkdir -p completions/bash $(PYTHON) devscripts/bash-completion.py -completions/zsh/_yt-dlp: $(CODE_FILES) devscripts/zsh-completion.in +completions/zsh/_yt-dlp: $(PY_CODE_FILES) devscripts/zsh-completion.in mkdir -p completions/zsh $(PYTHON) devscripts/zsh-completion.py -completions/fish/yt-dlp.fish: $(CODE_FILES) devscripts/fish-completion.in +completions/fish/yt-dlp.fish: $(PY_CODE_FILES) devscripts/fish-completion.in mkdir -p completions/fish $(PYTHON) devscripts/fish-completion.py @@ -157,7 +184,7 @@ yt-dlp.tar.gz: all --exclude '.git' \ -- \ README.md supportedsites.md Changelog.md LICENSE \ - CONTRIBUTING.md Collaborators.md CONTRIBUTORS AUTHORS \ + CONTRIBUTING.md Maintainers.md CONTRIBUTORS AUTHORS \ Makefile yt-dlp.1 README.txt completions .gitignore \ yt-dlp yt_dlp pyproject.toml devscripts test @@ -172,3 +199,45 @@ CONTRIBUTORS: Changelog.md echo 'Updating $@ from git commit history' ; \ $(PYTHON) devscripts/make_changelog.py -v -c > /dev/null ; \ fi + +# The following EJS_-prefixed variables are auto-generated by devscripts/update_ejs.py +# DO NOT EDIT! +EJS_VERSION = 0.3.0 +EJS_WHEEL_NAME = yt_dlp_ejs-0.3.0-py3-none-any.whl +EJS_WHEEL_HASH = sha256:abbf269fa1674cab7b7b266e51e89e0e60b01a11a0fdf3cd63528683190cdd07 +EJS_PY_FOLDERS = yt_dlp_ejs yt_dlp_ejs/yt yt_dlp_ejs/yt/solver +EJS_PY_FILES = yt_dlp_ejs/__init__.py yt_dlp_ejs/_version.py yt_dlp_ejs/yt/__init__.py yt_dlp_ejs/yt/solver/__init__.py +EJS_JS_FOLDERS = yt_dlp_ejs/yt/solver +EJS_JS_FILES = yt_dlp_ejs/yt/solver/core.min.js yt_dlp_ejs/yt/solver/lib.min.js + +yt-dlp-extra: current-ejs-version .ejs-$(EJS_VERSION) $(EJS_PY_FILES) $(EJS_JS_FILES) yt-dlp.zip + mkdir -p zip + for d in $(EJS_PY_FOLDERS) ; do \ + mkdir -p zip/$$d ;\ + cp -pPR $$d/*.py zip/$$d/ ;\ + done + for d in $(EJS_JS_FOLDERS) ; do \ + mkdir -p zip/$$d ;\ + cp -pPR $$d/*.js zip/$$d/ ;\ + done + (cd zip && touch -t 200001010101 $(EJS_PY_FILES) $(EJS_JS_FILES)) + (cd zip && zip -q ../yt-dlp.zip $(EJS_PY_FILES) $(EJS_JS_FILES)) + cp -pP yt_dlp/__main__.py zip/ + touch -t 200001010101 zip/__main__.py + (cd zip && zip -q ../yt-dlp.zip __main__.py) + echo '#!$(PYTHON)' > yt-dlp + cat yt-dlp.zip >> yt-dlp + rm yt-dlp.zip + chmod a+x yt-dlp + rm -rf zip + +.ejs-$(EJS_VERSION): + @echo Downloading yt-dlp-ejs + @echo "yt-dlp-ejs==$(EJS_VERSION) --hash $(EJS_WHEEL_HASH)" > .ejs-requirements.txt + $(PYTHON) -m pip download -d ./build --no-deps --require-hashes -r .ejs-requirements.txt + unzip -o build/$(EJS_WHEEL_NAME) "yt_dlp_ejs/*" + @touch .ejs-$(EJS_VERSION) + +current-ejs-version: + rm -rf .ejs-* + touch .ejs-$$($(PYTHON) -c 'import sys; sys.path = [""]; from yt_dlp_ejs import version; print(version)' 2>/dev/null) diff --git a/README.md b/README.md index 60c312114b..4e7f442a60 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ [![Release version](https://img.shields.io/github/v/release/yt-dlp/yt-dlp?color=brightgreen&label=Download&style=for-the-badge)](#installation "Installation") [![PyPI](https://img.shields.io/badge/-PyPI-blue.svg?logo=pypi&labelColor=555555&style=for-the-badge)](https://pypi.org/project/yt-dlp "PyPI") -[![Donate](https://img.shields.io/badge/_-Donate-red.svg?logo=githubsponsors&labelColor=555555&style=for-the-badge)](Collaborators.md#collaborators "Donate") +[![Donate](https://img.shields.io/badge/_-Donate-red.svg?logo=githubsponsors&labelColor=555555&style=for-the-badge)](Maintainers.md#maintainers "Donate") [![Discord](https://img.shields.io/discord/807245652072857610?color=blue&labelColor=555555&label=&logo=discord&style=for-the-badge)](https://discord.gg/H5MNcFW63r "Discord") [![Supported Sites](https://img.shields.io/badge/-Supported_Sites-brightgreen.svg?style=for-the-badge)](supportedsites.md "Supported Sites") [![License: Unlicense](https://img.shields.io/badge/-Unlicense-blue.svg?style=for-the-badge)](LICENSE "License") @@ -145,9 +145,11 @@ While yt-dlp is licensed under the [Unlicense](LICENSE), many of the release fil Most notably, the PyInstaller-bundled executables include GPLv3+ licensed code, and as such the combined work is licensed under [GPLv3+](https://www.gnu.org/licenses/gpl-3.0.html). -See [THIRD_PARTY_LICENSES.txt](THIRD_PARTY_LICENSES.txt) for details. +The zipimport Unix executable (`yt-dlp`) contains [ISC](https://github.com/meriyah/meriyah/blob/main/LICENSE.md) licensed code from [`meriyah`](https://github.com/meriyah/meriyah) and [MIT](https://github.com/davidbonnet/astring/blob/main/LICENSE) licensed code from [`astring`](https://github.com/davidbonnet/astring). -The zipimport binary (`yt-dlp`), the source tarball (`yt-dlp.tar.gz`), and the PyPI source distribution & wheel only contain code licensed under the [Unlicense](LICENSE). +See [THIRD_PARTY_LICENSES.txt](THIRD_PARTY_LICENSES.txt) for more details. + +The git repository, the source tarball (`yt-dlp.tar.gz`), the PyPI source distribution and the PyPI built distribution (wheel) only contain code licensed under the [Unlicense](LICENSE). @@ -194,14 +196,14 @@ When running a yt-dlp version that is older than 90 days, you will see a warning You can suppress this warning by adding `--no-update` to your command or configuration file. ## DEPENDENCIES -Python versions 3.9+ (CPython) and 3.11+ (PyPy) are supported. Other versions and implementations may or may not work correctly. +Python versions 3.10+ (CPython) and 3.11+ (PyPy) are supported. Other versions and implementations may or may not work correctly. -While all the other dependencies are optional, `ffmpeg` and `ffprobe` are highly recommended +While all the other dependencies are optional, `ffmpeg`, `ffprobe`, `yt-dlp-ejs` and a JavaScript runtime are highly recommended ### Strongly recommended @@ -211,6 +213,10 @@ While all the other dependencies are optional, `ffmpeg` and `ffprobe` are highly **Important**: What you need is ffmpeg *binary*, **NOT** [the Python package of the same name](https://pypi.org/project/ffmpeg) +* [**yt-dlp-ejs**](https://github.com/yt-dlp/ejs) - Required for deciphering YouTube n/sig values. Licensed under [Unlicense](https://github.com/yt-dlp/ejs/blob/main/LICENSE), bundles [MIT](https://github.com/davidbonnet/astring/blob/main/LICENSE) and [ISC](https://github.com/meriyah/meriyah/blob/main/LICENSE.md) components. + + A JavaScript runtime like [**deno**](https://deno.land) (recommended), [**node.js**](https://nodejs.org), [**bun**](https://bun.sh), or [**QuickJS**](https://bellard.org/quickjs/) is also required to run yt-dlp-ejs. See [the wiki](https://github.com/yt-dlp/yt-dlp/wiki/EJS). + ### Networking * [**certifi**](https://github.com/certifi/python-certifi)\* - Provides Mozilla's root certificate bundle. Licensed under [MPLv2](https://github.com/certifi/python-certifi/blob/master/LICENSE) * [**brotli**](https://github.com/google/brotli)\* or [**brotlicffi**](https://github.com/python-hyper/brotlicffi) - [Brotli](https://en.wikipedia.org/wiki/Brotli) content encoding support. Both licensed under MIT [1](https://github.com/google/brotli/blob/master/LICENSE) [2](https://github.com/python-hyper/brotlicffi/blob/master/LICENSE) @@ -235,7 +241,7 @@ The following provide support for impersonating browser requests. This may be re ### Misc * [**pycryptodomex**](https://github.com/Legrandin/pycryptodome)\* - For decrypting AES-128 HLS streams and various other data. Licensed under [BSD-2-Clause](https://github.com/Legrandin/pycryptodome/blob/master/LICENSE.rst) -* [**phantomjs**](https://github.com/ariya/phantomjs) - Used in extractors where javascript needs to be run. Licensed under [BSD-3-Clause](https://github.com/ariya/phantomjs/blob/master/LICENSE.BSD) +* [**phantomjs**](https://github.com/ariya/phantomjs) - Used in some extractors where JavaScript needs to be run. No longer used for YouTube. To be deprecated in the near future. Licensed under [BSD-3-Clause](https://github.com/ariya/phantomjs/blob/master/LICENSE.BSD) * [**secretstorage**](https://github.com/mitya57/secretstorage)\* - For `--cookies-from-browser` to access the **Gnome** keyring while decrypting cookies of **Chromium**-based browsers on **Linux**. Licensed under [BSD-3-Clause](https://github.com/mitya57/secretstorage/blob/master/LICENSE) * Any external downloader that you want to use with `--downloader` @@ -273,7 +279,7 @@ On some systems, you may need to use `py` or `python` instead of `python3`. **Important**: Running `pyinstaller` directly **instead of** using `python -m bundle.pyinstaller` is **not** officially supported. This may or may not work correctly. ### Platform-independent Binary (UNIX) -You will need the build tools `python` (3.9+), `zip`, `make` (GNU), `pandoc`\* and `pytest`\*. +You will need the build tools `python` (3.10+), `zip`, `make` (GNU), `pandoc`\* and `pytest`\*. After installing these, simply run `make`. @@ -362,6 +368,26 @@ Tip: Use `CTRL`+`F` (or `Command`+`F`) to search by keywords --no-plugin-dirs Clear plugin directories to search, including defaults and those provided by previous --plugin-dirs + --js-runtimes RUNTIME[:PATH] Additional JavaScript runtime to enable, + with an optional path to the runtime + location. This option can be used multiple + times to enable multiple runtimes. Supported + runtimes: deno, node, bun, quickjs. By + default, only "deno" runtime is enabled. + --no-js-runtimes Clear JavaScript runtimes to enable, + including defaults and those provided by + previous --js-runtimes + --remote-components COMPONENT Remote components to allow yt-dlp to fetch + when required. You can use this option + multiple times to allow multiple components. + Supported values: ejs:npm (external + JavaScript components from npm), ejs:github + (external JavaScript components from yt-dlp- + ejs GitHub). By default, no remote + components are allowed. + --no-remote-components Disallow fetching of all remote components, + including any previously allowed by + --remote-components or defaults. --flat-playlist Do not extract a playlist's URL result entries; some entry metadata may be missing and downloading may be bypassed @@ -1814,12 +1840,12 @@ The following extractors use this feature: #### youtube * `lang`: Prefer translated metadata (`title`, `description` etc) of this language code (case-sensitive). By default, the video primary language metadata is preferred, with a fallback to `en` translated. See [youtube/_base.py](https://github.com/yt-dlp/yt-dlp/blob/415b4c9f955b1a0391204bd24a7132590e7b3bdb/yt_dlp/extractor/youtube/_base.py#L402-L409) for the list of supported content language codes * `skip`: One or more of `hls`, `dash` or `translated_subs` to skip extraction of the m3u8 manifests, dash manifests and [auto-translated subtitles](https://github.com/yt-dlp/yt-dlp/issues/4090#issuecomment-1158102032) respectively -* `player_client`: Clients to extract video data from. The currently available clients are `web`, `web_safari`, `web_embedded`, `web_music`, `web_creator`, `mweb`, `ios`, `android`, `android_vr`, `tv`, `tv_simply` and `tv_embedded`. By default, `tv,web_safari,web` is used, and `tv,web_creator,web` is used with premium accounts. The `web_music` client is added for `music.youtube.com` URLs when logged-in cookies are used. The `web_embedded` client is added for age-restricted videos but only works if the video is embeddable. The `tv_embedded` and `web_creator` clients are added for age-restricted videos if account age-verification is required. Some clients, such as `web` and `web_music`, require a `po_token` for their formats to be downloadable. Some clients, such as `web_creator`, will only work with authentication. Not all clients support authentication via cookies. You can use `default` for the default clients, or you can use `all` for all clients (not recommended). You can prefix a client with `-` to exclude it, e.g. `youtube:player_client=default,-ios` +* `player_client`: Clients to extract video data from. The currently available clients are `web`, `web_safari`, `web_embedded`, `web_music`, `web_creator`, `mweb`, `ios`, `android`, `android_sdkless`, `android_vr`, `tv`, `tv_simply` and `tv_embedded`. By default, `tv,android_sdkless,web` is used. If no JavaScript runtime is available, then `android_sdkless,web_safari,web` is used. If logged-in cookies are passed to yt-dlp, then `tv,web_safari,web` is used for free accounts and `tv,web_creator,web` is used for premium accounts. The `web_music` client is added for `music.youtube.com` URLs when logged-in cookies are used. The `web_embedded` client is added for age-restricted videos but only works if the video is embeddable. The `tv_embedded` and `web_creator` clients are added for age-restricted videos if account age-verification is required. Some clients, such as `web` and `web_music`, require a `po_token` for their formats to be downloadable. Some clients, such as `web_creator`, will only work with authentication. Not all clients support authentication via cookies. You can use `default` for the default clients, or you can use `all` for all clients (not recommended). You can prefix a client with `-` to exclude it, e.g. `youtube:player_client=default,-ios` * `player_skip`: Skip some network requests that are generally needed for robust extraction. One or more of `configs` (skip client configs), `webpage` (skip initial webpage), `js` (skip js player), `initial_data` (skip initial data/next ep request). While these options can help reduce the number of requests needed or avoid some rate-limiting, they could cause issues such as missing formats or metadata. See [#860](https://github.com/yt-dlp/yt-dlp/pull/860) and [#12826](https://github.com/yt-dlp/yt-dlp/issues/12826) for more details * `webpage_skip`: Skip extraction of embedded webpage data. One or both of `player_response`, `initial_data`. These options are for testing purposes and don't skip any network requests * `player_params`: YouTube player parameters to use for player requests. Will overwrite any default ones set by yt-dlp. * `player_js_variant`: The player javascript variant to use for n/sig deciphering. The known variants are: `main`, `tcc`, `tce`, `es5`, `es6`, `tv`, `tv_es6`, `phone`, `tablet`. The default is `main`, and the others are for debugging purposes. You can use `actual` to go with what is prescribed by the site -* `player_js_version`: The player javascript version to use for n/sig deciphering, in the format of `signature_timestamp@hash`. Currently, the default is to force `20348@0004de42`. You can use `actual` to go with what is prescribed by the site +* `player_js_version`: The player javascript version to use for n/sig deciphering, in the format of `signature_timestamp@hash` (e.g. `20348@0004de42`). The default is to use what is prescribed by the site, and can be selected with `actual` * `comment_sort`: `top` or `new` (default) - choose comment sorting mode (on YouTube's side) * `max_comments`: Limit the amount of comments to gather. Comma-separated list of integers representing `max-comments,max-parents,max-replies,max-replies-per-thread`. Default is `all,all,all,all` * E.g. `all,all,1000,10` will get a maximum of 1000 replies total, with up to 10 replies per thread. `1000,all,100` will get a maximum of 1000 comments, with a maximum of 100 replies total @@ -1833,6 +1859,10 @@ The following extractors use this feature: * `pot_trace`: Enable debug logging for PO Token fetching. Either `true` or `false` (default) * `fetch_pot`: Policy to use for fetching a PO Token from providers. One of `always` (always try fetch a PO Token regardless if the client requires one for the given context), `never` (never fetch a PO Token), or `auto` (default; only fetch a PO Token if the client requires one for the given context) * `playback_wait`: Duration (in seconds) to wait inbetween the extraction and download stages in order to ensure the formats are available. The default is `6` seconds +* `jsc_trace`: Enable debug logging for JS Challenge fetching. Either `true` or `false` (default) + +#### youtube-ejs +* `jitless`: Run suported Javascript engines in JIT-less mode. Supported runtimes are `deno`, `node` and `bun`. Provides better security at the cost of performance/speed. Do note that `node` and `bun` are still considered unsecure. Either `true` or `false` (default) #### youtubepot-webpo * `bind_to_visitor_id`: Whether to use the Visitor ID instead of Visitor Data for caching WebPO tokens. Either `true` (default) or `false` @@ -2255,7 +2285,7 @@ Features marked with a **\*** have been back-ported to youtube-dl Some of yt-dlp's default options are different from that of youtube-dl and youtube-dlc: -* yt-dlp supports only [Python 3.9+](## "Windows 8"), and will remove support for more versions as they [become EOL](https://devguide.python.org/versions/#python-release-cycle); while [youtube-dl still supports Python 2.6+ and 3.2+](https://github.com/ytdl-org/youtube-dl/issues/30568#issue-1118238743) +* yt-dlp supports only [Python 3.10+](## "Windows 8"), and will remove support for more versions as they [become EOL](https://devguide.python.org/versions/#python-release-cycle); while [youtube-dl still supports Python 2.6+ and 3.2+](https://github.com/ytdl-org/youtube-dl/issues/30568#issue-1118238743) * The options `--auto-number` (`-A`), `--title` (`-t`) and `--literal` (`-l`), no longer work. See [removed options](#Removed) for details * `avconv` is not supported as an alternative to `ffmpeg` * yt-dlp stores config files in slightly different locations to youtube-dl. See [CONFIGURATION](#configuration) for a list of correct locations diff --git a/THIRD_PARTY_LICENSES.txt b/THIRD_PARTY_LICENSES.txt index 1040046541..f7977064a0 100644 --- a/THIRD_PARTY_LICENSES.txt +++ b/THIRD_PARTY_LICENSES.txt @@ -4431,3 +4431,43 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + + +-------------------------------------------------------------------------------- +Meriyah | ISC +URL: https://github.com/meriyah/meriyah +-------------------------------------------------------------------------------- +ISC License + +Copyright (c) 2019 and later, KFlash and others. + +Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby granted, provided that the above copyright notice and this permission notice appear in all copies. + +THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + + + +-------------------------------------------------------------------------------- +Astring | MIT +URL: https://github.com/davidbonnet/astring/ +-------------------------------------------------------------------------------- +Copyright (c) 2015, David Bonnet + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/bundle/docker/linux/build.sh b/bundle/docker/linux/build.sh index 71adaad058..b30d40980e 100755 --- a/bundle/docker/linux/build.sh +++ b/bundle/docker/linux/build.sh @@ -15,12 +15,12 @@ function venvpy { } INCLUDES=( - --include pyinstaller - --include secretstorage + --include-group pyinstaller + --include-group secretstorage ) if [[ -z "${EXCLUDE_CURL_CFFI:-}" ]]; then - INCLUDES+=(--include curl-cffi) + INCLUDES+=(--include-group curl-cffi) fi runpy -m venv /yt-dlp-build-venv @@ -28,7 +28,7 @@ runpy -m venv /yt-dlp-build-venv source /yt-dlp-build-venv/bin/activate # Inside the venv we use venvpy instead of runpy venvpy -m ensurepip --upgrade --default-pip -venvpy -m devscripts.install_deps -o --include build +venvpy -m devscripts.install_deps --only-optional-groups --include-group build venvpy -m devscripts.install_deps "${INCLUDES[@]}" venvpy -m devscripts.make_lazy_extractors venvpy devscripts/update-version.py -c "${CHANNEL}" -r "${ORIGIN}" "${VERSION}" diff --git a/devscripts/changelog_override.json b/devscripts/changelog_override.json index 618e7fe6fe..e906838175 100644 --- a/devscripts/changelog_override.json +++ b/devscripts/changelog_override.json @@ -298,5 +298,15 @@ "action": "add", "when": "08d78996831bd8e1e3c2592d740c3def00bbf548", "short": "[priority] **Several options have been deprecated**\nIn order to simplify the codebase and reduce maintenance burden, various options have been deprecated. Please remove them from your commands/configurations. [Read more](https://github.com/yt-dlp/yt-dlp/issues/14198)" + }, + { + "action": "add", + "when": "4e6a693057cfaf1ce1f07b019ed3bfce2bf936f6", + "short": "[priority] **The minimum *required* Python version has been raised to 3.10**\nPython 3.9 has reached its end-of-life as of October 2025, and yt-dlp has now removed support for it. [Read more](https://github.com/yt-dlp/yt-dlp/issues/13858)" + }, + { + "action": "add", + "when": "2c9091e355a7ba5d1edb69796ecdca48199b77fb", + "short": "[priority] **A stopgap release with a *TEMPORARY partial* fix for YouTube support**\nSome formats may still be unavailable, especially if cookies are passed to yt-dlp. The ***NEXT*** release, expected very soon, **will require an external JS runtime (e.g. Deno)** in order for YouTube downloads to work properly. [Read more](https://github.com/yt-dlp/yt-dlp/issues/14404)" } ] diff --git a/devscripts/generate_third_party_licenses.py b/devscripts/generate_third_party_licenses.py index db615d2e35..322d56f633 100644 --- a/devscripts/generate_third_party_licenses.py +++ b/devscripts/generate_third_party_licenses.py @@ -271,6 +271,19 @@ DEPENDENCIES: list[Dependency] = [ license_url='https://raw.githubusercontent.com/python-websockets/websockets/refs/heads/main/LICENSE', project_url='https://websockets.readthedocs.io/', ), + # Dependencies of yt-dlp-ejs + Dependency( + name='Meriyah', + license='ISC', + license_url='https://raw.githubusercontent.com/meriyah/meriyah/refs/heads/main/LICENSE.md', + project_url='https://github.com/meriyah/meriyah', + ), + Dependency( + name='Astring', + license='MIT', + license_url='https://raw.githubusercontent.com/davidbonnet/astring/refs/heads/main/LICENSE', + project_url='https://github.com/davidbonnet/astring/', + ), ] diff --git a/devscripts/install_deps.py b/devscripts/install_deps.py index d292505458..07c646a4c0 100755 --- a/devscripts/install_deps.py +++ b/devscripts/install_deps.py @@ -22,14 +22,19 @@ def parse_args(): 'input', nargs='?', metavar='TOMLFILE', default=Path(__file__).parent.parent / 'pyproject.toml', help='input file (default: %(default)s)') parser.add_argument( - '-e', '--exclude', metavar='DEPENDENCY', action='append', - help='exclude a dependency') + '-e', '--exclude-dependency', metavar='DEPENDENCY', action='append', + help='exclude a dependency (can be used multiple times)') parser.add_argument( - '-i', '--include', metavar='GROUP', action='append', - help='include an optional dependency group') + '-i', '--include-group', metavar='GROUP', action='append', + help='include an optional dependency group (can be used multiple times)') parser.add_argument( - '-o', '--only-optional', action='store_true', - help='only install optional dependencies') + '-c', '--cherry-pick', metavar='DEPENDENCY', action='append', + help=( + 'only include a specific dependency from the resulting dependency list ' + '(can be used multiple times)')) + parser.add_argument( + '-o', '--only-optional-groups', action='store_true', + help='omit default dependencies unless the "default" group is specified with --include-group') parser.add_argument( '-p', '--print', action='store_true', help='only print requirements to stdout') @@ -39,30 +44,41 @@ def parse_args(): return parser.parse_args() +def uniq(arg) -> dict[str, None]: + return dict.fromkeys(map(str.lower, arg or ())) + + def main(): args = parse_args() project_table = parse_toml(read_file(args.input))['project'] recursive_pattern = re.compile(rf'{project_table["name"]}\[(?P[\w-]+)\]') optional_groups = project_table['optional-dependencies'] - excludes = args.exclude or [] + + excludes = uniq(args.exclude_dependency) + only_includes = uniq(args.cherry_pick) + include_groups = uniq(args.include_group) def yield_deps(group): for dep in group: if mobj := recursive_pattern.fullmatch(dep): - yield from optional_groups.get(mobj.group('group_name'), []) + yield from optional_groups.get(mobj.group('group_name'), ()) else: yield dep - targets = [] - if not args.only_optional: # `-o` should exclude 'dependencies' and the 'default' group - targets.extend(project_table['dependencies']) - if 'default' not in excludes: # `--exclude default` should exclude entire 'default' group - targets.extend(yield_deps(optional_groups['default'])) + targets = {} + if not args.only_optional_groups: + # legacy: 'dependencies' is empty now + targets.update(dict.fromkeys(project_table['dependencies'])) + targets.update(dict.fromkeys(yield_deps(optional_groups['default']))) - for include in filter(None, map(optional_groups.get, args.include or [])): - targets.extend(yield_deps(include)) + for include in filter(None, map(optional_groups.get, include_groups)): + targets.update(dict.fromkeys(yield_deps(include))) - targets = [t for t in targets if re.match(r'[\w-]+', t).group(0).lower() not in excludes] + def target_filter(target): + name = re.match(r'[\w-]+', target).group(0).lower() + return name not in excludes and (not only_includes or name in only_includes) + + targets = list(filter(target_filter, targets)) if args.print: for target in targets: diff --git a/devscripts/make_changelog.py b/devscripts/make_changelog.py index 7c876101b4..0b2eb93b4e 100644 --- a/devscripts/make_changelog.py +++ b/devscripts/make_changelog.py @@ -373,7 +373,7 @@ class CommitRange: issues = [issue.strip()[1:] for issue in issues.split(',')] if issues else [] if prefix: - groups, details, sub_details = zip(*map(self.details_from_prefix, prefix.split(','))) + groups, details, sub_details = zip(*map(self.details_from_prefix, prefix.split(',')), strict=True) group = next(iter(filter(None, groups)), None) details = ', '.join(unique(details)) sub_details = list(itertools.chain.from_iterable(sub_details)) diff --git a/devscripts/update_ejs.py b/devscripts/update_ejs.py new file mode 100644 index 0000000000..cffb1aa2b4 --- /dev/null +++ b/devscripts/update_ejs.py @@ -0,0 +1,164 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import contextlib +import io +import json +import hashlib +import pathlib +import urllib.request +import zipfile + + +TEMPLATE = '''\ +# This file is generated by devscripts/update_ejs.py. DO NOT MODIFY! + +VERSION = {version!r} +HASHES = {{ +{hash_mapping} +}} +''' +PREFIX = ' "yt-dlp-ejs==' +BASE_PATH = pathlib.Path(__file__).parent.parent +PYPROJECT_PATH = BASE_PATH / 'pyproject.toml' +PACKAGE_PATH = BASE_PATH / 'yt_dlp/extractor/youtube/jsc/_builtin/vendor' +RELEASE_URL = 'https://api.github.com/repos/yt-dlp/ejs/releases/latest' +ASSETS = { + 'yt.solver.lib.js': False, + 'yt.solver.lib.min.js': False, + 'yt.solver.deno.lib.js': True, + 'yt.solver.bun.lib.js': True, + 'yt.solver.core.min.js': False, + 'yt.solver.core.js': True, +} +MAKEFILE_PATH = BASE_PATH / 'Makefile' + + +def request(url: str): + return contextlib.closing(urllib.request.urlopen(url)) + + +def makefile_variables( + version: str | None = None, + name: str | None = None, + digest: str | None = None, + data: bytes | None = None, + keys_only: bool = False, +) -> dict[str, str | None]: + assert keys_only or all(arg is not None for arg in (version, name, digest, data)) + + return { + 'EJS_VERSION': None if keys_only else version, + 'EJS_WHEEL_NAME': None if keys_only else name, + 'EJS_WHEEL_HASH': None if keys_only else digest, + 'EJS_PY_FOLDERS': None if keys_only else list_wheel_contents(data, 'py', files=False), + 'EJS_PY_FILES': None if keys_only else list_wheel_contents(data, 'py', folders=False), + 'EJS_JS_FOLDERS': None if keys_only else list_wheel_contents(data, 'js', files=False), + 'EJS_JS_FILES': None if keys_only else list_wheel_contents(data, 'js', folders=False), + } + + +def list_wheel_contents( + wheel_data: bytes, + suffix: str | None = None, + folders: bool = True, + files: bool = True, +) -> str: + assert folders or files, 'at least one of "folders" or "files" must be True' + + path_gen = (zinfo.filename for zinfo in zipfile.ZipFile(io.BytesIO(wheel_data)).infolist()) + filtered = filter(lambda path: path.startswith('yt_dlp_ejs/'), path_gen) + if suffix: + filtered = filter(lambda path: path.endswith(f'.{suffix}'), filtered) + + files_list = list(filtered) + if not folders: + return ' '.join(files_list) + + folders_list = list(dict.fromkeys(path.rpartition('/')[0] for path in files_list)) + if not files: + return ' '.join(folders_list) + + return ' '.join(folders_list + files_list) + + +def main(): + current_version = None + with PYPROJECT_PATH.open() as file: + for line in file: + if not line.startswith(PREFIX): + continue + current_version, _, _ = line.removeprefix(PREFIX).partition('"') + + if not current_version: + print('yt-dlp-ejs dependency line could not be found') + return + + makefile_info = makefile_variables(keys_only=True) + prefixes = tuple(f'{key} = ' for key in makefile_info) + with MAKEFILE_PATH.open() as file: + for line in file: + if not line.startswith(prefixes): + continue + key, _, val = line.partition(' = ') + makefile_info[key] = val.rstrip() + + with request(RELEASE_URL) as resp: + info = json.load(resp) + + version = info['tag_name'] + if version == current_version: + print(f'yt-dlp-ejs is up to date! ({version})') + return + + print(f'Updating yt-dlp-ejs from {current_version} to {version}') + hashes = [] + wheel_info = {} + for asset in info['assets']: + name = asset['name'] + is_wheel = name.startswith('yt_dlp_ejs-') and name.endswith('.whl') + if not is_wheel and name not in ASSETS: + continue + with request(asset['browser_download_url']) as resp: + data = resp.read() + + # verify digest from github + digest = asset['digest'] + algo, _, expected = digest.partition(':') + hexdigest = hashlib.new(algo, data).hexdigest() + assert hexdigest == expected, f'downloaded attest mismatch ({hexdigest!r} != {expected!r})' + + if is_wheel: + wheel_info = makefile_variables(version, name, digest, data) + continue + + # calculate sha3-512 digest + asset_hash = hashlib.sha3_512(data).hexdigest() + hashes.append(f' {name!r}: {asset_hash!r},') + + if ASSETS[name]: + (PACKAGE_PATH / name).write_bytes(data) + + hash_mapping = '\n'.join(hashes) + for asset_name in ASSETS: + assert asset_name in hash_mapping, f'{asset_name} not found in release' + + assert all(wheel_info.get(key) for key in makefile_info), 'wheel info not found in release' + + (PACKAGE_PATH / '_info.py').write_text(TEMPLATE.format( + version=version, + hash_mapping=hash_mapping, + )) + + content = PYPROJECT_PATH.read_text() + updated = content.replace(PREFIX + current_version, PREFIX + version) + PYPROJECT_PATH.write_text(updated) + + makefile = MAKEFILE_PATH.read_text() + for key in wheel_info: + makefile = makefile.replace(f'{key} = {makefile_info[key]}', f'{key} = {wheel_info[key]}') + MAKEFILE_PATH.write_text(makefile) + + +if __name__ == '__main__': + main() diff --git a/pyproject.toml b/pyproject.toml index 3438140525..0f6202ca08 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,8 +4,11 @@ build-backend = "hatchling.build" [project] name = "yt-dlp" -maintainers = [ +authors = [ {name = "pukkandan", email = "pukkandan.ytdlp@gmail.com"}, +] +maintainers = [ + {email = "maintainers@yt-dlp.org"}, {name = "Grub4K", email = "contact@grub4k.xyz"}, {name = "bashonly", email = "bashonly@protonmail.com"}, {name = "coletdjnz", email = "coletdjnz@protonmail.com"}, @@ -13,7 +16,7 @@ maintainers = [ ] description = "A feature-rich command-line audio/video downloader" readme = "README.md" -requires-python = ">=3.9" +requires-python = ">=3.10" keywords = [ "cli", "downloader", @@ -30,7 +33,6 @@ classifiers = [ "Environment :: Console", "Programming Language :: Python", "Programming Language :: Python :: 3 :: Only", - "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", @@ -54,6 +56,7 @@ default = [ "requests>=2.32.2,<3", "urllib3>=2.0.2,<3", "websockets>=13.0", + "yt-dlp-ejs==0.3.0", ] curl-cffi = [ "curl-cffi>=0.5.10,!=0.6.*,!=0.7.*,!=0.8.*,!=0.9.*,<0.14; implementation_name=='cpython'", @@ -76,7 +79,7 @@ dev = [ ] static-analysis = [ "autopep8~=2.0", - "ruff~=0.13.0", + "ruff~=0.14.0", ] test = [ "pytest~=8.1", @@ -90,7 +93,7 @@ pyinstaller = [ Documentation = "https://github.com/yt-dlp/yt-dlp#readme" Repository = "https://github.com/yt-dlp/yt-dlp" Tracker = "https://github.com/yt-dlp/yt-dlp/issues" -Funding = "https://github.com/yt-dlp/yt-dlp/blob/master/Collaborators.md#collaborators" +Funding = "https://github.com/yt-dlp/yt-dlp/blob/master/Maintainers.md#maintainers" [project.scripts] yt-dlp = "yt_dlp:main" @@ -120,7 +123,12 @@ artifacts = [ [tool.hatch.build.targets.wheel] packages = ["yt_dlp"] -artifacts = ["/yt_dlp/extractor/lazy_extractors.py"] +artifacts = [ + "/yt_dlp/extractor/lazy_extractors.py", +] +exclude = [ + "/yt_dlp/**/*.md", +] [tool.hatch.build.targets.wheel.shared-data] "completions/bash/yt-dlp" = "share/bash-completion/completions/yt-dlp" @@ -168,7 +176,6 @@ run-cov = "echo Code coverage not implemented && exit 1" [[tool.hatch.envs.hatch-test.matrix]] python = [ - "3.9", "3.10", "3.11", "3.12", diff --git a/supportedsites.md b/supportedsites.md index 513fd93989..a546819286 100644 --- a/supportedsites.md +++ b/supportedsites.md @@ -85,7 +85,7 @@ The only reliable way to check if a site is supported is to try it. - **aol.com**: Yahoo screen and movies (**Currently broken**) - **APA** - **Aparat** - - **AppleConnect** + - **apple:​music:connect**: Apple Music Connect - **AppleDaily**: 臺灣蘋果日報 - **ApplePodcasts** - **appletrailers** @@ -242,6 +242,7 @@ The only reliable way to check if a site is supported is to try it. - **Canalsurmas** - **CaracolTvPlay**: [*caracoltv-play*](## "netrc machine") - **cbc.ca** + - **cbc.ca:listen** - **cbc.ca:player** - **cbc.ca:​player:playlist** - **CBS**: (**Currently broken**) @@ -579,6 +580,11 @@ The only reliable way to check if a site is supported is to try it. - **Hypem** - **Hytale** - **Icareus** + - **IdagioAlbum** + - **IdagioPersonalPlaylist** + - **IdagioPlaylist** + - **IdagioRecording** + - **IdagioTrack** - **IdolPlus** - **iflix:episode** - **IflixSeries** @@ -1535,7 +1541,7 @@ The only reliable way to check if a site is supported is to try it. - **tvigle**: Интернет-телевидение Tvigle.ru - **TVIPlayer** - **TVN24**: (**Currently broken**) - - **TVNoe**: (**Currently broken**) + - **tvnoe**: Televize Noe - **tvopengr:embed**: tvopen.gr embedded videos - **tvopengr:watch**: tvopen.gr (and ethnos.gr) videos - **tvp**: Telewizja Polska diff --git a/test/helper.py b/test/helper.py index e4cb478e28..e96835fc46 100644 --- a/test/helper.py +++ b/test/helper.py @@ -176,7 +176,7 @@ def _iter_differences(got, expected, field): yield field, f'expected length of {len(expected)}, got {len(got)}' return - for index, (got_val, expected_val) in enumerate(zip(got, expected)): + for index, (got_val, expected_val) in enumerate(zip(got, expected, strict=True)): field_name = str(index) if field is None else f'{field}.{index}' yield from _iter_differences(got_val, expected_val, field_name) return diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 91312e4e5f..2705accb76 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -17,7 +17,6 @@ import json from test.helper import FakeYDL, assertRegexpMatches, try_rm from yt_dlp import YoutubeDL -from yt_dlp.extractor import YoutubeIE from yt_dlp.extractor.common import InfoExtractor from yt_dlp.postprocessor.common import PostProcessor from yt_dlp.utils import ( @@ -336,99 +335,6 @@ class TestFormatSelection(unittest.TestCase): ydl = YDL({'format': '[format_id!*=-]'}) self.assertRaises(ExtractorError, ydl.process_ie_result, info_dict.copy()) - def test_youtube_format_selection(self): - # FIXME: Rewrite in accordance with the new format sorting options - return - - order = [ - '38', '37', '46', '22', '45', '35', '44', '18', '34', '43', '6', '5', '17', '36', '13', - # Apple HTTP Live Streaming - '96', '95', '94', '93', '92', '132', '151', - # 3D - '85', '84', '102', '83', '101', '82', '100', - # Dash video - '137', '248', '136', '247', '135', '246', - '245', '244', '134', '243', '133', '242', '160', - # Dash audio - '141', '172', '140', '171', '139', - ] - - def format_info(f_id): - info = YoutubeIE._formats[f_id].copy() - - # XXX: In real cases InfoExtractor._parse_mpd_formats() fills up 'acodec' - # and 'vcodec', while in tests such information is incomplete since - # commit a6c2c24479e5f4827ceb06f64d855329c0a6f593 - # test_YoutubeDL.test_youtube_format_selection is broken without - # this fix - if 'acodec' in info and 'vcodec' not in info: - info['vcodec'] = 'none' - elif 'vcodec' in info and 'acodec' not in info: - info['acodec'] = 'none' - - info['format_id'] = f_id - info['url'] = 'url:' + f_id - return info - formats_order = [format_info(f_id) for f_id in order] - - info_dict = _make_result(list(formats_order), extractor='youtube') - ydl = YDL({'format': 'bestvideo+bestaudio'}) - ydl.sort_formats(info_dict) - ydl.process_ie_result(info_dict) - downloaded = ydl.downloaded_info_dicts[0] - self.assertEqual(downloaded['format_id'], '248+172') - self.assertEqual(downloaded['ext'], 'mp4') - - info_dict = _make_result(list(formats_order), extractor='youtube') - ydl = YDL({'format': 'bestvideo[height>=999999]+bestaudio/best'}) - ydl.sort_formats(info_dict) - ydl.process_ie_result(info_dict) - downloaded = ydl.downloaded_info_dicts[0] - self.assertEqual(downloaded['format_id'], '38') - - info_dict = _make_result(list(formats_order), extractor='youtube') - ydl = YDL({'format': 'bestvideo/best,bestaudio'}) - ydl.sort_formats(info_dict) - ydl.process_ie_result(info_dict) - downloaded_ids = [info['format_id'] for info in ydl.downloaded_info_dicts] - self.assertEqual(downloaded_ids, ['137', '141']) - - info_dict = _make_result(list(formats_order), extractor='youtube') - ydl = YDL({'format': '(bestvideo[ext=mp4],bestvideo[ext=webm])+bestaudio'}) - ydl.sort_formats(info_dict) - ydl.process_ie_result(info_dict) - downloaded_ids = [info['format_id'] for info in ydl.downloaded_info_dicts] - self.assertEqual(downloaded_ids, ['137+141', '248+141']) - - info_dict = _make_result(list(formats_order), extractor='youtube') - ydl = YDL({'format': '(bestvideo[ext=mp4],bestvideo[ext=webm])[height<=720]+bestaudio'}) - ydl.sort_formats(info_dict) - ydl.process_ie_result(info_dict) - downloaded_ids = [info['format_id'] for info in ydl.downloaded_info_dicts] - self.assertEqual(downloaded_ids, ['136+141', '247+141']) - - info_dict = _make_result(list(formats_order), extractor='youtube') - ydl = YDL({'format': '(bestvideo[ext=none]/bestvideo[ext=webm])+bestaudio'}) - ydl.sort_formats(info_dict) - ydl.process_ie_result(info_dict) - downloaded_ids = [info['format_id'] for info in ydl.downloaded_info_dicts] - self.assertEqual(downloaded_ids, ['248+141']) - - for f1, f2 in zip(formats_order, formats_order[1:]): - info_dict = _make_result([f1, f2], extractor='youtube') - ydl = YDL({'format': 'best/bestvideo'}) - ydl.sort_formats(info_dict) - ydl.process_ie_result(info_dict) - downloaded = ydl.downloaded_info_dicts[0] - self.assertEqual(downloaded['format_id'], f1['format_id']) - - info_dict = _make_result([f2, f1], extractor='youtube') - ydl = YDL({'format': 'best/bestvideo'}) - ydl.sort_formats(info_dict) - ydl.process_ie_result(info_dict) - downloaded = ydl.downloaded_info_dicts[0] - self.assertEqual(downloaded['format_id'], f1['format_id']) - def test_audio_only_extractor_format_selection(self): # For extractors with incomplete formats (all formats are audio-only or # video-only) best and worst should fallback to corresponding best/worst @@ -749,7 +655,7 @@ class TestYoutubeDL(unittest.TestCase): if not isinstance(expected, (list, tuple)): expected = (expected, expected) - for (name, got), expect in zip((('outtmpl', out), ('filename', fname)), expected): + for (name, got), expect in zip((('outtmpl', out), ('filename', fname)), expected, strict=True): if callable(expect): self.assertTrue(expect(got), f'Wrong {name} from {tmpl}') elif expect is not None: @@ -1147,7 +1053,7 @@ class TestYoutubeDL(unittest.TestCase): entries = func(evaluated) results = [(v['playlist_autonumber'] - 1, (int(v['id']), v['playlist_index'])) for v in get_downloaded_info_dicts(params, entries)] - self.assertEqual(results, list(enumerate(zip(expected_ids, expected_ids))), f'Entries of {name} for {params}') + self.assertEqual(results, list(enumerate(zip(expected_ids, expected_ids, strict=True))), f'Entries of {name} for {params}') self.assertEqual(sorted(evaluated), expected_eval, f'Evaluation of {name} for {params}') test_selection({}, INDICES) diff --git a/test/test_jsc/conftest.py b/test/test_jsc/conftest.py new file mode 100644 index 0000000000..28d6734122 --- /dev/null +++ b/test/test_jsc/conftest.py @@ -0,0 +1,60 @@ +import re +import pathlib + +import pytest + +import yt_dlp.globals +from yt_dlp import YoutubeDL +from yt_dlp.extractor.common import InfoExtractor + + +_TESTDATA_PATH = pathlib.Path(__file__).parent.parent / 'testdata/sigs' +_player_re = re.compile(r'^.+/player/(?P[a-zA-Z0-9_/.-]+)\.js$') +_player_id_trans = str.maketrans(dict.fromkeys('/.-', '_')) + + +@pytest.fixture +def ie() -> InfoExtractor: + runtime_names = yt_dlp.globals.supported_js_runtimes.value + ydl = YoutubeDL({'js_runtimes': {key: {} for key in runtime_names}}) + ie = ydl.get_info_extractor('Youtube') + + def _load_player(video_id, player_url, fatal=True): + match = _player_re.match(player_url) + test_id = match.group('id').translate(_player_id_trans) + cached_file = _TESTDATA_PATH / f'player-{test_id}.js' + + if cached_file.exists(): + return cached_file.read_text() + + if code := ie._download_webpage(player_url, video_id, fatal=fatal): + _TESTDATA_PATH.mkdir(exist_ok=True, parents=True) + cached_file.write_text(code) + return code + + return None + + ie._load_player = _load_player + return ie + + +class MockLogger: + def trace(self, message: str): + print(f'trace: {message}') + + def debug(self, message: str, *, once=False): + print(f'debug: {message}') + + def info(self, message: str): + print(f'info: {message}') + + def warning(self, message: str, *, once=False): + print(f'warning: {message}') + + def error(self, message: str): + print(f'error: {message}') + + +@pytest.fixture +def logger(): + return MockLogger() diff --git a/test/test_jsc/test_ejs_integration.py b/test/test_jsc/test_ejs_integration.py new file mode 100644 index 0000000000..7984810794 --- /dev/null +++ b/test/test_jsc/test_ejs_integration.py @@ -0,0 +1,128 @@ +from __future__ import annotations + +import dataclasses +import enum +import importlib.util +import json + +import pytest + +from yt_dlp.extractor.youtube.jsc.provider import ( + JsChallengeRequest, + JsChallengeType, + JsChallengeProviderResponse, + JsChallengeResponse, + NChallengeInput, + NChallengeOutput, + SigChallengeInput, + SigChallengeOutput, +) +from yt_dlp.extractor.youtube.jsc._builtin.bun import BunJCP +from yt_dlp.extractor.youtube.jsc._builtin.deno import DenoJCP +from yt_dlp.extractor.youtube.jsc._builtin.node import NodeJCP +from yt_dlp.extractor.youtube.jsc._builtin.quickjs import QuickJSJCP + + +_has_ejs = bool(importlib.util.find_spec('yt_dlp_ejs')) +pytestmark = pytest.mark.skipif(not _has_ejs, reason='yt-dlp-ejs not available') + + +class Variant(enum.Enum): + main = 'player_ias.vflset/en_US/base.js' + tcc = 'player_ias_tcc.vflset/en_US/base.js' + tce = 'player_ias_tce.vflset/en_US/base.js' + es5 = 'player_es5.vflset/en_US/base.js' + es6 = 'player_es6.vflset/en_US/base.js' + tv = 'tv-player-ias.vflset/tv-player-ias.js' + tv_es6 = 'tv-player-es6.vflset/tv-player-es6.js' + phone = 'player-plasma-ias-phone-en_US.vflset/base.js' + tablet = 'player-plasma-ias-tablet-en_US.vflset/base.js' + + +@dataclasses.dataclass +class Challenge: + player: str + variant: Variant + type: JsChallengeType + values: dict[str, str] = dataclasses.field(default_factory=dict) + + def url(self, /): + return f'https://www.youtube.com/s/player/{self.player}/{self.variant.value}' + + +CHALLENGES: list[Challenge] = [ + Challenge('3d3ba064', Variant.tce, JsChallengeType.N, { + 'ZdZIqFPQK-Ty8wId': 'qmtUsIz04xxiNW', + '4GMrWHyKI5cEvhDO': 'N9gmEX7YhKTSmw', + }), + Challenge('3d3ba064', Variant.tce, JsChallengeType.SIG, { + 'gN7a-hudCuAuPH6fByOk1_GNXN0yNMHShjZXS2VOgsEItAJz0tipeavEOmNdYN-wUtcEqD3bCXjc0iyKfAyZxCBGgIARwsSdQfJ2CJtt': + 'ttJC2JfQdSswRAIgGBCxZyAfKyi0cjXCb3gqEctUw-NYdNmOEvaepit0zJAtIEsgOV2SXZjhSHMNy0NXNG_1kNyBf6HPuAuCduh-a7O', + }), + Challenge('5ec65609', Variant.tce, JsChallengeType.N, { + '0eRGgQWJGfT5rFHFj': '4SvMpDQH-vBJCw', + }), + Challenge('5ec65609', Variant.tce, JsChallengeType.SIG, { + 'AAJAJfQdSswRQIhAMG5SN7-cAFChdrE7tLA6grH0rTMICA1mmDc0HoXgW3CAiAQQ4=CspfaF_vt82XH5yewvqcuEkvzeTsbRuHssRMyJQ=I': + 'AJfQdSswRQIhAMG5SN7-cAFChdrE7tLA6grI0rTMICA1mmDc0HoXgW3CAiAQQ4HCspfaF_vt82XH5yewvqcuEkvzeTsbRuHssRMyJQ==', + }), + Challenge('6742b2b9', Variant.tce, JsChallengeType.N, { + '_HPB-7GFg1VTkn9u': 'qUAsPryAO_ByYg', + 'K1t_fcB6phzuq2SF': 'Y7PcOt3VE62mog', + }), + Challenge('6742b2b9', Variant.tce, JsChallengeType.SIG, { + 'MMGZJMUucirzS_SnrSPYsc85CJNnTUi6GgR5NKn-znQEICACojE8MHS6S7uYq4TGjQX_D4aPk99hNU6wbTvorvVVMgIARwsSdQfJAA': + 'AJfQdSswRAIgMVVvrovTbw6UNh99kPa4D_XQjGT4qYu7S6SHM8EjoCACIEQnz-nKN5RgG6iUTnNJC58csYPSrnS_SzricuUMJZGM', + }), + Challenge('2b83d2e0', Variant.main, JsChallengeType.N, { + '0eRGgQWJGfT5rFHFj': 'euHbygrCMLksxd', + }), + Challenge('2b83d2e0', Variant.main, JsChallengeType.SIG, { + 'MMGZJMUucirzS_SnrSPYsc85CJNnTUi6GgR5NKn-znQEICACojE8MHS6S7uYq4TGjQX_D4aPk99hNU6wbTvorvVVMgIARwsSdQfJA': + '-MGZJMUucirzS_SnrSPYsc85CJNnTUi6GgR5NKnMznQEICACojE8MHS6S7uYq4TGjQX_D4aPk99hNU6wbTvorvVVMgIARwsSdQfJ', + }), + Challenge('638ec5c6', Variant.main, JsChallengeType.N, { + 'ZdZIqFPQK-Ty8wId': '1qov8-KM-yH', + }), + Challenge('638ec5c6', Variant.main, JsChallengeType.SIG, { + 'gN7a-hudCuAuPH6fByOk1_GNXN0yNMHShjZXS2VOgsEItAJz0tipeavEOmNdYN-wUtcEqD3bCXjc0iyKfAyZxCBGgIARwsSdQfJ2CJtt': + 'MhudCuAuP-6fByOk1_GNXN7gNHHShjyXS2VOgsEItAJz0tipeav0OmNdYN-wUtcEqD3bCXjc0iyKfAyZxCBGgIARwsSdQfJ2CJtt', + }), +] + +requests: list[JsChallengeRequest] = [] +responses: list[JsChallengeProviderResponse] = [] +for test in CHALLENGES: + input_type, output_type = { + JsChallengeType.N: (NChallengeInput, NChallengeOutput), + JsChallengeType.SIG: (SigChallengeInput, SigChallengeOutput), + }[test.type] + + request = JsChallengeRequest(test.type, input_type(test.url(), list(test.values.keys())), test.player) + requests.append(request) + responses.append(JsChallengeProviderResponse(request, JsChallengeResponse(test.type, output_type(test.values)))) + + +@pytest.fixture(params=[BunJCP, DenoJCP, NodeJCP, QuickJSJCP]) +def jcp(request, ie, logger): + obj = request.param(ie, logger, None) + if not obj.is_available(): + pytest.skip(f'{obj.PROVIDER_NAME} is not available') + obj.is_dev = True + return obj + + +@pytest.mark.download +def test_bulk_requests(jcp): + assert list(jcp.bulk_solve(requests)) == responses + + +@pytest.mark.download +def test_using_cached_player(jcp): + first_player_requests = requests[:3] + player = jcp._get_player(first_player_requests[0].video_id, first_player_requests[0].input.player_url) + initial = json.loads(jcp._run_js_runtime(jcp._construct_stdin(player, False, first_player_requests))) + preprocessed = initial.pop('preprocessed_player') + result = json.loads(jcp._run_js_runtime(jcp._construct_stdin(preprocessed, True, first_player_requests))) + + assert initial == result diff --git a/test/test_jsc/test_provider.py b/test/test_jsc/test_provider.py new file mode 100644 index 0000000000..3342f77546 --- /dev/null +++ b/test/test_jsc/test_provider.py @@ -0,0 +1,194 @@ + +import pytest + +from yt_dlp.extractor.youtube.jsc.provider import ( + JsChallengeProvider, + JsChallengeRequest, + JsChallengeProviderResponse, + JsChallengeProviderRejectedRequest, + JsChallengeType, + JsChallengeResponse, + NChallengeOutput, + NChallengeInput, + JsChallengeProviderError, + register_provider, + register_preference, +) +from yt_dlp.extractor.youtube.pot._provider import IEContentProvider +from yt_dlp.utils import ExtractorError +from yt_dlp.extractor.youtube.jsc._registry import _jsc_preferences, _jsc_providers + + +class ExampleJCP(JsChallengeProvider): + PROVIDER_NAME = 'example-provider' + PROVIDER_VERSION = '0.0.1' + BUG_REPORT_LOCATION = 'https://example.com/issues' + + _SUPPORTED_TYPES = [JsChallengeType.N] + + def is_available(self) -> bool: + return True + + def _real_bulk_solve(self, requests): + for request in requests: + results = dict.fromkeys(request.input.challenges, 'example-solution') + response = JsChallengeResponse( + type=request.type, + output=NChallengeOutput(results=results)) + yield JsChallengeProviderResponse(request=request, response=response) + + +PLAYER_URL = 'https://example.com/player.js' + + +class TestJsChallengeProvider: + # note: some test covered in TestPoTokenProvider which shares the same base class + def test_base_type(self): + assert issubclass(JsChallengeProvider, IEContentProvider) + + def test_create_provider_missing_bulk_solve_method(self, ie, logger): + class MissingMethodsJCP(JsChallengeProvider): + def is_available(self) -> bool: + return True + + with pytest.raises(TypeError, match='bulk_solve'): + MissingMethodsJCP(ie=ie, logger=logger, settings={}) + + def test_create_provider_missing_available_method(self, ie, logger): + class MissingMethodsJCP(JsChallengeProvider): + def _real_bulk_solve(self, requests): + raise JsChallengeProviderRejectedRequest('Not implemented') + + with pytest.raises(TypeError, match='is_available'): + MissingMethodsJCP(ie=ie, logger=logger, settings={}) + + def test_barebones_provider(self, ie, logger): + class BarebonesProviderJCP(JsChallengeProvider): + def is_available(self) -> bool: + return True + + def _real_bulk_solve(self, requests): + raise JsChallengeProviderRejectedRequest('Not implemented') + + provider = BarebonesProviderJCP(ie=ie, logger=logger, settings={}) + assert provider.PROVIDER_NAME == 'BarebonesProvider' + assert provider.PROVIDER_KEY == 'BarebonesProvider' + assert provider.PROVIDER_VERSION == '0.0.0' + assert provider.BUG_REPORT_MESSAGE == 'please report this issue to the provider developer at (developer has not provided a bug report location) .' + + def test_example_provider_success(self, ie, logger): + provider = ExampleJCP(ie=ie, logger=logger, settings={}) + + request = JsChallengeRequest( + type=JsChallengeType.N, + input=NChallengeInput(player_url=PLAYER_URL, challenges=['example-challenge'])) + + request_two = JsChallengeRequest( + type=JsChallengeType.N, + input=NChallengeInput(player_url=PLAYER_URL, challenges=['example-challenge-2'])) + + responses = list(provider.bulk_solve([request, request_two])) + assert len(responses) == 2 + assert all(isinstance(r, JsChallengeProviderResponse) for r in responses) + assert responses == [ + JsChallengeProviderResponse( + request=request, + response=JsChallengeResponse( + type=JsChallengeType.N, + output=NChallengeOutput(results={'example-challenge': 'example-solution'}), + ), + ), + JsChallengeProviderResponse( + request=request_two, + response=JsChallengeResponse( + type=JsChallengeType.N, + output=NChallengeOutput(results={'example-challenge-2': 'example-solution'}), + ), + ), + ] + + def test_provider_unsupported_challenge_type(self, ie, logger): + provider = ExampleJCP(ie=ie, logger=logger, settings={}) + request_supported = JsChallengeRequest( + type=JsChallengeType.N, + input=NChallengeInput(player_url=PLAYER_URL, challenges=['example-challenge'])) + request_unsupported = JsChallengeRequest( + type=JsChallengeType.SIG, + input=NChallengeInput(player_url=PLAYER_URL, challenges=['example-challenge'])) + responses = list(provider.bulk_solve([request_supported, request_unsupported, request_supported])) + assert len(responses) == 3 + # Requests are validated first before continuing to _real_bulk_solve + assert isinstance(responses[0], JsChallengeProviderResponse) + assert isinstance(responses[0].error, JsChallengeProviderRejectedRequest) + assert responses[0].request is request_unsupported + assert str(responses[0].error) == 'JS Challenge type "JsChallengeType.SIG" is not supported by example-provider' + + assert responses[1:] == [ + JsChallengeProviderResponse( + request=request_supported, + response=JsChallengeResponse( + type=JsChallengeType.N, + output=NChallengeOutput(results={'example-challenge': 'example-solution'}), + ), + ), + JsChallengeProviderResponse( + request=request_supported, + response=JsChallengeResponse( + type=JsChallengeType.N, + output=NChallengeOutput(results={'example-challenge': 'example-solution'}), + ), + ), + ] + + def test_provider_get_player(self, ie, logger): + ie._load_player = lambda video_id, player_url, fatal: (video_id, player_url, fatal) + provider = ExampleJCP(ie=ie, logger=logger, settings={}) + assert provider._get_player('video123', PLAYER_URL) == ('video123', PLAYER_URL, True) + + def test_provider_get_player_error(self, ie, logger): + def raise_error(video_id, player_url, fatal): + raise ExtractorError('Failed to load player') + + ie._load_player = raise_error + provider = ExampleJCP(ie=ie, logger=logger, settings={}) + with pytest.raises(JsChallengeProviderError, match='Failed to load player for JS challenge'): + provider._get_player('video123', PLAYER_URL) + + def test_require_class_end_with_suffix(self, ie, logger): + class InvalidSuffix(JsChallengeProvider): + PROVIDER_NAME = 'invalid-suffix' + + def _real_bulk_solve(self, requests): + raise JsChallengeProviderRejectedRequest('Not implemented') + + def is_available(self) -> bool: + return True + + provider = InvalidSuffix(ie=ie, logger=logger, settings={}) + + with pytest.raises(AssertionError): + provider.PROVIDER_KEY # noqa: B018 + + +def test_register_provider(ie): + + @register_provider + class UnavailableProviderJCP(JsChallengeProvider): + def is_available(self) -> bool: + return False + + def _real_bulk_solve(self, requests): + raise JsChallengeProviderRejectedRequest('Not implemented') + + assert _jsc_providers.value.get('UnavailableProvider') == UnavailableProviderJCP + _jsc_providers.value.pop('UnavailableProvider') + + +def test_register_preference(ie): + before = len(_jsc_preferences.value) + + @register_preference(ExampleJCP) + def unavailable_preference(*args, **kwargs): + return 1 + + assert len(_jsc_preferences.value) == before + 1 diff --git a/test/test_postprocessors.py b/test/test_postprocessors.py index ecc73e39eb..3911567066 100644 --- a/test/test_postprocessors.py +++ b/test/test_postprocessors.py @@ -115,7 +115,7 @@ class TestModifyChaptersPP(unittest.TestCase): self.assertEqual(len(ends), len(titles)) start = 0 chapters = [] - for e, t in zip(ends, titles): + for e, t in zip(ends, titles, strict=True): chapters.append(self._chapter(start, e, t)) start = e return chapters diff --git a/test/test_pot/test_pot_builtin_utils.py b/test/test_pot/test_pot_builtin_utils.py index 7645ba601f..15a25cff2f 100644 --- a/test/test_pot/test_pot_builtin_utils.py +++ b/test/test_pot/test_pot_builtin_utils.py @@ -45,3 +45,8 @@ class TestGetWebPoContentBinding: def test_invalid_base64(self, pot_request): pot_request.visitor_data = 'invalid-base64' assert get_webpo_content_binding(pot_request, bind_to_visitor_id=True) == (pot_request.visitor_data, ContentBindingType.VISITOR_DATA) + + def test_gvs_video_id_binding_experiment(self, pot_request): + pot_request.context = PoTokenContext.GVS + pot_request._gvs_bind_to_video_id = True + assert get_webpo_content_binding(pot_request) == ('example-video-id', ContentBindingType.VIDEO_ID) diff --git a/test/test_pot/test_pot_framework.py b/test/test_pot/test_pot_framework.py index d2de1dd290..fae6c80027 100644 --- a/test/test_pot/test_pot_framework.py +++ b/test/test_pot/test_pot_framework.py @@ -1,6 +1,6 @@ import pytest -from yt_dlp.extractor.youtube.pot._provider import IEContentProvider +from yt_dlp.extractor.youtube.pot._provider import IEContentProvider, configuration_arg from yt_dlp.cookies import YoutubeDLCookieJar from yt_dlp.utils.networking import HTTPHeaderDict from yt_dlp.extractor.youtube.pot.provider import ( @@ -627,3 +627,13 @@ def test_logger_log_level(logger): assert logger.LogLevel('debuG') == logger.LogLevel.DEBUG assert logger.LogLevel(10) == logger.LogLevel.DEBUG assert logger.LogLevel('UNKNOWN') == logger.LogLevel.INFO + + +def test_configuration_arg(): + config = {'abc': ['123D'], 'xyz': ['456a', '789B']} + + assert configuration_arg(config, 'abc') == ['123d'] + assert configuration_arg(config, 'abc', default=['default']) == ['123d'] + assert configuration_arg(config, 'ABC', default=['default']) == ['default'] + assert configuration_arg(config, 'abc', casesense=True) == ['123D'] + assert configuration_arg(config, 'xyz', casesense=False) == ['456a', '789b'] diff --git a/test/test_traversal.py b/test/test_traversal.py index 52215f5a7b..d4abacc597 100644 --- a/test/test_traversal.py +++ b/test/test_traversal.py @@ -417,7 +417,7 @@ class TestTraversal: def test_traversal_morsel(self): morsel = http.cookies.Morsel() - values = dict(zip(morsel, 'abcdefghijklmnop')) + values = dict(zip(morsel, 'abcdefghijklmnop', strict=False)) morsel.set('item_key', 'item_value', 'coded_value') morsel.update(values) values['key'] = 'item_key' diff --git a/test/test_utils.py b/test/test_utils.py index 83916b46d9..0865b39810 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1863,7 +1863,7 @@ Line 1 self.assertEqual( list(get_elements_text_and_html_by_attribute('class', 'foo bar', html)), - list(zip(['nice', 'also nice'], self.GET_ELEMENTS_BY_CLASS_RES))) + list(zip(['nice', 'also nice'], self.GET_ELEMENTS_BY_CLASS_RES, strict=True))) self.assertEqual(list(get_elements_text_and_html_by_attribute('class', 'foo', html)), []) self.assertEqual(list(get_elements_text_and_html_by_attribute('class', 'no-such-foo', html)), []) diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py deleted file mode 100644 index 2e9c974db2..0000000000 --- a/test/test_youtube_signature.py +++ /dev/null @@ -1,504 +0,0 @@ -#!/usr/bin/env python3 - -# Allow direct execution -import os -import sys -import unittest - -sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - - -import contextlib -import re -import string -import urllib.request - -from test.helper import FakeYDL, is_download_test -from yt_dlp.extractor import YoutubeIE -from yt_dlp.jsinterp import JSInterpreter - -_SIG_TESTS = [ - ( - 'https://s.ytimg.com/yts/jsbin/html5player-vflHOr_nV.js', - 86, - '>=<;:/.-[+*)(\'&%$#"!ZYX0VUTSRQPONMLKJIHGFEDCBA\\yxwvutsrqponmlkjihgfedcba987654321', - ), - ( - 'https://s.ytimg.com/yts/jsbin/html5player-vfldJ8xgI.js', - 85, - '3456789a0cdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRS[UVWXYZ!"#$%&\'()*+,-./:;<=>?@', - ), - ( - 'https://s.ytimg.com/yts/jsbin/html5player-vfle-mVwz.js', - 90, - ']\\[@?>=<;:/.-,+*)(\'&%$#"hZYXWVUTSRQPONMLKJIHGFEDCBAzyxwvutsrqponmlkjiagfedcb39876', - ), - ( - 'https://s.ytimg.com/yts/jsbin/html5player-en_US-vfl0Cbn9e.js', - 84, - 'O1I3456789abcde0ghijklmnopqrstuvwxyzABCDEFGHfJKLMN2PQRSTUVW@YZ!"#$%&\'()*+,-./:;<=', - ), - ( - 'https://s.ytimg.com/yts/jsbin/html5player-en_US-vflXGBaUN.js', - '2ACFC7A61CA478CD21425E5A57EBD73DDC78E22A.2094302436B2D377D14A3BBA23022D023B8BC25AA', - 'A52CB8B320D22032ABB3A41D773D2B6342034902.A22E87CDD37DBE75A5E52412DC874AC16A7CFCA2', - ), - ( - 'https://s.ytimg.com/yts/jsbin/html5player-en_US-vflBb0OQx.js', - 84, - '123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQ0STUVWXYZ!"#$%&\'()*+,@./:;<=>', - ), - ( - 'https://s.ytimg.com/yts/jsbin/html5player-en_US-vfl9FYC6l.js', - 83, - '123456789abcdefghijklmnopqr0tuvwxyzABCDETGHIJKLMNOPQRS>UVWXYZ!"#$%&\'()*+,-./:;<=F', - ), - ( - 'https://s.ytimg.com/yts/jsbin/html5player-en_US-vflCGk6yw/html5player.js', - '4646B5181C6C3020DF1D9C7FCFEA.AD80ABF70C39BD369CCCAE780AFBB98FA6B6CB42766249D9488C288', - '82C8849D94266724DC6B6AF89BBFA087EACCD963.B93C07FBA084ACAEFCF7C9D1FD0203C6C1815B6B', - ), - ( - 'https://s.ytimg.com/yts/jsbin/html5player-en_US-vflKjOTVq/html5player.js', - '312AA52209E3623129A412D56A40F11CB0AF14AE.3EE09501CB14E3BCDC3B2AE808BF3F1D14E7FBF12', - '112AA5220913623229A412D56A40F11CB0AF14AE.3EE0950FCB14EEBCDC3B2AE808BF331D14E7FBF3', - ), - ( - 'https://www.youtube.com/s/player/6ed0d907/player_ias.vflset/en_US/base.js', - '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', - 'AOq0QJ8wRAIgXmPlOPSBkkUs1bYFYlJCfe29xx8j7v1pDL2QwbdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJoOySqa0', - ), - ( - 'https://www.youtube.com/s/player/3bb1f723/player_ias.vflset/en_US/base.js', - '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', - 'MyOSJXtKI3m-uME_jv7-pT12gOFC02RFkGoqWpzE0Cs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', - ), - ( - 'https://www.youtube.com/s/player/2f1832d2/player_ias.vflset/en_US/base.js', - '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', - '0QJ8wRAIgXmPlOPSBkkUs1bYFYlJCfe29xxAj7v1pDL0QwbdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJ2OySqa0q', - ), - ( - 'https://www.youtube.com/s/player/643afba4/tv-player-ias.vflset/tv-player-ias.js', - '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', - 'AAOAOq0QJ8wRAIgXmPlOPSBkkUs1bYFYlJCfe29xx8j7vgpDL0QwbdV06sCIEzpWqMGkFR20CFOS21Tp-7vj_EMu-m37KtXJoOy1', - ), - ( - 'https://www.youtube.com/s/player/363db69b/player_ias.vflset/en_US/base.js', - '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', - '0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpz2ICs6EVdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', - ), - ( - 'https://www.youtube.com/s/player/363db69b/player_ias_tce.vflset/en_US/base.js', - '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', - '0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpz2ICs6EVdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', - ), - ( - 'https://www.youtube.com/s/player/4fcd6e4a/player_ias.vflset/en_US/base.js', - '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', - 'wAOAOq0QJ8ARAIgXmPlOPSBkkUs1bYFYlJCfe29xx8q7v1pDL0QwbdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJoOySqa0', - ), - ( - 'https://www.youtube.com/s/player/4fcd6e4a/player_ias_tce.vflset/en_US/base.js', - '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', - 'wAOAOq0QJ8ARAIgXmPlOPSBkkUs1bYFYlJCfe29xx8q7v1pDL0QwbdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJoOySqa0', - ), - ( - 'https://www.youtube.com/s/player/20830619/player_ias.vflset/en_US/base.js', - '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', - '7AOq0QJ8wRAIgXmPlOPSBkkAs1bYFYlJCfe29xx8jOv1pDL0Q2bdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJoOySqa0qaw', - ), - ( - 'https://www.youtube.com/s/player/20830619/player_ias_tce.vflset/en_US/base.js', - '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', - '7AOq0QJ8wRAIgXmPlOPSBkkAs1bYFYlJCfe29xx8jOv1pDL0Q2bdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJoOySqa0qaw', - ), - ( - 'https://www.youtube.com/s/player/20830619/player-plasma-ias-phone-en_US.vflset/base.js', - '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', - '7AOq0QJ8wRAIgXmPlOPSBkkAs1bYFYlJCfe29xx8jOv1pDL0Q2bdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJoOySqa0qaw', - ), - ( - 'https://www.youtube.com/s/player/20830619/player-plasma-ias-tablet-en_US.vflset/base.js', - '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', - '7AOq0QJ8wRAIgXmPlOPSBkkAs1bYFYlJCfe29xx8jOv1pDL0Q2bdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_EMu-m37KtXJoOySqa0qaw', - ), - ( - 'https://www.youtube.com/s/player/8a8ac953/player_ias_tce.vflset/en_US/base.js', - '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', - 'IAOAOq0QJ8wRAAgXmPlOPSBkkUs1bYFYlJCfe29xx8j7v1pDL0QwbdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_E2u-m37KtXJoOySqa0', - ), - ( - 'https://www.youtube.com/s/player/8a8ac953/tv-player-es6.vflset/tv-player-es6.js', - '2aq0aqSyOoJXtK73m-uME_jv7-pT15gOFC02RFkGMqWpzEICs69VdbwQ0LDp1v7j8xx92efCJlYFYb1sUkkBSPOlPmXgIARw8JQ0qOAOAA', - 'IAOAOq0QJ8wRAAgXmPlOPSBkkUs1bYFYlJCfe29xx8j7v1pDL0QwbdV96sCIEzpWqMGkFR20CFOg51Tp-7vj_E2u-m37KtXJoOySqa0', - ), - ( - 'https://www.youtube.com/s/player/e12fbea4/player_ias.vflset/en_US/base.js', - 'gN7a-hudCuAuPH6fByOk1_GNXN0yNMHShjZXS2VOgsEItAJz0tipeavEOmNdYN-wUtcEqD3bCXjc0iyKfAyZxCBGgIARwsSdQfJ2CJtt', - 'JC2JfQdSswRAIgGBCxZyAfKyi0cjXCb3DqEctUw-NYdNmOEvaepit0zJAtIEsgOV2SXZjhSHMNy0NXNG_1kOyBf6HPuAuCduh-a', - ), - ( - 'https://www.youtube.com/s/player/010fbc8d/player_es5.vflset/en_US/base.js', - 'gN7a-hudCuAuPH6fByOk1_GNXN0yNMHShjZXS2VOgsEItAJz0tipeavEOmNdYN-wUtcEqD3bCXjc0iyKfAyZxCBGgIARwsSdQfJ2CJtt', - 'ttJC2JfQdSswRAIgGBCxZyAfKyi0cjXCb3DqEctUw-NYdNmOEvaepit2zJAsIEggOVaSXZjhSHMNy0NXNG_1kOyBf6HPuAuCduh-', - ), - ( - 'https://www.youtube.com/s/player/010fbc8d/player_es6.vflset/en_US/base.js', - 'gN7a-hudCuAuPH6fByOk1_GNXN0yNMHShjZXS2VOgsEItAJz0tipeavEOmNdYN-wUtcEqD3bCXjc0iyKfAyZxCBGgIARwsSdQfJ2CJtt', - 'ttJC2JfQdSswRAIgGBCxZyAfKyi0cjXCb3DqEctUw-NYdNmOEvaepit2zJAsIEggOVaSXZjhSHMNy0NXNG_1kOyBf6HPuAuCduh-', - ), - ( - 'https://www.youtube.com/s/player/5ec65609/player_ias_tcc.vflset/en_US/base.js', - 'AAJAJfQdSswRAIgNSN0GDUcHnCIXkKcF61yLBgDHiX1sUhOJdY4_GxunRYCIDeYNYP_16mQTPm5f1OVq3oV1ijUNYPjP4iUSMAjO9bZ', - 'AJfQdSswRAIgNSN0GDUcHnCIXkKcF61ZLBgDHiX1sUhOJdY4_GxunRYCIDyYNYP_16mQTPm5f1OVq3oV1ijUNYPjP4iUSMAjO9be', - ), -] - -_NSIG_TESTS = [ - ( - 'https://www.youtube.com/s/player/7862ca1f/player_ias.vflset/en_US/base.js', - 'X_LCxVDjAavgE5t', 'yxJ1dM6iz5ogUg', - ), - ( - 'https://www.youtube.com/s/player/9216d1f7/player_ias.vflset/en_US/base.js', - 'SLp9F5bwjAdhE9F-', 'gWnb9IK2DJ8Q1w', - ), - ( - 'https://www.youtube.com/s/player/f8cb7a3b/player_ias.vflset/en_US/base.js', - 'oBo2h5euWy6osrUt', 'ivXHpm7qJjJN', - ), - ( - 'https://www.youtube.com/s/player/2dfe380c/player_ias.vflset/en_US/base.js', - 'oBo2h5euWy6osrUt', '3DIBbn3qdQ', - ), - ( - 'https://www.youtube.com/s/player/f1ca6900/player_ias.vflset/en_US/base.js', - 'cu3wyu6LQn2hse', 'jvxetvmlI9AN9Q', - ), - ( - 'https://www.youtube.com/s/player/8040e515/player_ias.vflset/en_US/base.js', - 'wvOFaY-yjgDuIEg5', 'HkfBFDHmgw4rsw', - ), - ( - 'https://www.youtube.com/s/player/e06dea74/player_ias.vflset/en_US/base.js', - 'AiuodmaDDYw8d3y4bf', 'ankd8eza2T6Qmw', - ), - ( - 'https://www.youtube.com/s/player/5dd88d1d/player-plasma-ias-phone-en_US.vflset/base.js', - 'kSxKFLeqzv_ZyHSAt', 'n8gS8oRlHOxPFA', - ), - ( - 'https://www.youtube.com/s/player/324f67b9/player_ias.vflset/en_US/base.js', - 'xdftNy7dh9QGnhW', '22qLGxrmX8F1rA', - ), - ( - 'https://www.youtube.com/s/player/4c3f79c5/player_ias.vflset/en_US/base.js', - 'TDCstCG66tEAO5pR9o', 'dbxNtZ14c-yWyw', - ), - ( - 'https://www.youtube.com/s/player/c81bbb4a/player_ias.vflset/en_US/base.js', - 'gre3EcLurNY2vqp94', 'Z9DfGxWP115WTg', - ), - ( - 'https://www.youtube.com/s/player/1f7d5369/player_ias.vflset/en_US/base.js', - 'batNX7sYqIJdkJ', 'IhOkL_zxbkOZBw', - ), - ( - 'https://www.youtube.com/s/player/009f1d77/player_ias.vflset/en_US/base.js', - '5dwFHw8aFWQUQtffRq', 'audescmLUzI3jw', - ), - ( - 'https://www.youtube.com/s/player/dc0c6770/player_ias.vflset/en_US/base.js', - '5EHDMgYLV6HPGk_Mu-kk', 'n9lUJLHbxUI0GQ', - ), - ( - 'https://www.youtube.com/s/player/113ca41c/player_ias.vflset/en_US/base.js', - 'cgYl-tlYkhjT7A', 'hI7BBr2zUgcmMg', - ), - ( - 'https://www.youtube.com/s/player/c57c113c/player_ias.vflset/en_US/base.js', - 'M92UUMHa8PdvPd3wyM', '3hPqLJsiNZx7yA', - ), - ( - 'https://www.youtube.com/s/player/5a3b6271/player_ias.vflset/en_US/base.js', - 'B2j7f_UPT4rfje85Lu_e', 'm5DmNymaGQ5RdQ', - ), - ( - 'https://www.youtube.com/s/player/7a062b77/player_ias.vflset/en_US/base.js', - 'NRcE3y3mVtm_cV-W', 'VbsCYUATvqlt5w', - ), - ( - 'https://www.youtube.com/s/player/dac945fd/player_ias.vflset/en_US/base.js', - 'o8BkRxXhuYsBCWi6RplPdP', '3Lx32v_hmzTm6A', - ), - ( - 'https://www.youtube.com/s/player/6f20102c/player_ias.vflset/en_US/base.js', - 'lE8DhoDmKqnmJJ', 'pJTTX6XyJP2BYw', - ), - ( - 'https://www.youtube.com/s/player/cfa9e7cb/player_ias.vflset/en_US/base.js', - 'aCi3iElgd2kq0bxVbQ', 'QX1y8jGb2IbZ0w', - ), - ( - 'https://www.youtube.com/s/player/8c7583ff/player_ias.vflset/en_US/base.js', - '1wWCVpRR96eAmMI87L', 'KSkWAVv1ZQxC3A', - ), - ( - 'https://www.youtube.com/s/player/b7910ca8/player_ias.vflset/en_US/base.js', - '_hXMCwMt9qE310D', 'LoZMgkkofRMCZQ', - ), - ( - 'https://www.youtube.com/s/player/590f65a6/player_ias.vflset/en_US/base.js', - '1tm7-g_A9zsI8_Lay_', 'xI4Vem4Put_rOg', - ), - ( - 'https://www.youtube.com/s/player/b22ef6e7/player_ias.vflset/en_US/base.js', - 'b6HcntHGkvBLk_FRf', 'kNPW6A7FyP2l8A', - ), - ( - 'https://www.youtube.com/s/player/3400486c/player_ias.vflset/en_US/base.js', - 'lL46g3XifCKUZn1Xfw', 'z767lhet6V2Skl', - ), - ( - 'https://www.youtube.com/s/player/20dfca59/player_ias.vflset/en_US/base.js', - '-fLCxedkAk4LUTK2', 'O8kfRq1y1eyHGw', - ), - ( - 'https://www.youtube.com/s/player/b12cc44b/player_ias.vflset/en_US/base.js', - 'keLa5R2U00sR9SQK', 'N1OGyujjEwMnLw', - ), - ( - 'https://www.youtube.com/s/player/3bb1f723/player_ias.vflset/en_US/base.js', - 'gK15nzVyaXE9RsMP3z', 'ZFFWFLPWx9DEgQ', - ), - ( - 'https://www.youtube.com/s/player/2f1832d2/player_ias.vflset/en_US/base.js', - 'YWt1qdbe8SAfkoPHW5d', 'RrRjWQOJmBiP', - ), - ( - 'https://www.youtube.com/s/player/9c6dfc4a/player_ias.vflset/en_US/base.js', - 'jbu7ylIosQHyJyJV', 'uwI0ESiynAmhNg', - ), - ( - 'https://www.youtube.com/s/player/e7567ecf/player_ias_tce.vflset/en_US/base.js', - 'Sy4aDGc0VpYRR9ew_', '5UPOT1VhoZxNLQ', - ), - ( - 'https://www.youtube.com/s/player/d50f54ef/player_ias_tce.vflset/en_US/base.js', - 'Ha7507LzRmH3Utygtj', 'XFTb2HoeOE5MHg', - ), - ( - 'https://www.youtube.com/s/player/074a8365/player_ias_tce.vflset/en_US/base.js', - 'Ha7507LzRmH3Utygtj', 'ufTsrE0IVYrkl8v', - ), - ( - 'https://www.youtube.com/s/player/643afba4/player_ias.vflset/en_US/base.js', - 'N5uAlLqm0eg1GyHO', 'dCBQOejdq5s-ww', - ), - ( - 'https://www.youtube.com/s/player/69f581a5/tv-player-ias.vflset/tv-player-ias.js', - '-qIP447rVlTTwaZjY', 'KNcGOksBAvwqQg', - ), - ( - 'https://www.youtube.com/s/player/643afba4/tv-player-ias.vflset/tv-player-ias.js', - 'ir9-V6cdbCiyKxhr', '2PL7ZDYAALMfmA', - ), - ( - 'https://www.youtube.com/s/player/363db69b/player_ias.vflset/en_US/base.js', - 'eWYu5d5YeY_4LyEDc', 'XJQqf-N7Xra3gg', - ), - ( - 'https://www.youtube.com/s/player/4fcd6e4a/player_ias.vflset/en_US/base.js', - 'o_L251jm8yhZkWtBW', 'lXoxI3XvToqn6A', - ), - ( - 'https://www.youtube.com/s/player/4fcd6e4a/player_ias_tce.vflset/en_US/base.js', - 'o_L251jm8yhZkWtBW', 'lXoxI3XvToqn6A', - ), - ( - 'https://www.youtube.com/s/player/20830619/tv-player-ias.vflset/tv-player-ias.js', - 'ir9-V6cdbCiyKxhr', '9YE85kNjZiS4', - ), - ( - 'https://www.youtube.com/s/player/20830619/player-plasma-ias-phone-en_US.vflset/base.js', - 'ir9-V6cdbCiyKxhr', '9YE85kNjZiS4', - ), - ( - 'https://www.youtube.com/s/player/20830619/player-plasma-ias-tablet-en_US.vflset/base.js', - 'ir9-V6cdbCiyKxhr', '9YE85kNjZiS4', - ), - ( - 'https://www.youtube.com/s/player/8a8ac953/player_ias_tce.vflset/en_US/base.js', - 'MiBYeXx_vRREbiCCmh', 'RtZYMVvmkE0JE', - ), - ( - 'https://www.youtube.com/s/player/8a8ac953/tv-player-es6.vflset/tv-player-es6.js', - 'MiBYeXx_vRREbiCCmh', 'RtZYMVvmkE0JE', - ), - ( - 'https://www.youtube.com/s/player/59b252b9/player_ias.vflset/en_US/base.js', - 'D3XWVpYgwhLLKNK4AGX', 'aZrQ1qWJ5yv5h', - ), - ( - 'https://www.youtube.com/s/player/fc2a56a5/player_ias.vflset/en_US/base.js', - 'qTKWg_Il804jd2kAC', 'OtUAm2W6gyzJjB9u', - ), - ( - 'https://www.youtube.com/s/player/fc2a56a5/tv-player-ias.vflset/tv-player-ias.js', - 'qTKWg_Il804jd2kAC', 'OtUAm2W6gyzJjB9u', - ), - ( - 'https://www.youtube.com/s/player/a74bf670/player_ias_tce.vflset/en_US/base.js', - 'kM5r52fugSZRAKHfo3', 'hQP7k1hA22OrNTnq', - ), - ( - 'https://www.youtube.com/s/player/6275f73c/player_ias_tce.vflset/en_US/base.js', - 'kM5r52fugSZRAKHfo3', '-I03XF0iyf6I_X0A', - ), - ( - 'https://www.youtube.com/s/player/20c72c18/player_ias_tce.vflset/en_US/base.js', - 'kM5r52fugSZRAKHfo3', '-I03XF0iyf6I_X0A', - ), - ( - 'https://www.youtube.com/s/player/9fe2e06e/player_ias_tce.vflset/en_US/base.js', - 'kM5r52fugSZRAKHfo3', '6r5ekNIiEMPutZy', - ), - ( - 'https://www.youtube.com/s/player/680f8c75/player_ias_tce.vflset/en_US/base.js', - 'kM5r52fugSZRAKHfo3', '0ml9caTwpa55Jf', - ), - ( - 'https://www.youtube.com/s/player/14397202/player_ias_tce.vflset/en_US/base.js', - 'kM5r52fugSZRAKHfo3', 'ozZFAN21okDdJTa', - ), - ( - 'https://www.youtube.com/s/player/5dcb2c1f/player_ias_tce.vflset/en_US/base.js', - 'kM5r52fugSZRAKHfo3', 'p7iTbRZDYAF', - ), - ( - 'https://www.youtube.com/s/player/a10d7fcc/player_ias_tce.vflset/en_US/base.js', - 'kM5r52fugSZRAKHfo3', '9Zue7DDHJSD', - ), - ( - 'https://www.youtube.com/s/player/8e20cb06/player_ias_tce.vflset/en_US/base.js', - 'kM5r52fugSZRAKHfo3', '5-4tTneTROTpMzba', - ), - ( - 'https://www.youtube.com/s/player/e12fbea4/player_ias_tce.vflset/en_US/base.js', - 'kM5r52fugSZRAKHfo3', 'XkeRfXIPOkSwfg', - ), - ( - 'https://www.youtube.com/s/player/ef259203/player_ias_tce.vflset/en_US/base.js', - 'rPqBC01nJpqhhi2iA2U', 'hY7dbiKFT51UIA', - ), - ( - 'https://www.youtube.com/s/player/010fbc8d/player_es5.vflset/en_US/base.js', - '0hlOAlqjFszVvF4Z', 'R-H23bZGAsRFTg', - ), - ( - 'https://www.youtube.com/s/player/010fbc8d/player_es6.vflset/en_US/base.js', - '0hlOAlqjFszVvF4Z', 'R-H23bZGAsRFTg', - ), - ( - 'https://www.youtube.com/s/player/5ec65609/player_ias_tcc.vflset/en_US/base.js', - '6l5CTNx4AzIqH4MXM', 'NupToduxHBew1g', - ), -] - - -@is_download_test -class TestPlayerInfo(unittest.TestCase): - def test_youtube_extract_player_info(self): - PLAYER_URLS = ( - ('https://www.youtube.com/s/player/4c3f79c5/player_ias.vflset/en_US/base.js', '4c3f79c5'), - ('https://www.youtube.com/s/player/64dddad9/player_ias.vflset/en_US/base.js', '64dddad9'), - ('https://www.youtube.com/s/player/64dddad9/player_ias.vflset/fr_FR/base.js', '64dddad9'), - ('https://www.youtube.com/s/player/64dddad9/player-plasma-ias-phone-en_US.vflset/base.js', '64dddad9'), - ('https://www.youtube.com/s/player/64dddad9/player-plasma-ias-phone-de_DE.vflset/base.js', '64dddad9'), - ('https://www.youtube.com/s/player/64dddad9/player-plasma-ias-tablet-en_US.vflset/base.js', '64dddad9'), - ('https://www.youtube.com/s/player/e7567ecf/player_ias_tce.vflset/en_US/base.js', 'e7567ecf'), - ('https://www.youtube.com/s/player/643afba4/tv-player-ias.vflset/tv-player-ias.js', '643afba4'), - # obsolete - ('https://www.youtube.com/yts/jsbin/player_ias-vfle4-e03/en_US/base.js', 'vfle4-e03'), - ('https://www.youtube.com/yts/jsbin/player_ias-vfl49f_g4/en_US/base.js', 'vfl49f_g4'), - ('https://www.youtube.com/yts/jsbin/player_ias-vflCPQUIL/en_US/base.js', 'vflCPQUIL'), - ('https://www.youtube.com/yts/jsbin/player-vflzQZbt7/en_US/base.js', 'vflzQZbt7'), - ('https://www.youtube.com/yts/jsbin/player-en_US-vflaxXRn1/base.js', 'vflaxXRn1'), - ('https://s.ytimg.com/yts/jsbin/html5player-en_US-vflXGBaUN.js', 'vflXGBaUN'), - ('https://s.ytimg.com/yts/jsbin/html5player-en_US-vflKjOTVq/html5player.js', 'vflKjOTVq'), - ) - for player_url, expected_player_id in PLAYER_URLS: - player_id = YoutubeIE._extract_player_info(player_url) - self.assertEqual(player_id, expected_player_id) - - -@is_download_test -class TestSignature(unittest.TestCase): - def setUp(self): - TEST_DIR = os.path.dirname(os.path.abspath(__file__)) - self.TESTDATA_DIR = os.path.join(TEST_DIR, 'testdata/sigs') - if not os.path.exists(self.TESTDATA_DIR): - os.mkdir(self.TESTDATA_DIR) - - def tearDown(self): - with contextlib.suppress(OSError): - for f in os.listdir(self.TESTDATA_DIR): - os.remove(f) - - -def t_factory(name, sig_func, url_pattern): - def make_tfunc(url, sig_input, expected_sig): - m = url_pattern.match(url) - assert m, f'{url!r} should follow URL format' - test_id = re.sub(r'[/.-]', '_', m.group('id') or m.group('compat_id')) - - def test_func(self): - basename = f'player-{test_id}.js' - fn = os.path.join(self.TESTDATA_DIR, basename) - - if not os.path.exists(fn): - urllib.request.urlretrieve(url, fn) - with open(fn, encoding='utf-8') as testf: - jscode = testf.read() - self.assertEqual(sig_func(jscode, sig_input, url), expected_sig) - - test_func.__name__ = f'test_{name}_js_{test_id}' - setattr(TestSignature, test_func.__name__, test_func) - return make_tfunc - - -def signature(jscode, sig_input, player_url): - func = YoutubeIE(FakeYDL())._parse_sig_js(jscode, player_url) - src_sig = ( - str(string.printable[:sig_input]) - if isinstance(sig_input, int) else sig_input) - return func(src_sig) - - -def n_sig(jscode, sig_input, player_url): - ie = YoutubeIE(FakeYDL()) - funcname = ie._extract_n_function_name(jscode, player_url=player_url) - jsi = JSInterpreter(jscode) - func = jsi.extract_function_from_code(*ie._fixup_n_function_code(*jsi.extract_function_code(funcname), jscode, player_url)) - return func([sig_input]) - - -make_sig_test = t_factory( - 'signature', signature, - re.compile(r'''(?x) - .+(?: - /player/(?P[a-zA-Z0-9_/.-]+)| - /html5player-(?:en_US-)?(?P[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)? - )\.js$''')) -for test_spec in _SIG_TESTS: - make_sig_test(*test_spec) - -make_nsig_test = t_factory( - 'nsig', n_sig, re.compile(r'.+/player/(?P[a-zA-Z0-9_/.-]+)\.js$')) -for test_spec in _NSIG_TESTS: - make_nsig_test(*test_spec) - - -if __name__ == '__main__': - unittest.main() diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 3b41a15196..539b10fe29 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -42,6 +42,8 @@ from .globals import ( plugin_pps, all_plugins_loaded, plugin_dirs, + supported_js_runtimes, + supported_remote_components, ) from .minicurses import format_text from .networking import HEADRequest, Request, RequestDirector @@ -533,6 +535,18 @@ class YoutubeDL: See "EXTRACTOR ARGUMENTS" for details. Argument values must always be a list of string(s). E.g. {'youtube': {'skip': ['dash', 'hls']}} + js_runtimes: A dictionary of JavaScript runtime keys (in lower case) to enable + and a dictionary of additional configuration for the runtime. + Currently supported runtimes are 'deno', 'node', 'bun', and 'quickjs'. + If None, the default runtime of "deno" will be enabled. + The runtime configuration dictionary can have the following keys: + - path: Path to the executable (optional) + E.g. {'deno': {'path': '/path/to/deno'} + remote_components: A list of remote components that are allowed to be fetched when required. + Supported components: + - ejs:npm (external JavaScript components from npm) + - ejs:github (external JavaScript components from yt-dlp-ejs GitHub) + By default, no remote components are allowed to be fetched. mark_watched: Mark videos watched (even with --simulate). Only for YouTube The following options are deprecated and may be removed in the future: @@ -717,6 +731,13 @@ class YoutubeDL: else: raise + # Note: this must be after plugins are loaded + self.params['js_runtimes'] = self.params.get('js_runtimes', {'deno': {}}) + self._clean_js_runtimes(self.params['js_runtimes']) + + self.params['remote_components'] = set(self.params.get('remote_components', ())) + self._clean_remote_components(self.params['remote_components']) + self.params['compat_opts'] = set(self.params.get('compat_opts', ())) self.params['http_headers'] = HTTPHeaderDict(std_headers, self.params.get('http_headers')) self._load_cookies(self.params['http_headers'].get('Cookie')) # compat @@ -829,6 +850,36 @@ class YoutubeDL: self.archive = preload_download_archive(self.params.get('download_archive')) + def _clean_js_runtimes(self, runtimes): + if not ( + isinstance(runtimes, dict) + and all(isinstance(k, str) and (v is None or isinstance(v, dict)) for k, v in runtimes.items()) + ): + raise ValueError('Invalid js_runtimes format, expected a dict of {runtime: {config}}') + + if unsupported_runtimes := runtimes.keys() - supported_js_runtimes.value.keys(): + self.report_warning( + f'Ignoring unsupported JavaScript runtime(s): {", ".join(unsupported_runtimes)}.' + f' Supported runtimes: {", ".join(supported_js_runtimes.value.keys())}.') + for rt in unsupported_runtimes: + runtimes.pop(rt) + + def _clean_remote_components(self, remote_components: set): + if unsupported_remote_components := set(remote_components) - set(supported_remote_components.value): + self.report_warning( + f'Ignoring unsupported remote component(s): {", ".join(unsupported_remote_components)}.' + f' Supported remote components: {", ".join(supported_remote_components.value)}.') + for rt in unsupported_remote_components: + remote_components.remove(rt) + + @functools.cached_property + def _js_runtimes(self): + runtimes = {} + for name, config in self.params.get('js_runtimes', {}).items(): + runtime_cls = supported_js_runtimes.value.get(name) + runtimes[name] = runtime_cls(path=config.get('path')) if runtime_cls else None + return runtimes + def warn_if_short_id(self, argv): # short YouTube ID starting with dash? idxs = [ @@ -2007,7 +2058,7 @@ class YoutubeDL: else: entries = resolved_entries = list(entries) n_entries = len(resolved_entries) - ie_result['requested_entries'], ie_result['entries'] = tuple(zip(*resolved_entries)) or ([], []) + ie_result['requested_entries'], ie_result['entries'] = tuple(zip(*resolved_entries, strict=True)) or ([], []) if not ie_result.get('playlist_count'): # Better to do this after potentially exhausting entries ie_result['playlist_count'] = all_entries.get_full_count() @@ -2785,7 +2836,7 @@ class YoutubeDL: dummy_chapter = {'end_time': 0, 'start_time': info_dict.get('duration')} for idx, (prev, current, next_) in enumerate(zip( - (dummy_chapter, *chapters), chapters, (*chapters[1:], dummy_chapter)), 1): + (dummy_chapter, *chapters), chapters, (*chapters[1:], dummy_chapter), strict=False), 1): if current.get('start_time') is None: current['start_time'] = prev.get('end_time') if not current.get('end_time'): @@ -3370,7 +3421,7 @@ class YoutubeDL: def existing_video_file(*filepaths): ext = info_dict.get('ext') converted = lambda file: replace_extension(file, self.params.get('final_ext') or ext, ext) - file = self.existing_file(itertools.chain(*zip(map(converted, filepaths), filepaths)), + file = self.existing_file(itertools.chain(*zip(map(converted, filepaths), filepaths, strict=True)), default_overwrite=False) if file: info_dict['ext'] = os.path.splitext(file)[1][1:] @@ -3956,7 +4007,7 @@ class YoutubeDL: def render_subtitles_table(self, video_id, subtitles): def _row(lang, formats): - exts, names = zip(*((f['ext'], f.get('name') or 'unknown') for f in reversed(formats))) + exts, names = zip(*((f['ext'], f.get('name') or 'unknown') for f in reversed(formats)), strict=True) if len(set(names)) == 1: names = [] if names[0] == 'unknown' else names[:1] return [lang, ', '.join(names), ', '.join(exts)] @@ -4064,6 +4115,18 @@ class YoutubeDL: join_nonempty(*get_package_info(m)) for m in available_dependencies.values() })) or 'none')) + if not self.params.get('js_runtimes'): + write_debug('JS runtimes: none (disabled)') + else: + write_debug('JS runtimes: %s' % (', '.join(sorted( + f'{name} (unknown)' if runtime is None + else join_nonempty( + runtime.info.name, + runtime.info.version + (' (unsupported)' if runtime.info.supported is False else ''), + ) + for name, runtime in self._js_runtimes.items() if runtime is None or runtime.info is not None + )) or 'none')) + write_debug(f'Proxy map: {self.proxies}') write_debug(f'Request Handlers: {", ".join(rh.RH_NAME for rh in self._request_director.handlers.values())}') @@ -4112,8 +4175,7 @@ class YoutubeDL: self.params.get('cookiefile'), self.params.get('cookiesfrombrowser'), self) except CookieLoadError as error: cause = error.__context__ - # compat: <=py3.9: `traceback.format_exception` has a different signature - self.report_error(str(cause), tb=''.join(traceback.format_exception(None, cause, cause.__traceback__))) + self.report_error(str(cause), tb=''.join(traceback.format_exception(cause))) raise @property diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index 09c022fa0e..2f6ba47832 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -1,8 +1,8 @@ import sys -if sys.version_info < (3, 9): +if sys.version_info < (3, 10): raise ImportError( - f'You are using an unsupported version of Python. Only Python versions 3.9 and above are supported by yt-dlp') # noqa: F541 + f'You are using an unsupported version of Python. Only Python versions 3.10 and above are supported by yt-dlp') # noqa: F541 __license__ = 'The Unlicense' @@ -61,8 +61,15 @@ from .utils import ( shell_quote, variadic, write_string, + ) from .utils._utils import _UnsafeExtensionError +from .utils._jsruntime import ( + BunJsRuntime as _BunJsRuntime, + DenoJsRuntime as _DenoJsRuntime, + NodeJsRuntime as _NodeJsRuntime, + QuickJsRuntime as _QuickJsRuntime, +) from .YoutubeDL import YoutubeDL @@ -155,7 +162,7 @@ def set_compat_opts(opts): if 'format-sort' in opts.compat_opts: opts.format_sort.extend(FormatSorter.ytdl_default) elif 'prefer-vp9-sort' in opts.compat_opts: - opts.format_sort.extend(FormatSorter._prefer_vp9_sort) + FormatSorter.default = FormatSorter._prefer_vp9_sort if 'mtime-by-default' in opts.compat_opts: if opts.updatetime is None: @@ -773,6 +780,10 @@ def parse_options(argv=None): else opts.audioformat if (opts.extractaudio and opts.audioformat in FFmpegExtractAudioPP.SUPPORTED_EXTS) else None) + js_runtimes = { + runtime.lower(): {'path': path} for runtime, path in ( + [*arg.split(':', 1), None][:2] for arg in opts.js_runtimes)} + return ParsedOptions(parser, opts, urls, { 'usenetrc': opts.usenetrc, 'netrc_location': opts.netrc_location, @@ -940,6 +951,8 @@ def parse_options(argv=None): 'geo_bypass_country': opts.geo_bypass_country, 'geo_bypass_ip_block': opts.geo_bypass_ip_block, 'useid': opts.useid or None, + 'js_runtimes': js_runtimes, + 'remote_components': opts.remote_components, 'warn_when_outdated': opts.update_self is None, '_warnings': warnings, '_deprecation_warnings': deprecation_warnings, @@ -974,13 +987,8 @@ def _real_main(argv=None): try: updater = Updater(ydl, opts.update_self) - if opts.update_self and updater.update() and actual_use: - if updater.cmd: - return updater.restart() - # This code is reachable only for zip variant in py < 3.10 - # It makes sense to exit here, but the old behavior is to continue - ydl.report_warning('Restart yt-dlp to use the updated version') - # return 100, 'ERROR: The program must exit for the update to complete' + if opts.update_self and updater.update() and actual_use and updater.cmd: + return updater.restart() except Exception: traceback.print_exc() ydl._download_retcode = 100 @@ -1086,6 +1094,16 @@ def main(argv=None): from .extractor import gen_extractors, list_extractors +# Register JS runtimes and remote components +from .globals import supported_js_runtimes, supported_remote_components +supported_js_runtimes.value['deno'] = _DenoJsRuntime +supported_js_runtimes.value['node'] = _NodeJsRuntime +supported_js_runtimes.value['bun'] = _BunJsRuntime +supported_js_runtimes.value['quickjs'] = _QuickJsRuntime + +supported_remote_components.value.append('ejs:github') +supported_remote_components.value.append('ejs:npm') + __all__ = [ 'YoutubeDL', 'gen_extractors', diff --git a/yt_dlp/__pyinstaller/hook-yt_dlp.py b/yt_dlp/__pyinstaller/hook-yt_dlp.py index 8e7f42f596..0c4bf7d63b 100644 --- a/yt_dlp/__pyinstaller/hook-yt_dlp.py +++ b/yt_dlp/__pyinstaller/hook-yt_dlp.py @@ -34,3 +34,4 @@ print(f'Adding imports: {hiddenimports}') excludedimports = ['youtube_dl', 'youtube_dlc', 'test', 'ytdlp_plugins', 'devscripts', 'bundle'] datas = collect_data_files('curl_cffi', includes=['cacert.pem']) +datas += collect_data_files('yt_dlp_ejs', includes=['**/*.js']) diff --git a/yt_dlp/aes.py b/yt_dlp/aes.py index 600cb12a89..e5a2e67cff 100644 --- a/yt_dlp/aes.py +++ b/yt_dlp/aes.py @@ -447,7 +447,7 @@ def key_schedule_core(data, rcon_iteration): def xor(data1, data2): - return [x ^ y for x, y in zip(data1, data2)] + return [x ^ y for x, y in zip(data1, data2, strict=False)] def iter_mix_columns(data, matrix): diff --git a/yt_dlp/compat/types.py b/yt_dlp/compat/types.py deleted file mode 100644 index 4aa3b0efdd..0000000000 --- a/yt_dlp/compat/types.py +++ /dev/null @@ -1,13 +0,0 @@ -# flake8: noqa: F405 -from types import * # noqa: F403 - -from .compat_utils import passthrough_module - -passthrough_module(__name__, 'types') -del passthrough_module - -try: - # NB: pypy has builtin NoneType, so checking NameError won't work - from types import NoneType # >= 3.10 -except ImportError: - NoneType = type(None) diff --git a/yt_dlp/compat/urllib/request.py b/yt_dlp/compat/urllib/request.py index dfc7f4a2dc..ddb4e6f031 100644 --- a/yt_dlp/compat/urllib/request.py +++ b/yt_dlp/compat/urllib/request.py @@ -22,15 +22,11 @@ if os.name == 'nt': def getproxies_registry_patched(): proxies = getproxies_registry() - if ( - sys.version_info >= (3, 10, 5) # https://docs.python.org/3.10/whatsnew/changelog.html#python-3-10-5-final - or (3, 9, 13) <= sys.version_info < (3, 10) # https://docs.python.org/3.9/whatsnew/changelog.html#python-3-9-13-final - ): - return proxies - for scheme in ('https', 'ftp'): - if scheme in proxies and proxies[scheme].startswith(f'{scheme}://'): - proxies[scheme] = 'http' + proxies[scheme][len(scheme):] + if sys.version_info < (3, 10, 5): # https://docs.python.org/3.10/whatsnew/changelog.html#python-3-10-5-final + for scheme in ('https', 'ftp'): + if scheme in proxies and proxies[scheme].startswith(f'{scheme}://'): + proxies[scheme] = 'http' + proxies[scheme][len(scheme):] return proxies diff --git a/yt_dlp/dependencies/__init__.py b/yt_dlp/dependencies/__init__.py index 0d58da2bd5..cf2bcfb37e 100644 --- a/yt_dlp/dependencies/__init__.py +++ b/yt_dlp/dependencies/__init__.py @@ -81,6 +81,12 @@ except ImportError: from . import Cryptodome +try: + import yt_dlp_ejs +except ImportError: + yt_dlp_ejs = None + + all_dependencies = {k: v for k, v in globals().items() if not k.startswith('_')} available_dependencies = {k: v for k, v in all_dependencies.items() if v} diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 8d09923819..947d894bd0 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -143,6 +143,8 @@ from .archiveorg import ( from .arcpublishing import ArcPublishingIE from .ard import ( ARDIE, + ARDAudiothekIE, + ARDAudiothekPlaylistIE, ARDBetaMediathekIE, ARDMediathekCollectionIE, ) @@ -337,6 +339,7 @@ from .cbc import ( CBCGemIE, CBCGemLiveIE, CBCGemPlaylistIE, + CBCListenIE, CBCPlayerIE, CBCPlayerPlaylistIE, ) @@ -824,6 +827,13 @@ from .ichinanalive import ( IchinanaLiveIE, IchinanaLiveVODIE, ) +from .idagio import ( + IdagioAlbumIE, + IdagioPersonalPlaylistIE, + IdagioPlaylistIE, + IdagioRecordingIE, + IdagioTrackIE, +) from .idolplus import IdolPlusIE from .ign import ( IGNIE, @@ -1209,6 +1219,7 @@ from .n1 import ( N1InfoAssetIE, N1InfoIIE, ) +from .nascar import NascarClassicsIE from .nate import ( NateIE, NateProgramIE, diff --git a/yt_dlp/extractor/abc.py b/yt_dlp/extractor/abc.py index 7296be73b3..2e6617842b 100644 --- a/yt_dlp/extractor/abc.py +++ b/yt_dlp/extractor/abc.py @@ -21,7 +21,7 @@ from ..utils import ( class ABCIE(InfoExtractor): IE_NAME = 'abc.net.au' - _VALID_URL = r'https?://(?:www\.)?abc\.net\.au/(?:news|btn)/(?:[^/]+/){1,4}(?P\d{5,})' + _VALID_URL = r'https?://(?:www\.)?abc\.net\.au/(?:news|btn|listen)/(?:[^/?#]+/){1,4}(?P\d{5,})' _TESTS = [{ 'url': 'http://www.abc.net.au/news/2014-11-05/australia-to-staff-ebola-treatment-centre-in-sierra-leone/5868334', @@ -53,8 +53,9 @@ class ABCIE(InfoExtractor): 'info_dict': { 'id': '6880080', 'ext': 'mp3', - 'title': 'NAB lifts interest rates, following Westpac and CBA', + 'title': 'NAB lifts interest rates, following Westpac and CBA - ABC listen', 'description': 'md5:f13d8edc81e462fce4a0437c7dc04728', + 'thumbnail': r're:https://live-production\.wcms\.abc-cdn\.net\.au/2193d7437c84b25eafd6360c82b5fa21', }, }, { 'url': 'http://www.abc.net.au/news/2015-10-19/6866214', @@ -64,8 +65,9 @@ class ABCIE(InfoExtractor): 'info_dict': { 'id': '10527914', 'ext': 'mp4', - 'title': 'WWI Centenary', - 'description': 'md5:c2379ec0ca84072e86b446e536954546', + 'title': 'WWI Centenary - Behind The News', + 'description': 'md5:fa4405939ff750fade46ff0cd4c66a52', + 'thumbnail': r're:https://live-production\.wcms\.abc-cdn\.net\.au/bcc3433c97bf992dff32ec5a768713c9', }, }, { 'url': 'https://www.abc.net.au/news/programs/the-world/2020-06-10/black-lives-matter-protests-spawn-support-for/12342074', @@ -73,7 +75,8 @@ class ABCIE(InfoExtractor): 'id': '12342074', 'ext': 'mp4', 'title': 'Black Lives Matter protests spawn support for Papuans in Indonesia', - 'description': 'md5:2961a17dc53abc558589ccd0fb8edd6f', + 'description': 'md5:625257209f2d14ce23cb4e3785da9beb', + 'thumbnail': r're:https://live-production\.wcms\.abc-cdn\.net\.au/7ee6f190de6d7dbb04203e514bfae9ec', }, }, { 'url': 'https://www.abc.net.au/btn/newsbreak/btn-newsbreak-20200814/12560476', @@ -93,7 +96,16 @@ class ABCIE(InfoExtractor): 'title': 'Wagner Group retreating from Russia, leader Prigozhin to move to Belarus', 'ext': 'mp4', 'description': 'Wagner troops leave Rostov-on-Don and\xa0Yevgeny Prigozhin will move to Belarus under a deal brokered by Belarusian President Alexander Lukashenko to end the mutiny.', - 'thumbnail': 'https://live-production.wcms.abc-cdn.net.au/0c170f5b57f0105c432f366c0e8e267b?impolicy=wcms_crop_resize&cropH=2813&cropW=5000&xPos=0&yPos=249&width=862&height=485', + 'thumbnail': r're:https://live-production\.wcm\.abc-cdn\.net\.au/0c170f5b57f0105c432f366c0e8e267b', + }, + }, { + 'url': 'https://www.abc.net.au/listen/programs/the-followers-madness-of-two/presents-followers-madness-of-two/105697646', + 'info_dict': { + 'id': '105697646', + 'title': 'INTRODUCING — The Followers: Madness of Two - ABC listen', + 'ext': 'mp3', + 'description': 'md5:2310cd0d440a4e01656abea15db8d1f3', + 'thumbnail': r're:https://live-production\.wcms\.abc-cdn\.net\.au/90d7078214e5d66553ffb7fcf0da0cda', }, }] diff --git a/yt_dlp/extractor/appleconnect.py b/yt_dlp/extractor/appleconnect.py index 433eb4ed8e..786719447a 100644 --- a/yt_dlp/extractor/appleconnect.py +++ b/yt_dlp/extractor/appleconnect.py @@ -1,47 +1,125 @@ +import time + from .common import InfoExtractor -from ..utils import ExtractorError, str_to_int +from ..utils import ( + ExtractorError, + extract_attributes, + float_or_none, + jwt_decode_hs256, + jwt_encode, + parse_resolution, + qualities, + unified_strdate, + update_url, + url_or_none, + urljoin, +) +from ..utils.traversal import ( + find_element, + require, + traverse_obj, +) class AppleConnectIE(InfoExtractor): - _VALID_URL = r'https?://itunes\.apple\.com/\w{0,2}/?post/(?:id)?sa\.(?P[\w-]+)' + IE_NAME = 'apple:music:connect' + IE_DESC = 'Apple Music Connect' + + _BASE_URL = 'https://music.apple.com' + _QUALITIES = { + 'provisionalUploadVideo': None, + 'sdVideo': 480, + 'sdVideoWithPlusAudio': 480, + 'sd480pVideo': 480, + '720pHdVideo': 720, + '1080pHdVideo': 1080, + } + _VALID_URL = r'https?://music\.apple\.com/[\w-]+/post/(?P\d+)' _TESTS = [{ - 'url': 'https://itunes.apple.com/us/post/idsa.4ab17a39-2720-11e5-96c5-a5b38f6c42d3', - 'md5': 'c1d41f72c8bcaf222e089434619316e4', + 'url': 'https://music.apple.com/us/post/1018290019', 'info_dict': { - 'id': '4ab17a39-2720-11e5-96c5-a5b38f6c42d3', + 'id': '1018290019', 'ext': 'm4v', 'title': 'Energy', - 'uploader': 'Drake', - 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 177.911, + 'thumbnail': r're:https?://.+\.png', 'upload_date': '20150710', - 'timestamp': 1436545535, + 'uploader': 'Drake', }, }, { - 'url': 'https://itunes.apple.com/us/post/sa.0fe0229f-2457-11e5-9f40-1bb645f2d5d9', - 'only_matching': True, + 'url': 'https://music.apple.com/us/post/1016746627', + 'info_dict': { + 'id': '1016746627', + 'ext': 'm4v', + 'title': 'Body Shop (Madonna) - Chellous Lima (Acoustic Cover)', + 'duration': 210.278, + 'thumbnail': r're:https?://.+\.png', + 'upload_date': '20150706', + 'uploader': 'Chellous Lima', + }, }] + _jwt = None + + @staticmethod + def _jwt_is_expired(token): + return jwt_decode_hs256(token)['exp'] - time.time() < 120 + + def _get_token(self, webpage, video_id): + if self._jwt and not self._jwt_is_expired(self._jwt): + return self._jwt + + js_url = traverse_obj(webpage, ( + {find_element(tag='script', attr='crossorigin', value='', html=True)}, + {extract_attributes}, 'src', {urljoin(self._BASE_URL)}, {require('JS URL')})) + js = self._download_webpage( + js_url, video_id, 'Downloading token JS', 'Unable to download token JS') + + header = jwt_encode({}, '', headers={'alg': 'ES256', 'kid': 'WebPlayKid'}).split('.')[0] + self._jwt = self._search_regex( + fr'(["\'])(?P{header}(?:\.[\w-]+){{2}})\1', js, 'JSON Web Token', group='jwt') + if self._jwt_is_expired(self._jwt): + raise ExtractorError('The fetched token is already expired') + + return self._jwt + def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - try: - video_json = self._html_search_regex( - r'class="auc-video-data">(\{.*?\})', webpage, 'json') - except ExtractorError: - raise ExtractorError('This post doesn\'t contain a video', expected=True) + videos = self._download_json( + 'https://amp-api.music.apple.com/v1/catalog/us/uploaded-videos', + video_id, headers={ + 'Authorization': f'Bearer {self._get_token(webpage, video_id)}', + 'Origin': self._BASE_URL, + }, query={'ids': video_id, 'l': 'en-US'}) + attributes = traverse_obj(videos, ( + 'data', ..., 'attributes', any, {require('video information')})) - video_data = self._parse_json(video_json, video_id) - timestamp = str_to_int(self._html_search_regex(r'data-timestamp="(\d+)"', webpage, 'timestamp')) - like_count = str_to_int(self._html_search_regex(r'(\d+) Loves', webpage, 'like count', default=None)) + formats = [] + quality = qualities(list(self._QUALITIES.keys())) + for format_id, src_url in traverse_obj(attributes, ( + 'assetTokens', {dict.items}, lambda _, v: url_or_none(v[1]), + )): + formats.append({ + 'ext': 'm4v', + 'format_id': format_id, + 'height': self._QUALITIES.get(format_id), + 'quality': quality(format_id), + 'url': src_url, + **parse_resolution(update_url(src_url, query=None), lenient=True), + }) return { 'id': video_id, - 'url': video_data['sslSrc'], - 'title': video_data['title'], - 'description': video_data['description'], - 'uploader': video_data['artistName'], - 'thumbnail': video_data['artworkUrl'], - 'timestamp': timestamp, - 'like_count': like_count, + 'formats': formats, + 'thumbnail': self._html_search_meta( + ['og:image', 'og:image:secure_url', 'twitter:image'], webpage), + **traverse_obj(attributes, { + 'title': ('name', {str}), + 'duration': ('durationInMilliseconds', {float_or_none(scale=1000)}), + 'upload_date': ('uploadDate', {unified_strdate}), + 'uploader': (('artistName', 'uploadingArtistName'), {str}, any), + 'webpage_url': ('postUrl', {url_or_none}), + }), } diff --git a/yt_dlp/extractor/archiveorg.py b/yt_dlp/extractor/archiveorg.py index 1864ddbfd9..3746c58fb7 100644 --- a/yt_dlp/extractor/archiveorg.py +++ b/yt_dlp/extractor/archiveorg.py @@ -5,12 +5,9 @@ import re import urllib.parse from .common import InfoExtractor -from .youtube import YoutubeBaseInfoExtractor, YoutubeIE -from ..networking import HEADRequest -from ..networking.exceptions import HTTPError +from .youtube import YoutubeBaseInfoExtractor from ..utils import ( KNOWN_EXTENSIONS, - ExtractorError, bug_reports_message, clean_html, dict_get, @@ -21,18 +18,14 @@ from ..utils import ( join_nonempty, js_to_json, merge_dicts, - mimetype2ext, orderedSet, parse_duration, parse_qs, str_or_none, - str_to_int, traverse_obj, - try_get, unified_strdate, unified_timestamp, url_or_none, - urlhandle_detect_ext, ) @@ -471,7 +464,7 @@ class YoutubeWebArchiveIE(InfoExtractor): 'url': 'https://web.archive.org/web/20110712231407/http://www.youtube.com/watch?v=lTx3G6h2xyA', 'info_dict': { 'id': 'lTx3G6h2xyA', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Madeon - Pop Culture (live mashup)', 'upload_date': '20110711', 'uploader': 'Madeon', @@ -578,7 +571,7 @@ class YoutubeWebArchiveIE(InfoExtractor): 'url': 'https://web.archive.org/web/20110126141719/http://www.youtube.com/watch?v=Q_yjX80U7Yc', 'info_dict': { 'id': 'Q_yjX80U7Yc', - 'ext': 'flv', + 'ext': 'webm', 'title': 'Spray Paint Art by Clay Butler: Purple Fantasy Forest', 'uploader_id': 'claybutlermusic', 'description': 'md5:4595264559e3d0a0ceb3f011f6334543', @@ -680,6 +673,37 @@ class YoutubeWebArchiveIE(InfoExtractor): 'upload_date': '20120407', 'uploader_id': 'thecomputernerd01', }, + }, { + # Contains split audio/video formats + 'url': 'ytarchive:o_T_S_TU12M', + 'info_dict': { + 'id': 'o_T_S_TU12M', + 'ext': 'mp4', + 'title': 'Prairie Pulse 1218; Lin Enger, Paul Olson', + 'description': 'md5:36e7a34cdc8508e35a920ec042e799c7', + 'uploader': 'Prairie Public', + 'channel_id': 'UC4BOzQel6tvJm7OEDd3vZlw', + 'channel_url': 'https://www.youtube.com/channel/UC4BOzQel6tvJm7OEDd3vZlw', + 'duration': 1606, + 'upload_date': '20150213', + }, + }, { + # Video unavailable through wayback-fakeurl + 'url': 'ytarchive:SQCom7wjGDs', + 'info_dict': { + 'id': 'SQCom7wjGDs', + 'ext': 'mp4', + 'title': 'Jamin Warren from PBS Game/Show decides that Portal is a feminist Game [Top Hats and No Brain]', + 'description': 'md5:c0cb876dd075483ead9afcc86798efb0', + 'uploader': 'Top Hats and Champagne', + 'uploader_id': 'sparrowtm', + 'uploader_url': 'https://www.youtube.com/user/sparrowtm', + 'channel_id': 'UCW3T5nG4iEkI7HjG-Du3HQA', + 'channel_url': 'https://www.youtube.com/channel/UCW3T5nG4iEkI7HjG-Du3HQA', + 'duration': 1500, + 'thumbnail': 'https://web.archive.org/web/20160108040020if_/https://i.ytimg.com/vi/SQCom7wjGDs/maxresdefault.jpg', + 'upload_date': '20160107', + }, }, { 'url': 'https://web.archive.org/web/http://www.youtube.com/watch?v=kH-G_aIBlFw', 'only_matching': True, @@ -724,6 +748,113 @@ class YoutubeWebArchiveIE(InfoExtractor): _OLDEST_CAPTURE_DATE = 20050214000000 _NEWEST_CAPTURE_DATE = 20500101000000 + _FORMATS = { + '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'vcodec': 'h263'}, + '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'vcodec': 'h263'}, + '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'}, + '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'vcodec': 'mp4v'}, + '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'vcodec': 'h264'}, + '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'vcodec': 'h264'}, + '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'vcodec': 'h264'}, + '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'vcodec': 'h264'}, + # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well + '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'}, + '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'vcodec': 'h264'}, + '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'vcodec': 'h264'}, + '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'vcodec': 'vp8'}, + '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'vcodec': 'vp8'}, + '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'vcodec': 'vp8'}, + '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'vcodec': 'vp8'}, + '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'vcodec': 'h264'}, + '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'vcodec': 'h264'}, + + + # 3D videos + '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'vcodec': 'h264', 'preference': -20}, + '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'vcodec': 'h264', 'preference': -20}, + '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'vcodec': 'h264', 'preference': -20}, + '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'vcodec': 'h264', 'preference': -20}, + '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'vcodec': 'vp8', 'preference': -20}, + '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'vcodec': 'vp8', 'preference': -20}, + '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'vcodec': 'vp8', 'preference': -20}, + + # Apple HTTP Live Streaming + '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'vcodec': 'h264'}, + '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'vcodec': 'h264'}, + '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'vcodec': 'h264'}, + '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'vcodec': 'h264'}, + '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'vcodec': 'h264'}, + '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'vcodec': 'h264'}, + '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'vcodec': 'h264'}, + '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'vcodec': 'h264'}, + + # DASH mp4 video + '133': {'ext': 'mp4', 'height': 240, 'vcodec': 'h264', 'acodec': 'none'}, + '134': {'ext': 'mp4', 'height': 360, 'vcodec': 'h264', 'acodec': 'none'}, + '135': {'ext': 'mp4', 'height': 480, 'vcodec': 'h264', 'acodec': 'none'}, + '136': {'ext': 'mp4', 'height': 720, 'vcodec': 'h264', 'acodec': 'none'}, + '137': {'ext': 'mp4', 'height': 1080, 'vcodec': 'h264', 'acodec': 'none'}, + '138': {'ext': 'mp4', 'vcodec': 'h264', 'acodec': 'none'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559) + '160': {'ext': 'mp4', 'height': 144, 'vcodec': 'h264', 'acodec': 'none'}, + '212': {'ext': 'mp4', 'height': 480, 'vcodec': 'h264', 'acodec': 'none'}, + '264': {'ext': 'mp4', 'height': 1440, 'vcodec': 'h264', 'acodec': 'none'}, + '298': {'ext': 'mp4', 'height': 720, 'vcodec': 'h264', 'fps': 60, 'acodec': 'none'}, + '299': {'ext': 'mp4', 'height': 1080, 'vcodec': 'h264', 'fps': 60, 'acodec': 'none'}, + '266': {'ext': 'mp4', 'height': 2160, 'vcodec': 'h264', 'acodec': 'none'}, + + # Dash mp4 audio + '139': {'ext': 'm4a', 'acodec': 'aac', 'vcodec': 'none'}, + '140': {'ext': 'm4a', 'acodec': 'aac', 'vcodec': 'none'}, + '141': {'ext': 'm4a', 'acodec': 'aac', 'vcodec': 'none'}, + '256': {'ext': 'm4a', 'acodec': 'aac', 'vcodec': 'none'}, + '258': {'ext': 'm4a', 'acodec': 'aac', 'vcodec': 'none'}, + '325': {'ext': 'm4a', 'acodec': 'dtse', 'vcodec': 'none'}, + '328': {'ext': 'm4a', 'acodec': 'ec-3', 'vcodec': 'none'}, + + # Dash webm + '167': {'ext': 'webm', 'height': 360, 'width': 640, 'vcodec': 'vp8'}, + '168': {'ext': 'webm', 'height': 480, 'width': 854, 'vcodec': 'vp8'}, + '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'vcodec': 'vp8'}, + '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'vcodec': 'vp8'}, + '218': {'ext': 'webm', 'height': 480, 'width': 854, 'vcodec': 'vp8'}, + '219': {'ext': 'webm', 'height': 480, 'width': 854, 'vcodec': 'vp8'}, + '278': {'ext': 'webm', 'height': 144, 'vcodec': 'vp9', 'acodec': 'none'}, + '242': {'ext': 'webm', 'height': 240, 'vcodec': 'vp9', 'acodec': 'none'}, + '243': {'ext': 'webm', 'height': 360, 'vcodec': 'vp9', 'acodec': 'none'}, + '244': {'ext': 'webm', 'height': 480, 'vcodec': 'vp9', 'acodec': 'none'}, + '245': {'ext': 'webm', 'height': 480, 'vcodec': 'vp9', 'acodec': 'none'}, + '246': {'ext': 'webm', 'height': 480, 'vcodec': 'vp9', 'acodec': 'none'}, + '247': {'ext': 'webm', 'height': 720, 'vcodec': 'vp9', 'acodec': 'none'}, + '248': {'ext': 'webm', 'height': 1080, 'vcodec': 'vp9', 'acodec': 'none'}, + '271': {'ext': 'webm', 'height': 1440, 'vcodec': 'vp9', 'acodec': 'none'}, + # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug) + '272': {'ext': 'webm', 'height': 2160, 'vcodec': 'vp9', 'acodec': 'none'}, + '302': {'ext': 'webm', 'height': 720, 'vcodec': 'vp9', 'fps': 60, 'acodec': 'none'}, + '303': {'ext': 'webm', 'height': 1080, 'vcodec': 'vp9', 'fps': 60, 'acodec': 'none'}, + '308': {'ext': 'webm', 'height': 1440, 'vcodec': 'vp9', 'fps': 60, 'acodec': 'none'}, + '313': {'ext': 'webm', 'height': 2160, 'vcodec': 'vp9', 'acodec': 'none'}, + '315': {'ext': 'webm', 'height': 2160, 'vcodec': 'vp9', 'fps': 60, 'acodec': 'none'}, + + # Dash webm audio + '171': {'ext': 'webm', 'acodec': 'vorbis', 'vcodec': 'none'}, + '172': {'ext': 'webm', 'acodec': 'vorbis', 'vcodec': 'none'}, + + # Dash webm audio with opus inside + '249': {'ext': 'webm', 'acodec': 'opus', 'vcodec': 'none'}, + '250': {'ext': 'webm', 'acodec': 'opus', 'vcodec': 'none'}, + '251': {'ext': 'webm', 'acodec': 'opus', 'vcodec': 'none'}, + + # av01 video only formats sometimes served with "unknown" codecs + '394': {'ext': 'mp4', 'height': 144, 'vcodec': 'av01.0.00M.08', 'acodec': 'none'}, + '395': {'ext': 'mp4', 'height': 240, 'vcodec': 'av01.0.00M.08', 'acodec': 'none'}, + '396': {'ext': 'mp4', 'height': 360, 'vcodec': 'av01.0.01M.08', 'acodec': 'none'}, + '397': {'ext': 'mp4', 'height': 480, 'vcodec': 'av01.0.04M.08', 'acodec': 'none'}, + '398': {'ext': 'mp4', 'height': 720, 'vcodec': 'av01.0.05M.08', 'acodec': 'none'}, + '399': {'ext': 'mp4', 'height': 1080, 'vcodec': 'av01.0.08M.08', 'acodec': 'none'}, + '400': {'ext': 'mp4', 'height': 1440, 'vcodec': 'av01.0.12M.08', 'acodec': 'none'}, + '401': {'ext': 'mp4', 'height': 2160, 'vcodec': 'av01.0.12M.08', 'acodec': 'none'}, + } + def _call_cdx_api(self, item_id, url, filters: list | None = None, collapse: list | None = None, query: dict | None = None, note=None, fatal=False): # CDX docs: https://github.com/internetarchive/wayback/blob/master/wayback-cdx-server/README.md query = { @@ -740,7 +871,7 @@ class YoutubeWebArchiveIE(InfoExtractor): note or 'Downloading CDX API JSON', query=query, fatal=fatal) if isinstance(res, list) and len(res) >= 2: # format response to make it easier to use - return [dict(zip(res[0], v)) for v in res[1:]] + return [dict(zip(res[0], v)) for v in res[1:]] # noqa: B905 elif not isinstance(res, list) or len(res) != 0: self.report_warning('Error while parsing CDX API response' + bug_reports_message()) @@ -933,23 +1064,13 @@ class YoutubeWebArchiveIE(InfoExtractor): video_id, url_date, url_date_2 = self._match_valid_url(url).group('id', 'date', 'date2') url_date = url_date or url_date_2 - urlh = None - retry_manager = self.RetryManager(fatal=False) - for retry in retry_manager: - try: - urlh = self._request_webpage( - HEADRequest(f'https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/{video_id}'), - video_id, note='Fetching archived video file url', expected_status=True) - except ExtractorError as e: - # HTTP Error 404 is expected if the video is not saved. - if isinstance(e.cause, HTTPError) and e.cause.status == 404: - self.raise_no_formats( - 'The requested video is not archived, indexed, or there is an issue with web.archive.org (try again later)', expected=True) - else: - retry.error = e + video_info = self._download_json( + 'https://web.archive.org/__wb/videoinfo', video_id, + query={'vtype': 'youtube', 'vid': video_id}) - if retry_manager.error: - self.raise_no_formats(retry_manager.error, expected=True, video_id=video_id) + if not traverse_obj(video_info, 'formats'): + self.raise_no_formats( + 'The requested video is not archived or indexed', expected=True) capture_dates = self._get_capture_dates(video_id, int_or_none(url_date)) self.write_debug('Captures to try: ' + join_nonempty(*capture_dates, delim=', ')) @@ -968,25 +1089,18 @@ class YoutubeWebArchiveIE(InfoExtractor): info['thumbnails'] = self._extract_thumbnails(video_id) - if urlh: - url = urllib.parse.unquote(urlh.url) - video_file_url_qs = parse_qs(url) - # Attempt to recover any ext & format info from playback url & response headers - fmt = {'url': url, 'filesize': int_or_none(urlh.headers.get('x-archive-orig-content-length'))} - itag = try_get(video_file_url_qs, lambda x: x['itag'][0]) - if itag and itag in YoutubeIE._formats: - fmt.update(YoutubeIE._formats[itag]) - fmt.update({'format_id': itag}) - else: - mime = try_get(video_file_url_qs, lambda x: x['mime'][0]) - ext = (mimetype2ext(mime) - or urlhandle_detect_ext(urlh) - or mimetype2ext(urlh.headers.get('x-archive-guessed-content-type'))) - fmt.update({'ext': ext}) - info['formats'] = [fmt] - if not info.get('duration'): - info['duration'] = str_to_int(try_get(video_file_url_qs, lambda x: x['dur'][0])) + formats = [] + for fmt in traverse_obj(video_info, ('formats', lambda _, v: url_or_none(v['url']))): + format_id = traverse_obj(fmt, ('url', {parse_qs}, 'itag', 0)) + formats.append({ + 'format_id': format_id, + **self._FORMATS.get(format_id, {}), + **traverse_obj(fmt, { + 'url': ('url', {lambda x: f'https://web.archive.org/web/2id_/{x}'}), + 'ext': ('ext', {str}), + 'filesize': ('url', {parse_qs}, 'clen', 0, {int_or_none}), + }), + }) + info['formats'] = formats - if not info.get('title'): - info['title'] = video_id return info diff --git a/yt_dlp/extractor/ard.py b/yt_dlp/extractor/ard.py index 89d3299213..5bcf74e1d0 100644 --- a/yt_dlp/extractor/ard.py +++ b/yt_dlp/extractor/ard.py @@ -1,4 +1,5 @@ import functools +import json import re from .common import InfoExtractor @@ -15,11 +16,12 @@ from ..utils import ( remove_start, str_or_none, unified_strdate, + update_url, update_url_query, url_or_none, xpath_text, ) -from ..utils.traversal import traverse_obj +from ..utils.traversal import traverse_obj, value class ARDMediathekBaseIE(InfoExtractor): @@ -601,3 +603,163 @@ class ARDMediathekCollectionIE(InfoExtractor): return self.playlist_result( OnDemandPagedList(fetch_page, self._PAGE_SIZE), full_id, display_id=display_id, title=page_data.get('title'), description=page_data.get('synopsis')) + + +class ARDAudiothekBaseIE(InfoExtractor): + def _graphql_query(self, urn, query): + return self._download_json( + 'https://api.ardaudiothek.de/graphql', urn, + data=json.dumps({ + 'query': query, + 'variables': {'id': urn}, + }).encode(), headers={ + 'Content-Type': 'application/json', + })['data'] + + +class ARDAudiothekIE(ARDAudiothekBaseIE): + _VALID_URL = r'https:?//(?:www\.)?ardaudiothek\.de/episode/(?Purn:ard:(?:episode|section|extra):[a-f0-9]{16})' + + _TESTS = [{ + 'url': 'https://www.ardaudiothek.de/episode/urn:ard:episode:eabead1add170e93/', + 'info_dict': { + 'id': 'urn:ard:episode:eabead1add170e93', + 'ext': 'mp3', + 'upload_date': '20240717', + 'duration': 3339, + 'title': 'CAIMAN CLUB (S04E04): Cash Out', + 'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:ed64411a07a4b405', + 'description': 'md5:0e5d127a3832ae59e8bab40a91a5dadc', + 'display_id': 'urn:ard:episode:eabead1add170e93', + 'timestamp': 1721181641, + 'series': '1LIVE Caiman Club', + 'channel': 'WDR', + 'episode': 'Episode 4', + 'episode_number': 4, + }, + }, { + 'url': 'https://www.ardaudiothek.de/episode/urn:ard:section:855c7a53dac72e0a/', + 'info_dict': { + 'id': 'urn:ard:section:855c7a53dac72e0a', + 'ext': 'mp4', + 'upload_date': '20241231', + 'duration': 3304, + 'title': 'Illegaler DDR-Detektiv: Doberschütz und die letzte Staatsjagd (1/2) - Wendezeit', + 'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:b9b4f1e8b93da4dd', + 'description': 'md5:3552d571e1959754cff66c1da6c0fdae', + 'display_id': 'urn:ard:section:855c7a53dac72e0a', + 'timestamp': 1735629900, + 'series': 'Auf der Spur – Die ARD Ermittlerkrimis', + 'channel': 'ARD', + 'episode': 'Episode 1', + 'episode_number': 1, + }, + }, { + 'url': 'https://www.ardaudiothek.de/episode/urn:ard:extra:d2fe7303d2dcbf5d/', + 'info_dict': { + 'id': 'urn:ard:extra:d2fe7303d2dcbf5d', + 'ext': 'mp3', + 'title': 'Trailer: Fanta Vier Forever, Baby!?!', + 'description': 'md5:b64a586f2e976b8bb5ea0a79dbd8751c', + 'channel': 'SWR', + 'duration': 62, + 'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:48d3c255969be803', + 'series': 'Fanta Vier Forever, Baby!?!', + 'timestamp': 1732108217, + 'upload_date': '20241120', + }, + }] + + _QUERY_ITEM = '''\ + query($id: ID!) { + item(id: $id) { + audioList { + href + distributionType + audioBitrate + audioCodec + } + show { + title + } + image { + url1X1 + } + programSet { + publicationService { + organizationName + } + } + description + title + duration + startDate + episodeNumber + } + }''' + + def _real_extract(self, url): + urn = self._match_id(url) + item = self._graphql_query(urn, self._QUERY_ITEM)['item'] + return { + 'id': urn, + **traverse_obj(item, { + 'formats': ('audioList', lambda _, v: url_or_none(v['href']), { + 'url': 'href', + 'format_id': ('distributionType', {str}), + 'abr': ('audioBitrate', {int_or_none}), + 'acodec': ('audioCodec', {str}), + 'vcodec': {value('none')}, + }), + 'channel': ('programSet', 'publicationService', 'organizationName', {str}), + 'description': ('description', {str}), + 'duration': ('duration', {int_or_none}), + 'series': ('show', 'title', {str}), + 'episode_number': ('episodeNumber', {int_or_none}), + 'thumbnail': ('image', 'url1X1', {url_or_none}, {update_url(query=None)}), + 'timestamp': ('startDate', {parse_iso8601}), + 'title': ('title', {str}), + }), + } + + +class ARDAudiothekPlaylistIE(ARDAudiothekBaseIE): + _VALID_URL = r'https:?//(?:www\.)?ardaudiothek\.de/sendung/(?P[\w-]+)/(?Purn:ard:show:[a-f0-9]{16})' + + _TESTS = [{ + 'url': 'https://www.ardaudiothek.de/sendung/mia-insomnia/urn:ard:show:c405aa26d9a4060a/', + 'info_dict': { + 'display_id': 'mia-insomnia', + 'title': 'Mia Insomnia', + 'id': 'urn:ard:show:c405aa26d9a4060a', + 'description': 'md5:d9ceb7a6b4d26a4db3316573bb564292', + }, + 'playlist_mincount': 37, + }, { + 'url': 'https://www.ardaudiothek.de/sendung/100-berlin/urn:ard:show:4d248e0806ce37bc/', + 'only_matching': True, + }] + + _QUERY_PLAYLIST = ''' + query($id: ID!) { + show(id: $id) { + title + description + items(filter: { isPublished: { equalTo: true } }) { + nodes { + url + } + } + } + }''' + + def _real_extract(self, url): + urn, playlist = self._match_valid_url(url).group('id', 'playlist') + playlist_info = self._graphql_query(urn, self._QUERY_PLAYLIST)['show'] + entries = [] + for url in traverse_obj(playlist_info, ('items', 'nodes', ..., 'url', {url_or_none})): + entries.append(self.url_result(url, ie=ARDAudiothekIE)) + return self.playlist_result(entries, urn, display_id=playlist, **traverse_obj(playlist_info, { + 'title': ('title', {str}), + 'description': ('description', {str}), + })) diff --git a/yt_dlp/extractor/cbc.py b/yt_dlp/extractor/cbc.py index 319771655e..aa4024a90b 100644 --- a/yt_dlp/extractor/cbc.py +++ b/yt_dlp/extractor/cbc.py @@ -31,7 +31,7 @@ from ..utils.traversal import require, traverse_obj, trim_str class CBCIE(InfoExtractor): IE_NAME = 'cbc.ca' - _VALID_URL = r'https?://(?:www\.)?cbc\.ca/(?!player/)(?:[^/]+/)+(?P[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?cbc\.ca/(?!player/|listen/|i/caffeine/syndicate/)(?:[^/?#]+/)+(?P[^/?#]+)' _TESTS = [{ # with mediaId 'url': 'http://www.cbc.ca/22minutes/videos/clips-season-23/don-cherry-play-offs', @@ -112,10 +112,6 @@ class CBCIE(InfoExtractor): 'playlist_mincount': 6, }] - @classmethod - def suitable(cls, url): - return False if CBCPlayerIE.suitable(url) else super().suitable(url) - def _extract_player_init(self, player_init, display_id): player_info = self._parse_json(player_init, display_id, js_to_json) media_id = player_info.get('mediaId') @@ -913,3 +909,63 @@ class CBCGemLiveIE(InfoExtractor): 'thumbnail': ('images', 'card', 'url'), }), } + + +class CBCListenIE(InfoExtractor): + IE_NAME = 'cbc.ca:listen' + _VALID_URL = r'https?://(?:www\.)?cbc\.ca/listen/(?:cbc-podcasts|live-radio)/[\w-]+/[\w-]+/(?P\d+)' + _TESTS = [{ + 'url': 'https://www.cbc.ca/listen/cbc-podcasts/1353-the-naked-emperor/episode/16142603-introducing-understood-who-broke-the-internet', + 'info_dict': { + 'id': '16142603', + 'title': 'Introducing Understood: Who Broke the Internet?', + 'ext': 'mp3', + 'description': 'md5:c605117500084e43f08a950adc6a708c', + 'duration': 229, + 'timestamp': 1745812800, + 'release_timestamp': 1745827200, + 'release_date': '20250428', + 'upload_date': '20250428', + }, + }, { + 'url': 'https://www.cbc.ca/listen/live-radio/1-64-the-house/clip/16170773-should-canada-suck-stand-donald-trump', + 'info_dict': { + 'id': '16170773', + 'title': 'Should Canada suck up or stand up to Donald Trump?', + 'ext': 'mp3', + 'description': 'md5:7385194f1cdda8df27ba3764b35e7976', + 'duration': 3159, + 'timestamp': 1758340800, + 'release_timestamp': 1758254400, + 'release_date': '20250919', + 'upload_date': '20250920', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + response = self._download_json( + f'https://www.cbc.ca/listen/api/v1/clips/{video_id}', video_id, fatal=False) + data = traverse_obj(response, ('data', {dict})) + if not data: + self.report_warning('API failed to return data. Falling back to webpage parsing') + webpage = self._download_webpage(url, video_id) + preloaded_state = self._search_json( + r'window\.__PRELOADED_STATE__\s*=', webpage, 'preloaded state', + video_id, transform_source=js_to_json) + data = traverse_obj(preloaded_state, ( + ('podcastDetailData', 'showDetailData'), ..., 'episodes', + lambda _, v: str(v['clipID']) == video_id, any, {require('episode data')})) + + return { + 'id': video_id, + **traverse_obj(data, { + 'url': (('src', 'url'), {url_or_none}, any), + 'title': ('title', {str}), + 'description': ('description', {str}), + 'release_timestamp': ('releasedAt', {int_or_none(scale=1000)}), + 'timestamp': ('airdate', {int_or_none(scale=1000)}), + 'duration': ('duration', {int_or_none}), + }), + } diff --git a/yt_dlp/extractor/cellebrite.py b/yt_dlp/extractor/cellebrite.py index 54367c4d52..75405e966b 100644 --- a/yt_dlp/extractor/cellebrite.py +++ b/yt_dlp/extractor/cellebrite.py @@ -5,18 +5,6 @@ from ..utils import ExtractorError, make_archive_id, url_basename class CellebriteIE(VidyardBaseIE): _VALID_URL = r'https?://cellebrite\.com/(?:\w+)?/(?P[\w-]+)' _TESTS = [{ - 'url': 'https://cellebrite.com/en/collect-data-from-android-devices-with-cellebrite-ufed/', - 'info_dict': { - 'id': 'ZqmUss3dQfEMGpauambPuH', - 'display_id': '16025876', - 'ext': 'mp4', - 'title': 'Ask the Expert: Chat Capture - Collect Data from Android Devices in Cellebrite UFED', - 'description': 'md5:dee48fe12bbae5c01fe6a053f7676da4', - 'thumbnail': 'https://cellebrite.com/wp-content/uploads/2021/05/Chat-Capture-1024x559.png', - 'duration': 455.979, - '_old_archive_ids': ['cellebrite 16025876'], - }, - }, { 'url': 'https://cellebrite.com/en/how-to-lawfully-collect-the-maximum-amount-of-data-from-android-devices/', 'info_dict': { 'id': 'QV1U8a2yzcxigw7VFnqKyg', diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 573a9c5a0a..d6e2596ae1 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1663,7 +1663,7 @@ class InfoExtractor: 'end_time': part.get('endOffset'), } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip'] for idx, (last_c, current_c, next_c) in enumerate(zip( - [{'end_time': 0}, *chapters], chapters, chapters[1:])): + [{'end_time': 0}, *chapters], chapters, chapters[1:], strict=False)): current_c['end_time'] = current_c['end_time'] or next_c['start_time'] current_c['start_time'] = current_c['start_time'] or last_c['end_time'] if None in current_c.values(): @@ -1848,7 +1848,7 @@ class InfoExtractor: return {} args = dict(zip(arg_keys.split(','), map(json.dumps, self._parse_json( - f'[{arg_vals}]', video_id, transform_source=js_to_json, fatal=fatal) or ()))) + f'[{arg_vals}]', video_id, transform_source=js_to_json, fatal=fatal) or ()), strict=True)) ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal) return traverse_obj(ret, traverse) or {} diff --git a/yt_dlp/extractor/dplay.py b/yt_dlp/extractor/dplay.py index 86950b2445..9868f0e4d2 100644 --- a/yt_dlp/extractor/dplay.py +++ b/yt_dlp/extractor/dplay.py @@ -13,6 +13,7 @@ from ..utils import ( try_get, unified_timestamp, ) +from ..utils.traversal import traverse_obj class DPlayBaseIE(InfoExtractor): @@ -1053,7 +1054,7 @@ class DiscoveryPlusIndiaIE(DiscoveryPlusBaseIE): class DiscoveryNetworksDeIE(DiscoveryPlusBaseIE): - _VALID_URL = r'https?://(?:www\.)?(?P(?:tlc|dmax)\.de|dplay\.co\.uk)/(?:programme|show|sendungen)/(?P[^/]+)/(?:video/)?(?P[^/]+)' + _VALID_URL = r'https?://(?:www\.)?(?P(?:tlc|dmax)\.de)/(?:programme|show|sendungen)/(?P[^/?#]+)/(?:video/)?(?P[^/?#]+)' _TESTS = [{ 'url': 'https://dmax.de/sendungen/goldrausch-in-australien/german-gold', @@ -1062,7 +1063,7 @@ class DiscoveryNetworksDeIE(DiscoveryPlusBaseIE): 'ext': 'mp4', 'title': 'German Gold', 'description': 'md5:f3073306553a8d9b40e6ac4cdbf09fc6', - 'display_id': 'goldrausch-in-australien/german-gold', + 'display_id': 'german-gold', 'episode': 'Episode 1', 'episode_number': 1, 'season': 'Season 5', @@ -1074,6 +1075,7 @@ class DiscoveryNetworksDeIE(DiscoveryPlusBaseIE): 'creators': ['DMAX'], 'thumbnail': 'https://eu1-prod-images.disco-api.com/2023/05/09/f72fb510-7992-3b12-af7f-f16a2c22d1e3.jpeg', 'tags': ['schatzsucher', 'schatz', 'nugget', 'bodenschätze', 'down under', 'australien', 'goldrausch'], + 'categories': ['Gold', 'Schatzsucher'], }, 'params': {'skip_download': 'm3u8'}, }, { @@ -1100,20 +1102,95 @@ class DiscoveryNetworksDeIE(DiscoveryPlusBaseIE): }, { 'url': 'https://www.dmax.de/programme/dmax-highlights/video/tuning-star-sidney-hoffmann-exklusiv-bei-dmax/191023082312316', 'only_matching': True, - }, { - 'url': 'https://www.dplay.co.uk/show/ghost-adventures/video/hotel-leger-103620/EHD_280313B', - 'only_matching': True, }, { 'url': 'https://tlc.de/sendungen/breaking-amish/die-welt-da-drauen/', 'only_matching': True, + }, { + 'url': 'https://dmax.de/sendungen/feuerwache-3-alarm-in-muenchen/24-stunden-auf-der-feuerwache-3', + 'info_dict': { + 'id': '8873549', + 'ext': 'mp4', + 'title': '24 Stunden auf der Feuerwache 3', + 'description': 'md5:f3084ef6170bfb79f9a6e0c030e09330', + 'display_id': '24-stunden-auf-der-feuerwache-3', + 'episode': 'Episode 1', + 'episode_number': 1, + 'season': 'Season 1', + 'season_number': 1, + 'series': 'Feuerwache 3 - Alarm in München', + 'duration': 2632.0, + 'upload_date': '20251016', + 'timestamp': 1760645100, + 'creators': ['DMAX'], + 'thumbnail': 'https://eu1-prod-images.disco-api.com/2025/10/14/0bdee68c-a8d8-33d9-9204-16eb61108552.jpeg', + 'tags': [], + 'categories': ['DMAX Originals', 'Jobs', 'Blaulicht'], + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://tlc.de/sendungen/ghost-adventures/der-poltergeist-im-kostumladen', + 'info_dict': { + 'id': '4550602', + 'ext': 'mp4', + 'title': 'Der Poltergeist im Kostümladen', + 'description': 'md5:20b52b9736a0a3a7873d19a238fad7fc', + 'display_id': 'der-poltergeist-im-kostumladen', + 'episode': 'Episode 1', + 'episode_number': 1, + 'season': 'Season 25', + 'season_number': 25, + 'series': 'Ghost Adventures', + 'duration': 2493.0, + 'upload_date': '20241223', + 'timestamp': 1734948900, + 'creators': ['TLC'], + 'thumbnail': 'https://eu1-prod-images.disco-api.com/2023/04/05/59941d26-a81b-365f-829f-69d8cd81fd0f.jpeg', + 'tags': [], + 'categories': ['Paranormal', 'Gruselig!'], + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://tlc.de/sendungen/evil-gesichter-des-boesen/das-geheimnis-meines-bruders', + 'info_dict': { + 'id': '7792288', + 'ext': 'mp4', + 'title': 'Das Geheimnis meines Bruders', + 'description': 'md5:3167550bb582eb9c92875c86a0a20882', + 'display_id': 'das-geheimnis-meines-bruders', + 'episode': 'Episode 1', + 'episode_number': 1, + 'season': 'Season 1', + 'season_number': 1, + 'series': 'Evil - Gesichter des Bösen', + 'duration': 2626.0, + 'upload_date': '20240926', + 'timestamp': 1727388000, + 'creators': ['TLC'], + 'thumbnail': 'https://eu1-prod-images.disco-api.com/2024/11/29/e9f3e3ae-74ec-3631-81b7-fc7bbe844741.jpeg', + 'tags': 'count:13', + 'categories': ['True Crime', 'Mord'], + }, + 'params': {'skip_download': 'm3u8'}, }] def _real_extract(self, url): domain, programme, alternate_id = self._match_valid_url(url).groups() - country = 'GB' if domain == 'dplay.co.uk' else 'DE' - realm = 'questuk' if country == 'GB' else domain.replace('.', '') - return self._get_disco_api_info( - url, f'{programme}/{alternate_id}', 'eu1-prod.disco-api.com', realm, country) + meta = self._download_json( + f'https://de-api.loma-cms.com/feloma/videos/{alternate_id}/', + alternate_id, query={ + 'environment': domain.split('.')[0], + 'v': '2', + 'filter[show.slug]': programme, + }, fatal=False) + video_id = traverse_obj(meta, ('uid', {str}, {lambda s: s[-7:]})) or alternate_id + + disco_api_info = self._get_disco_api_info( + url, video_id, 'eu1-prod.disco-api.com', domain.replace('.', ''), 'DE') + disco_api_info['display_id'] = alternate_id + disco_api_info['categories'] = traverse_obj(meta, ( + 'taxonomies', lambda _, v: v['category'] == 'genre', 'title', {str.strip}, filter, all, filter)) + + return disco_api_info def _update_disco_api_headers(self, headers, disco_base, display_id, realm): headers.update({ diff --git a/yt_dlp/extractor/dropout.py b/yt_dlp/extractor/dropout.py index a0d8aacdbe..2cfb0800fb 100644 --- a/yt_dlp/extractor/dropout.py +++ b/yt_dlp/extractor/dropout.py @@ -18,15 +18,15 @@ from ..utils import ( class DropoutIE(InfoExtractor): - _LOGIN_URL = 'https://www.dropout.tv/login' + _LOGIN_URL = 'https://watch.dropout.tv/login' _NETRC_MACHINE = 'dropout' - _VALID_URL = r'https?://(?:www\.)?dropout\.tv/(?:[^/]+/)*videos/(?P[^/]+)/?$' + _VALID_URL = r'https?://(?:watch\.)?dropout\.tv/(?:[^/?#]+/)*videos/(?P[^/?#]+)/?(?:[?#]|$)' _TESTS = [ { - 'url': 'https://www.dropout.tv/game-changer/season:2/videos/yes-or-no', + 'url': 'https://watch.dropout.tv/game-changer/season:2/videos/yes-or-no', 'note': 'Episode in a series', - 'md5': '5e000fdfd8d8fa46ff40456f1c2af04a', + 'md5': '4b76963f904f8bc4ba22dcf0e66ada06', 'info_dict': { 'id': '738153', 'display_id': 'yes-or-no', @@ -45,35 +45,35 @@ class DropoutIE(InfoExtractor): 'uploader_url': 'https://vimeo.com/user80538407', 'uploader': 'OTT Videos', }, - 'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'], + 'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest', 'Failed to parse XML: not well-formed'], }, { - 'url': 'https://www.dropout.tv/dimension-20-fantasy-high/season:1/videos/episode-1', + 'url': 'https://watch.dropout.tv/tablepop-presents-megadungeon-live/season:1/videos/enter-through-the-gift-shop', 'note': 'Episode in a series (missing release_date)', - 'md5': '712caf7c191f1c47c8f1879520c2fa5c', + 'md5': 'b08fb03050585ea25cd7ee092db9134c', 'info_dict': { - 'id': '320562', - 'display_id': 'episode-1', + 'id': '624270', + 'display_id': 'enter-through-the-gift-shop', 'ext': 'mp4', - 'title': 'The Beginning Begins', - 'description': 'The cast introduces their PCs, including a neurotic elf, a goblin PI, and a corn-worshipping cleric.', - 'thumbnail': 'https://vhx.imgix.net/chuncensoredstaging/assets/4421ed0d-f630-4c88-9004-5251b2b8adfa.jpg', - 'series': 'Dimension 20: Fantasy High', + 'title': 'Enter Through the Gift Shop', + 'description': 'A new adventuring party explores a gift shop and runs into a friendly orc -- and some angry goblins.', + 'thumbnail': 'https://vhx.imgix.net/chuncensoredstaging/assets/a1d876c3-3dee-4cd0-87c6-27a851b1d0ec.jpg', + 'series': 'TablePop Presents: MEGADUNGEON LIVE!', 'season_number': 1, 'season': 'Season 1', 'episode_number': 1, - 'episode': 'The Beginning Begins', - 'duration': 6838, + 'episode': 'Enter Through the Gift Shop', + 'duration': 7101, 'uploader_id': 'user80538407', 'uploader_url': 'https://vimeo.com/user80538407', 'uploader': 'OTT Videos', }, - 'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'], + 'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest', 'Failed to parse XML: not well-formed'], }, { - 'url': 'https://www.dropout.tv/videos/misfits-magic-holiday-special', + 'url': 'https://watch.dropout.tv/videos/misfits-magic-holiday-special', 'note': 'Episode not in a series', - 'md5': 'c30fa18999c5880d156339f13c953a26', + 'md5': '1e6428f7756b02c93b573d39ddd789fe', 'info_dict': { 'id': '1915774', 'display_id': 'misfits-magic-holiday-special', @@ -87,7 +87,7 @@ class DropoutIE(InfoExtractor): 'uploader_url': 'https://vimeo.com/user80538407', 'uploader': 'OTT Videos', }, - 'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'], + 'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest', 'Failed to parse XML: not well-formed'], }, ] @@ -125,7 +125,7 @@ class DropoutIE(InfoExtractor): display_id = self._match_id(url) webpage = None - if self._get_cookies('https://www.dropout.tv').get('_session'): + if self._get_cookies('https://watch.dropout.tv').get('_session'): webpage = self._download_webpage(url, display_id) if not webpage or '
[^\/$&?#]+)(?:/?$|/season:(?P[0-9]+)/?$)' + _VALID_URL = r'https?://(?:watch\.)?dropout\.tv/(?P[^\/$&?#]+)(?:/?$|/season:(?P[0-9]+)/?$)' _TESTS = [ { - 'url': 'https://www.dropout.tv/dimension-20-fantasy-high/season:1', + 'url': 'https://watch.dropout.tv/dimension-20-fantasy-high/season:1', 'note': 'Multi-season series with the season in the url', 'playlist_count': 24, 'info_dict': { @@ -179,7 +179,7 @@ class DropoutSeasonIE(InfoExtractor): }, }, { - 'url': 'https://www.dropout.tv/dimension-20-fantasy-high', + 'url': 'https://watch.dropout.tv/dimension-20-fantasy-high', 'note': 'Multi-season series with the season not in the url', 'playlist_count': 24, 'info_dict': { @@ -188,7 +188,7 @@ class DropoutSeasonIE(InfoExtractor): }, }, { - 'url': 'https://www.dropout.tv/dimension-20-shriek-week', + 'url': 'https://watch.dropout.tv/dimension-20-shriek-week', 'note': 'Single-season series', 'playlist_count': 4, 'info_dict': { @@ -197,7 +197,7 @@ class DropoutSeasonIE(InfoExtractor): }, }, { - 'url': 'https://www.dropout.tv/breaking-news-no-laugh-newsroom/season:3', + 'url': 'https://watch.dropout.tv/breaking-news-no-laugh-newsroom/season:3', 'note': 'Multi-season series with season in the url that requires pagination', 'playlist_count': 25, 'info_dict': { diff --git a/yt_dlp/extractor/dtube.py b/yt_dlp/extractor/dtube.py index 0d87820c4c..2c47de8448 100644 --- a/yt_dlp/extractor/dtube.py +++ b/yt_dlp/extractor/dtube.py @@ -1,5 +1,4 @@ import json -import socket from .common import InfoExtractor from ..utils import ( @@ -56,7 +55,7 @@ class DTubeIE(InfoExtractor): try: self.to_screen(f'{video_id}: Checking {format_id} video format URL') self._downloader._opener.open(video_url, timeout=5).close() - except socket.timeout: + except TimeoutError: self.to_screen( f'{video_id}: {format_id} URL is invalid, skipping') continue diff --git a/yt_dlp/extractor/fujitv.py b/yt_dlp/extractor/fujitv.py index a2d1a828b4..3319b12681 100644 --- a/yt_dlp/extractor/fujitv.py +++ b/yt_dlp/extractor/fujitv.py @@ -56,7 +56,7 @@ class FujiTVFODPlus7IE(InfoExtractor): fmt, subs = self._extract_m3u8_formats_and_subtitles(src['url'], video_id, 'ts') for f in fmt: f.update(dict(zip(('height', 'width'), - self._BITRATE_MAP.get(f.get('tbr'), ())))) + self._BITRATE_MAP.get(f.get('tbr'), ()), strict=False))) formats.extend(fmt) subtitles = self._merge_subtitles(subtitles, subs) diff --git a/yt_dlp/extractor/googledrive.py b/yt_dlp/extractor/googledrive.py index 0c84f0b241..91c9f60cd8 100644 --- a/yt_dlp/extractor/googledrive.py +++ b/yt_dlp/extractor/googledrive.py @@ -1,21 +1,20 @@ import re -import urllib.parse from .common import InfoExtractor -from .youtube import YoutubeIE from ..utils import ( - ExtractorError, - bug_reports_message, determine_ext, extract_attributes, + filter_dict, get_element_by_class, get_element_html_by_id, int_or_none, - lowercase_escape, - parse_qs, - try_get, + mimetype2ext, + parse_duration, + str_or_none, update_url_query, + url_or_none, ) +from ..utils.traversal import traverse_obj, value class GoogleDriveIE(InfoExtractor): @@ -38,8 +37,8 @@ class GoogleDriveIE(InfoExtractor): 'id': '0ByeS4oOUV-49Zzh4R1J6R09zazQ', 'ext': 'mp4', 'title': 'Big Buck Bunny.mp4', - 'duration': 45, - 'thumbnail': 'https://drive.google.com/thumbnail?id=0ByeS4oOUV-49Zzh4R1J6R09zazQ', + 'duration': 45.069, + 'thumbnail': r're:https://lh3\.googleusercontent\.com/drive-storage/', }, }, { # has itag 50 which is not in YoutubeIE._formats (royalty Free music from 1922) @@ -49,8 +48,29 @@ class GoogleDriveIE(InfoExtractor): 'id': '1IP0o8dHcQrIHGgVyp0Ofvx2cGfLzyO1x', 'ext': 'mp3', 'title': 'My Buddy - Henry Burr - Gus Kahn - Walter Donaldson.mp3', - 'duration': 184, - 'thumbnail': 'https://drive.google.com/thumbnail?id=1IP0o8dHcQrIHGgVyp0Ofvx2cGfLzyO1x', + 'duration': 184.68, + }, + }, { + # Has subtitle track + 'url': 'https://drive.google.com/file/d/1RAGWRgzn85TXCaCk4gxnwF6TGUaZatzE/view', + 'md5': '05488c528da6ef737ec8c962bfa9724e', + 'info_dict': { + 'id': '1RAGWRgzn85TXCaCk4gxnwF6TGUaZatzE', + 'ext': 'mp4', + 'title': 'test.mp4', + 'duration': 9.999, + 'thumbnail': r're:https://lh3\.googleusercontent\.com/drive-storage/', + }, + }, { + # Has subtitle track with kind 'asr' + 'url': 'https://drive.google.com/file/d/1Prvv9-mtDDfN_gkJgtt1OFvIULK8c3Ev/view', + 'md5': 'ccae12d07f18b5988900b2c8b92801fc', + 'info_dict': { + 'id': '1Prvv9-mtDDfN_gkJgtt1OFvIULK8c3Ev', + 'ext': 'mp4', + 'title': 'LEE NA GYUNG-3410-VOICE_MESSAGE.mp4', + 'duration': 8.766, + 'thumbnail': r're:https://lh3\.googleusercontent\.com/drive-storage/', }, }, { # video can't be watched anonymously due to view count limit reached, @@ -71,17 +91,6 @@ class GoogleDriveIE(InfoExtractor): 'url': 'https://drive.usercontent.google.com/download?id=0ByeS4oOUV-49Zzh4R1J6R09zazQ', 'only_matching': True, }] - _FORMATS_EXT = { - **{k: v['ext'] for k, v in YoutubeIE._formats.items() if v.get('ext')}, - '50': 'm4a', - } - _BASE_URL_CAPTIONS = 'https://drive.google.com/timedtext' - _CAPTIONS_ENTRY_TAG = { - 'subtitles': 'track', - 'automatic_captions': 'target', - } - _caption_formats_ext = [] - _captions_xml = None @classmethod def _extract_embed_urls(cls, url, webpage): @@ -91,129 +100,73 @@ class GoogleDriveIE(InfoExtractor): if mobj: yield 'https://drive.google.com/file/d/{}'.format(mobj.group('id')) - def _download_subtitles_xml(self, video_id, subtitles_id, hl): - if self._captions_xml: - return - self._captions_xml = self._download_xml( - self._BASE_URL_CAPTIONS, video_id, query={ - 'id': video_id, - 'vid': subtitles_id, - 'hl': hl, + @staticmethod + def _construct_subtitle_url(base_url, video_id, language, fmt, kind): + return update_url_query( + base_url, filter_dict({ + 'hl': 'en-US', 'v': video_id, + 'type': 'track', + 'lang': language, + 'fmt': fmt, + 'kind': kind, + })) + + def _get_subtitles(self, video_id, video_info): + subtitles = {} + timed_text_base_url = traverse_obj(video_info, ('timedTextDetails', 'timedTextBaseUrl', {url_or_none})) + if not timed_text_base_url: + return subtitles + subtitle_data = self._download_xml( + timed_text_base_url, video_id, 'Downloading subtitles XML', fatal=False, query={ + 'hl': 'en-US', 'type': 'list', - 'tlangs': '1', - 'fmts': '1', - 'vssids': '1', - }, note='Downloading subtitles XML', - errnote='Unable to download subtitles XML', fatal=False) - if self._captions_xml: - for f in self._captions_xml.findall('format'): - if f.attrib.get('fmt_code') and not f.attrib.get('default'): - self._caption_formats_ext.append(f.attrib['fmt_code']) - - def _get_captions_by_type(self, video_id, subtitles_id, caption_type, - origin_lang_code=None, origin_lang_name=None): - if not subtitles_id or not caption_type: - return - captions = {} - for caption_entry in self._captions_xml.findall( - self._CAPTIONS_ENTRY_TAG[caption_type]): - caption_lang_code = caption_entry.attrib.get('lang_code') - caption_name = caption_entry.attrib.get('name') or origin_lang_name - if not caption_lang_code or not caption_name: - self.report_warning(f'Missing necessary caption metadata. ' - f'Need lang_code and name attributes. ' - f'Found: {caption_entry.attrib}') - continue - caption_format_data = [] - for caption_format in self._caption_formats_ext: - query = { - 'vid': subtitles_id, - 'v': video_id, - 'fmt': caption_format, - 'lang': (caption_lang_code if origin_lang_code is None - else origin_lang_code), - 'type': 'track', - 'name': caption_name, - 'kind': '', - } - if origin_lang_code is not None: - query.update({'tlang': caption_lang_code}) - caption_format_data.append({ - 'url': update_url_query(self._BASE_URL_CAPTIONS, query), - 'ext': caption_format, - }) - captions[caption_lang_code] = caption_format_data - return captions - - def _get_subtitles(self, video_id, subtitles_id, hl): - if not subtitles_id or not hl: - return - self._download_subtitles_xml(video_id, subtitles_id, hl) - if not self._captions_xml: - return - return self._get_captions_by_type(video_id, subtitles_id, 'subtitles') - - def _get_automatic_captions(self, video_id, subtitles_id, hl): - if not subtitles_id or not hl: - return - self._download_subtitles_xml(video_id, subtitles_id, hl) - if not self._captions_xml: - return - track = next((t for t in self._captions_xml.findall('track') if t.attrib.get('cantran') == 'true'), None) - if track is None: - return - origin_lang_code = track.attrib.get('lang_code') - origin_lang_name = track.attrib.get('name') - if not origin_lang_code or not origin_lang_name: - return - return self._get_captions_by_type( - video_id, subtitles_id, 'automatic_captions', origin_lang_code, origin_lang_name) + 'tlangs': 1, + 'v': video_id, + 'vssids': 1, + }) + subtitle_formats = traverse_obj(subtitle_data, (lambda _, v: v.tag == 'format', {lambda x: x.get('fmt_code')}, {str})) + for track in traverse_obj(subtitle_data, (lambda _, v: v.tag == 'track' and v.get('lang_code'))): + language = track.get('lang_code') + subtitles.setdefault(language, []).extend([{ + 'url': self._construct_subtitle_url( + timed_text_base_url, video_id, language, sub_fmt, track.get('kind')), + 'name': track.get('lang_original'), + 'ext': sub_fmt, + } for sub_fmt in subtitle_formats]) + return subtitles def _real_extract(self, url): video_id = self._match_id(url) - video_info = urllib.parse.parse_qs(self._download_webpage( - 'https://drive.google.com/get_video_info', - video_id, 'Downloading video webpage', query={'docid': video_id})) - - def get_value(key): - return try_get(video_info, lambda x: x[key][0]) - - reason = get_value('reason') - title = get_value('title') + video_info = self._download_json( + f'https://content-workspacevideo-pa.googleapis.com/v1/drive/media/{video_id}/playback', + video_id, 'Downloading video webpage', query={'key': 'AIzaSyDVQw45DwoYh632gvsP5vPDqEKvb-Ywnb8'}, + headers={'Referer': 'https://drive.google.com/'}) formats = [] - fmt_stream_map = (get_value('fmt_stream_map') or '').split(',') - fmt_list = (get_value('fmt_list') or '').split(',') - if fmt_stream_map and fmt_list: - resolutions = {} - for fmt in fmt_list: - mobj = re.search( - r'^(?P\d+)/(?P\d+)[xX](?P\d+)', fmt) - if mobj: - resolutions[mobj.group('format_id')] = ( - int(mobj.group('width')), int(mobj.group('height'))) + for fmt in traverse_obj(video_info, ( + 'mediaStreamingData', 'formatStreamingData', ('adaptiveTranscodes', 'progressiveTranscodes'), + lambda _, v: url_or_none(v['url']))): + formats.append({ + **traverse_obj(fmt, { + 'url': 'url', + 'format_id': ('itag', {int}, {str_or_none}), + }), + **traverse_obj(fmt, ('transcodeMetadata', { + 'ext': ('mimeType', {mimetype2ext}), + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + 'fps': ('videoFps', {int_or_none}), + 'filesize': ('contentLength', {int_or_none}), + 'vcodec': ((('videoCodecString', {str}), {value('none')}), any), + 'acodec': ((('audioCodecString', {str}), {value('none')}), any), + })), + 'downloader_options': { + 'http_chunk_size': 10 << 20, + }, + }) - for fmt_stream in fmt_stream_map: - fmt_stream_split = fmt_stream.split('|') - if len(fmt_stream_split) < 2: - continue - format_id, format_url = fmt_stream_split[:2] - ext = self._FORMATS_EXT.get(format_id) - if not ext: - self.report_warning(f'Unknown format {format_id}{bug_reports_message()}') - f = { - 'url': lowercase_escape(format_url), - 'format_id': format_id, - 'ext': ext, - } - resolution = resolutions.get(format_id) - if resolution: - f.update({ - 'width': resolution[0], - 'height': resolution[1], - }) - formats.append(f) + title = traverse_obj(video_info, ('mediaMetadata', 'title', {str})) source_url = update_url_query( 'https://drive.usercontent.google.com/download', { @@ -264,30 +217,20 @@ class GoogleDriveIE(InfoExtractor): or get_element_by_class('uc-error-caption', confirmation_webpage) or 'unable to extract confirmation code') - if not formats and reason: - if title: - self.raise_no_formats(reason, expected=True) - else: - raise ExtractorError(reason, expected=True) - - hl = get_value('hl') - subtitles_id = None - ttsurl = get_value('ttsurl') - if ttsurl: - # the subtitles ID is the vid param of the ttsurl query - subtitles_id = parse_qs(ttsurl).get('vid', [None])[-1] - - self.cookiejar.clear(domain='.google.com', path='/', name='NID') - return { 'id': video_id, 'title': title, - 'thumbnail': 'https://drive.google.com/thumbnail?id=' + video_id, - 'duration': int_or_none(get_value('length_seconds')), + **traverse_obj(video_info, { + 'duration': ('mediaMetadata', 'duration', {parse_duration}), + 'thumbnails': ('thumbnails', lambda _, v: url_or_none(v['url']), { + 'url': 'url', + 'ext': ('mimeType', {mimetype2ext}), + 'width': ('width', {int}), + 'height': ('height', {int}), + }), + }), 'formats': formats, - 'subtitles': self.extract_subtitles(video_id, subtitles_id, hl), - 'automatic_captions': self.extract_automatic_captions( - video_id, subtitles_id, hl), + 'subtitles': self.extract_subtitles(video_id, video_info), } diff --git a/yt_dlp/extractor/idagio.py b/yt_dlp/extractor/idagio.py new file mode 100644 index 0000000000..a99c559065 --- /dev/null +++ b/yt_dlp/extractor/idagio.py @@ -0,0 +1,262 @@ +from .common import InfoExtractor +from ..utils import int_or_none, unified_timestamp, url_or_none +from ..utils.traversal import traverse_obj + + +class IdagioTrackIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?app\.idagio\.com(?:/[a-z]{2})?/recordings/\d+\?(?:[^#]+&)?trackId=(?P\d+)' + _TESTS = [{ + 'url': 'https://app.idagio.com/recordings/30576934?trackId=30576943', + 'md5': '15148bd71804b2450a2508931a116b56', + 'info_dict': { + 'id': '30576943', + 'ext': 'mp3', + 'title': 'Theme. Andante', + 'duration': 82, + 'composers': ['Edward Elgar'], + 'artists': ['Vasily Petrenko', 'Royal Liverpool Philharmonic Orchestra'], + 'genres': ['Orchestral', 'Other Orchestral Music'], + 'track': 'Theme. Andante', + 'timestamp': 1554474370, + 'upload_date': '20190405', + }, + }, { + 'url': 'https://app.idagio.com/recordings/20514467?trackId=20514478&utm_source=pcl', + 'md5': '3acef2ea0feadf889123b70e5a1e7fa7', + 'info_dict': { + 'id': '20514478', + 'ext': 'mp3', + 'title': 'I. Adagio sostenuto', + 'duration': 316, + 'composers': ['Ludwig van Beethoven'], + 'genres': ['Keyboard', 'Sonata (Keyboard)'], + 'track': 'I. Adagio sostenuto', + 'timestamp': 1518076337, + 'upload_date': '20180208', + }, + }, { + 'url': 'https://app.idagio.com/de/recordings/20514467?trackId=20514478&utm_source=pcl', + 'only_matching': True, + }] + + def _real_extract(self, url): + track_id = self._match_id(url) + track_info = self._download_json( + f'https://api.idagio.com/v2.0/metadata/tracks/{track_id}', + track_id, fatal=False, expected_status=406) + if traverse_obj(track_info, 'error_code') == 'idagio.error.blocked.location': + self.raise_geo_restricted() + + content_info = self._download_json( + f'https://api.idagio.com/v1.8/content/track/{track_id}', track_id, + query={ + 'quality': '0', + 'format': '2', + 'client_type': 'web-4', + }) + + return { + 'ext': 'mp3', + 'vcodec': 'none', + 'id': track_id, + 'url': traverse_obj(content_info, ('url', {url_or_none})), + **traverse_obj(track_info, ('result', { + 'title': ('piece', 'title', {str}), + 'timestamp': ('recording', 'created_at', {int_or_none(scale=1000)}), + 'location': ('recording', 'location', {str}), + 'duration': ('duration', {int_or_none}), + 'track': ('piece', 'title', {str}), + 'artists': ('recording', ('conductor', ('ensembles', ...), ('soloists', ...)), 'name', {str}, filter), + 'composers': ('piece', 'workpart', 'work', 'composer', 'name', {str}, filter, all, filter), + 'genres': ('piece', 'workpart', 'work', ('genre', 'subgenre'), 'title', {str}, filter), + })), + } + + +class IdagioPlaylistBaseIE(InfoExtractor): + """Subclasses must set _API_URL_TMPL and define _parse_playlist_metadata""" + _PLAYLIST_ID_KEY = 'id' # vs. 'display_id' + + def _entries(self, playlist_info): + for track_data in traverse_obj(playlist_info, ('tracks', lambda _, v: v['id'] and v['recording']['id'])): + track_id = track_data['id'] + recording_id = track_data['recording']['id'] + yield self.url_result( + f'https://app.idagio.com/recordings/{recording_id}?trackId={track_id}', + ie=IdagioTrackIE, video_id=track_id) + + def _real_extract(self, url): + playlist_id = self._match_id(url) + playlist_info = self._download_json( + self._API_URL_TMPL.format(playlist_id), playlist_id)['result'] + + return { + '_type': 'playlist', + self._PLAYLIST_ID_KEY: playlist_id, + 'entries': self._entries(playlist_info), + **self._parse_playlist_metadata(playlist_info), + } + + +class IdagioRecordingIE(IdagioPlaylistBaseIE): + _VALID_URL = r'https?://(?:www\.)?app\.idagio\.com(?:/[a-z]{2})?/recordings/(?P\d+)(?![^#]*[&?]trackId=\d+)' + _TESTS = [{ + 'url': 'https://app.idagio.com/recordings/30576934', + 'info_dict': { + 'id': '30576934', + 'title': 'Variations on an Original Theme op. 36', + 'composers': ['Edward Elgar'], + 'artists': ['Vasily Petrenko', 'Royal Liverpool Philharmonic Orchestra'], + 'genres': ['Orchestral', 'Other Orchestral Music'], + 'timestamp': 1554474370, + 'modified_timestamp': 1554474370, + 'modified_date': '20190405', + 'upload_date': '20190405', + }, + 'playlist_count': 15, + }, { + 'url': 'https://app.idagio.com/de/recordings/20514467', + 'info_dict': { + 'id': '20514467', + 'title': 'Sonata for Piano No. 14 in C sharp minor op. 27/2', + 'composers': ['Ludwig van Beethoven'], + 'genres': ['Keyboard', 'Sonata (Keyboard)'], + 'timestamp': 1518076337, + 'upload_date': '20180208', + 'modified_timestamp': 1518076337, + 'modified_date': '20180208', + }, + 'playlist_count': 3, + }] + _API_URL_TMPL = 'https://api.idagio.com/v2.0/metadata/recordings/{}' + + def _parse_playlist_metadata(self, playlist_info): + return traverse_obj(playlist_info, { + 'title': ('work', 'title', {str}), + 'timestamp': ('created_at', {int_or_none(scale=1000)}), + 'modified_timestamp': ('created_at', {int_or_none(scale=1000)}), + 'location': ('location', {str}), + 'artists': (('conductor', ('ensembles', ...), ('soloists', ...)), 'name', {str}), + 'composers': ('work', 'composer', 'name', {str}, all), + 'genres': ('work', ('genre', 'subgenre'), 'title', {str}), + 'tags': ('tags', ..., {str}), + }) + + +class IdagioAlbumIE(IdagioPlaylistBaseIE): + _VALID_URL = r'https?://(?:www\.)?app\.idagio\.com(?:/[a-z]{2})?/albums/(?P[\w-]+)' + _TESTS = [{ + 'url': 'https://app.idagio.com/albums/elgar-enigma-variations-in-the-south-serenade-for-strings', + 'info_dict': { + 'id': 'a9f139b8-f70d-4b8a-a9a4-5fe8d35eaf9c', + 'display_id': 'elgar-enigma-variations-in-the-south-serenade-for-strings', + 'title': 'Elgar: Enigma Variations, In the South, Serenade for Strings', + 'description': '', + 'thumbnail': r're:https://.+/albums/880040420521/main\.jpg', + 'artists': ['Vasily Petrenko', 'Royal Liverpool Philharmonic Orchestra', 'Edward Elgar'], + 'timestamp': 1553817600, + 'upload_date': '20190329', + 'modified_timestamp': 1562566559.0, + 'modified_date': '20190708', + }, + 'playlist_count': 19, + }, { + 'url': 'https://app.idagio.com/de/albums/brahms-ein-deutsches-requiem-3B403DF6-62D7-4A42-807B-47173F3E0192', + 'info_dict': { + 'id': '2862ad4e-4a61-45ad-9ce4-7fcf0c2626fe', + 'display_id': 'brahms-ein-deutsches-requiem-3B403DF6-62D7-4A42-807B-47173F3E0192', + 'title': 'Brahms: Ein deutsches Requiem', + 'description': 'GRAMOPHONE CLASSICAL MUSIC AWARDS 2025 Recording of the Year & Choral', + 'thumbnail': r're:https://.+/albums/3149020954522/main\.jpg', + 'artists': ['Sabine Devieilhe', 'Stéphane Degout', 'Raphaël Pichon', 'Pygmalion', 'Johannes Brahms'], + 'timestamp': 1760054400, + 'upload_date': '20251010', + 'modified_timestamp': 1760624868, + 'modified_date': '20251016', + 'tags': ['recommended', 'recent-release'], + }, + 'playlist_count': 7, + }] + _API_URL_TMPL = 'https://api.idagio.com/v2.0/metadata/albums/{}' + _PLAYLIST_ID_KEY = 'display_id' + + def _parse_playlist_metadata(self, playlist_info): + return traverse_obj(playlist_info, { + 'id': ('id', {str}), + 'title': ('title', {str}), + 'timestamp': ('publishDate', {unified_timestamp}), + 'modified_timestamp': ('lastModified', {unified_timestamp}), + 'thumbnail': ('imageUrl', {url_or_none}), + 'description': ('description', {str}), + 'artists': ('participants', ..., 'name', {str}), + 'tags': ('tags', ..., {str}), + }) + + +class IdagioPlaylistIE(IdagioPlaylistBaseIE): + _VALID_URL = r'https?://(?:www\.)?app\.idagio\.com(?:/[a-z]{2})?/playlists/(?!personal/)(?P[\w-]+)' + _TESTS = [{ + 'url': 'https://app.idagio.com/playlists/beethoven-the-most-beautiful-piano-music', + 'info_dict': { + 'id': '31652bec-8c5b-460e-a3f0-cf1f69817f53', + 'display_id': 'beethoven-the-most-beautiful-piano-music', + 'title': 'Beethoven: the most beautiful piano music', + 'description': 'md5:d41bb04b8896bb69377f5c2cd9345ad1', + 'thumbnail': r're:https://.+/playlists/31652bec-8c5b-460e-a3f0-cf1f69817f53/main\.jpg', + 'creators': ['IDAGIO'], + }, + 'playlist_mincount': 16, # one entry is geo-restricted + }, { + 'url': 'https://app.idagio.com/de/playlists/piano-music-for-an-autumn-day', + 'info_dict': { + 'id': 'd70e9c7f-7080-4308-ae0f-f890dddeda82', + 'display_id': 'piano-music-for-an-autumn-day', + 'title': 'Piano Music for an Autumn Day', + 'description': 'Get ready to snuggle up and enjoy all the musical colours of this cosy, autumnal playlist.', + 'thumbnail': r're:https://.+/playlists/d70e9c7f-7080-4308-ae0f-f890dddeda82/main\.jpg', + 'creators': ['IDAGIO'], + }, + 'playlist_count': 35, + }] + _API_URL_TMPL = 'https://api.idagio.com/v2.0/playlists/{}' + _PLAYLIST_ID_KEY = 'display_id' + + def _parse_playlist_metadata(self, playlist_info): + return traverse_obj(playlist_info, { + 'id': ('id', {str}), + 'title': ('title', {str}), + 'thumbnail': ('imageUrl', {url_or_none}), + 'description': ('description', {str}), + 'creators': ('curator', 'name', {str}, all), + }) + + +class IdagioPersonalPlaylistIE(IdagioPlaylistBaseIE): + _VALID_URL = r'https?://(?:www\.)?app\.idagio\.com(?:/[a-z]{2})?/playlists/personal/(?P[\da-f-]+)' + _TESTS = [{ + 'url': 'https://app.idagio.com/playlists/personal/99dad72e-7b3a-45a4-b216-867c08046ed8', + 'info_dict': { + 'id': '99dad72e-7b3a-45a4-b216-867c08046ed8', + 'title': 'Test', + 'creators': ['1a6f16a6-4514-4d0c-b481-3a9877835626'], + 'thumbnail': r're:https://.+/artists/86371/main\.jpg', + 'timestamp': 1602859138, + 'modified_timestamp': 1755616667, + 'upload_date': '20201016', + 'modified_date': '20250819', + }, + 'playlist_count': 100, + }, { + 'url': 'https://app.idagio.com/de/playlists/personal/99dad72e-7b3a-45a4-b216-867c08046ed8', + 'only_matching': True, + }] + _API_URL_TMPL = 'https://api.idagio.com/v1.0/personal-playlists/{}' + + def _parse_playlist_metadata(self, playlist_info): + return traverse_obj(playlist_info, { + 'title': ('title', {str}), + 'thumbnail': ('image_url', {url_or_none}), + 'creators': ('user_id', {str}, all), + 'timestamp': ('created_at', {int_or_none(scale=1000)}), + 'modified_timestamp': ('updated_at', {int_or_none(scale=1000)}), + }) diff --git a/yt_dlp/extractor/kaltura.py b/yt_dlp/extractor/kaltura.py index dc11cbf6be..98516b8ce1 100644 --- a/yt_dlp/extractor/kaltura.py +++ b/yt_dlp/extractor/kaltura.py @@ -437,7 +437,7 @@ class KalturaIE(InfoExtractor): params = urllib.parse.parse_qs(query) if path: splitted_path = path.split('/') - params.update(dict(zip(splitted_path[::2], [[v] for v in splitted_path[1::2]]))) + params.update(dict(zip(splitted_path[::2], [[v] for v in splitted_path[1::2]]))) # noqa: B905 if 'wid' in params: partner_id = remove_start(params['wid'][0], '_') elif 'p' in params: diff --git a/yt_dlp/extractor/kika.py b/yt_dlp/extractor/kika.py index e277564524..94798b9ac3 100644 --- a/yt_dlp/extractor/kika.py +++ b/yt_dlp/extractor/kika.py @@ -17,57 +17,60 @@ class KikaIE(InfoExtractor): _GEO_COUNTRIES = ['DE'] _TESTS = [{ - 'url': 'https://www.kika.de/logo/videos/logo-vom-samstag-einunddreissig-august-zweitausendvierundzwanzig-100', - 'md5': 'fbfc8da483719ef06f396e5e5b938c69', + # Video without season/episode info + 'url': 'https://www.kika.de/logo/videos/logo-vom-dienstag-achtundzwanzig-oktober-zweitausendfuenfundzwanzig-100', + 'md5': '4a9f6e0f9c6bfcc82394c294f186d6db', 'info_dict': { - 'id': 'logo-vom-samstag-einunddreissig-august-zweitausendvierundzwanzig-100', + 'id': 'logo-vom-dienstag-achtundzwanzig-oktober-zweitausendfuenfundzwanzig-100', 'ext': 'mp4', - 'upload_date': '20240831', - 'timestamp': 1725126600, - 'season_number': 2024, - 'modified_date': '20240831', - 'episode': 'Episode 476', - 'episode_number': 476, - 'season': 'Season 2024', - 'duration': 634, - 'title': 'logo! vom Samstag, 31. August 2024', - 'modified_timestamp': 1725129983, + 'title': 'logo! vom Dienstag, 28. Oktober 2025', + 'description': 'md5:4d28b92cef423bec99740ffaa3e7ec04', + 'duration': 651, + 'timestamp': 1761678000, + 'upload_date': '20251028', + 'modified_timestamp': 1761682624, + 'modified_date': '20251028', }, }, { + # Video with season/episode info + # Also: Video with subtitles 'url': 'https://www.kika.de/kaltstart/videos/video92498', - 'md5': '710ece827e5055094afeb474beacb7aa', + 'md5': 'e58073070acb195906c55c4ad31dceb3', 'info_dict': { 'id': 'video92498', 'ext': 'mp4', 'title': '7. Wo ist Leo?', 'description': 'md5:fb48396a5b75068bcac1df74f1524920', 'duration': 436, + 'season': 'Season 1', + 'season_number': 1, + 'episode': 'Episode 7', + 'episode_number': 7, 'timestamp': 1702926876, 'upload_date': '20231218', - 'episode_number': 7, - 'modified_date': '20240319', 'modified_timestamp': 1710880610, - 'episode': 'Episode 7', - 'season_number': 1, - 'season': 'Season 1', + 'modified_date': '20240319', + 'subtitles': 'count:1', }, }, { - 'url': 'https://www.kika.de/bernd-das-brot/astrobrot/videos/video90088', - 'md5': 'ffd1b700d7de0a6616a1d08544c77294', + # Video without subtitles + 'url': 'https://www.kika.de/die-pfefferkoerner/videos/abgezogen-102', + 'md5': '62e97961ce5343c19f0f330a1b6dd736', 'info_dict': { - 'id': 'video90088', + 'id': 'abgezogen-102', 'ext': 'mp4', - 'upload_date': '20221102', - 'timestamp': 1667390580, - 'duration': 197, - 'modified_timestamp': 1711093771, - 'episode_number': 8, - 'title': 'Es ist nicht leicht, ein Astrobrot zu sein', - 'modified_date': '20240322', - 'description': 'md5:d3641deaf1b5515a160788b2be4159a9', - 'season_number': 1, - 'episode': 'Episode 8', + 'title': '1. Abgezogen', + 'description': 'md5:42d87963364391f9f8eba8affcb30bd2', + 'duration': 1574, 'season': 'Season 1', + 'season_number': 1, + 'episode': 'Episode 1', + 'episode_number': 1, + 'timestamp': 1735382700, + 'upload_date': '20241228', + 'modified_timestamp': 1757344051, + 'modified_date': '20250908', + 'subtitles': 'count:0', }, }] @@ -78,16 +81,19 @@ class KikaIE(InfoExtractor): video_assets = self._download_json(doc['assets']['url'], video_id) subtitles = {} - if ttml_resource := url_or_none(video_assets.get('videoSubtitle')): - subtitles['de'] = [{ - 'url': ttml_resource, - 'ext': 'ttml', - }] - if webvtt_resource := url_or_none(video_assets.get('webvttUrl')): - subtitles.setdefault('de', []).append({ - 'url': webvtt_resource, - 'ext': 'vtt', - }) + # Subtitle API endpoints may be present in the JSON even if there are no subtitles. + # They then return HTTP 200 with invalid data. So we must check explicitly. + if doc.get('hasSubtitle'): + if ttml_resource := url_or_none(video_assets.get('videoSubtitle')): + subtitles['de'] = [{ + 'url': ttml_resource, + 'ext': 'ttml', + }] + if webvtt_resource := url_or_none(video_assets.get('webvttUrl')): + subtitles.setdefault('de', []).append({ + 'url': webvtt_resource, + 'ext': 'vtt', + }) return { 'id': video_id, diff --git a/yt_dlp/extractor/lynda.py b/yt_dlp/extractor/lynda.py index bfd4619337..f7cf9261a8 100644 --- a/yt_dlp/extractor/lynda.py +++ b/yt_dlp/extractor/lynda.py @@ -1,3 +1,4 @@ +import itertools import re import urllib.parse @@ -216,7 +217,7 @@ class LyndaIE(LyndaBaseIE): def _fix_subtitles(self, subs): srt = '' seq_counter = 0 - for seq_current, seq_next in zip(subs, subs[1:]): + for seq_current, seq_next in itertools.pairwise(subs): m_current = re.match(self._TIMECODE_REGEX, seq_current['Timecode']) if m_current is None: continue diff --git a/yt_dlp/extractor/mojevideo.py b/yt_dlp/extractor/mojevideo.py index 145e306970..1f95ed8bc0 100644 --- a/yt_dlp/extractor/mojevideo.py +++ b/yt_dlp/extractor/mojevideo.py @@ -92,7 +92,7 @@ class MojevideoIE(InfoExtractor): contains_pattern=r'\[(?s:.+)\]', transform_source=js_to_json) formats = [] - for video_hash, (suffix, quality, format_note) in zip(video_hashes, [ + for video_hash, (suffix, quality, format_note) in zip(video_hashes, [ # noqa: B905 ('', 1, 'normálna kvalita'), ('_lq', 0, 'nízka kvalita'), ('_hd', 2, 'HD-720p'), diff --git a/yt_dlp/extractor/musescore.py b/yt_dlp/extractor/musescore.py index 0ef2fa0c88..c171e58b3f 100644 --- a/yt_dlp/extractor/musescore.py +++ b/yt_dlp/extractor/musescore.py @@ -1,3 +1,5 @@ +import hashlib + from .common import InfoExtractor @@ -9,10 +11,10 @@ class MuseScoreIE(InfoExtractor): 'id': '142975', 'ext': 'mp3', 'title': 'WA Mozart Marche Turque (Turkish March fingered)', - 'description': 'md5:7ede08230e4eaabd67a4a98bb54d07be', - 'thumbnail': r're:https?://(?:www\.)?musescore\.com/.*\.png[^$]+', + 'description': 'md5:0ca4cf6b79d7f5868a1fee74097394ab', + 'thumbnail': r're:https?://cdn\.ustatik\.com/musescore/.*\.jpg', 'uploader': 'PapyPiano', - 'creator': 'Wolfgang Amadeus Mozart', + 'creators': ['Wolfgang Amadeus Mozart'], }, }, { 'url': 'https://musescore.com/user/36164500/scores/6837638', @@ -20,10 +22,10 @@ class MuseScoreIE(InfoExtractor): 'id': '6837638', 'ext': 'mp3', 'title': 'Sweet Child O\' Mine – Guns N\' Roses sweet child', - 'description': 'md5:4dca71191c14abc312a0a4192492eace', - 'thumbnail': r're:https?://(?:www\.)?musescore\.com/.*\.png[^$]+', + 'description': 'md5:2cd49bd6b4e48a75a3c469d4775d5079', + 'thumbnail': r're:https?://cdn\.ustatik\.com/musescore/.*\.png', 'uploader': 'roxbelviolin', - 'creator': 'Guns N´Roses Arr. Roxbel Violin', + 'creators': ['Guns N´Roses Arr. Roxbel Violin'], }, }, { 'url': 'https://musescore.com/classicman/fur-elise', @@ -31,22 +33,28 @@ class MuseScoreIE(InfoExtractor): 'id': '33816', 'ext': 'mp3', 'title': 'Für Elise – Beethoven', - 'description': 'md5:49515a3556d5ecaf9fa4b2514064ac34', - 'thumbnail': r're:https?://(?:www\.)?musescore\.com/.*\.png[^$]+', + 'description': 'md5:e37b241c0280b33e9ac25651b815d06e', + 'thumbnail': r're:https?://cdn\.ustatik\.com/musescore/.*\.jpg', 'uploader': 'ClassicMan', - 'creator': 'Ludwig van Beethoven (1770–1827)', + 'creators': ['Ludwig van Beethoven (1770–1827)'], }, }, { 'url': 'https://musescore.com/minh_cuteee/scores/6555384', 'only_matching': True, }] + @staticmethod + def _generate_auth_token(video_id): + return hashlib.md5((video_id + 'mp30gs').encode()).hexdigest()[:4] + def _real_extract(self, url): webpage = self._download_webpage(url, None) url = self._og_search_url(webpage) or url video_id = self._match_id(url) - mp3_url = self._download_json(f'https://musescore.com/api/jmuse?id={video_id}&index=0&type=mp3&v2=1', video_id, - headers={'authorization': '63794e5461e4cfa046edfbdddfccc1ac16daffd2'})['info']['url'] + mp3_url = self._download_json( + 'https://musescore.com/api/jmuse', video_id, + headers={'authorization': self._generate_auth_token(video_id)}, + query={'id': video_id, 'index': '0', 'type': 'mp3'})['info']['url'] formats = [{ 'url': mp3_url, 'ext': 'mp3', @@ -57,7 +65,7 @@ class MuseScoreIE(InfoExtractor): 'id': video_id, 'formats': formats, 'title': self._og_search_title(webpage), - 'description': self._og_search_description(webpage), + 'description': self._html_search_meta('description', webpage, 'description'), 'thumbnail': self._og_search_thumbnail(webpage), 'uploader': self._html_search_meta('musescore:author', webpage, 'uploader'), 'creator': self._html_search_meta('musescore:composer', webpage, 'composer'), diff --git a/yt_dlp/extractor/nascar.py b/yt_dlp/extractor/nascar.py new file mode 100644 index 0000000000..b14a3b0aa1 --- /dev/null +++ b/yt_dlp/extractor/nascar.py @@ -0,0 +1,60 @@ +from .common import InfoExtractor +from ..utils import ( + float_or_none, + parse_iso8601, + url_or_none, +) +from ..utils.traversal import traverse_obj + + +class NascarClassicsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?classics\.nascar\.com/video/(?P[\w~-]+)' + _TESTS = [{ + 'url': 'https://classics.nascar.com/video/Ka5qGuxzZ~SIvJii7uAC~wszPshklHN', + 'md5': '81d712eccffa7169c328281b8cc28f77', + 'info_dict': { + 'id': 'Ka5qGuxzZ~SIvJii7uAC~wszPshklHN', + 'ext': 'mp4', + 'title': 'Cook Out 400 2023', + 'thumbnail': 'https://va.aws.nascar.com/IMAGES/CUP_2023_22_RICHMOND_THUMB_NCD.jpg', + 'timestamp': 1690732800, + 'upload_date': '20230730', + 'tags': ['2023', 'race #22', 'richmond', 'chris buescher', 'cup'], + 'chapters': 'count:18', + }, + }, { + 'url': 'https://classics.nascar.com/video/UASvPDOwEha~SIvJii7uAC~wszPshklHN', + 'md5': 'a5e8d6ec6005da3857d25ba2df5e7133', + 'info_dict': { + 'id': 'UASvPDOwEha~SIvJii7uAC~wszPshklHN', + 'ext': 'mp4', + 'title': 'I Love New York 355 at the Glen 2017', + 'thumbnail': 'https://va.aws.nascar.com/IMAGES/CUP_2017_22_WATKINSGLEN_THUMB_NCD.jpg', + 'timestamp': 1501995600, + 'upload_date': '20170806', + 'tags': ['watkins glen', 'race #22', '2017', 'martin truex jr.', 'cup'], + 'chapters': 'count:13', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + content_data = self._search_nextjs_data( + webpage, video_id)['props']['pageProps']['contentData'] + + return { + 'id': video_id, + 'formats': self._extract_m3u8_formats(content_data['input']['src'], video_id, 'mp4'), + **traverse_obj(content_data, { + 'title': ('input', 'name', {str}), + 'description': ('input', 'description', {str}, filter), + 'thumbnail': ('input', 'thumbnail', {url_or_none}), + 'tags': ('input', 'settings', 'tags', ..., {str}), + 'timestamp': ('input', 'start_time', {parse_iso8601}), + 'chapters': ('overlay', 'data', 'timelines', 0, 'events', lambda _, v: float(v['timestamp']) is not None, { + 'start_time': ('timestamp', {float_or_none}), + 'title': ('name', {str}), + }), + }), + } diff --git a/yt_dlp/extractor/nbc.py b/yt_dlp/extractor/nbc.py index caa9dc0175..41811b8a20 100644 --- a/yt_dlp/extractor/nbc.py +++ b/yt_dlp/extractor/nbc.py @@ -63,7 +63,7 @@ class NBCUniversalBaseIE(ThePlatformBaseIE): # formats='mpeg4' will return either a working m3u8 URL or an m3u8 template for non-DRM HLS # formats='m3u+none,mpeg4' may return DRM HLS but w/the "folders" needed for non-DRM template query['formats'] = 'm3u+none,mpeg4' - m3u8_url = self._download_nbcu_smil_and_extract_m3u8_url(tp_path, video_id, query) + orig_m3u8_url = m3u8_url = self._download_nbcu_smil_and_extract_m3u8_url(tp_path, video_id, query) if mobj := re.fullmatch(self._M3U8_RE, m3u8_url): query['formats'] = 'mpeg4' @@ -76,7 +76,17 @@ class NBCUniversalBaseIE(ThePlatformBaseIE): if '/mpeg_cenc' in m3u8_url or '/mpeg_cbcs' in m3u8_url: self.report_drm(video_id) - return self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, 'mp4', m3u8_id='hls') + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + m3u8_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + + if not formats and m3u8_url != orig_m3u8_url: + orig_fmts, subtitles = self._extract_m3u8_formats_and_subtitles( + orig_m3u8_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + formats = [f for f in orig_fmts if not f.get('has_drm')] + if orig_fmts and not formats: + self.report_drm(video_id) + + return formats, subtitles def _extract_nbcu_video(self, url, display_id, old_ie_key=None): webpage = self._download_webpage(url, display_id) diff --git a/yt_dlp/extractor/nhk.py b/yt_dlp/extractor/nhk.py index 14fbd6ce82..eef3ed820c 100644 --- a/yt_dlp/extractor/nhk.py +++ b/yt_dlp/extractor/nhk.py @@ -503,7 +503,7 @@ class NhkForSchoolBangumiIE(InfoExtractor): 'start_time': s, 'end_time': e, 'title': t, - } for s, e, t in zip(start_time, end_time, chapter_titles)] + } for s, e, t in zip(start_time, end_time, chapter_titles, strict=True)] return { 'id': video_id, diff --git a/yt_dlp/extractor/pbs.py b/yt_dlp/extractor/pbs.py index 53b1994156..dde734ff3d 100644 --- a/yt_dlp/extractor/pbs.py +++ b/yt_dlp/extractor/pbs.py @@ -181,7 +181,7 @@ class PBSIE(InfoExtractor): ) IE_NAME = 'pbs' - IE_DESC = 'Public Broadcasting Service (PBS) and member stations: {}'.format(', '.join(list(zip(*_STATIONS))[1])) + IE_DESC = 'Public Broadcasting Service (PBS) and member stations: {}'.format(', '.join(list(zip(*_STATIONS, strict=True))[1])) _VALID_URL = r'''(?x)https?:// (?: @@ -193,7 +193,7 @@ class PBSIE(InfoExtractor): (?:[^/?#]+/){{1,5}}(?P[^/?#]+?)(?:\.html)?/?(?:$|[?#]) ) ) - '''.format('|'.join(next(zip(*_STATIONS)))) + '''.format('|'.join(next(zip(*_STATIONS, strict=True)))) _GEO_COUNTRIES = ['US'] diff --git a/yt_dlp/extractor/polskieradio.py b/yt_dlp/extractor/polskieradio.py index 9d0496bdf0..8777f987e6 100644 --- a/yt_dlp/extractor/polskieradio.py +++ b/yt_dlp/extractor/polskieradio.py @@ -405,7 +405,7 @@ class PolskieRadioCategoryIE(InfoExtractor): tab_content = self._download_json( 'https://www.polskieradio.pl/CMS/TemplateBoxesManagement/TemplateBoxTabContent.aspx/GetTabContent', category_id, f'Downloading page {page_num}', headers={'content-type': 'application/json'}, - data=json.dumps(dict(zip(( + data=json.dumps(dict(zip(( # noqa: B905 'boxInstanceId', 'tabId', 'categoryType', 'sectionId', 'categoryId', 'pagerMode', 'subjectIds', 'tagIndexId', 'queryString', 'name', 'openArticlesInParentTemplate', 'idSectionFromUrl', 'maxDocumentAge', 'showCategoryForArticle', 'pageNumber', diff --git a/yt_dlp/extractor/pr0gramm.py b/yt_dlp/extractor/pr0gramm.py index d5d6ecdfd8..0ba78a4f42 100644 --- a/yt_dlp/extractor/pr0gramm.py +++ b/yt_dlp/extractor/pr0gramm.py @@ -155,7 +155,7 @@ class Pr0grammIE(InfoExtractor): # Sorted by "confidence", higher confidence = earlier in list confidences = traverse_obj(metadata, ('tags', ..., 'confidence', ({int}, {float}))) if confidences: - tags = [tag for _, tag in sorted(zip(confidences, tags), reverse=True)] + tags = [tag for _, tag in sorted(zip(confidences, tags), reverse=True)] # noqa: B905 formats = traverse_obj(video_info, ('variants', ..., { 'format_id': ('name', {str}), diff --git a/yt_dlp/extractor/prankcast.py b/yt_dlp/extractor/prankcast.py index 84e6f7ebcf..68eb382e70 100644 --- a/yt_dlp/extractor/prankcast.py +++ b/yt_dlp/extractor/prankcast.py @@ -1,8 +1,8 @@ import json from .common import InfoExtractor -from ..utils import float_or_none, parse_iso8601, str_or_none, try_call -from ..utils.traversal import traverse_obj +from ..utils import float_or_none, parse_iso8601, str_or_none, try_call, url_or_none +from ..utils.traversal import traverse_obj, value class PrankCastIE(InfoExtractor): @@ -100,9 +100,38 @@ class PrankCastPostIE(InfoExtractor): 'duration': 263.287, 'cast': ['despicabledogs'], 'description': 'https://imgur.com/a/vtxLvKU', - 'categories': [], 'upload_date': '20240104', }, + }, { + 'url': 'https://prankcast.com/drtomservo/posts/11988-butteye-s-late-night-stank-episode-1-part-1-', + 'info_dict': { + 'id': '11988', + 'ext': 'mp3', + 'title': 'Butteye\'s Late Night Stank Episode 1 (Part 1)', + 'display_id': 'butteye-s-late-night-stank-episode-1-part-1-', + 'timestamp': 1754238686, + 'uploader': 'DrTomServo', + 'channel_id': '136', + 'duration': 2176.464, + 'cast': ['DrTomServo'], + 'description': '', + 'upload_date': '20250803', + }, + }, { + 'url': 'https://prankcast.com/drtomservo/posts/12105-butteye-s-late-night-stank-episode-08-16-2025-part-2', + 'info_dict': { + 'id': '12105', + 'ext': 'mp3', + 'title': 'Butteye\'s Late Night Stank Episode 08-16-2025 Part 2', + 'display_id': 'butteye-s-late-night-stank-episode-08-16-2025-part-2', + 'timestamp': 1755453505, + 'uploader': 'DrTomServo', + 'channel_id': '136', + 'duration': 19018.392, + 'cast': ['DrTomServo'], + 'description': '', + 'upload_date': '20250817', + }, }] def _real_extract(self, url): @@ -112,26 +141,28 @@ class PrankCastPostIE(InfoExtractor): post = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['ssr_data_posts'] content = self._parse_json(post['post_contents_json'], video_id)[0] - uploader = post.get('user_name') - guests_json = traverse_obj(content, ('guests_json', {json.loads}, {dict})) or {} - return { 'id': video_id, - 'title': post.get('post_title') or self._og_search_title(webpage), 'display_id': display_id, - 'url': content.get('url'), - 'timestamp': parse_iso8601(content.get('start_date') or content.get('crdate'), ' '), - 'uploader': uploader, - 'channel_id': str_or_none(post.get('user_id')), - 'duration': float_or_none(content.get('duration')), - 'cast': list(filter(None, [uploader, *traverse_obj(guests_json, (..., 'name'))])), - 'description': post.get('post_body'), - 'categories': list(filter(None, [content.get('category')])), - 'tags': try_call(lambda: list(filter('', post['post_tags'].split(',')))), - 'subtitles': { - 'live_chat': [{ - 'url': f'https://prankcast.com/api/private/chat/select-broadcast?id={post["content_id"]}&cache=', - 'ext': 'json', - }], - } if post.get('content_id') else None, + 'title': self._og_search_title(webpage), + **traverse_obj(post, { + 'title': ('post_title', {str}), + 'description': ('post_body', {str}), + 'tags': ('post_tags', {lambda x: x.split(',')}, ..., {str.strip}, filter), + 'channel_id': ('user_id', {int}, {str_or_none}), + 'uploader': ('user_name', {str}), + }), + **traverse_obj(content, { + 'url': (('secure_url', 'url'), {url_or_none}, any), + 'timestamp': (( + (('start_date', 'crdate'), {parse_iso8601(delimiter=' ')}), + ('created_at', {parse_iso8601}), + ), any), + 'duration': ('duration', {float_or_none}), + 'categories': ('category', {str}, filter, all, filter), + 'cast': (( + {value(post.get('user_name'))}, + ('guests_json', {json.loads}, ..., 'name'), + ), {str}, filter), + }), } diff --git a/yt_dlp/extractor/slideslive.py b/yt_dlp/extractor/slideslive.py index e684ac7b8e..c95feb9d34 100644 --- a/yt_dlp/extractor/slideslive.py +++ b/yt_dlp/extractor/slideslive.py @@ -248,35 +248,17 @@ class SlidesLiveIE(InfoExtractor): 'skip_download': 'm3u8', }, }, { - # /v3/ slides, .jpg and .png, service_name = youtube + # /v3/ slides, .jpg and .png, formerly service_name = youtube, now native 'url': 'https://slideslive.com/embed/38932460/', 'info_dict': { - 'id': 'RTPdrgkyTiE', - 'display_id': '38932460', + 'id': '38932460', 'ext': 'mp4', 'title': 'Active Learning for Hierarchical Multi-Label Classification', - 'description': 'Watch full version of this video at https://slideslive.com/38932460.', - 'channel': 'SlidesLive Videos - A', - 'channel_id': 'UC62SdArr41t_-_fX40QCLRw', - 'channel_url': 'https://www.youtube.com/channel/UC62SdArr41t_-_fX40QCLRw', - 'uploader': 'SlidesLive Videos - A', - 'uploader_id': '@slideslivevideos-a6075', - 'uploader_url': 'https://www.youtube.com/@slideslivevideos-a6075', - 'upload_date': '20200903', - 'timestamp': 1697805922, - 'duration': 942, - 'age_limit': 0, - 'live_status': 'not_live', - 'playable_in_embed': True, - 'availability': 'unlisted', - 'categories': ['People & Blogs'], - 'tags': [], - 'channel_follower_count': int, - 'like_count': int, - 'view_count': int, - 'thumbnail': r're:^https?://.*\.(?:jpg|png|webp)', - 'thumbnails': 'count:21', + 'duration': 941, + 'thumbnail': r're:https?://.+/.+\.(?:jpg|png)', 'chapters': 'count:20', + 'timestamp': 1708338974, + 'upload_date': '20240219', }, 'params': { 'skip_download': 'm3u8', @@ -425,7 +407,7 @@ class SlidesLiveIE(InfoExtractor): player_token = self._search_regex(r'data-player-token="([^"]+)"', webpage, 'player token') player_data = self._download_webpage( - f'https://ben.slideslive.com/player/{video_id}', video_id, + f'https://slideslive.com/player/{video_id}', video_id, note='Downloading player info', query={'player_token': player_token}) player_info = self._extract_custom_m3u8_info(player_data) @@ -525,7 +507,7 @@ class SlidesLiveIE(InfoExtractor): yield info service_data = self._download_json( - f'https://ben.slideslive.com/player/{video_id}/slides_video_service_data', + f'https://slideslive.com/player/{video_id}/slides_video_service_data', video_id, fatal=False, query={ 'player_token': player_token, 'videos': ','.join(video_slides), diff --git a/yt_dlp/extractor/soundcloud.py b/yt_dlp/extractor/soundcloud.py index 2cc98c66ce..7833081bfa 100644 --- a/yt_dlp/extractor/soundcloud.py +++ b/yt_dlp/extractor/soundcloud.py @@ -438,7 +438,7 @@ class SoundcloudIE(SoundcloudBaseIE): (?P[\w\d-]+) (?:/(?P<token>(?!(?:albums|sets|recommended))[^?]+?))? (?:[?].*)?$) - |(?:api(?:-v2)?\.soundcloud\.com/tracks/(?P<track_id>\d+) + |(?:api(?:-v2)?\.soundcloud\.com/tracks/(?:soundcloud%3Atracks%3A)?(?P<track_id>\d+) (?:/?\?secret_token=(?P<secret_token>[^&]+))?) ) ''' @@ -692,6 +692,9 @@ class SoundcloudIE(SoundcloudBaseIE): # Go+ (account with active subscription needed) 'url': 'https://soundcloud.com/taylorswiftofficial/look-what-you-made-me-do', 'only_matching': True, + }, { + 'url': 'https://api.soundcloud.com/tracks/soundcloud%3Atracks%3A1083788353', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/tenplay.py b/yt_dlp/extractor/tenplay.py index bf5dddde42..bf82f4bfda 100644 --- a/yt_dlp/extractor/tenplay.py +++ b/yt_dlp/extractor/tenplay.py @@ -1,12 +1,20 @@ +import base64 +import datetime as dt import itertools +import json +import re +import time from .common import InfoExtractor -from ..networking import HEADRequest +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, + encode_data_uri, + filter_dict, int_or_none, - update_url_query, + jwt_decode_hs256, url_or_none, + urlencode_postdata, urljoin, ) from ..utils.traversal import traverse_obj @@ -90,7 +98,7 @@ class TenPlayIE(InfoExtractor): 'only_matching': True, }] _GEO_BYPASS = False - + _GEO_COUNTRIES = ['AU'] _AUS_AGES = { 'G': 0, 'PG': 15, @@ -100,31 +108,155 @@ class TenPlayIE(InfoExtractor): 'R': 18, 'X': 18, } + _TOKEN_CACHE_KEY = 'token_data' + _SEGMENT_BITRATE_RE = r'(?m)-(?:300|150|75|55)0000-(\d+(?:-[\da-f]+)?)\.ts$' + + _refresh_token = None + _access_token = None + + @staticmethod + def _filter_ads_from_m3u8(m3u8_doc): + out = [] + for line in m3u8_doc.splitlines(): + if line.startswith('https://redirector.googlevideo.com/'): + out.pop() + continue + out.append(line) + + return '\n'.join(out) + + @staticmethod + def _generate_xnetwork_ten_auth_token(): + ts = dt.datetime.now(dt.timezone.utc).strftime('%Y%m%d%H%M%S') + return base64.b64encode(ts.encode()).decode() + + @staticmethod + def _is_jwt_expired(token): + return jwt_decode_hs256(token)['exp'] - time.time() < 300 + + def _refresh_access_token(self): + try: + refresh_data = self._download_json( + 'https://10.com.au/api/token/refresh', None, 'Refreshing access token', + headers={ + 'Content-Type': 'application/json', + }, data=json.dumps({ + 'accessToken': self._access_token, + 'refreshToken': self._refresh_token, + }).encode()) + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and e.cause.status == 400: + self._refresh_token = self._access_token = None + self.cache.store(self._NETRC_MACHINE, self._TOKEN_CACHE_KEY, [None, None]) + self.report_warning('Refresh token has been invalidated; retrying with credentials') + self._perform_login(*self._get_login_info()) + return + raise + self._access_token = refresh_data['accessToken'] + self._refresh_token = refresh_data['refreshToken'] + self.cache.store(self._NETRC_MACHINE, self._TOKEN_CACHE_KEY, [self._refresh_token, self._access_token]) + + def _perform_login(self, username, password): + if not self._refresh_token: + self._refresh_token, self._access_token = self.cache.load( + self._NETRC_MACHINE, self._TOKEN_CACHE_KEY, default=[None, None]) + if self._refresh_token and self._access_token: + self.write_debug('Using cached refresh token') + return + + try: + auth_data = self._download_json( + 'https://10.com.au/api/user/auth', None, 'Logging in', + headers={ + 'Content-Type': 'application/json', + 'X-Network-Ten-Auth': self._generate_xnetwork_ten_auth_token(), + 'Referer': 'https://10.com.au/', + }, data=json.dumps({ + 'email': username, + 'password': password, + }).encode()) + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and e.cause.status == 400: + raise ExtractorError('Invalid username/password', expected=True) + raise + + self._refresh_token = auth_data['jwt']['refreshToken'] + self._access_token = auth_data['jwt']['accessToken'] + self.cache.store(self._NETRC_MACHINE, self._TOKEN_CACHE_KEY, [self._refresh_token, self._access_token]) + + def _call_playback_api(self, content_id): + if self._access_token and self._is_jwt_expired(self._access_token): + self._refresh_access_token() + for is_retry in (False, True): + try: + return self._download_json_handle( + f'https://10.com.au/api/v1/videos/playback/{content_id}/', content_id, + note='Downloading video JSON', query={'platform': 'samsung'}, + headers=filter_dict({ + 'TP-AcceptFeature': 'v1/fw;v1/drm', + 'Authorization': f'Bearer {self._access_token}' if self._access_token else None, + })) + except ExtractorError as e: + if not is_retry and isinstance(e.cause, HTTPError) and e.cause.status == 403: + if self._access_token: + self.to_screen('Access token has expired; refreshing') + self._refresh_access_token() + continue + elif not self._get_login_info()[0]: + self.raise_login_required('Login required to access this video', method='password') + raise def _real_extract(self, url): content_id = self._match_id(url) - data = self._download_json( - 'https://10.com.au/api/v1/videos/' + content_id, content_id) + try: + data = self._download_json(f'https://10.com.au/api/v1/videos/{content_id}', content_id) + except ExtractorError as e: + if ( + isinstance(e.cause, HTTPError) and e.cause.status == 403 + and 'Error 54113' in e.cause.response.read().decode() + ): + self.raise_geo_restricted(countries=self._GEO_COUNTRIES) + raise - video_data = self._download_json( - f'https://vod.ten.com.au/api/videos/bcquery?command=find_videos_by_id&video_id={data["altId"]}', - content_id, 'Downloading video JSON') - # Dash URL 404s, changing the m3u8 format works - m3u8_url = self._request_webpage( - HEADRequest(update_url_query(video_data['items'][0]['dashManifestUrl'], { - 'manifest': 'm3u', - })), - content_id, 'Checking stream URL').url - if '10play-not-in-oz' in m3u8_url: - self.raise_geo_restricted(countries=['AU']) - if '10play_unsupported' in m3u8_url: - raise ExtractorError('Unable to extract stream') - # Attempt to get a higher quality stream - formats = self._extract_m3u8_formats( - m3u8_url.replace(',150,75,55,0000', ',500,300,150,75,55,0000'), - content_id, 'mp4', fatal=False) - if not formats: - formats = self._extract_m3u8_formats(m3u8_url, content_id, 'mp4') + video_data, urlh = self._call_playback_api(content_id) + content_source_id = video_data['dai']['contentSourceId'] + video_id = video_data['dai']['videoId'] + auth_token = urlh.get_header('x-dai-auth') + if not auth_token: + raise ExtractorError('Failed to get DAI auth token') + + dai_data = self._download_json( + f'https://pubads.g.doubleclick.net/ondemand/hls/content/{content_source_id}/vid/{video_id}/streams', + content_id, note='Downloading DAI JSON', + data=urlencode_postdata({'auth-token': auth_token})) + + # Ignore subs to avoid ad break cleanup + formats, _ = self._extract_m3u8_formats_and_subtitles( + dai_data['stream_manifest'], content_id, 'mp4') + + already_have_1080p = False + for fmt in formats: + m3u8_doc = self._download_webpage( + fmt['url'], content_id, note='Downloading m3u8 information') + m3u8_doc = self._filter_ads_from_m3u8(m3u8_doc) + fmt['hls_media_playlist_data'] = m3u8_doc + if fmt.get('height') == 1080: + already_have_1080p = True + + # Attempt format upgrade + if not already_have_1080p and m3u8_doc and re.search(self._SEGMENT_BITRATE_RE, m3u8_doc): + m3u8_doc = re.sub(self._SEGMENT_BITRATE_RE, r'-5000000-\1.ts', m3u8_doc) + m3u8_doc = re.sub(r'-(?:300|150|75|55)0000\.key"', r'-5000000.key"', m3u8_doc) + formats.append({ + 'format_id': 'upgrade-attempt-1080p', + 'url': encode_data_uri(m3u8_doc.encode(), 'application/x-mpegurl'), + 'hls_media_playlist_data': m3u8_doc, + 'width': 1920, + 'height': 1080, + 'ext': 'mp4', + 'protocol': 'm3u8_native', + '__needs_testing': True, + }) return { 'id': content_id, diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py index e165effd4e..b7e058ebe7 100644 --- a/yt_dlp/extractor/tiktok.py +++ b/yt_dlp/extractor/tiktok.py @@ -81,7 +81,7 @@ class TikTokBaseIE(InfoExtractor): } self._APP_INFO_POOL = [ {**defaults, **dict( - (k, v) for k, v in zip(self._APP_INFO_DEFAULTS, app_info.split('/')) if v + (k, v) for k, v in zip(self._APP_INFO_DEFAULTS, app_info.split('/'), strict=False) if v )} for app_info in self._KNOWN_APP_INFO ] @@ -220,7 +220,7 @@ class TikTokBaseIE(InfoExtractor): def _extract_web_data_and_status(self, url, video_id, fatal=True): video_data, status = {}, -1 - res = self._download_webpage_handle(url, video_id, fatal=fatal, headers={'User-Agent': 'Mozilla/5.0'}) + res = self._download_webpage_handle(url, video_id, fatal=fatal, impersonate=True) if res is False: return video_data, status @@ -1071,12 +1071,15 @@ class TikTokUserIE(TikTokBaseIE): webpage = self._download_webpage( self._UPLOADER_URL_FORMAT % user_name, user_name, 'Downloading user webpage', 'Unable to download user webpage', - fatal=False, headers={'User-Agent': 'Mozilla/5.0'}) or '' + fatal=False, impersonate=True) or '' detail = traverse_obj( self._get_universal_data(webpage, user_name), ('webapp.user-detail', {dict})) or {} - if detail.get('statusCode') == 10222: + video_count = traverse_obj(detail, ('userInfo', ('stats', 'statsV2'), 'videoCount', {int}, any)) + if not video_count and detail.get('statusCode') == 10222: self.raise_login_required( 'This user\'s account is private. Log into an account that has access') + elif video_count == 0: + raise ExtractorError('This account does not have any videos posted', expected=True) sec_uid = traverse_obj(detail, ('userInfo', 'user', 'secUid', {str})) if sec_uid: fail_early = not traverse_obj(detail, ('userInfo', 'itemList', ...)) @@ -1520,7 +1523,7 @@ class TikTokLiveIE(TikTokBaseIE): uploader, room_id = self._match_valid_url(url).group('uploader', 'id') if not room_id: webpage = self._download_webpage( - format_field(uploader, None, self._UPLOADER_URL_FORMAT), uploader) + format_field(uploader, None, self._UPLOADER_URL_FORMAT), uploader, impersonate=True) room_id = traverse_obj( self._get_universal_data(webpage, uploader), ('webapp.user-detail', 'userInfo', 'user', 'roomId', {str})) diff --git a/yt_dlp/extractor/tvnoe.py b/yt_dlp/extractor/tvnoe.py index 24a82623f2..b6d9ac6692 100644 --- a/yt_dlp/extractor/tvnoe.py +++ b/yt_dlp/extractor/tvnoe.py @@ -1,46 +1,82 @@ +import re + from .common import InfoExtractor from ..utils import ( clean_html, - get_element_by_class, + extract_attributes, js_to_json, + mimetype2ext, + unified_strdate, + url_or_none, + urljoin, ) +from ..utils.traversal import find_element, traverse_obj class TVNoeIE(InfoExtractor): - _WORKING = False - _VALID_URL = r'https?://(?:www\.)?tvnoe\.cz/video/(?P<id>[0-9]+)' - _TEST = { - 'url': 'http://www.tvnoe.cz/video/10362', - 'md5': 'aee983f279aab96ec45ab6e2abb3c2ca', + IE_NAME = 'tvnoe' + IE_DESC = 'Televize Noe' + + _VALID_URL = r'https?://(?:www\.)?tvnoe\.cz/porad/(?P<id>[\w-]+)' + _TESTS = [{ + 'url': 'https://www.tvnoe.cz/porad/43216-outdoor-films-s-mudr-tomasem-kempnym-pomahat-potrebnym-nejen-u-nas', 'info_dict': { - 'id': '10362', + 'id': '43216-outdoor-films-s-mudr-tomasem-kempnym-pomahat-potrebnym-nejen-u-nas', 'ext': 'mp4', - 'series': 'Noční univerzita', - 'title': 'prof. Tomáš Halík, Th.D. - Návrat náboženství a střet civilizací', - 'description': 'md5:f337bae384e1a531a52c55ebc50fff41', + 'title': 'Pomáhat potřebným nejen u nás', + 'description': 'md5:78b538ee32f7e881ec23b9c278a0ff3a', + 'release_date': '20250531', + 'series': 'Outdoor Films s MUDr. Tomášem Kempným', + 'thumbnail': r're:https?://www\.tvnoe\.cz/.+\.jpg', }, - } + }, { + 'url': 'https://www.tvnoe.cz/porad/43205-zamysleni-tomase-halika-7-nedele-velikonocni', + 'info_dict': { + 'id': '43205-zamysleni-tomase-halika-7-nedele-velikonocni', + 'ext': 'mp4', + 'title': '7. neděle velikonoční', + 'description': 'md5:6bb9908efc59abe60e1c8c7c0e9bb6cd', + 'release_date': '20250531', + 'series': 'Zamyšlení Tomáše Halíka', + 'thumbnail': r're:https?://www\.tvnoe\.cz/.+\.jpg', + }, + }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) + player = self._search_json( + r'var\s+INIT_PLAYER\s*=', webpage, 'init player', + video_id, transform_source=js_to_json) - iframe_url = self._search_regex( - r'<iframe[^>]+src="([^"]+)"', webpage, 'iframe URL') + formats = [] + for source in traverse_obj(player, ('tracks', ..., lambda _, v: url_or_none(v['src']))): + src_url = source['src'] + ext = mimetype2ext(source.get('type')) + if ext == 'm3u8': + fmts = self._extract_m3u8_formats( + src_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + elif ext == 'mpd': + fmts = self._extract_mpd_formats( + src_url, video_id, mpd_id='dash', fatal=False) + else: + self.report_warning(f'Unsupported stream type: {ext}') + continue + formats.extend(fmts) - ifs_page = self._download_webpage(iframe_url, video_id) - jwplayer_data = self._find_jwplayer_data( - ifs_page, video_id, transform_source=js_to_json) - info_dict = self._parse_jwplayer_data( - jwplayer_data, video_id, require_title=False, base_url=iframe_url) - - info_dict.update({ + return { 'id': video_id, - 'title': clean_html(get_element_by_class( - 'field-name-field-podnazev', webpage)), - 'description': clean_html(get_element_by_class( - 'field-name-body', webpage)), - 'series': clean_html(get_element_by_class('title', webpage)), - }) - - return info_dict + 'description': clean_html(self._search_regex( + r'<p\s+class="">(.+?)</p>', webpage, 'description', default=None)), + 'formats': formats, + **traverse_obj(webpage, { + 'title': ({find_element(tag='h2')}, {clean_html}), + 'release_date': ( + {clean_html}, {re.compile(r'Premiéra:\s*(\d{1,2}\.\d{1,2}\.\d{4})').findall}, + ..., {str}, {unified_strdate}, any), + 'series': ({find_element(tag='h1')}, {clean_html}), + 'thumbnail': ( + {find_element(id='player-live', html=True)}, {extract_attributes}, + 'poster', {urljoin('https://www.tvnoe.cz/')}), + }), + } diff --git a/yt_dlp/extractor/urplay.py b/yt_dlp/extractor/urplay.py index 79bb8a8055..a0ac2a0bc6 100644 --- a/yt_dlp/extractor/urplay.py +++ b/yt_dlp/extractor/urplay.py @@ -8,6 +8,7 @@ from ..utils import ( try_get, unified_timestamp, ) +from ..utils.traversal import traverse_obj class URPlayIE(InfoExtractor): @@ -25,7 +26,7 @@ class URPlayIE(InfoExtractor): 'upload_date': '20171214', 'series': 'UR Samtiden - Livet, universum och rymdens märkliga musik', 'duration': 2269, - 'categories': ['Vetenskap & teknik'], + 'categories': ['Kultur & historia'], 'tags': ['Kritiskt tänkande', 'Vetenskap', 'Vetenskaplig verksamhet'], 'episode': 'Om vetenskap, kritiskt tänkande och motstånd', 'age_limit': 15, @@ -78,7 +79,7 @@ class URPlayIE(InfoExtractor): webpage = self._download_webpage(url, video_id) urplayer_data = self._search_nextjs_data(webpage, video_id, fatal=False) or {} if urplayer_data: - urplayer_data = try_get(urplayer_data, lambda x: x['props']['pageProps']['program'], dict) + urplayer_data = traverse_obj(urplayer_data, ('props', 'pageProps', 'productData', {dict})) if not urplayer_data: raise ExtractorError('Unable to parse __NEXT_DATA__') else: diff --git a/yt_dlp/extractor/vidyard.py b/yt_dlp/extractor/vidyard.py index 89a89b13f1..42da0e3d1c 100644 --- a/yt_dlp/extractor/vidyard.py +++ b/yt_dlp/extractor/vidyard.py @@ -58,6 +58,20 @@ class VidyardBaseIE(InfoExtractor): return subs + def _get_additional_metadata(self, video_id): + additional_metadata = self._download_json( + f'https://play.vidyard.com/video/{video_id}', video_id, + note='Downloading additional metadata', fatal=False) + return traverse_obj(additional_metadata, { + 'title': ('name', {str}), + 'duration': ('seconds', {int_or_none}), + 'thumbnails': ('thumbnailUrl', {'url': {url_or_none}}, all), + 'chapters': ('videoSections', lambda _, v: float_or_none(v['milliseconds']) is not None, { + 'title': ('title', {str}), + 'start_time': ('milliseconds', {float_or_none(scale=1000)}), + }), + }) + def _fetch_video_json(self, video_id): return self._download_json( f'https://play.vidyard.com/player/{video_id}.json', video_id)['payload'] @@ -67,6 +81,7 @@ class VidyardBaseIE(InfoExtractor): self._merge_subtitles(self._get_direct_subtitles(json_data.get('captions')), target=subtitles) return { + **self._get_additional_metadata(json_data['facadeUuid']), **traverse_obj(json_data, { 'id': ('facadeUuid', {str}), 'display_id': ('videoId', {int}, {str_or_none}), @@ -113,6 +128,29 @@ class VidyardIE(VidyardBaseIE): 'thumbnail': 'https://cdn.vidyard.com/thumbnails/spacer.gif', 'duration': 41.186, }, + }, { + 'url': 'https://share.vidyard.com/watch/wL237MtNgZUHo6e8WPiJbF', + 'info_dict': { + 'id': 'wL237MtNgZUHo6e8WPiJbF', + 'display_id': '25926870', + 'ext': 'mp4', + 'title': 'Adding & Editing Video Chapters', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/25926870/bvSEZS3dGY7DByQ_bzB57avIZ_hsvhr4_small.jpg', + 'duration': 135.46, + 'chapters': [{ + 'title': 'Adding new chapters', + 'start_time': 0, + }, { + 'title': 'Previewing your video', + 'start_time': 74, + }, { + 'title': 'Editing your chapters', + 'start_time': 91, + }, { + 'title': 'Share a link to a specific chapter', + 'start_time': 105, + }], + }, }, { 'url': 'https://embed.vidyard.com/share/oTDMPlUv--51Th455G5u7Q', 'info_dict': { @@ -132,8 +170,8 @@ class VidyardIE(VidyardBaseIE): 'id': 'SyStyHtYujcBHe5PkZc5DL', 'display_id': '41974005', 'ext': 'mp4', - 'title': 'Prepare the Frame and Track for Palm Beach Polysatin Shutters With BiFold Track', - 'description': r're:In this video, you will learn how to prepare the frame.+', + 'title': 'Install Palm Beach Shutters with a Bi-Fold Track System (Video 1 of 6)', + 'description': r're:In this video, you will learn the first step.+', 'thumbnail': 'https://cdn.vidyard.com/thumbnails/41974005/IJw7oCaJcF1h7WWu3OVZ8A_small.png', 'duration': 258.666, }, @@ -147,42 +185,42 @@ class VidyardIE(VidyardBaseIE): 'id': 'SyStyHtYujcBHe5PkZc5DL', 'display_id': '41974005', 'ext': 'mp4', - 'title': 'Prepare the Frame and Track for Palm Beach Polysatin Shutters With BiFold Track', + 'title': 'Install Palm Beach Shutters with a Bi-Fold Track System (Video 1 of 6)', 'thumbnail': 'https://cdn.vidyard.com/thumbnails/41974005/IJw7oCaJcF1h7WWu3OVZ8A_small.png', 'duration': 258.666, }, { 'id': '1Fw4B84jZTXLXWqkE71RiM', 'display_id': '5861113', 'ext': 'mp4', - 'title': 'Palm Beach - Bi-Fold Track System "Frame Installation"', + 'title': 'Install Palm Beach Shutters with a Bi-Fold Track System (Video 2 of 6)', 'thumbnail': 'https://cdn.vidyard.com/thumbnails/5861113/29CJ54s5g1_aP38zkKLHew_small.jpg', 'duration': 167.858, }, { 'id': 'DqP3wBvLXSpxrcqpT5kEeo', 'display_id': '41976334', 'ext': 'mp4', - 'title': 'Install the Track for Palm Beach Polysatin Shutters With BiFold Track', + 'title': 'Install Palm Beach Shutters with a Bi-Fold Track System (Video 3 of 6)', 'thumbnail': 'https://cdn.vidyard.com/thumbnails/5861090/RwG2VaTylUa6KhSTED1r1Q_small.png', 'duration': 94.229, }, { 'id': 'opfybfxpzQArxqtQYB6oBU', 'display_id': '41976364', 'ext': 'mp4', - 'title': 'Install the Panel for Palm Beach Polysatin Shutters With BiFold Track', + 'title': 'Install Palm Beach Shutters with a Bi-Fold Track System (Video 4 of 6)', 'thumbnail': 'https://cdn.vidyard.com/thumbnails/5860926/JIOaJR08dM4QgXi_iQ2zGA_small.png', 'duration': 191.467, }, { 'id': 'rWrXvkbTNNaNqD6189HJya', 'display_id': '41976382', 'ext': 'mp4', - 'title': 'Adjust the Panels for Palm Beach Polysatin Shutters With BiFold Track', + 'title': 'Install Palm Beach Shutters with a Bi-Fold Track System (Video 5 of 6)', 'thumbnail': 'https://cdn.vidyard.com/thumbnails/5860687/CwHxBv4UudAhOh43FVB4tw_small.png', 'duration': 138.155, }, { 'id': 'eYPTB521MZ9TPEArSethQ5', 'display_id': '41976409', 'ext': 'mp4', - 'title': 'Assemble and Install the Valance for Palm Beach Polysatin Shutters With BiFold Track', + 'title': 'Install Palm Beach Shutters with a Bi-Fold Track System (Video 6 of 6)', 'thumbnail': 'https://cdn.vidyard.com/thumbnails/5861425/0y68qlMU4O5VKU7bJ8i_AA_small.png', 'duration': 148.224, }], @@ -191,6 +229,7 @@ class VidyardIE(VidyardBaseIE): }, { # Non hubs.vidyard.com playlist 'url': 'https://salesforce.vidyard.com/watch/d4vqPjs7Q5EzVEis5QT3jd', + 'skip': 'URL now 404s. Alternative non hubs.vidyard.com playlist not yet available', 'info_dict': { 'id': 'd4vqPjs7Q5EzVEis5QT3jd', 'title': 'How To: Service Cloud: Import External Content in Lightning Knowledge', @@ -300,6 +339,7 @@ class VidyardIE(VidyardBaseIE): }, { # <script ... id="vidyard_embed_code_DXx2sW4WaLA6hTdGFz7ja8" src="//play.vidyard.com/DXx2sW4WaLA6hTdGFz7ja8.js? 'url': 'http://videos.vivint.com/watch/DXx2sW4WaLA6hTdGFz7ja8', + 'skip': 'URL certificate expired 2025-09-10. Alternative script embed test case not yet available', 'info_dict': { 'id': 'DXx2sW4WaLA6hTdGFz7ja8', 'display_id': '2746529', @@ -317,11 +357,12 @@ class VidyardIE(VidyardBaseIE): 'ext': 'mp4', 'title': 'Lesson 1 - Opening an MT4 Account', 'description': 'Never heard of MetaTrader4? Here\'s the 411 on the popular trading platform!', - 'duration': 168, + 'duration': 168.16, 'thumbnail': 'https://cdn.vidyard.com/thumbnails/20291/IM-G2WXQR9VBLl2Cmzvftg_small.jpg', }, }, { # <iframe ... src="//play.vidyard.com/d61w8EQoZv1LDuPxDkQP2Q/type/background?preview=1" + 'skip': 'URL changed embed method to \'class="vidyard-player-embed"\'. An alternative iframe embed test case is not yet available', 'url': 'https://www.avaya.com/en/', 'info_dict': { # These values come from the generic extractor and don't matter @@ -354,46 +395,18 @@ class VidyardIE(VidyardBaseIE): }], 'playlist_count': 2, }, { - # <div class="vidyard-player-embed" data-uuid="vpCWTVHw3qrciLtVY94YkS" - 'url': 'https://www.gogoair.com/', + # <div class="vidyard-player-embed" data-uuid="pMk8eNCYzukzJaEPoo1Hgn" + # URL previously used iframe embeds and was used for that test case + 'url': 'https://www.avaya.com/en/', 'info_dict': { - # These values come from the generic extractor and don't matter - 'id': str, - 'title': str, - 'description': str, - 'age_limit': 0, + 'id': 'pMk8eNCYzukzJaEPoo1Hgn', + 'display_id': '47074153', + 'ext': 'mp4', + 'title': 'Avaya Infinity Helps Redefine the Contact Center as Your Connection Center', + 'description': r're:Our mission is to help you turn single engagements.+', + 'duration': 81.55, + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/47074153/MZOLKhXdbiUWwp2ROnT5HaXL0oau6JtR_small.jpg', }, - 'playlist': [{ - 'info_dict': { - 'id': 'vpCWTVHw3qrciLtVY94YkS', - 'display_id': '40780699', - 'ext': 'mp4', - 'title': 'Upgrade to AVANCE 100% worth it - Jason Talley, Owner and Pilot, Testimonial', - 'description': 'md5:f609824839439a51990cef55ffc472aa', - 'duration': 70.737, - 'thumbnail': 'https://cdn.vidyard.com/thumbnails/40780699/KzjfYZz5MZl2gHF_e-4i2c6ib1cLDweQ_small.jpg', - }, - }, { - 'info_dict': { - 'id': 'xAmV9AsLbnitCw35paLBD8', - 'display_id': '31130867', - 'ext': 'mp4', - 'title': 'Brad Keselowski goes faster with Gogo AVANCE inflight Wi-Fi', - 'duration': 132.565, - 'thumbnail': 'https://cdn.vidyard.com/thumbnails/31130867/HknyDtLdm2Eih9JZ4A5XLjhfBX_6HRw5_small.jpg', - }, - }, { - 'info_dict': { - 'id': 'RkkrFRNxfP79nwCQavecpF', - 'display_id': '39009815', - 'ext': 'mp4', - 'title': 'Live Demo of Gogo Galileo', - 'description': 'md5:e2df497236f4e12c3fef8b392b5f23e0', - 'duration': 112.128, - 'thumbnail': 'https://cdn.vidyard.com/thumbnails/38144873/CWLlxfUbJ4Gh0ThbUum89IsEM4yupzMb_small.jpg', - }, - }], - 'playlist_count': 3, }] @classmethod diff --git a/yt_dlp/extractor/xhamster.py b/yt_dlp/extractor/xhamster.py index 4f9da4c2d8..f7d416dc70 100644 --- a/yt_dlp/extractor/xhamster.py +++ b/yt_dlp/extractor/xhamster.py @@ -2,6 +2,7 @@ import base64 import codecs import itertools import re +import string from .common import InfoExtractor from ..utils import ( @@ -22,6 +23,47 @@ from ..utils import ( ) +def to_signed_32(n): + return n % ((-1 if n < 0 else 1) * 2**32) + + +class _ByteGenerator: + def __init__(self, algo_id, seed): + try: + self._algorithm = getattr(self, f'_algo{algo_id}') + except AttributeError: + raise ExtractorError(f'Unknown algorithm ID: {algo_id}') + self._s = to_signed_32(seed) + + def _algo1(self, s): + # LCG (a=1664525, c=1013904223, m=2^32) + # Ref: https://en.wikipedia.org/wiki/Linear_congruential_generator + s = self._s = to_signed_32(s * 1664525 + 1013904223) + return s + + def _algo2(self, s): + # xorshift32 + # Ref: https://en.wikipedia.org/wiki/Xorshift + s = to_signed_32(s ^ (s << 13)) + s = to_signed_32(s ^ ((s & 0xFFFFFFFF) >> 17)) + s = self._s = to_signed_32(s ^ (s << 5)) + return s + + def _algo3(self, s): + # Weyl Sequence (k≈2^32*φ, m=2^32) + MurmurHash3 (fmix32) + # Ref: https://en.wikipedia.org/wiki/Weyl_sequence + # https://commons.apache.org/proper/commons-codec/jacoco/org.apache.commons.codec.digest/MurmurHash3.java.html + s = self._s = to_signed_32(s + 0x9e3779b9) + s = to_signed_32(s ^ ((s & 0xFFFFFFFF) >> 16)) + s = to_signed_32(s * to_signed_32(0x85ebca77)) + s = to_signed_32(s ^ ((s & 0xFFFFFFFF) >> 13)) + s = to_signed_32(s * to_signed_32(0xc2b2ae3d)) + return to_signed_32(s ^ ((s & 0xFFFFFFFF) >> 16)) + + def __next__(self): + return self._algorithm(self._s) & 0xFF + + class XHamsterIE(InfoExtractor): _DOMAINS = r'(?:xhamster\.(?:com|one|desi)|xhms\.pro|xhamster\d+\.(?:com|desi)|xhday\.com|xhvid\.com)' _VALID_URL = rf'''(?x) @@ -146,6 +188,12 @@ class XHamsterIE(InfoExtractor): _XOR_KEY = b'xh7999' def _decipher_format_url(self, format_url, format_id): + if all(char in string.hexdigits for char in format_url): + byte_data = bytes.fromhex(format_url) + seed = int.from_bytes(byte_data[1:5], byteorder='little', signed=True) + byte_gen = _ByteGenerator(byte_data[0], seed) + return bytearray(byte ^ next(byte_gen) for byte in byte_data[5:]).decode('latin-1') + cipher_type, _, ciphertext = try_call( lambda: base64.b64decode(format_url).decode().partition('_')) or [None] * 3 @@ -164,6 +212,16 @@ class XHamsterIE(InfoExtractor): self.report_warning(f'Skipping format "{format_id}": unsupported cipher type "{cipher_type}"') return None + def _fixup_formats(self, formats): + for f in formats: + if f.get('vcodec'): + continue + for vcodec in ('av1', 'h264'): + if any(f'.{vcodec}.' in f_url for f_url in (f['url'], f.get('manifest_url', ''))): + f['vcodec'] = vcodec + break + return formats + def _real_extract(self, url): mobj = self._match_valid_url(url) video_id = mobj.group('id') or mobj.group('id_2') @@ -312,7 +370,8 @@ class XHamsterIE(InfoExtractor): 'comment_count': int_or_none(video.get('comments')), 'age_limit': age_limit if age_limit is not None else 18, 'categories': categories, - 'formats': formats, + 'formats': self._fixup_formats(formats), + '_format_sort_fields': ('res', 'proto', 'tbr'), } # Old layout fallback diff --git a/yt_dlp/extractor/youtube/_base.py b/yt_dlp/extractor/youtube/_base.py index 47e09c4f21..062301b5ff 100644 --- a/yt_dlp/extractor/youtube/_base.py +++ b/yt_dlp/extractor/youtube/_base.py @@ -220,6 +220,20 @@ INNERTUBE_CLIENTS = { }, 'PLAYER_PO_TOKEN_POLICY': PlayerPoTokenPolicy(required=False, recommended=True), }, + # Doesn't require a PoToken for some reason + 'android_sdkless': { + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'ANDROID', + 'clientVersion': '20.10.38', + 'userAgent': 'com.google.android.youtube/20.10.38 (Linux; U; Android 11) gzip', + 'osName': 'Android', + 'osVersion': '11', + }, + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 3, + 'REQUIRE_JS_PLAYER': False, + }, # YouTube Kids videos aren't returned on this client for some reason 'android_vr': { 'INNERTUBE_CONTEXT': { @@ -366,11 +380,15 @@ def short_client_name(client_name): return join_nonempty(main[:4], ''.join(x[0] for x in parts)).upper() -def build_innertube_clients(): - THIRD_PARTY = { +def _fix_embedded_ytcfg(ytcfg): + ytcfg['INNERTUBE_CONTEXT'].setdefault('thirdParty', {}).update({ 'embedUrl': 'https://www.youtube.com/', # Can be any valid URL - } - BASE_CLIENTS = ('ios', 'web', 'tv', 'mweb', 'android') + }) + + +def build_innertube_clients(): + # From highest to lowest priority + BASE_CLIENTS = ('tv', 'web', 'mweb', 'android', 'ios') priority = qualities(BASE_CLIENTS[::-1]) for client, ytcfg in tuple(INNERTUBE_CLIENTS.items()): @@ -391,10 +409,7 @@ def build_innertube_clients(): ytcfg['priority'] = 10 * priority(base_client) if variant == 'embedded': - ytcfg['INNERTUBE_CONTEXT']['thirdParty'] = THIRD_PARTY - ytcfg['priority'] -= 2 - elif variant: - ytcfg['priority'] -= 3 + _fix_embedded_ytcfg(ytcfg) build_innertube_clients() @@ -977,6 +992,10 @@ class YoutubeBaseInfoExtractor(InfoExtractor): ytcfg = self.extract_ytcfg(video_id, webpage) or {} + # See https://github.com/yt-dlp/yt-dlp/issues/14826 + if _split_innertube_client(client)[2] == 'embedded': + _fix_embedded_ytcfg(ytcfg) + # Workaround for https://github.com/yt-dlp/yt-dlp/issues/12563 # But it's not effective when logged-in if client == 'tv' and not self.is_authenticated: @@ -1196,7 +1215,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): except ValueError: return None - def _parse_time_text(self, text): + def _parse_time_text(self, text, report_failure=True): if not text: return dt_ = self.extract_relative_time(text) @@ -1211,7 +1230,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): (r'([a-z]+\s*\d{1,2},?\s*20\d{2})', r'(?:.+|^)(?:live|premieres|ed|ing)(?:\s*(?:on|for))?\s*(.+\d)'), text.lower(), 'time text', default=None))) - if text and timestamp is None and self._preferred_lang in (None, 'en'): + if report_failure and text and timestamp is None and self._preferred_lang in (None, 'en'): self.report_warning( f'Cannot parse localized time text "{text}"', only_once=True) return timestamp diff --git a/yt_dlp/extractor/youtube/_tab.py b/yt_dlp/extractor/youtube/_tab.py index 5870786978..72a66e0a1a 100644 --- a/yt_dlp/extractor/youtube/_tab.py +++ b/yt_dlp/extractor/youtube/_tab.py @@ -341,7 +341,11 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): 'contentImage', *thumb_keys, 'thumbnailViewModel', 'image'), final_key='sources'), duration=traverse_obj(view_model, ( 'contentImage', 'thumbnailViewModel', 'overlays', ..., 'thumbnailOverlayBadgeViewModel', - 'thumbnailBadges', ..., 'thumbnailBadgeViewModel', 'text', {parse_duration}, any))) + 'thumbnailBadges', ..., 'thumbnailBadgeViewModel', 'text', {parse_duration}, any)), + timestamp=(traverse_obj(view_model, ( + 'metadata', 'lockupMetadataViewModel', 'metadata', 'contentMetadataViewModel', 'metadataRows', + ..., 'metadataParts', ..., 'text', 'content', {lambda t: self._parse_time_text(t, report_failure=False)}, any)) + if self._configuration_arg('approximate_date', ie_key=YoutubeTabIE) else None)) def _rich_entries(self, rich_grid_renderer): if lockup_view_model := traverse_obj(rich_grid_renderer, ('content', 'lockupViewModel', {dict})): diff --git a/yt_dlp/extractor/youtube/_video.py b/yt_dlp/extractor/youtube/_video.py index 79c183c6a5..8706439ae7 100644 --- a/yt_dlp/extractor/youtube/_video.py +++ b/yt_dlp/extractor/youtube/_video.py @@ -4,9 +4,7 @@ import collections import datetime as dt import functools import itertools -import json import math -import os.path import random import re import sys @@ -26,10 +24,11 @@ from ._base import ( _split_innertube_client, short_client_name, ) +from .jsc._builtin.ejs import _EJS_WIKI_URL +from .jsc._director import initialize_jsc_director +from .jsc.provider import JsChallengeRequest, JsChallengeType, NChallengeInput, SigChallengeInput from .pot._director import initialize_pot_director from .pot.provider import PoTokenContext, PoTokenRequest -from ..openload import PhantomJSwrapper -from ...jsinterp import JSInterpreter, LocalNameSpace from ...networking.exceptions import HTTPError from ...utils import ( NO_DEFAULT, @@ -39,13 +38,11 @@ from ...utils import ( clean_html, datetime_from_str, filesize_from_tbr, - filter_dict, float_or_none, format_field, get_first, int_or_none, join_nonempty, - js_to_json, mimetype2ext, orderedSet, parse_codecs, @@ -147,117 +144,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): r'/(?P<id>[a-zA-Z0-9_-]{8,})/player(?:_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?|-plasma-ias-(?:phone|tablet)-[a-z]{2}_[A-Z]{2}\.vflset)/base\.js$', r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.js$', ) - _formats = { # NB: Used in YoutubeWebArchiveIE and GoogleDriveIE - '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, - '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, - '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'}, - '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'}, - '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'}, - '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, - '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, - '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, - # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well - '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'}, - '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, - '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, - '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'}, - '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'}, - '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'}, - '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'}, - '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, - '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, - - - # 3D videos - '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20}, - '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20}, - '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20}, - '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20}, - '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20}, - '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20}, - '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20}, - - # Apple HTTP Live Streaming - '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10}, - '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10}, - '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10}, - '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10}, - '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10}, - '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10}, - '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10}, - '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10}, - - # DASH mp4 video - '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559) - '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60}, - '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60}, - '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'}, - - # Dash mp4 audio - '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'}, - '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'}, - '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'}, - '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'}, - '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'}, - '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'}, - '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'}, - - # Dash webm - '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, - '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, - '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, - '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, - '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, - '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, - '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'}, - '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug) - '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, - '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, - '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, - '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'}, - '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, - - # Dash webm audio - '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128}, - '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256}, - - # Dash webm audio with opus inside - '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50}, - '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70}, - '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160}, - - # RTMP (unnamed) - '_rtmp': {'protocol': 'rtmp'}, - - # av01 video only formats sometimes served with "unknown" codecs - '394': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'av01.0.00M.08'}, - '395': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'av01.0.00M.08'}, - '396': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'av01.0.01M.08'}, - '397': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'av01.0.04M.08'}, - '398': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'av01.0.05M.08'}, - '399': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'av01.0.08M.08'}, - '400': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'av01.0.12M.08'}, - '401': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'av01.0.12M.08'}, - } _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'srt', 'vtt') - _DEFAULT_CLIENTS = ('tv', 'web_safari', 'web') + _DEFAULT_CLIENTS = ('tv', 'android_sdkless', 'web') + _DEFAULT_JSLESS_CLIENTS = ('android_sdkless', 'web_safari', 'web') _DEFAULT_AUTHED_CLIENTS = ('tv', 'web_safari', 'web') # Premium does not require POT (except for subtitles) _DEFAULT_PREMIUM_CLIENTS = ('tv', 'web_creator', 'web') @@ -1815,7 +1704,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'params': {'skip_download': True}, }] - _DEFAULT_PLAYER_JS_VERSION = '20348@0004de42' # TODO: revert to 'actual' when n/sig is fixed + _DEFAULT_PLAYER_JS_VERSION = 'actual' _DEFAULT_PLAYER_JS_VARIANT = 'main' _PLAYER_JS_VARIANT_MAP = { 'main': 'player_ias.vflset/en_US/base.js', @@ -1829,8 +1718,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'tablet': 'player-plasma-ias-tablet-en_US.vflset/base.js', } _INVERSE_PLAYER_JS_VARIANT_MAP = {v: k for k, v in _PLAYER_JS_VARIANT_MAP.items()} - _NSIG_FUNC_CACHE_ID = 'nsig func' - _DUMMY_STRING = 'dlp_wins' @classmethod def suitable(cls, url): @@ -1850,6 +1737,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _real_initialize(self): super()._real_initialize() self._pot_director = initialize_pot_director(self) + self._jsc_director = initialize_jsc_director(self) def _prepare_live_from_start_formats(self, formats, video_id, live_start_time, url, webpage_url, smuggled_data, is_live): lock = threading.Lock() @@ -1867,7 +1755,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): microformats = traverse_obj( prs, (..., 'microformat', 'playerMicroformatRenderer'), expected_type=dict) - _, live_status, _, formats, _ = self._list_formats(video_id, microformats, video_details, prs, player_url) + _, live_status, formats, _ = self._list_formats(video_id, microformats, video_details, prs, player_url) is_live = live_status == 'is_live' start_time = time.time() @@ -2115,10 +2003,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): variant = re.sub(r'[^a-zA-Z0-9]', '_', remove_end(player_path, '.js')) return f'{player_id}-{variant}' - def _signature_cache_id(self, example_sig): - """ Return a string representation of a signature """ - return '.'.join(str(len(part)) for part in example_sig.split('.')) - @classmethod def _extract_player_info(cls, player_url): for player_re in cls._PLAYER_INFO_RE: @@ -2140,53 +2024,35 @@ class YoutubeIE(YoutubeBaseInfoExtractor): self._code_cache[player_js_key] = code return self._code_cache.get(player_js_key) - def _extract_signature_function(self, video_id, player_url, example_sig): - # Read from filesystem cache - func_id = join_nonempty( - self._player_js_cache_key(player_url), self._signature_cache_id(example_sig)) - assert os.path.basename(func_id) == func_id + def _sig_spec_cache_id(self, player_url, spec_id): + return join_nonempty(self._player_js_cache_key(player_url), str(spec_id)) - self.write_debug(f'Extracting signature function {func_id}') - cache_spec, code = self.cache.load('youtube-sigfuncs', func_id, min_ver='2025.07.21'), None + def _load_sig_spec_from_cache(self, spec_cache_id): + # This is almost identical to _load_player_data_from_cache + # I hate it + if spec_cache_id in self._player_cache: + return self._player_cache[spec_cache_id] + spec = self.cache.load('youtube-sigfuncs', spec_cache_id, min_ver='2025.07.21') + if spec: + self._player_cache[spec_cache_id] = spec + return spec - if not cache_spec: - code = self._load_player(video_id, player_url) - if code: - res = self._parse_sig_js(code, player_url) - test_string = ''.join(map(chr, range(len(example_sig)))) - cache_spec = [ord(c) for c in res(test_string)] - self.cache.store('youtube-sigfuncs', func_id, cache_spec) + def _store_sig_spec_to_cache(self, spec_cache_id, spec): + if spec_cache_id not in self._player_cache: + self._player_cache[spec_cache_id] = spec + self.cache.store('youtube-sigfuncs', spec_cache_id, spec) - return lambda s: ''.join(s[i] for i in cache_spec) + def _load_player_data_from_cache(self, name, player_url): + cache_id = (f'youtube-{name}', self._player_js_cache_key(player_url)) - def _parse_sig_js(self, jscode, player_url): - # Examples where `sig` is funcname: - # sig=function(a){a=a.split(""); ... ;return a.join("")}; - # ;c&&(c=sig(decodeURIComponent(c)),a.set(b,encodeURIComponent(c)));return a}; - # {var l=f,m=h.sp,n=sig(decodeURIComponent(h.s));l.set(m,encodeURIComponent(n))} - # sig=function(J){J=J.split(""); ... ;return J.join("")}; - # ;N&&(N=sig(decodeURIComponent(N)),J.set(R,encodeURIComponent(N)));return J}; - # {var H=u,k=f.sp,v=sig(decodeURIComponent(f.s));H.set(k,encodeURIComponent(v))} - funcname = self._search_regex( - (r'\b(?P<var>[a-zA-Z0-9_$]+)&&\((?P=var)=(?P<sig>[a-zA-Z0-9_$]{2,})\(decodeURIComponent\((?P=var)\)\)', - r'(?P<sig>[a-zA-Z0-9_$]+)\s*=\s*function\(\s*(?P<arg>[a-zA-Z0-9_$]+)\s*\)\s*{\s*(?P=arg)\s*=\s*(?P=arg)\.split\(\s*""\s*\)\s*;\s*[^}]+;\s*return\s+(?P=arg)\.join\(\s*""\s*\)', - r'(?:\b|[^a-zA-Z0-9_$])(?P<sig>[a-zA-Z0-9_$]{2,})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)(?:;[a-zA-Z0-9_$]{2}\.[a-zA-Z0-9_$]{2}\(a,\d+\))?', - # Old patterns - r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(', - r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(', - r'\bm=(?P<sig>[a-zA-Z0-9$]{2,})\(decodeURIComponent\(h\.s\)\)', - # Obsolete patterns - r'("|\')signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(', - r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(', - r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(', - r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(', - r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('), - jscode, 'Initial JS player signature function name', group='sig') + if data := self._player_cache.get(cache_id): + return data - varname, global_list = self._interpret_player_js_global_var(jscode, player_url) - jsi = JSInterpreter(jscode) - initial_function = jsi.extract_function(funcname, filter_dict({varname: global_list})) - return lambda s: initial_function([s]) + data = self.cache.load(*cache_id, min_ver='2025.07.21') + if data: + self._player_cache[cache_id] = data + + return data def _cached(self, func, *cache_id): def inner(*args, **kwargs): @@ -2204,246 +2070,25 @@ class YoutubeIE(YoutubeBaseInfoExtractor): return ret return inner - def _load_player_data_from_cache(self, name, player_url): - cache_id = (f'youtube-{name}', self._player_js_cache_key(player_url)) - - if data := self._player_cache.get(cache_id): - return data - - data = self.cache.load(*cache_id, min_ver='2025.07.21') - if data: - self._player_cache[cache_id] = data - - return data - def _store_player_data_to_cache(self, name, player_url, data): cache_id = (f'youtube-{name}', self._player_js_cache_key(player_url)) if cache_id not in self._player_cache: self.cache.store(*cache_id, data) self._player_cache[cache_id] = data - def _decrypt_signature(self, s, video_id, player_url): - """Turn the encrypted s field into a working signature""" - extract_sig = self._cached( - self._extract_signature_function, 'sig', player_url, self._signature_cache_id(s)) - func = extract_sig(video_id, player_url, s) - return func(s) - - def _decrypt_nsig(self, s, video_id, player_url): - """Turn the encrypted n field into a working signature""" - if player_url is None: - raise ExtractorError('Cannot decrypt nsig without player_url') - player_url = urljoin('https://www.youtube.com', player_url) - - try: - jsi, _, func_code = self._extract_n_function_code(video_id, player_url) - except ExtractorError as e: - raise ExtractorError('Unable to extract nsig function code', cause=e) - - try: - extract_nsig = self._cached(self._extract_n_function_from_code, self._NSIG_FUNC_CACHE_ID, player_url) - ret = extract_nsig(jsi, func_code)(s) - except JSInterpreter.Exception as e: - try: - jsi = PhantomJSwrapper(self, timeout=5000) - except ExtractorError: - raise e - self.report_warning( - f'Native nsig extraction failed: Trying with PhantomJS\n' - f' n = {s} ; player = {player_url}', video_id) - self.write_debug(e, only_once=True) - - args, func_body = func_code - ret = jsi.execute( - f'console.log(function({", ".join(args)}) {{ {func_body} }}({s!r}));', - video_id=video_id, note='Executing signature code').strip() - - self.write_debug(f'Decrypted nsig {s} => {ret}') - # Only cache nsig func JS code to disk if successful, and only once - self._store_player_data_to_cache('nsig', player_url, func_code) - return ret - - def _extract_n_function_name(self, jscode, player_url=None): - varname, global_list = self._interpret_player_js_global_var(jscode, player_url) - if debug_str := traverse_obj(global_list, (lambda _, v: v.endswith('-_w8_'), any)): - pattern = r'''(?x) - \{\s*return\s+%s\[%d\]\s*\+\s*(?P<argname>[a-zA-Z0-9_$]+)\s*\} - ''' % (re.escape(varname), global_list.index(debug_str)) - if match := re.search(pattern, jscode): - pattern = r'''(?x) - \{\s*\)%s\(\s* - (?: - (?P<funcname_a>[a-zA-Z0-9_$]+)\s*noitcnuf\s* - |noitcnuf\s*=\s*(?P<funcname_b>[a-zA-Z0-9_$]+)(?:\s+rav)? - )[;\n] - ''' % re.escape(match.group('argname')[::-1]) - if match := re.search(pattern, jscode[match.start()::-1]): - a, b = match.group('funcname_a', 'funcname_b') - return (a or b)[::-1] - self.write_debug(join_nonempty( - 'Initial search was unable to find nsig function name', - player_url and f' player = {player_url}', delim='\n'), only_once=True) - - # Examples (with placeholders nfunc, narray, idx): - # * .get("n"))&&(b=nfunc(b) - # * .get("n"))&&(b=narray[idx](b) - # * b=String.fromCharCode(110),c=a.get(b))&&c=narray[idx](c) - # * a.D&&(b="nn"[+a.D],c=a.get(b))&&(c=narray[idx](c),a.set(b,c),narray.length||nfunc("") - # * a.D&&(PL(a),b=a.j.n||null)&&(b=narray[0](b),a.set("n",b),narray.length||nfunc("") - # * a.D&&(b="nn"[+a.D],vL(a),c=a.j[b]||null)&&(c=narray[idx](c),a.set(b,c),narray.length||nfunc("") - # * J.J="";J.url="";J.Z&&(R="nn"[+J.Z],mW(J),N=J.K[R]||null)&&(N=narray[idx](N),J.set(R,N))}}; - funcname, idx = self._search_regex( - r'''(?x) - (?: - \.get\("n"\)\)&&\(b=| - (?: - b=String\.fromCharCode\(110\)| - (?P<str_idx>[a-zA-Z0-9_$.]+)&&\(b="nn"\[\+(?P=str_idx)\] - ) - (?: - ,[a-zA-Z0-9_$]+\(a\))?,c=a\. - (?: - get\(b\)| - [a-zA-Z0-9_$]+\[b\]\|\|null - )\)&&\(c=| - \b(?P<var>[a-zA-Z0-9_$]+)= - )(?P<nfunc>[a-zA-Z0-9_$]+)(?:\[(?P<idx>\d+)\])?\([a-zA-Z]\) - (?(var),[a-zA-Z0-9_$]+\.set\((?:"n+"|[a-zA-Z0-9_$]+)\,(?P=var)\))''', - jscode, 'n function name', group=('nfunc', 'idx'), default=(None, None)) - if not funcname: - self.report_warning(join_nonempty( - 'Falling back to generic n function search', - player_url and f' player = {player_url}', delim='\n'), only_once=True) - return self._search_regex( - r'''(?xs) - ;\s*(?P<name>[a-zA-Z0-9_$]+)\s*=\s*function\([a-zA-Z0-9_$]+\) - \s*\{(?:(?!};).)+?return\s*(?P<q>["'])[\w-]+_w8_(?P=q)\s*\+\s*[a-zA-Z0-9_$]+''', - jscode, 'Initial JS player n function name', group='name') - elif not idx: - return funcname - - return json.loads(js_to_json(self._search_regex( - rf'var {re.escape(funcname)}\s*=\s*(\[.+?\])\s*[,;]', jscode, - f'Initial JS player n function list ({funcname}.{idx})')))[int(idx)] - - def _interpret_player_js_global_var(self, jscode, player_url): - """Returns tuple of: variable name string, variable value list""" - extract_global_var = self._cached(self._search_regex, 'js global array', player_url) - varcode, varname, varvalue = extract_global_var( - r'''(?x) - (?P<q1>["\'])use\s+strict(?P=q1);\s* - (?P<code> - var\s+(?P<name>[a-zA-Z0-9_$]+)\s*=\s* - (?P<value> - (?P<q2>["\'])(?:(?!(?P=q2)).|\\.)+(?P=q2) - \.split\((?P<q3>["\'])(?:(?!(?P=q3)).)+(?P=q3)\) - |\[\s*(?:(?P<q4>["\'])(?:(?!(?P=q4)).|\\.)*(?P=q4)\s*,?\s*)+\] - ) - )[;,] - ''', jscode, 'global variable', group=('code', 'name', 'value'), default=(None, None, None)) - if not varcode: - self.write_debug(join_nonempty( - 'No global array variable found in player JS', - player_url and f' player = {player_url}', delim='\n'), only_once=True) - return None, None - - jsi = JSInterpreter(varcode) - interpret_global_var = self._cached(jsi.interpret_expression, 'js global list', player_url) - return varname, interpret_global_var(varvalue, LocalNameSpace(), allow_recursion=10) - - def _fixup_n_function_code(self, argnames, nsig_code, jscode, player_url): - # Fixup global array - varname, global_list = self._interpret_player_js_global_var(jscode, player_url) - if varname and global_list: - nsig_code = f'var {varname}={json.dumps(global_list)}; {nsig_code}' - else: - varname = self._DUMMY_STRING - global_list = [] - - # Fixup typeof check - undefined_idx = global_list.index('undefined') if 'undefined' in global_list else r'\d+' - fixed_code = re.sub( - fr'''(?x) - ;\s*if\s*\(\s*typeof\s+[a-zA-Z0-9_$]+\s*===?\s*(?: - (["\'])undefined\1| - {re.escape(varname)}\[{undefined_idx}\] - )\s*\)\s*return\s+{re.escape(argnames[0])}; - ''', ';', nsig_code) - if fixed_code == nsig_code: - self.write_debug(join_nonempty( - 'No typeof statement found in nsig function code', - player_url and f' player = {player_url}', delim='\n'), only_once=True) - - # Fixup global funcs - jsi = JSInterpreter(fixed_code) - cache_id = (self._NSIG_FUNC_CACHE_ID, player_url) - try: - self._cached( - self._extract_n_function_from_code, *cache_id)(jsi, (argnames, fixed_code))(self._DUMMY_STRING) - except JSInterpreter.Exception: - self._player_cache.pop(cache_id, None) - - global_funcnames = jsi._undefined_varnames - debug_names = [] - jsi = JSInterpreter(jscode) - for func_name in global_funcnames: - try: - func_args, func_code = jsi.extract_function_code(func_name) - fixed_code = f'var {func_name} = function({", ".join(func_args)}) {{ {func_code} }}; {fixed_code}' - debug_names.append(func_name) - except Exception: - self.report_warning(join_nonempty( - f'Unable to extract global nsig function {func_name} from player JS', - player_url and f' player = {player_url}', delim='\n'), only_once=True) - - if debug_names: - self.write_debug(f'Extracted global nsig functions: {", ".join(debug_names)}') - - return argnames, fixed_code - - def _extract_n_function_code(self, video_id, player_url): - player_id = self._extract_player_info(player_url) - func_code = self._load_player_data_from_cache('nsig', player_url) - jscode = func_code or self._load_player(video_id, player_url) - jsi = JSInterpreter(jscode) - - if func_code: - return jsi, player_id, func_code - - func_name = self._extract_n_function_name(jscode, player_url=player_url) - - # XXX: Work around (a) global array variable, (b) `typeof` short-circuit, (c) global functions - func_code = self._fixup_n_function_code(*jsi.extract_function_code(func_name), jscode, player_url) - - return jsi, player_id, func_code - - def _extract_n_function_from_code(self, jsi, func_code): - func = jsi.extract_function_from_code(*func_code) - - def extract_nsig(s): - try: - ret = func([s]) - except JSInterpreter.Exception: - raise - except Exception as e: - raise JSInterpreter.Exception(traceback.format_exc(), cause=e) - - if ret.startswith('enhanced_except_') or ret.endswith(s): - raise JSInterpreter.Exception('Signature function returned an exception') - return ret - - return extract_nsig - def _extract_signature_timestamp(self, video_id, player_url, ytcfg=None, fatal=False): """ Extract signatureTimestamp (sts) Required to tell API what sig/player version is in use. """ + CACHE_ENABLED = False # TODO: enable when preprocessed player JS cache is solved/enabled + player_sts_override = self._get_player_js_version()[0] if player_sts_override: return int(player_sts_override) - if sts := traverse_obj(ytcfg, ('STS', {int_or_none})): + sts = traverse_obj(ytcfg, ('STS', {int_or_none})) + if sts: return sts if not player_url: @@ -2453,15 +2098,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor): self.report_warning(error_msg) return None - sts = self._load_player_data_from_cache('sts', player_url) - if sts: + if CACHE_ENABLED and (sts := self._load_player_data_from_cache('sts', player_url)): return sts if code := self._load_player(video_id, player_url, fatal=fatal): sts = int_or_none(self._search_regex( r'(?:signatureTimestamp|sts)\s*:\s*(?P<sts>[0-9]{5})', code, 'JS player signature timestamp', group='sts', fatal=fatal)) - if sts: + if CACHE_ENABLED and sts: self._store_player_data_to_cache('sts', player_url, sts) return sts @@ -2955,9 +2599,20 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # TODO(future): This validation should be moved into pot framework. # Some sort of middleware or validation provider perhaps? + gvs_bind_to_video_id = False + experiments = traverse_obj(ytcfg, ( + 'WEB_PLAYER_CONTEXT_CONFIGS', ..., 'serializedExperimentFlags', {urllib.parse.parse_qs})) + if 'true' in traverse_obj(experiments, (..., 'html5_generate_content_po_token', -1)): + self.write_debug( + f'{video_id}: Detected experiment to bind GVS PO Token to video id.', only_once=True) + gvs_bind_to_video_id = True + # GVS WebPO Token is bound to visitor_data / Visitor ID when logged out. # Must have visitor_data for it to function. - if player_url and context == _PoTokenContext.GVS and not visitor_data and not self.is_authenticated: + if ( + player_url and context == _PoTokenContext.GVS + and not visitor_data and not self.is_authenticated and not gvs_bind_to_video_id + ): self.report_warning( f'Unable to fetch GVS PO Token for {client} client: Missing required Visitor Data. ' f'You may need to pass Visitor Data with --extractor-args "youtube:visitor_data=XXX"', only_once=True) @@ -2971,7 +2626,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): config_po_token = self._get_config_po_token(client, context) if config_po_token: # GVS WebPO token is bound to data_sync_id / account Session ID when logged in. - if player_url and context == _PoTokenContext.GVS and not data_sync_id and self.is_authenticated: + if ( + player_url and context == _PoTokenContext.GVS + and not data_sync_id and self.is_authenticated and not gvs_bind_to_video_id + ): self.report_warning( f'Got a GVS PO Token for {client} client, but missing Data Sync ID for account. Formats may not work.' f'You may need to pass a Data Sync ID with --extractor-args "youtube:data_sync_id=XXX"') @@ -2997,6 +2655,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): video_id=video_id, video_webpage=webpage, required=required, + _gvs_bind_to_video_id=gvs_bind_to_video_id, **kwargs, ) @@ -3040,6 +2699,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): data_sync_id=kwargs.get('data_sync_id'), video_id=kwargs.get('video_id'), request_cookiejar=self._downloader.cookiejar, + _gvs_bind_to_video_id=kwargs.get('_gvs_bind_to_video_id', False), # All requests that would need to be proxied should be in the # context of www.youtube.com or the innertube host @@ -3113,9 +2773,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _get_requested_clients(self, url, smuggled_data, is_premium_subscriber): requested_clients = [] excluded_clients = [] + js_runtime_available = any(p.is_available() for p in self._jsc_director.providers.values()) default_clients = ( self._DEFAULT_PREMIUM_CLIENTS if is_premium_subscriber else self._DEFAULT_AUTHED_CLIENTS if self.is_authenticated + else self._DEFAULT_JSLESS_CLIENTS if not js_runtime_available else self._DEFAULT_CLIENTS ) allowed_clients = sorted( @@ -3132,6 +2794,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor): self.report_warning(f'Skipping unsupported client "{client}"') else: requested_clients.append(client) + + if not (requested_clients or excluded_clients) and default_clients == self._DEFAULT_JSLESS_CLIENTS: + self.report_warning( + f'No supported JavaScript runtime could be found. YouTube extraction without ' + f'a JS runtime has been deprecated, and some formats may be missing. ' + f'See {_EJS_WIKI_URL} for details on installing one. To silence this warning, ' + f'you can use --extractor-args "youtube:player_client=default"', only_once=True) + if not requested_clients: requested_clients.extend(default_clients) for excluded_client in excluded_clients: @@ -3266,12 +2936,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): sd[STREAMING_DATA_INNERTUBE_CONTEXT] = innertube_context sd[STREAMING_DATA_FETCH_SUBS_PO_TOKEN] = fetch_subs_po_token_func sd[STREAMING_DATA_IS_PREMIUM_SUBSCRIBER] = is_premium_subscriber + sd[STREAMING_DATA_FETCHED_TIMESTAMP] = fetched_timestamp for f in traverse_obj(sd, (('formats', 'adaptiveFormats'), ..., {dict})): f[STREAMING_DATA_CLIENT_NAME] = client f[STREAMING_DATA_FETCH_GVS_PO_TOKEN] = fetch_gvs_po_token_func f[STREAMING_DATA_IS_PREMIUM_SUBSCRIBER] = is_premium_subscriber f[STREAMING_DATA_PLAYER_TOKEN_PROVIDED] = bool(player_po_token) - f[STREAMING_DATA_FETCHED_TIMESTAMP] = fetched_timestamp if deprioritize_pr: deprioritized_prs.append(pr) else: @@ -3351,12 +3021,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): else: self.report_warning(msg, only_once=True) - def _extract_formats_and_subtitles(self, streaming_data, video_id, player_url, live_status, duration): + def _extract_formats_and_subtitles(self, video_id, player_responses, player_url, live_status, duration): CHUNK_SIZE = 10 << 20 PREFERRED_LANG_VALUE = 10 original_language = None itags, stream_ids = collections.defaultdict(set), [] itag_qualities, res_qualities = {}, {0: None} + subtitles = {} q = qualities([ # Normally tiny is the smallest video-only formats. But # audio-only formats with unknown quality may get tagged as tiny @@ -3364,7 +3035,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'audio_quality_ultralow', 'audio_quality_low', 'audio_quality_medium', 'audio_quality_high', # Audio only formats 'small', 'medium', 'large', 'hd720', 'hd1080', 'hd1440', 'hd2160', 'hd2880', 'highres', ]) - streaming_formats = traverse_obj(streaming_data, (..., ('formats', 'adaptiveFormats'), ...)) format_types = self._configuration_arg('formats') all_formats = 'duplicate' in format_types if self._configuration_arg('include_duplicate_formats'): @@ -3372,6 +3042,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): self._downloader.deprecated_feature('[youtube] include_duplicate_formats extractor argument is deprecated. ' 'Use formats=duplicate extractor argument instead') + def solve_sig(s, spec): + return ''.join(s[i] for i in spec) + def build_fragments(f): return LazyList({ 'url': update_url_query(f['url'], { @@ -3391,279 +3064,361 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # For handling potential pre-playback required waiting period playback_wait = int_or_none(self._configuration_arg('playback_wait', [None])[0], default=6) - for fmt in streaming_formats: - client_name = fmt[STREAMING_DATA_CLIENT_NAME] - available_at = fmt[STREAMING_DATA_FETCHED_TIMESTAMP] + playback_wait - if fmt.get('targetDurationSec'): + for pr in player_responses: + streaming_data = traverse_obj(pr, 'streamingData') + if not streaming_data: continue + fetch_po_token_func = streaming_data[STREAMING_DATA_FETCH_GVS_PO_TOKEN] + is_premium_subscriber = streaming_data[STREAMING_DATA_IS_PREMIUM_SUBSCRIBER] + player_token_provided = streaming_data[STREAMING_DATA_PLAYER_TOKEN_PROVIDED] + client_name = streaming_data.get(STREAMING_DATA_CLIENT_NAME) + available_at = streaming_data[STREAMING_DATA_FETCHED_TIMESTAMP] + playback_wait + streaming_formats = traverse_obj(streaming_data, (('formats', 'adaptiveFormats'), ...)) - itag = str_or_none(fmt.get('itag')) - audio_track = fmt.get('audioTrack') or {} - stream_id = (itag, audio_track.get('id'), fmt.get('isDrc')) - if not all_formats: - if stream_id in stream_ids: - continue + def get_stream_id(fmt_stream): + return str_or_none(fmt_stream.get('itag')), traverse_obj(fmt_stream, 'audioTrack', 'id'), fmt_stream.get('isDrc') - quality = fmt.get('quality') - height = int_or_none(fmt.get('height')) - if quality == 'tiny' or not quality: - quality = fmt.get('audioQuality', '').lower() or quality - # The 3gp format (17) in android client has a quality of "small", - # but is actually worse than other formats - if itag == '17': - quality = 'tiny' - if quality: - if itag: - itag_qualities[itag] = quality - if height: - res_qualities[height] = quality + def process_format_stream(fmt_stream, proto, missing_pot): + nonlocal original_language + itag = str_or_none(fmt_stream.get('itag')) + audio_track = fmt_stream.get('audioTrack') or {} + quality = fmt_stream.get('quality') + height = int_or_none(fmt_stream.get('height')) + if quality == 'tiny' or not quality: + quality = fmt_stream.get('audioQuality', '').lower() or quality + # The 3gp format (17) in android client has a quality of "small", + # but is actually worse than other formats + if itag == '17': + quality = 'tiny' + if quality: + if itag: + itag_qualities[itag] = quality + if height: + res_qualities[height] = quality - display_name = audio_track.get('displayName') or '' - is_original = 'original' in display_name.lower() - is_descriptive = 'descriptive' in display_name.lower() - is_default = audio_track.get('audioIsDefault') - language_code = audio_track.get('id', '').split('.')[0] - if language_code and (is_original or (is_default and not original_language)): - original_language = language_code + display_name = audio_track.get('displayName') or '' + is_original = 'original' in display_name.lower() + is_descriptive = 'descriptive' in display_name.lower() + is_default = audio_track.get('audioIsDefault') + language_code = audio_track.get('id', '').split('.')[0] + if language_code and (is_original or (is_default and not original_language)): + original_language = language_code - has_drm = bool(fmt.get('drmFamilies')) + has_drm = bool(fmt_stream.get('drmFamilies')) - # FORMAT_STREAM_TYPE_OTF(otf=1) requires downloading the init fragment - # (adding `&sq=0` to the URL) and parsing emsg box to determine the - # number of fragment that would subsequently requested with (`&sq=N`) - if fmt.get('type') == 'FORMAT_STREAM_TYPE_OTF' and not has_drm: - continue - - if has_drm: - msg = f'Some {client_name} client https formats have been skipped as they are DRM protected. ' - if client_name == 'tv': - msg += ( - f'{"Your account" if self.is_authenticated else "The current session"} may have ' - f'an experiment that applies DRM to all videos on the tv client. ' - f'See https://github.com/yt-dlp/yt-dlp/issues/12563 for more details.' - ) - self.report_warning(msg, video_id, only_once=True) - - fmt_url = fmt.get('url') - if not fmt_url: - sc = urllib.parse.parse_qs(fmt.get('signatureCipher')) - fmt_url = url_or_none(try_get(sc, lambda x: x['url'][0])) - encrypted_sig = try_get(sc, lambda x: x['s'][0]) - if not all((sc, fmt_url, player_url, encrypted_sig)): - msg = f'Some {client_name} client https formats have been skipped as they are missing a url. ' - if client_name in ('web', 'web_safari'): - msg += 'YouTube is forcing SABR streaming for this client. ' - else: + if has_drm: + msg = f'Some {client_name} client {proto} formats have been skipped as they are DRM protected. ' + if client_name == 'tv': msg += ( - f'YouTube may have enabled the SABR-only or Server-Side Ad Placement experiment for ' - f'{"your account" if self.is_authenticated else "the current session"}. ' + f'{"Your account" if self.is_authenticated else "The current session"} may have ' + f'an experiment that applies DRM to all videos on the tv client. ' + f'See https://github.com/yt-dlp/yt-dlp/issues/12563 for more details.' ) - msg += 'See https://github.com/yt-dlp/yt-dlp/issues/12482 for more details' self.report_warning(msg, video_id, only_once=True) - continue - try: - fmt_url += '&{}={}'.format( - traverse_obj(sc, ('sp', -1)) or 'signature', - self._decrypt_signature(encrypted_sig, video_id, player_url), - ) - except ExtractorError as e: + + tbr = float_or_none(fmt_stream.get('averageBitrate') or fmt_stream.get('bitrate'), 1000) + format_duration = traverse_obj(fmt_stream, ('approxDurationMs', {float_or_none(scale=1000)})) + # Some formats may have much smaller duration than others (possibly damaged during encoding) + # E.g. 2-nOtRESiUc Ref: https://github.com/yt-dlp/yt-dlp/issues/2823 + # Make sure to avoid false positives with small duration differences. + # E.g. __2ABJjxzNo, ySuUZEjARPY + is_damaged = try_call(lambda: format_duration < duration // 2) + if is_damaged: self.report_warning( - f'Signature extraction failed: Some formats may be missing\n' - f' player = {player_url}\n' - f' {bug_reports_message(before="")}', - video_id=video_id, only_once=True) - self.write_debug( - f'{video_id}: Signature extraction failure info:\n' - f' encrypted sig = {encrypted_sig}\n' - f' player = {player_url}') - self.write_debug(e, only_once=True) - continue + f'Some {client_name} client {proto} formats are possibly damaged. They will be deprioritized', video_id, only_once=True) - query = parse_qs(fmt_url) - if query.get('n'): - try: - decrypt_nsig = self._cached(self._decrypt_nsig, 'nsig', query['n'][0]) - fmt_url = update_url_query(fmt_url, { - 'n': decrypt_nsig(query['n'][0], video_id, player_url), - }) - except ExtractorError as e: - if player_url: - self.report_warning( - f'nsig extraction failed: Some formats may be missing\n' - f' n = {query["n"][0]} ; player = {player_url}\n' - f' {bug_reports_message(before="")}', - video_id=video_id, only_once=True) - self.write_debug(e, only_once=True) - else: - self.report_warning( - 'Cannot decrypt nsig without player_url: Some formats may be missing', - video_id=video_id, only_once=True) - continue + if missing_pot and 'missing_pot' not in self._configuration_arg('formats'): + self._report_pot_format_skipped(video_id, client_name, proto) + return None - tbr = float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) - format_duration = traverse_obj(fmt, ('approxDurationMs', {float_or_none(scale=1000)})) - # Some formats may have much smaller duration than others (possibly damaged during encoding) - # E.g. 2-nOtRESiUc Ref: https://github.com/yt-dlp/yt-dlp/issues/2823 - # Make sure to avoid false positives with small duration differences. - # E.g. __2ABJjxzNo, ySuUZEjARPY - is_damaged = try_call(lambda: format_duration < duration // 2) - if is_damaged: - self.report_warning( - 'Some formats are possibly damaged. They will be deprioritized', video_id, only_once=True) - - fetch_po_token_func = fmt[STREAMING_DATA_FETCH_GVS_PO_TOKEN] - pot_policy: GvsPoTokenPolicy = self._get_default_ytcfg(client_name)['GVS_PO_TOKEN_POLICY'][StreamingProtocol.HTTPS] - - require_po_token = ( - itag not in ['18'] - and gvs_pot_required( - pot_policy, fmt[STREAMING_DATA_IS_PREMIUM_SUBSCRIBER], - fmt[STREAMING_DATA_PLAYER_TOKEN_PROVIDED])) - - po_token = ( - gvs_pots.get(client_name) - or fetch_po_token_func(required=require_po_token or pot_policy.recommended)) - - if po_token: - fmt_url = update_url_query(fmt_url, {'pot': po_token}) - if client_name not in gvs_pots: - gvs_pots[client_name] = po_token - - if not po_token and require_po_token and 'missing_pot' not in self._configuration_arg('formats'): - self._report_pot_format_skipped(video_id, client_name, 'https') - continue - - name = fmt.get('qualityLabel') or quality.replace('audio_quality_', '') or '' - fps = int_or_none(fmt.get('fps')) or 0 - dct = { - 'asr': int_or_none(fmt.get('audioSampleRate')), - 'filesize': int_or_none(fmt.get('contentLength')), - 'format_id': f'{itag}{"-drc" if fmt.get("isDrc") else ""}', - 'format_note': join_nonempty( - join_nonempty(display_name, is_default and ' (default)', delim=''), - name, fmt.get('isDrc') and 'DRC', - try_get(fmt, lambda x: x['projectionType'].replace('RECTANGULAR', '').lower()), - try_get(fmt, lambda x: x['spatialAudioType'].replace('SPATIAL_AUDIO_TYPE_', '').lower()), - is_damaged and 'DAMAGED', require_po_token and not po_token and 'MISSING POT', - (self.get_param('verbose') or all_formats) and short_client_name(client_name), - delim=', '), - # Format 22 is likely to be damaged. See https://github.com/yt-dlp/yt-dlp/issues/3372 - 'source_preference': (-5 if itag == '22' else -1) + (100 if 'Premium' in name else 0), - 'fps': fps if fps > 1 else None, # For some formats, fps is wrongly returned as 1 - 'audio_channels': fmt.get('audioChannels'), - 'height': height, - 'quality': q(quality) - bool(fmt.get('isDrc')) / 2, - 'has_drm': has_drm, - 'tbr': tbr, - 'filesize_approx': filesize_from_tbr(tbr, format_duration), - 'url': fmt_url, - 'width': int_or_none(fmt.get('width')), - 'language': join_nonempty(language_code, 'desc' if is_descriptive else '') or None, - 'language_preference': PREFERRED_LANG_VALUE if is_original else 5 if is_default else -10 if is_descriptive else -1, - # Strictly de-prioritize damaged and 3gp formats - 'preference': -10 if is_damaged else -2 if itag == '17' else None, - } - mime_mobj = re.match( - r'((?:[^/]+)/(?:[^;]+))(?:;\s*codecs="([^"]+)")?', fmt.get('mimeType') or '') - if mime_mobj: - dct['ext'] = mimetype2ext(mime_mobj.group(1)) - dct.update(parse_codecs(mime_mobj.group(2))) - if itag: - itags[itag].add(('https', dct.get('language'))) - stream_ids.append(stream_id) - single_stream = 'none' in (dct.get('acodec'), dct.get('vcodec')) - if single_stream and dct.get('ext'): - dct['container'] = dct['ext'] + '_dash' - - # For handling potential pre-playback required waiting period - if live_status not in ('is_live', 'post_live'): - dct['available_at'] = available_at - - if (all_formats or 'dashy' in format_types) and dct['filesize']: - yield { - **dct, - 'format_id': f'{dct["format_id"]}-dashy' if all_formats else dct['format_id'], - 'protocol': 'http_dash_segments', - 'fragments': build_fragments(dct), + name = fmt_stream.get('qualityLabel') or quality.replace('audio_quality_', '') or '' + fps = int_or_none(fmt_stream.get('fps')) or 0 + dct = { + 'asr': int_or_none(fmt_stream.get('audioSampleRate')), + 'filesize': int_or_none(fmt_stream.get('contentLength')), + 'format_id': f'{itag}{"-drc" if fmt_stream.get("isDrc") else ""}', + 'format_note': join_nonempty( + join_nonempty(display_name, is_default and ' (default)', delim=''), + name, fmt_stream.get('isDrc') and 'DRC', + try_get(fmt_stream, lambda x: x['projectionType'].replace('RECTANGULAR', '').lower()), + try_get(fmt_stream, lambda x: x['spatialAudioType'].replace('SPATIAL_AUDIO_TYPE_', '').lower()), + is_damaged and 'DAMAGED', missing_pot and 'MISSING POT', + (self.get_param('verbose') or all_formats) and short_client_name(client_name), + delim=', '), + # Format 22 is likely to be damaged. See https://github.com/yt-dlp/yt-dlp/issues/3372 + 'source_preference': (-5 if itag == '22' else -1) + (100 if 'Premium' in name else 0), + 'fps': fps if fps > 1 else None, # For some formats, fps is wrongly returned as 1 + 'audio_channels': fmt_stream.get('audioChannels'), + 'height': height, + 'quality': q(quality) - bool(fmt_stream.get('isDrc')) / 2, + 'has_drm': has_drm, + 'tbr': tbr, + 'filesize_approx': filesize_from_tbr(tbr, format_duration), + 'width': int_or_none(fmt_stream.get('width')), + 'language': join_nonempty(language_code, 'desc' if is_descriptive else '') or None, + 'language_preference': PREFERRED_LANG_VALUE if is_original else 5 if is_default else -10 if is_descriptive else -1, + # Strictly de-prioritize damaged and 3gp formats + 'preference': -10 if is_damaged else -2 if itag == '17' else None, } - if all_formats or 'dashy' not in format_types: - dct['downloader_options'] = {'http_chunk_size': CHUNK_SIZE} - yield dct + mime_mobj = re.match( + r'((?:[^/]+)/(?:[^;]+))(?:;\s*codecs="([^"]+)")?', fmt_stream.get('mimeType') or '') + if mime_mobj: + dct['ext'] = mimetype2ext(mime_mobj.group(1)) + dct.update(parse_codecs(mime_mobj.group(2))) - needs_live_processing = self._needs_live_processing(live_status, duration) - skip_bad_formats = 'incomplete' not in format_types + single_stream = 'none' in (dct.get('acodec'), dct.get('vcodec')) + if single_stream and dct.get('ext'): + dct['container'] = dct['ext'] + '_dash' - skip_manifests = set(self._configuration_arg('skip')) - if (needs_live_processing == 'is_live' # These will be filtered out by YoutubeDL anyway - or (needs_live_processing and skip_bad_formats)): - skip_manifests.add('hls') - if skip_bad_formats and live_status == 'is_live' and needs_live_processing != 'is_live': - skip_manifests.add('dash') + return dct - def process_manifest_format(f, proto, client_name, itag, missing_pot): - key = (proto, f.get('language')) - if not all_formats and key in itags[itag]: - return False + def process_https_formats(): + proto = 'https' + https_fmts = [] + for fmt_stream in streaming_formats: + if fmt_stream.get('targetDurationSec'): + continue - # For handling potential pre-playback required waiting period - if live_status not in ('is_live', 'post_live'): - f['available_at'] = available_at + # FORMAT_STREAM_TYPE_OTF(otf=1) requires downloading the init fragment + # (adding `&sq=0` to the URL) and parsing emsg box to determine the + # number of fragment that would subsequently requested with (`&sq=N`) + if fmt_stream.get('type') == 'FORMAT_STREAM_TYPE_OTF' and not bool(fmt_stream.get('drmFamilies')): + continue - if f.get('source_preference') is None: - f['source_preference'] = -1 + stream_id = get_stream_id(fmt_stream) + if not all_formats: + if stream_id in stream_ids: + continue - # Deprioritize since its pre-merged m3u8 formats may have lower quality audio streams - if client_name == 'web_safari' and proto == 'hls' and live_status != 'is_live': - f['source_preference'] -= 1 + pot_policy: GvsPoTokenPolicy = self._get_default_ytcfg(client_name)['GVS_PO_TOKEN_POLICY'][StreamingProtocol.HTTPS] - if missing_pot: - f['format_note'] = join_nonempty(f.get('format_note'), 'MISSING POT', delim=' ') - f['source_preference'] -= 20 + require_po_token = ( + stream_id[0] not in ['18'] + and gvs_pot_required(pot_policy, is_premium_subscriber, player_token_provided)) - itags[itag].add(key) + po_token = ( + gvs_pots.get(client_name) + or fetch_po_token_func(required=require_po_token or pot_policy.recommended)) + if po_token: + if client_name not in gvs_pots: + gvs_pots[client_name] = po_token - if itag and all_formats: - f['format_id'] = f'{itag}-{proto}' - elif any(p != proto for p, _ in itags[itag]): - f['format_id'] = f'{itag}-{proto}' - elif itag: - f['format_id'] = itag + fmt_url = fmt_stream.get('url') + encrypted_sig, sc = None, None + if not fmt_url: + sc = urllib.parse.parse_qs(fmt_stream.get('signatureCipher')) + fmt_url = url_or_none(try_get(sc, lambda x: x['url'][0])) + encrypted_sig = try_get(sc, lambda x: x['s'][0]) + if not all((sc, fmt_url, player_url, encrypted_sig)): + msg = f'Some {client_name} client https formats have been skipped as they are missing a url. ' + if client_name in ('web', 'web_safari'): + msg += 'YouTube is forcing SABR streaming for this client. ' + else: + msg += ( + f'YouTube may have enabled the SABR-only or Server-Side Ad Placement experiment for ' + f'{"your account" if self.is_authenticated else "the current session"}. ' + ) + msg += 'See https://github.com/yt-dlp/yt-dlp/issues/12482 for more details' + self.report_warning(msg, video_id, only_once=True) + continue - if original_language and f.get('language') == original_language: - f['format_note'] = join_nonempty(f.get('format_note'), '(default)', delim=' ') - f['language_preference'] = PREFERRED_LANG_VALUE + fmt = process_format_stream(fmt_stream, proto, missing_pot=require_po_token and not po_token) + if not fmt: + continue - if itag in ('616', '235'): - f['format_note'] = join_nonempty(f.get('format_note'), 'Premium', delim=' ') - f['source_preference'] += 100 + # signature + # Attempt to load sig spec from cache + if encrypted_sig: + spec_cache_id = self._sig_spec_cache_id(player_url, len(encrypted_sig)) + spec = self._load_sig_spec_from_cache(spec_cache_id) + if spec: + self.write_debug(f'Using cached signature function {spec_cache_id}', only_once=True) + fmt_url += '&{}={}'.format(traverse_obj(sc, ('sp', -1)) or 'signature', + solve_sig(encrypted_sig, spec)) + else: + fmt['_jsc_s_challenge'] = encrypted_sig + fmt['_jsc_s_sc'] = sc - f['quality'] = q(itag_qualities.get(try_get(f, lambda f: f['format_id'].split('-')[0]), -1)) - if f['quality'] == -1 and f.get('height'): - f['quality'] = q(res_qualities[min(res_qualities, key=lambda x: abs(x - f['height']))]) - if self.get_param('verbose') or all_formats: - f['format_note'] = join_nonempty( - f.get('format_note'), short_client_name(client_name), delim=', ') - if f.get('fps') and f['fps'] <= 1: - del f['fps'] + # n challenge + query = parse_qs(fmt_url) + if query.get('n'): + n_challenge = query['n'][0] + if n_challenge in self._player_cache: + fmt_url = update_url_query(fmt_url, {'n': self._player_cache[n_challenge]}) + else: + fmt['_jsc_n_challenge'] = n_challenge - if proto == 'hls' and f.get('has_drm'): - f['has_drm'] = 'maybe' - f['source_preference'] -= 5 - return True + if po_token: + fmt_url = update_url_query(fmt_url, {'pot': po_token}) - subtitles = {} - for sd in streaming_data: - client_name = sd[STREAMING_DATA_CLIENT_NAME] - fetch_pot_func = sd[STREAMING_DATA_FETCH_GVS_PO_TOKEN] - is_premium_subscriber = sd[STREAMING_DATA_IS_PREMIUM_SUBSCRIBER] - has_player_token = sd[STREAMING_DATA_PLAYER_TOKEN_PROVIDED] + fmt['url'] = fmt_url - hls_manifest_url = 'hls' not in skip_manifests and sd.get('hlsManifestUrl') + if stream_id[0]: + itags[stream_id[0]].add((proto, fmt.get('language'))) + stream_ids.append(stream_id) + + # For handling potential pre-playback required waiting period + if live_status not in ('is_live', 'post_live'): + fmt['available_at'] = available_at + + https_fmts.append(fmt) + + # Bulk process sig/n handling + # Retrieve all JSC Sig and n requests for this player response in one go + n_challenges = {} + s_challenges = {} + for fmt in https_fmts: + # This will de-duplicate requests + n_challenge = fmt.pop('_jsc_n_challenge', None) + if n_challenge is not None: + n_challenges.setdefault(n_challenge, []).append(fmt) + + s_challenge = fmt.pop('_jsc_s_challenge', None) + if s_challenge is not None: + s_challenges.setdefault(len(s_challenge), {}).setdefault(s_challenge, []).append(fmt) + + challenge_requests = [] + if n_challenges: + challenge_requests.append(JsChallengeRequest( + type=JsChallengeType.N, + video_id=video_id, + input=NChallengeInput(challenges=list(n_challenges.keys()), player_url=player_url))) + if s_challenges: + challenge_requests.append(JsChallengeRequest( + type=JsChallengeType.SIG, + video_id=video_id, + input=SigChallengeInput(challenges=[''.join(map(chr, range(spec_id))) for spec_id in s_challenges], player_url=player_url))) + + if challenge_requests: + for _challenge_request, challenge_response in self._jsc_director.bulk_solve(challenge_requests): + if challenge_response.type == JsChallengeType.SIG: + for challenge, result in challenge_response.output.results.items(): + spec_id = len(challenge) + spec = [ord(c) for c in result] + self._store_sig_spec_to_cache(self._sig_spec_cache_id(player_url, spec_id), spec) + s_challenge_data = s_challenges.pop(spec_id, {}) + if not s_challenge_data: + continue + for s_challenge, fmts in s_challenge_data.items(): + solved_challenge = solve_sig(s_challenge, spec) + for fmt in fmts: + sc = fmt.pop('_jsc_s_sc') + fmt['url'] += '&{}={}'.format( + traverse_obj(sc, ('sp', -1)) or 'signature', + solved_challenge) + + elif challenge_response.type == JsChallengeType.N: + for challenge, result in challenge_response.output.results.items(): + fmts = n_challenges.pop(challenge, []) + for fmt in fmts: + self._player_cache[challenge] = result + fmt['url'] = update_url_query(fmt['url'], {'n': result}) + + # Raise warning if any challenge requests remain + # Depending on type of challenge request + + help_message = ( + 'Ensure you have a supported JavaScript runtime and ' + 'challenge solver script distribution installed. ' + 'Review any warnings presented before this message. ' + f'For more details, refer to {_EJS_WIKI_URL}') + + if s_challenges: + self.report_warning( + f'Signature solving failed: Some formats may be missing. {help_message}', + video_id=video_id, only_once=True) + if n_challenges: + self.report_warning( + f'n challenge solving failed: Some formats may be missing. {help_message}', + video_id=video_id, only_once=True) + + for cfmts in list(s_challenges.values()) + list(n_challenges.values()): + for fmt in cfmts: + if fmt in https_fmts: + https_fmts.remove(fmt) + + for fmt in https_fmts: + if (all_formats or 'dashy' in format_types) and fmt['filesize']: + yield { + **fmt, + 'format_id': f'{fmt["format_id"]}-dashy' if all_formats else fmt['format_id'], + 'protocol': 'http_dash_segments', + 'fragments': build_fragments(fmt), + } + if all_formats or 'dashy' not in format_types: + fmt['downloader_options'] = {'http_chunk_size': CHUNK_SIZE} + yield fmt + + yield from process_https_formats() + + needs_live_processing = self._needs_live_processing(live_status, duration) + skip_bad_formats = 'incomplete' not in format_types + + skip_manifests = set(self._configuration_arg('skip')) + if (needs_live_processing == 'is_live' # These will be filtered out by YoutubeDL anyway + or (needs_live_processing and skip_bad_formats)): + skip_manifests.add('hls') + + if skip_bad_formats and live_status == 'is_live' and needs_live_processing != 'is_live': + skip_manifests.add('dash') + + def process_manifest_format(f, proto, client_name, itag, missing_pot): + key = (proto, f.get('language')) + if not all_formats and key in itags[itag]: + return False + + # For handling potential pre-playback required waiting period + if live_status not in ('is_live', 'post_live'): + f['available_at'] = available_at + + if f.get('source_preference') is None: + f['source_preference'] = -1 + + # Deprioritize since its pre-merged m3u8 formats may have lower quality audio streams + if client_name == 'web_safari' and proto == 'hls' and live_status != 'is_live': + f['source_preference'] -= 1 + + if missing_pot: + f['format_note'] = join_nonempty(f.get('format_note'), 'MISSING POT', delim=' ') + f['source_preference'] -= 20 + + itags[itag].add(key) + + if itag and all_formats: + f['format_id'] = f'{itag}-{proto}' + elif any(p != proto for p, _ in itags[itag]): + f['format_id'] = f'{itag}-{proto}' + elif itag: + f['format_id'] = itag + + if original_language and f.get('language') == original_language: + f['format_note'] = join_nonempty(f.get('format_note'), '(default)', delim=' ') + f['language_preference'] = PREFERRED_LANG_VALUE + + if itag in ('616', '235'): + f['format_note'] = join_nonempty(f.get('format_note'), 'Premium', delim=' ') + f['source_preference'] += 100 + + f['quality'] = q(itag_qualities.get(try_get(f, lambda f: f['format_id'].split('-')[0]), -1)) + if f['quality'] == -1 and f.get('height'): + f['quality'] = q(res_qualities[min(res_qualities, key=lambda x: abs(x - f['height']))]) + if self.get_param('verbose') or all_formats: + f['format_note'] = join_nonempty( + f.get('format_note'), short_client_name(client_name), delim=', ') + if f.get('fps') and f['fps'] <= 1: + del f['fps'] + + if proto == 'hls' and f.get('has_drm'): + f['has_drm'] = 'maybe' + f['source_preference'] -= 5 + return True + + hls_manifest_url = 'hls' not in skip_manifests and streaming_data.get('hlsManifestUrl') if hls_manifest_url: pot_policy: GvsPoTokenPolicy = self._get_default_ytcfg( client_name)['GVS_PO_TOKEN_POLICY'][StreamingProtocol.HLS] - require_po_token = gvs_pot_required(pot_policy, is_premium_subscriber, has_player_token) - po_token = gvs_pots.get(client_name, fetch_pot_func(required=require_po_token or pot_policy.recommended)) + require_po_token = gvs_pot_required(pot_policy, is_premium_subscriber, player_token_provided) + po_token = gvs_pots.get(client_name, fetch_po_token_func(required=require_po_token or pot_policy.recommended)) if po_token: hls_manifest_url = hls_manifest_url.rstrip('/') + f'/pot/{po_token}' if client_name not in gvs_pots: @@ -3683,12 +3438,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): r'/itag/(\d+)', f['url'], 'itag', default=None), require_po_token and not po_token): yield f - dash_manifest_url = 'dash' not in skip_manifests and sd.get('dashManifestUrl') + dash_manifest_url = 'dash' not in skip_manifests and streaming_data.get('dashManifestUrl') if dash_manifest_url: pot_policy: GvsPoTokenPolicy = self._get_default_ytcfg( client_name)['GVS_PO_TOKEN_POLICY'][StreamingProtocol.DASH] - require_po_token = gvs_pot_required(pot_policy, is_premium_subscriber, has_player_token) - po_token = gvs_pots.get(client_name, fetch_pot_func(required=require_po_token or pot_policy.recommended)) + require_po_token = gvs_pot_required(pot_policy, is_premium_subscriber, player_token_provided) + po_token = gvs_pots.get(client_name, fetch_po_token_func(required=require_po_token or pot_policy.recommended)) if po_token: dash_manifest_url = dash_manifest_url.rstrip('/') + f'/pot/{po_token}' if client_name not in gvs_pots: @@ -3708,7 +3463,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): r'/clen/(\d+)', f.get('fragment_base_url') or f['url'], 'file size', default=None)) if needs_live_processing: f['is_from_start'] = True - yield f yield subtitles @@ -3781,14 +3535,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): else 'was_live' if live_content else 'not_live' if False in (is_live, live_content) else None) - streaming_data = traverse_obj(player_responses, (..., 'streamingData')) - *formats, subtitles = self._extract_formats_and_subtitles(streaming_data, video_id, player_url, live_status, duration) + *formats, subtitles = self._extract_formats_and_subtitles(video_id, player_responses, player_url, live_status, duration) if all(f.get('has_drm') for f in formats): # If there are no formats that definitely don't have DRM, all have DRM for f in formats: f['has_drm'] = True - return live_broadcast_details, live_status, streaming_data, formats, subtitles + return live_broadcast_details, live_status, formats, subtitles def _download_initial_data(self, video_id, webpage, webpage_client, webpage_ytcfg): initial_data = None @@ -3948,8 +3701,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): or int_or_none(get_first(microformats, 'lengthSeconds')) or parse_duration(search_meta('duration')) or None) - live_broadcast_details, live_status, streaming_data, formats, automatic_captions = \ + live_broadcast_details, live_status, formats, automatic_captions = \ self._list_formats(video_id, microformats, video_details, player_responses, player_url, duration) + streaming_data = traverse_obj(player_responses, (..., 'streamingData')) if live_status == 'post_live': self.write_debug(f'{video_id}: Video is in Post-Live Manifestless mode') @@ -4094,7 +3848,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): else 'video'), 'release_timestamp': live_start_time, '_format_sort_fields': ( # source_preference is lower for potentially damaged formats - 'quality', 'res', 'fps', 'hdr:12', 'source', 'vcodec', 'channels', 'acodec', 'lang', 'proto'), + 'quality', 'res', 'fps', 'hdr:12', 'source', + 'vcodec:vp9.2' if 'prefer-vp9-sort' in self.get_param('compat_opts', []) else 'vcodec', + 'channels', 'acodec', 'lang', 'proto'), } def get_lang_code(track): diff --git a/yt_dlp/extractor/youtube/jsc/README.md b/yt_dlp/extractor/youtube/jsc/README.md new file mode 100644 index 0000000000..1bd7a3ff8a --- /dev/null +++ b/yt_dlp/extractor/youtube/jsc/README.md @@ -0,0 +1,132 @@ +# YoutubeIE JS Challenge Provider Framework + +As part of the YouTube extractor, we have a framework for solving n/sig JS Challenges programmatically. This can be used by plugins. + +> [!TIP] +> If publishing a JS Challenge Provider plugin to GitHub, add the [yt-dlp-jsc-provider](https://github.com/topics/yt-dlp-jsc-provider) topic to your repository to help users find it. + + +## Public APIs + +- `yt_dlp.extractor.youtube.jsc.provider` + +Everything else is **internal-only** and no guarantees are made about the API stability. + +> [!WARNING] +> We will try our best to maintain stability with the public APIs. +> However, due to the nature of extractors and YouTube, we may need to remove or change APIs in the future. +> If you are using these APIs outside yt-dlp plugins, please account for this by importing them safely. + +## JS Challenge Provider + +`yt_dlp.extractor.youtube.jsc.provider` + +```python +from yt_dlp.extractor.youtube.jsc.provider import ( + register_provider, + register_preference, + JsChallengeProvider, + JsChallengeRequest, + JsChallengeResponse, + JsChallengeProviderError, + JsChallengeProviderRejectedRequest, + JsChallengeType, + JsChallengeProviderResponse, + NChallengeOutput, +) +from yt_dlp.utils import traverse_obj, Popen +import json +import subprocess +import typing + +@register_provider +class MyJsChallengeProviderJCP(JsChallengeProvider): # Provider class name must end with "JCP" + PROVIDER_VERSION = '0.2.1' + # Define a unique display name for the provider + PROVIDER_NAME = 'my-provider' + BUG_REPORT_LOCATION = 'https://issues.example.com/report' + + # Set supported challenge types. + # If None, the provider will handle all types. + _SUPPORTED_TYPES = [JsChallengeType.N] + + def is_available(self) -> bool: + """ + Check if the provider is available (e.g. all required dependencies are available) + This is used to determine if the provider should be used and to provide debug information. + + IMPORTANT: This method SHOULD NOT make any network requests or perform any expensive operations. + + Since this is called multiple times, we recommend caching the result. + """ + return True + + def close(self): + # Optional close hook, called when YoutubeDL is closed. + pass + + def _real_bulk_solve(self, requests: list[JsChallengeRequest]) -> typing.Generator[JsChallengeProviderResponse, None, None]: + # ℹ️ If you need to do additional validation on the requests. + # Raise yt_dlp.extractor.youtube.jsc.provider.JsChallengeProviderRejectedRequest if the request is not supported. + if len("something") > 255: + raise JsChallengeProviderRejectedRequest('Challenges longer than 255 are not supported', expected=True) + + + # ℹ️ Settings are pulled from extractor args passed to yt-dlp with the key `youtubejsc-<PROVIDER_KEY>`. + # For this example, the extractor arg would be: + # `--extractor-args "youtubejsc-myjschallengeprovider:bin_path=/path/to/bin"` + bin_path = self._configuration_arg( + 'bin_path', default=['/path/to/bin'])[0] + + # See below for logging guidelines + self.logger.trace(f'Using bin path: {bin_path}') + + for request in requests: + # You can use the _get_player method to get the player JS code if needed. + # This shares the same caching as the YouTube extractor, so it will not make unnecessary requests. + player_js = self._get_player(request.video_id, request.input.player_url) + cmd = f'{bin_path} {request.input.challenges} {player_js}' + self.logger.info(f'Executing command: {cmd}') + stdout, _, ret = Popen.run(cmd, text=True, shell=True, stdout=subprocess.PIPE) + if ret != 0: + # ℹ️ If there is an error, raise JsChallengeProviderError. + # The request will be sent to the next provider if there is one. + # You can specify whether it is expected or not. If it is unexpected, + # the log will include a link to the bug report location (BUG_REPORT_LOCATION). + + # raise JsChallengeProviderError(f'Command returned error code {ret}', expected=False) + + # You can also only fail this specific request by returning a JsChallengeProviderResponse with the error. + # This will allow other requests to be processed by this provider. + yield JsChallengeProviderResponse( + request=request, + error=JsChallengeProviderError(f'Command returned error code {ret}', expected=False) + ) + + yield JsChallengeProviderResponse( + request=request, + response=JsChallengeResponse( + type=JsChallengeType.N, + output=NChallengeOutput(results=traverse_obj(json.loads(stdout))), + )) + + +# If there are multiple JS Challenge Providers that can handle the same JsChallengeRequest(s), +# you can define a preference function to increase/decrease the priority of providers. + +@register_preference(MyJsChallengeProviderJCP) +def my_provider_preference(provider: JsChallengeProvider, requests: list[JsChallengeRequest]) -> int: + return 50 +``` + +## Logging Guidelines + +- Use the `self.logger` object to log messages. +- When making HTTP requests or any other time-expensive operation, use `self.logger.info` to log a message to standard non-verbose output. + - This lets users know what is happening when a time-expensive operation is taking place. +- Technical information such as a command being executed should be logged to `self.logger.debug` +- Use `self.logger.trace` for very detailed information that is only useful for debugging to avoid cluttering the debug log. + +## Debugging + +- Use `-v --extractor-args "youtube:jsc_trace=true"` to enable JS Challenge debug output. diff --git a/yt_dlp/extractor/youtube/jsc/__init__.py b/yt_dlp/extractor/youtube/jsc/__init__.py new file mode 100644 index 0000000000..b0a0f037d0 --- /dev/null +++ b/yt_dlp/extractor/youtube/jsc/__init__.py @@ -0,0 +1,5 @@ +# Trigger import of built-in providers +from ._builtin.bun import BunJCP as _BunJCP # noqa: F401 +from ._builtin.deno import DenoJCP as _DenoJCP # noqa: F401 +from ._builtin.node import NodeJCP as _NodeJCP # noqa: F401 +from ._builtin.quickjs import QuickJSJCP as _QuickJSJCP # noqa: F401 diff --git a/yt_dlp/extractor/youtube/jsc/_builtin/__init__.py b/yt_dlp/extractor/youtube/jsc/_builtin/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/yt_dlp/extractor/youtube/jsc/_builtin/bun.py b/yt_dlp/extractor/youtube/jsc/_builtin/bun.py new file mode 100644 index 0000000000..8b0a6e5510 --- /dev/null +++ b/yt_dlp/extractor/youtube/jsc/_builtin/bun.py @@ -0,0 +1,150 @@ +from __future__ import annotations + +import os +import re +import shlex +import subprocess +import urllib.parse + +from yt_dlp.extractor.youtube.jsc._builtin.ejs import ( + _EJS_WIKI_URL, + EJSBaseJCP, + Script, + ScriptSource, + ScriptType, + ScriptVariant, +) +from yt_dlp.extractor.youtube.jsc._builtin.vendor import load_script +from yt_dlp.extractor.youtube.jsc.provider import ( + JsChallengeProvider, + JsChallengeProviderError, + JsChallengeRequest, + register_preference, + register_provider, +) +from yt_dlp.extractor.youtube.pot._provider import BuiltinIEContentProvider +from yt_dlp.extractor.youtube.pot.provider import provider_bug_report_message +from yt_dlp.utils import Popen +from yt_dlp.utils.networking import HTTPHeaderDict, clean_proxies + +# KNOWN ISSUES: +# - If node_modules is present and includes a requested lib, the version we request is ignored +# and whatever installed in node_modules is used. +# - No way to ignore existing node_modules, lock files, etc. +# - No sandboxing options available +# - Cannot detect if npm packages are cached without potentially downloading them. +# `--no-install` appears to disable the cache. +# - npm auto-install may fail with an integrity error when using HTTP proxies +# - npm auto-install HTTP proxy support may be limited on older Bun versions +# - Cannot disable the transpiler / specify lang for stdin + + +@register_provider +class BunJCP(EJSBaseJCP, BuiltinIEContentProvider): + PROVIDER_NAME = 'bun' + JS_RUNTIME_NAME = 'bun' + BUN_NPM_LIB_FILENAME = 'yt.solver.bun.lib.js' + SUPPORTED_PROXY_SCHEMES = ['http', 'https'] + + def _iter_script_sources(self): + yield from super()._iter_script_sources() + yield ScriptSource.BUILTIN, self._bun_npm_source + + def _bun_npm_source(self, script_type: ScriptType, /): + if script_type != ScriptType.LIB: + return None + if 'ejs:npm' not in self.ie.get_param('remote_components', []): + return self._skip_component('ejs:npm') + + # Check to see if the environment proxies are compatible with Bun npm source + if unsupported_scheme := self._check_env_proxies(self._get_env_options()): + self.logger.warning( + f'Bun NPM package downloads only support HTTP/HTTPS proxies; skipping remote NPM package downloads. ' + f'Provide another distribution of the challenge solver script or use ' + f'another JS runtime that supports "{unsupported_scheme}" proxies. ' + f'For more information and alternatives, refer to {_EJS_WIKI_URL}') + return None + + # Bun-specific lib scripts that uses Bun autoimport + # https://bun.com/docs/runtime/autoimport + error_hook = lambda e: self.logger.warning( + f'Failed to read bun challenge solver lib script: {e}{provider_bug_report_message(self)}') + code = load_script( + self.BUN_NPM_LIB_FILENAME, error_hook=error_hook) + if code: + return Script(script_type, ScriptVariant.BUN_NPM, ScriptSource.BUILTIN, self._SCRIPT_VERSION, code) + return None + + def _check_env_proxies(self, env): + # check that the schemes of both HTTP_PROXY and HTTPS_PROXY are supported + for key in ('HTTP_PROXY', 'HTTPS_PROXY'): + proxy = env.get(key) + if not proxy: + continue + scheme = urllib.parse.urlparse(proxy).scheme.lower() + if scheme not in self.SUPPORTED_PROXY_SCHEMES: + return scheme + return None + + def _get_env_options(self) -> dict[str, str]: + options = os.environ.copy() # pass through existing bun env vars + request_proxies = self.ie._downloader.proxies.copy() + clean_proxies(request_proxies, HTTPHeaderDict()) + + # Apply 'all' proxy first, then allow per-scheme overrides + if request_proxies.get('all') is not None: + options['HTTP_PROXY'] = options['HTTPS_PROXY'] = request_proxies['all'] + for key, env in (('http', 'HTTP_PROXY'), ('https', 'HTTPS_PROXY')): + val = request_proxies.get(key) + if val is not None: + options[env] = val + if self.ie.get_param('nocheckcertificate'): + options['NODE_TLS_REJECT_UNAUTHORIZED'] = '0' + + # Disable Bun transpiler cache + options['BUN_RUNTIME_TRANSPILER_CACHE_PATH'] = '0' + + # Prevent segfault: <https://github.com/oven-sh/bun/issues/22901> + options.pop('JSC_useJIT', None) + if self.ejs_setting('jitless', ['false']) != ['false']: + options['BUN_JSC_useJIT'] = '0' + + return options + + def _run_js_runtime(self, stdin: str, /) -> str: + # https://bun.com/docs/cli/run + options = ['--no-addons', '--prefer-offline'] + if self._lib_script.variant == ScriptVariant.BUN_NPM: + # Enable auto-install even if node_modules is present + options.append('--install=fallback') + else: + options.append('--no-install') + cmd = [self.runtime_info.path, '--bun', 'run', *options, '-'] + self.logger.debug(f'Running bun: {shlex.join(cmd)}') + + with Popen( + cmd, + text=True, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + env=self._get_env_options(), + ) as proc: + stdout, stderr = proc.communicate_or_kill(stdin) + stderr = self._clean_stderr(stderr) + if proc.returncode or stderr: + msg = f'Error running bun process (returncode: {proc.returncode})' + if stderr: + msg = f'{msg}: {stderr.strip()}' + raise JsChallengeProviderError(msg) + return stdout + + def _clean_stderr(self, stderr): + return '\n'.join( + line for line in stderr.splitlines() + if not re.match(r'^Bun v\d+\.\d+\.\d+ \([\w\s]+\)$', line)) + + +@register_preference(BunJCP) +def preference(provider: JsChallengeProvider, requests: list[JsChallengeRequest]) -> int: + return 800 diff --git a/yt_dlp/extractor/youtube/jsc/_builtin/deno.py b/yt_dlp/extractor/youtube/jsc/_builtin/deno.py new file mode 100644 index 0000000000..55d8fc5ea4 --- /dev/null +++ b/yt_dlp/extractor/youtube/jsc/_builtin/deno.py @@ -0,0 +1,131 @@ +from __future__ import annotations + +import os +import re +import shlex +import subprocess + +from yt_dlp.extractor.youtube.jsc._builtin.ejs import ( + EJSBaseJCP, + Script, + ScriptSource, + ScriptType, + ScriptVariant, +) +from yt_dlp.extractor.youtube.jsc._builtin.vendor import load_script +from yt_dlp.extractor.youtube.jsc.provider import ( + JsChallengeProvider, + JsChallengeProviderError, + JsChallengeRequest, + register_preference, + register_provider, +) +from yt_dlp.extractor.youtube.pot._provider import BuiltinIEContentProvider +from yt_dlp.extractor.youtube.pot.provider import provider_bug_report_message +from yt_dlp.utils import Popen, remove_terminal_sequences +from yt_dlp.utils.networking import HTTPHeaderDict, clean_proxies + +# KNOWN ISSUES: +# - Can't avoid analysis cache: https://github.com/yt-dlp/yt-dlp/pull/14849#issuecomment-3475840821 + + +@register_provider +class DenoJCP(EJSBaseJCP, BuiltinIEContentProvider): + PROVIDER_NAME = 'deno' + JS_RUNTIME_NAME = 'deno' + + _DENO_BASE_OPTIONS = [ + '--ext=js', '--no-code-cache', '--no-prompt', '--no-remote', + '--no-lock', '--node-modules-dir=none', '--no-config', + ] + DENO_NPM_LIB_FILENAME = 'yt.solver.deno.lib.js' + _NPM_PACKAGES_CACHED = False + + def _iter_script_sources(self): + yield from super()._iter_script_sources() + yield ScriptSource.BUILTIN, self._deno_npm_source + + def _deno_npm_source(self, script_type: ScriptType, /): + if script_type != ScriptType.LIB: + return None + # Deno-specific lib scripts that use Deno NPM imports + error_hook = lambda e: self.logger.warning( + f'Failed to read deno challenge solver lib script: {e}{provider_bug_report_message(self)}') + code = load_script( + self.DENO_NPM_LIB_FILENAME, error_hook=error_hook) + if not code: + return None + if 'ejs:npm' not in self.ie.get_param('remote_components', []): + # We may still be able to continue if the npm packages are available/cached + self._NPM_PACKAGES_CACHED = self._npm_packages_cached(code) + if not self._NPM_PACKAGES_CACHED: + return self._skip_component('ejs:npm') + return Script(script_type, ScriptVariant.DENO_NPM, ScriptSource.BUILTIN, self._SCRIPT_VERSION, code) + + def _npm_packages_cached(self, stdin: str) -> bool: + # Check if npm packages are cached, so we can run without --remote-components ejs:npm + self.logger.debug('Checking if npm packages are cached') + try: + self._run_deno(stdin, [*self._DENO_BASE_OPTIONS, '--cached-only']) + except JsChallengeProviderError as e: + self.logger.trace(f'Deno npm packages not cached: {e}') + return False + return True + + def _run_js_runtime(self, stdin: str, /) -> str: + options = [*self._DENO_BASE_OPTIONS] + if self._lib_script.variant == ScriptVariant.DENO_NPM and self._NPM_PACKAGES_CACHED: + options.append('--cached-only') + elif self._lib_script.variant != ScriptVariant.DENO_NPM: + options.append('--no-npm') + options.append('--cached-only') + if self.ie.get_param('nocheckcertificate'): + options.append('--unsafely-ignore-certificate-errors') + # XXX: Convert this extractor-arg into a general option if/when a JSI framework is implemented + if self.ejs_setting('jitless', ['false']) != ['false']: + options.append('--v8-flags=--jitless') + return self._run_deno(stdin, options) + + def _get_env_options(self) -> dict[str, str]: + options = os.environ.copy() # pass through existing deno env vars + request_proxies = self.ie._downloader.proxies.copy() + clean_proxies(request_proxies, HTTPHeaderDict()) + # Apply 'all' proxy first, then allow per-scheme overrides + if 'all' in request_proxies and request_proxies['all'] is not None: + options['HTTP_PROXY'] = options['HTTPS_PROXY'] = request_proxies['all'] + for key, env in (('http', 'HTTP_PROXY'), ('https', 'HTTPS_PROXY'), ('no', 'NO_PROXY')): + if key in request_proxies and request_proxies[key] is not None: + options[env] = request_proxies[key] + return options + + def _run_deno(self, stdin, options) -> str: + cmd = [self.runtime_info.path, 'run', *options, '-'] + self.logger.debug(f'Running deno: {shlex.join(cmd)}') + with Popen( + cmd, + text=True, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + env=self._get_env_options(), + ) as proc: + stdout, stderr = proc.communicate_or_kill(stdin) + stderr = self._clean_stderr(stderr) + if proc.returncode or stderr: + msg = f'Error running deno process (returncode: {proc.returncode})' + if stderr: + msg = f'{msg}: {stderr.strip()}' + raise JsChallengeProviderError(msg) + return stdout + + def _clean_stderr(self, stderr): + return '\n'.join( + line for line in stderr.splitlines() + if not ( + re.match(r'^Download\s+https\S+$', remove_terminal_sequences(line)) + or re.match(r'DANGER: TLS certificate validation is disabled for all hostnames', remove_terminal_sequences(line)))) + + +@register_preference(DenoJCP) +def preference(provider: JsChallengeProvider, requests: list[JsChallengeRequest]) -> int: + return 1000 diff --git a/yt_dlp/extractor/youtube/jsc/_builtin/ejs.py b/yt_dlp/extractor/youtube/jsc/_builtin/ejs.py new file mode 100644 index 0000000000..52d7ecf170 --- /dev/null +++ b/yt_dlp/extractor/youtube/jsc/_builtin/ejs.py @@ -0,0 +1,326 @@ +from __future__ import annotations + +import collections +import dataclasses +import enum +import functools +import hashlib +import json + +from yt_dlp.dependencies import yt_dlp_ejs as _has_ejs +from yt_dlp.extractor.youtube.jsc._builtin import vendor +from yt_dlp.extractor.youtube.jsc.provider import ( + JsChallengeProvider, + JsChallengeProviderError, + JsChallengeProviderRejectedRequest, + JsChallengeProviderResponse, + JsChallengeResponse, + JsChallengeType, + NChallengeOutput, + SigChallengeOutput, +) +from yt_dlp.extractor.youtube.pot._provider import configuration_arg +from yt_dlp.extractor.youtube.pot.provider import provider_bug_report_message +from yt_dlp.utils._jsruntime import JsRuntimeInfo + +if _has_ejs: + import yt_dlp_ejs.yt.solver + +TYPE_CHECKING = False +if TYPE_CHECKING: + from collections.abc import Callable, Generator + + from yt_dlp.extractor.youtube.jsc.provider import JsChallengeRequest + +_EJS_WIKI_URL = 'https://github.com/yt-dlp/yt-dlp/wiki/EJS' + + +class ScriptType(enum.Enum): + LIB = 'lib' + CORE = 'core' + + +class ScriptVariant(enum.Enum): + UNKNOWN = 'unknown' + MINIFIED = 'minified' + UNMINIFIED = 'unminified' + DENO_NPM = 'deno_npm' + BUN_NPM = 'bun_npm' + + +class ScriptSource(enum.Enum): + PYPACKAGE = 'python package' # PyPI, PyInstaller exe, zipimport binary, etc + CACHE = 'cache' # GitHub release assets (cached) + WEB = 'web' # GitHub release assets (downloaded) + BUILTIN = 'builtin' # vendored (full core script; import-only lib script + NPM cache) + + +@dataclasses.dataclass +class Script: + type: ScriptType + variant: ScriptVariant + source: ScriptSource + version: str + code: str + + @functools.cached_property + def hash(self, /) -> str: + return hashlib.sha3_512(self.code.encode()).hexdigest() + + def __str__(self, /): + return f'<Script {self.type.value!r} v{self.version} (source: {self.source.value}) variant={self.variant.value!r} size={len(self.code)} hash={self.hash[:7]}...>' + + +class EJSBaseJCP(JsChallengeProvider): + JS_RUNTIME_NAME: str + _CACHE_SECTION = 'challenge-solver' + + _REPOSITORY = 'yt-dlp/ejs' + _SUPPORTED_TYPES = [JsChallengeType.N, JsChallengeType.SIG] + _SCRIPT_VERSION = vendor.VERSION + # TODO: Integration tests for each kind of scripts source + _ALLOWED_HASHES = { + ScriptType.LIB: { + ScriptVariant.UNMINIFIED: vendor.HASHES['yt.solver.lib.js'], + ScriptVariant.MINIFIED: vendor.HASHES['yt.solver.lib.min.js'], + ScriptVariant.DENO_NPM: vendor.HASHES['yt.solver.deno.lib.js'], + ScriptVariant.BUN_NPM: vendor.HASHES['yt.solver.bun.lib.js'], + }, + ScriptType.CORE: { + ScriptVariant.MINIFIED: vendor.HASHES['yt.solver.core.min.js'], + ScriptVariant.UNMINIFIED: vendor.HASHES['yt.solver.core.js'], + }, + } + + _SCRIPT_FILENAMES = { + ScriptType.LIB: 'yt.solver.lib.js', + ScriptType.CORE: 'yt.solver.core.js', + } + + _MIN_SCRIPT_FILENAMES = { + ScriptType.LIB: 'yt.solver.lib.min.js', + ScriptType.CORE: 'yt.solver.core.min.js', + } + + # currently disabled as files are large and we do not support rotation + _ENABLE_PREPROCESSED_PLAYER_CACHE = False + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._available = True + self.ejs_settings = self.ie.get_param('extractor_args', {}).get('youtube-ejs', {}) + + # Note: The following 3 args are for developer use only & intentionally not documented. + # - dev: bypasses verification of script hashes and versions. + # - repo: use a custom GitHub repository to fetch web script from. + # - script_version: use a custom script version. + # E.g. --extractor-args "youtube-ejs:dev=true;script_version=0.1.4" + + self.is_dev = self.ejs_setting('dev', ['false'])[0] == 'true' + if self.is_dev: + self.report_dev_option('You have enabled dev mode for EJS JCP Providers.') + + custom_repo = self.ejs_setting('repo', [None])[0] + if custom_repo: + self.report_dev_option(f'You have set a custom GitHub repository for EJS JCP Providers ({custom_repo}).') + self._REPOSITORY = custom_repo + + custom_version = self.ejs_setting('script_version', [None])[0] + if custom_version: + self.report_dev_option(f'You have set a custom EJS script version for EJS JCP Providers ({custom_version}).') + self._SCRIPT_VERSION = custom_version + + def ejs_setting(self, key, *args, **kwargs): + return configuration_arg(self.ejs_settings, key, *args, **kwargs) + + def report_dev_option(self, message: str): + self.ie.report_warning( + f'{message} ' + f'This is a developer option intended for debugging. \n' + ' If you experience any issues while using this option, ' + f'{self.ie._downloader._format_err("DO NOT", self.ie._downloader.Styles.ERROR)} open a bug report', only_once=True) + + def _run_js_runtime(self, stdin: str, /) -> str: + """To be implemented by subclasses""" + raise NotImplementedError + + def _real_bulk_solve(self, /, requests: list[JsChallengeRequest]): + grouped: dict[str, list[JsChallengeRequest]] = collections.defaultdict(list) + for request in requests: + grouped[request.input.player_url].append(request) + + for player_url, grouped_requests in grouped.items(): + player = None + if self._ENABLE_PREPROCESSED_PLAYER_CACHE: + player = self.ie.cache.load(self._CACHE_SECTION, f'player:{player_url}') + + if player: + cached = True + else: + cached = False + video_id = next((request.video_id for request in grouped_requests), None) + player = self._get_player(video_id, player_url) + + # NB: This output belongs after the player request + self.logger.info(f'Solving JS challenges using {self.JS_RUNTIME_NAME}') + + stdin = self._construct_stdin(player, cached, grouped_requests) + stdout = self._run_js_runtime(stdin) + output = json.loads(stdout) + if output['type'] == 'error': + raise JsChallengeProviderError(output['error']) + + if self._ENABLE_PREPROCESSED_PLAYER_CACHE and (preprocessed := output.get('preprocessed_player')): + self.ie.cache.store(self._CACHE_SECTION, f'player:{player_url}', preprocessed) + + for request, response_data in zip(grouped_requests, output['responses'], strict=True): + if response_data['type'] == 'error': + yield JsChallengeProviderResponse(request, None, response_data['error']) + else: + yield JsChallengeProviderResponse(request, JsChallengeResponse(request.type, ( + NChallengeOutput(response_data['data']) if request.type is JsChallengeType.N + else SigChallengeOutput(response_data['data'])))) + + def _construct_stdin(self, player: str, preprocessed: bool, requests: list[JsChallengeRequest], /) -> str: + json_requests = [{ + 'type': request.type.value, + 'challenges': request.input.challenges, + } for request in requests] + data = { + 'type': 'preprocessed', + 'preprocessed_player': player, + 'requests': json_requests, + } if preprocessed else { + 'type': 'player', + 'player': player, + 'requests': json_requests, + 'output_preprocessed': True, + } + return f'''\ + {self._lib_script.code} + Object.assign(globalThis, lib); + {self._core_script.code} + console.log(JSON.stringify(jsc({json.dumps(data)}))); + ''' + + # region: challenge solver script + + @functools.cached_property + def _lib_script(self, /): + return self._get_script(ScriptType.LIB) + + @functools.cached_property + def _core_script(self, /): + return self._get_script(ScriptType.CORE) + + def _get_script(self, script_type: ScriptType, /) -> Script: + skipped_components: list[_SkippedComponent] = [] + for _, from_source in self._iter_script_sources(): + script = from_source(script_type) + if not script: + continue + if isinstance(script, _SkippedComponent): + skipped_components.append(script) + continue + if not self.is_dev: + if script.version != self._SCRIPT_VERSION: + self.logger.warning( + f'Challenge solver {script_type.value} script version {script.version} ' + f'is not supported (source: {script.source.value}, variant: {script.variant}, supported version: {self._SCRIPT_VERSION})') + if script.source is ScriptSource.CACHE: + self.logger.debug('Clearing outdated cached script') + self.ie.cache.store(self._CACHE_SECTION, script_type.value, None) + continue + script_hashes = self._ALLOWED_HASHES[script.type].get(script.variant, []) + if script_hashes and script.hash not in script_hashes: + self.logger.warning( + f'Hash mismatch on challenge solver {script.type.value} script ' + f'(source: {script.source.value}, variant: {script.variant}, hash: {script.hash})!{provider_bug_report_message(self)}') + if script.source is ScriptSource.CACHE: + self.logger.debug('Clearing invalid cached script') + self.ie.cache.store(self._CACHE_SECTION, script_type.value, None) + continue + self.logger.debug( + f'Using challenge solver {script.type.value} script v{script.version} ' + f'(source: {script.source.value}, variant: {script.variant.value})') + break + + else: + self._available = False + raise JsChallengeProviderRejectedRequest( + f'No usable challenge solver {script_type.value} script available', + _skipped_components=skipped_components or None, + ) + + return script + + def _iter_script_sources(self) -> Generator[tuple[ScriptSource, Callable[[ScriptType], Script | None]]]: + yield from [ + (ScriptSource.PYPACKAGE, self._pypackage_source), + (ScriptSource.CACHE, self._cached_source), + (ScriptSource.BUILTIN, self._builtin_source), + (ScriptSource.WEB, self._web_release_source)] + + def _pypackage_source(self, script_type: ScriptType, /) -> Script | None: + if not _has_ejs: + return None + try: + code = yt_dlp_ejs.yt.solver.core() if script_type is ScriptType.CORE else yt_dlp_ejs.yt.solver.lib() + except Exception as e: + self.logger.warning( + f'Failed to load challenge solver {script_type.value} script from python package: {e}{provider_bug_report_message(self)}') + return None + return Script(script_type, ScriptVariant.MINIFIED, ScriptSource.PYPACKAGE, yt_dlp_ejs.version, code) + + def _cached_source(self, script_type: ScriptType, /) -> Script | None: + if data := self.ie.cache.load(self._CACHE_SECTION, script_type.value): + return Script(script_type, ScriptVariant(data['variant']), ScriptSource.CACHE, data['version'], data['code']) + return None + + def _builtin_source(self, script_type: ScriptType, /) -> Script | None: + error_hook = lambda _: self.logger.warning( + f'Failed to read builtin challenge solver {script_type.value} script{provider_bug_report_message(self)}') + code = vendor.load_script( + self._SCRIPT_FILENAMES[script_type], error_hook=error_hook) + if code: + return Script(script_type, ScriptVariant.UNMINIFIED, ScriptSource.BUILTIN, self._SCRIPT_VERSION, code) + return None + + def _web_release_source(self, script_type: ScriptType, /): + if 'ejs:github' not in (self.ie.get_param('remote_components') or ()): + return self._skip_component('ejs:github') + url = f'https://github.com/{self._REPOSITORY}/releases/download/{self._SCRIPT_VERSION}/{self._MIN_SCRIPT_FILENAMES[script_type]}' + if code := self.ie._download_webpage_with_retries( + url, None, f'[{self.logger.prefix}] Downloading challenge solver {script_type.value} script from {url}', + f'[{self.logger.prefix}] Failed to download challenge solver {script_type.value} script', fatal=False, + ): + self.ie.cache.store(self._CACHE_SECTION, script_type.value, { + 'version': self._SCRIPT_VERSION, + 'variant': ScriptVariant.MINIFIED.value, + 'code': code, + }) + return Script(script_type, ScriptVariant.MINIFIED, ScriptSource.WEB, self._SCRIPT_VERSION, code) + return None + + # endregion: challenge solver script + + @property + def runtime_info(self) -> JsRuntimeInfo | None: + runtime = self.ie._downloader._js_runtimes.get(self.JS_RUNTIME_NAME) + if not runtime or not runtime.info or not runtime.info.supported: + return None + return runtime.info + + def is_available(self, /) -> bool: + if not self.runtime_info: + return False + return self._available + + def _skip_component(self, component: str, /): + return _SkippedComponent(component, self.JS_RUNTIME_NAME) + + +@dataclasses.dataclass +class _SkippedComponent: + component: str + runtime: str diff --git a/yt_dlp/extractor/youtube/jsc/_builtin/node.py b/yt_dlp/extractor/youtube/jsc/_builtin/node.py new file mode 100644 index 0000000000..4294e91229 --- /dev/null +++ b/yt_dlp/extractor/youtube/jsc/_builtin/node.py @@ -0,0 +1,70 @@ +from __future__ import annotations + +import re +import shlex +import subprocess + +from yt_dlp.extractor.youtube.jsc._builtin.ejs import EJSBaseJCP +from yt_dlp.extractor.youtube.jsc.provider import ( + JsChallengeProvider, + JsChallengeProviderError, + JsChallengeRequest, + register_preference, + register_provider, +) +from yt_dlp.extractor.youtube.pot._provider import BuiltinIEContentProvider +from yt_dlp.utils import Popen + + +@register_provider +class NodeJCP(EJSBaseJCP, BuiltinIEContentProvider): + PROVIDER_NAME = 'node' + JS_RUNTIME_NAME = 'node' + + _ARGS = ['-'] + + def _run_js_runtime(self, stdin: str, /) -> str: + args = [] + + if self.ejs_setting('jitless', ['false']) != ['false']: + args.append('--v8-flags=--jitless') + + # Node permission flag changed from experimental to stable in v23.5.0 + if self.runtime_info.version_tuple < (23, 5, 0): + args.append('--experimental-permission') + args.append('--no-warnings=ExperimentalWarning') + else: + args.append('--permission') + + cmd = [self.runtime_info.path, *args, *self._ARGS] + self.logger.debug(f'Running node: {shlex.join(cmd)}') + with Popen( + cmd, + text=True, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) as proc: + stdout, stderr = proc.communicate_or_kill(stdin) + stderr = self._clean_stderr(stderr) + if proc.returncode or stderr: + msg = f'Error running node process (returncode: {proc.returncode})' + if stderr: + msg = f'{msg}: {stderr.strip()}' + raise JsChallengeProviderError(msg) + + return stdout + + def _clean_stderr(self, stderr): + return '\n'.join( + line for line in stderr.splitlines() + if not ( + re.match(r'^\[stdin\]:', line) + or re.match(r'^var jsc', line) + or '(Use `node --trace-uncaught ...` to show where the exception was thrown)' == line + or re.match(r'^Node\.js v\d+\.\d+\.\d+$', line))) + + +@register_preference(NodeJCP) +def preference(provider: JsChallengeProvider, requests: list[JsChallengeRequest]) -> int: + return 900 diff --git a/yt_dlp/extractor/youtube/jsc/_builtin/quickjs.py b/yt_dlp/extractor/youtube/jsc/_builtin/quickjs.py new file mode 100644 index 0000000000..f87725baed --- /dev/null +++ b/yt_dlp/extractor/youtube/jsc/_builtin/quickjs.py @@ -0,0 +1,59 @@ +from __future__ import annotations + +import pathlib +import shlex +import subprocess +import tempfile + +from yt_dlp.extractor.youtube.jsc._builtin.ejs import EJSBaseJCP +from yt_dlp.extractor.youtube.jsc.provider import ( + JsChallengeProvider, + JsChallengeProviderError, + JsChallengeRequest, + register_preference, + register_provider, +) +from yt_dlp.extractor.youtube.pot._provider import BuiltinIEContentProvider +from yt_dlp.utils import Popen + + +@register_provider +class QuickJSJCP(EJSBaseJCP, BuiltinIEContentProvider): + PROVIDER_NAME = 'quickjs' + JS_RUNTIME_NAME = 'quickjs' + + def _run_js_runtime(self, stdin: str, /) -> str: + if self.runtime_info.name == 'quickjs-ng': + self.logger.warning('QuickJS-NG is missing some optimizations making this very slow. Consider using upstream QuickJS instead.') + elif self.runtime_info.version_tuple < (2025, 4, 26): + self.logger.warning('Older QuickJS versions are missing optimizations making this very slow. Consider upgrading.') + + # QuickJS does not support reading from stdin, so we have to use a temp file + temp_file = tempfile.NamedTemporaryFile(mode='w', suffix='.js', delete=False, encoding='utf-8') + try: + temp_file.write(stdin) + temp_file.close() + cmd = [self.runtime_info.path, '--script', temp_file.name] + self.logger.debug(f'Running QuickJS: {shlex.join(cmd)}') + with Popen( + cmd, + text=True, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) as proc: + stdout, stderr = proc.communicate_or_kill() + if proc.returncode or stderr: + msg = f'Error running QuickJS process (returncode: {proc.returncode})' + if stderr: + msg = f'{msg}: {stderr.strip()}' + raise JsChallengeProviderError(msg) + finally: + pathlib.Path(temp_file.name).unlink(missing_ok=True) + + return stdout + + +@register_preference(QuickJSJCP) +def preference(provider: JsChallengeProvider, requests: list[JsChallengeRequest]) -> int: + return 850 diff --git a/yt_dlp/extractor/youtube/jsc/_builtin/vendor/__init__.py b/yt_dlp/extractor/youtube/jsc/_builtin/vendor/__init__.py new file mode 100644 index 0000000000..1a208ae975 --- /dev/null +++ b/yt_dlp/extractor/youtube/jsc/_builtin/vendor/__init__.py @@ -0,0 +1,17 @@ +import importlib.resources + +from yt_dlp.extractor.youtube.jsc._builtin.vendor._info import HASHES, VERSION + +__all__ = ['HASHES', 'VERSION', 'load_script'] + + +def load_script(filename, error_hook=None): + file = importlib.resources.files(__package__) / filename + if file.is_file(): + try: + return file.read_text(encoding='utf-8') + except (OSError, FileNotFoundError, ModuleNotFoundError) as e: + if error_hook: + error_hook(e) + return None + return None diff --git a/yt_dlp/extractor/youtube/jsc/_builtin/vendor/_info.py b/yt_dlp/extractor/youtube/jsc/_builtin/vendor/_info.py new file mode 100644 index 0000000000..f117438eb3 --- /dev/null +++ b/yt_dlp/extractor/youtube/jsc/_builtin/vendor/_info.py @@ -0,0 +1,11 @@ +# This file is generated by devscripts/update_ejs.py. DO NOT MODIFY! + +VERSION = '0.3.0' +HASHES = { + 'yt.solver.bun.lib.js': '6ff45e94de9f0ea936a183c48173cfa9ce526ee4b7544cd556428427c1dd53c8073ef0174e79b320252bf0e7c64b0032cc1cf9c4358f3fda59033b7caa01c241', + 'yt.solver.core.js': '0cd96b2d3f319dfa62cae689efa7d930ef1706e95f5921794db5089b2262957ec0a17d73938d8975ea35d0309cbfb4c8e4418d5e219837215eee242890c8b64d', + 'yt.solver.core.min.js': '370d627703002b4a73b10027702734a3de9484f6b56b739942be1dc2b60fee49dee2aa86ed117d1c8ae1ac55181d326481f1fe2e2e8d5211154d48e2a55dac51', + 'yt.solver.deno.lib.js': '9c8ee3ab6c23e443a5a951e3ac73c6b8c1c8fb34335e7058a07bf99d349be5573611de00536dcd03ecd3cf34014c4e9b536081de37af3637c5390c6a6fd6a0f0', + 'yt.solver.lib.js': '1ee3753a8222fc855f5c39db30a9ccbb7967dbe1fb810e86dc9a89aa073a0907f294c720e9b65427d560a35aa1ce6af19ef854d9126a05ca00afe03f72047733', + 'yt.solver.lib.min.js': '8420c259ad16e99ce004e4651ac1bcabb53b4457bf5668a97a9359be9a998a789fee8ab124ee17f91a2ea8fd84e0f2b2fc8eabcaf0b16a186ba734cf422ad053', +} diff --git a/yt_dlp/extractor/youtube/jsc/_builtin/vendor/yt.solver.bun.lib.js b/yt_dlp/extractor/youtube/jsc/_builtin/vendor/yt.solver.bun.lib.js new file mode 100644 index 0000000000..13cfd1539c --- /dev/null +++ b/yt_dlp/extractor/youtube/jsc/_builtin/vendor/yt.solver.bun.lib.js @@ -0,0 +1,9 @@ +/*! + * SPDX-License-Identifier: Unlicense + * This file was automatically generated by https://github.com/yt-dlp/ejs + */ +const lib = { + meriyah: await import('meriyah@6.1.4'), + astring: await import('astring@1.9.0'), +}; +export { lib }; diff --git a/yt_dlp/extractor/youtube/jsc/_builtin/vendor/yt.solver.core.js b/yt_dlp/extractor/youtube/jsc/_builtin/vendor/yt.solver.core.js new file mode 100644 index 0000000000..1cfa0d6a6e --- /dev/null +++ b/yt_dlp/extractor/youtube/jsc/_builtin/vendor/yt.solver.core.js @@ -0,0 +1,550 @@ +/*! + * SPDX-License-Identifier: Unlicense + * This file was automatically generated by https://github.com/yt-dlp/ejs + */ +var jsc = (function (meriyah, astring) { + 'use strict'; + function matchesStructure(obj, structure) { + if (Array.isArray(structure)) { + if (!Array.isArray(obj)) { + return false; + } + return ( + structure.length === obj.length && + structure.every((value, index) => matchesStructure(obj[index], value)) + ); + } + if (typeof structure === 'object') { + if (!obj) { + return !structure; + } + if ('or' in structure) { + return structure.or.some((node) => matchesStructure(obj, node)); + } + if ('anykey' in structure && Array.isArray(structure.anykey)) { + const haystack = Array.isArray(obj) ? obj : Object.values(obj); + return structure.anykey.every((value) => + haystack.some((el) => matchesStructure(el, value)), + ); + } + for (const [key, value] of Object.entries(structure)) { + if (!matchesStructure(obj[key], value)) { + return false; + } + } + return true; + } + return structure === obj; + } + function isOneOf(value, ...of) { + return of.includes(value); + } + function _optionalChain$2(ops) { + let lastAccessLHS = undefined; + let value = ops[0]; + let i = 1; + while (i < ops.length) { + const op = ops[i]; + const fn = ops[i + 1]; + i += 2; + if ((op === 'optionalAccess' || op === 'optionalCall') && value == null) { + return undefined; + } + if (op === 'access' || op === 'optionalAccess') { + lastAccessLHS = value; + value = fn(value); + } else if (op === 'call' || op === 'optionalCall') { + value = fn((...args) => value.call(lastAccessLHS, ...args)); + lastAccessLHS = undefined; + } + } + return value; + } + const logicalExpression = { + type: 'ExpressionStatement', + expression: { + type: 'LogicalExpression', + left: { type: 'Identifier' }, + right: { + type: 'SequenceExpression', + expressions: [ + { + type: 'AssignmentExpression', + left: { type: 'Identifier' }, + operator: '=', + right: { + type: 'CallExpression', + callee: { type: 'Identifier' }, + arguments: { + or: [ + [ + { type: 'Literal' }, + { + type: 'CallExpression', + callee: { + type: 'Identifier', + name: 'decodeURIComponent', + }, + arguments: [{ type: 'Identifier' }], + optional: false, + }, + ], + [ + { + type: 'CallExpression', + callee: { + type: 'Identifier', + name: 'decodeURIComponent', + }, + arguments: [{ type: 'Identifier' }], + optional: false, + }, + ], + ], + }, + optional: false, + }, + }, + { type: 'CallExpression' }, + ], + }, + operator: '&&', + }, + }; + const identifier$1 = { + or: [ + { + type: 'ExpressionStatement', + expression: { + type: 'AssignmentExpression', + operator: '=', + left: { type: 'Identifier' }, + right: { type: 'FunctionExpression', params: [{}, {}, {}] }, + }, + }, + { type: 'FunctionDeclaration', params: [{}, {}, {}] }, + { + type: 'VariableDeclaration', + declarations: { + anykey: [ + { + type: 'VariableDeclarator', + init: { type: 'FunctionExpression', params: [{}, {}, {}] }, + }, + ], + }, + }, + ], + }; + function extract$1(node) { + if (!matchesStructure(node, identifier$1)) { + return null; + } + let block; + if ( + node.type === 'ExpressionStatement' && + node.expression.type === 'AssignmentExpression' && + node.expression.right.type === 'FunctionExpression' + ) { + block = node.expression.right.body; + } else if (node.type === 'VariableDeclaration') { + for (const decl of node.declarations) { + if ( + decl.type === 'VariableDeclarator' && + _optionalChain$2([ + decl, + 'access', + (_) => _.init, + 'optionalAccess', + (_2) => _2.type, + ]) === 'FunctionExpression' && + _optionalChain$2([ + decl, + 'access', + (_3) => _3.init, + 'optionalAccess', + (_4) => _4.params, + 'access', + (_5) => _5.length, + ]) === 3 + ) { + block = decl.init.body; + break; + } + } + } else if (node.type === 'FunctionDeclaration') { + block = node.body; + } else { + return null; + } + const relevantExpression = _optionalChain$2([ + block, + 'optionalAccess', + (_6) => _6.body, + 'access', + (_7) => _7.at, + 'call', + (_8) => _8(-2), + ]); + if (!matchesStructure(relevantExpression, logicalExpression)) { + return null; + } + if ( + _optionalChain$2([ + relevantExpression, + 'optionalAccess', + (_9) => _9.type, + ]) !== 'ExpressionStatement' || + relevantExpression.expression.type !== 'LogicalExpression' || + relevantExpression.expression.right.type !== 'SequenceExpression' || + relevantExpression.expression.right.expressions[0].type !== + 'AssignmentExpression' + ) { + return null; + } + const call = relevantExpression.expression.right.expressions[0].right; + if (call.type !== 'CallExpression' || call.callee.type !== 'Identifier') { + return null; + } + return { + type: 'ArrowFunctionExpression', + params: [{ type: 'Identifier', name: 'sig' }], + body: { + type: 'CallExpression', + callee: { type: 'Identifier', name: call.callee.name }, + arguments: + call.arguments.length === 1 + ? [{ type: 'Identifier', name: 'sig' }] + : [call.arguments[0], { type: 'Identifier', name: 'sig' }], + optional: false, + }, + async: false, + expression: false, + generator: false, + }; + } + function _optionalChain$1(ops) { + let lastAccessLHS = undefined; + let value = ops[0]; + let i = 1; + while (i < ops.length) { + const op = ops[i]; + const fn = ops[i + 1]; + i += 2; + if ((op === 'optionalAccess' || op === 'optionalCall') && value == null) { + return undefined; + } + if (op === 'access' || op === 'optionalAccess') { + lastAccessLHS = value; + value = fn(value); + } else if (op === 'call' || op === 'optionalCall') { + value = fn((...args) => value.call(lastAccessLHS, ...args)); + lastAccessLHS = undefined; + } + } + return value; + } + const identifier = { + or: [ + { + type: 'VariableDeclaration', + kind: 'var', + declarations: { + anykey: [ + { + type: 'VariableDeclarator', + id: { type: 'Identifier' }, + init: { + type: 'ArrayExpression', + elements: [{ type: 'Identifier' }], + }, + }, + ], + }, + }, + { + type: 'ExpressionStatement', + expression: { + type: 'AssignmentExpression', + left: { type: 'Identifier' }, + operator: '=', + right: { + type: 'ArrayExpression', + elements: [{ type: 'Identifier' }], + }, + }, + }, + ], + }; + const catchBlockBody = [ + { + type: 'ReturnStatement', + argument: { + type: 'BinaryExpression', + left: { + type: 'MemberExpression', + object: { type: 'Identifier' }, + computed: true, + property: { type: 'Literal' }, + optional: false, + }, + right: { type: 'Identifier' }, + operator: '+', + }, + }, + ]; + function extract(node) { + if (!matchesStructure(node, identifier)) { + let name = null; + let block = null; + switch (node.type) { + case 'ExpressionStatement': { + if ( + node.expression.type === 'AssignmentExpression' && + node.expression.left.type === 'Identifier' && + node.expression.right.type === 'FunctionExpression' && + node.expression.right.params.length === 1 + ) { + name = node.expression.left.name; + block = node.expression.right.body; + } + break; + } + case 'FunctionDeclaration': { + if (node.params.length === 1) { + name = _optionalChain$1([ + node, + 'access', + (_) => _.id, + 'optionalAccess', + (_2) => _2.name, + ]); + block = node.body; + } + break; + } + } + if (!block || !name) { + return null; + } + const tryNode = block.body.at(-2); + if ( + _optionalChain$1([tryNode, 'optionalAccess', (_3) => _3.type]) !== + 'TryStatement' || + _optionalChain$1([ + tryNode, + 'access', + (_4) => _4.handler, + 'optionalAccess', + (_5) => _5.type, + ]) !== 'CatchClause' + ) { + return null; + } + const catchBody = tryNode.handler.body.body; + if (matchesStructure(catchBody, catchBlockBody)) { + return makeSolverFuncFromName(name); + } + return null; + } + if (node.type === 'VariableDeclaration') { + for (const declaration of node.declarations) { + if ( + declaration.type !== 'VariableDeclarator' || + !declaration.init || + declaration.init.type !== 'ArrayExpression' || + declaration.init.elements.length !== 1 + ) { + continue; + } + const [firstElement] = declaration.init.elements; + if (firstElement && firstElement.type === 'Identifier') { + return makeSolverFuncFromName(firstElement.name); + } + } + } else if (node.type === 'ExpressionStatement') { + const expr = node.expression; + if ( + expr.type === 'AssignmentExpression' && + expr.left.type === 'Identifier' && + expr.operator === '=' && + expr.right.type === 'ArrayExpression' && + expr.right.elements.length === 1 + ) { + const [firstElement] = expr.right.elements; + if (firstElement && firstElement.type === 'Identifier') { + return makeSolverFuncFromName(firstElement.name); + } + } + } + return null; + } + function makeSolverFuncFromName(name) { + return { + type: 'ArrowFunctionExpression', + params: [{ type: 'Identifier', name: 'n' }], + body: { + type: 'CallExpression', + callee: { type: 'Identifier', name: name }, + arguments: [{ type: 'Identifier', name: 'n' }], + optional: false, + }, + async: false, + expression: false, + generator: false, + }; + } + const setupNodes = meriyah.parse( + `\nif (typeof globalThis.XMLHttpRequest === "undefined") {\n globalThis.XMLHttpRequest = { prototype: {} };\n}\nconst window = Object.create(null);\nif (typeof URL === "undefined") {\n window.location = {\n hash: "",\n host: "www.youtube.com",\n hostname: "www.youtube.com",\n href: "https://www.youtube.com/watch?v=yt-dlp-wins",\n origin: "https://www.youtube.com",\n password: "",\n pathname: "/watch",\n port: "",\n protocol: "https:",\n search: "?v=yt-dlp-wins",\n username: "",\n };\n} else {\n window.location = new URL("https://www.youtube.com/watch?v=yt-dlp-wins");\n}\nif (typeof globalThis.document === "undefined") {\n globalThis.document = Object.create(null);\n}\nif (typeof globalThis.navigator === "undefined") {\n globalThis.navigator = Object.create(null);\n}\nif (typeof globalThis.self === "undefined") {\n globalThis.self = globalThis;\n}\n`, + ).body; + function _optionalChain(ops) { + let lastAccessLHS = undefined; + let value = ops[0]; + let i = 1; + while (i < ops.length) { + const op = ops[i]; + const fn = ops[i + 1]; + i += 2; + if ((op === 'optionalAccess' || op === 'optionalCall') && value == null) { + return undefined; + } + if (op === 'access' || op === 'optionalAccess') { + lastAccessLHS = value; + value = fn(value); + } else if (op === 'call' || op === 'optionalCall') { + value = fn((...args) => value.call(lastAccessLHS, ...args)); + lastAccessLHS = undefined; + } + } + return value; + } + function preprocessPlayer(data) { + const ast = meriyah.parse(data); + const body = ast.body; + const block = (() => { + switch (body.length) { + case 1: { + const func = body[0]; + if ( + _optionalChain([func, 'optionalAccess', (_) => _.type]) === + 'ExpressionStatement' && + func.expression.type === 'CallExpression' && + func.expression.callee.type === 'MemberExpression' && + func.expression.callee.object.type === 'FunctionExpression' + ) { + return func.expression.callee.object.body; + } + break; + } + case 2: { + const func = body[1]; + if ( + _optionalChain([func, 'optionalAccess', (_2) => _2.type]) === + 'ExpressionStatement' && + func.expression.type === 'CallExpression' && + func.expression.callee.type === 'FunctionExpression' + ) { + const block = func.expression.callee.body; + block.body.splice(0, 1); + return block; + } + break; + } + } + throw 'unexpected structure'; + })(); + const found = { n: [], sig: [] }; + const plainExpressions = block.body.filter((node) => { + const n = extract(node); + if (n) { + found.n.push(n); + } + const sig = extract$1(node); + if (sig) { + found.sig.push(sig); + } + if (node.type === 'ExpressionStatement') { + if (node.expression.type === 'AssignmentExpression') { + return true; + } + return node.expression.type === 'Literal'; + } + return true; + }); + block.body = plainExpressions; + for (const [name, options] of Object.entries(found)) { + const unique = new Set(options.map((x) => JSON.stringify(x))); + if (unique.size !== 1) { + const message = `found ${unique.size} ${name} function possibilities`; + throw ( + message + + (unique.size + ? `: ${options.map((x) => astring.generate(x)).join(', ')}` + : '') + ); + } + plainExpressions.push({ + type: 'ExpressionStatement', + expression: { + type: 'AssignmentExpression', + operator: '=', + left: { + type: 'MemberExpression', + computed: false, + object: { type: 'Identifier', name: '_result' }, + property: { type: 'Identifier', name: name }, + }, + right: options[0], + }, + }); + } + ast.body.splice(0, 0, ...setupNodes); + return astring.generate(ast); + } + function getFromPrepared(code) { + const resultObj = { n: null, sig: null }; + Function('_result', code)(resultObj); + return resultObj; + } + function main(input) { + const preprocessedPlayer = + input.type === 'player' + ? preprocessPlayer(input.player) + : input.preprocessed_player; + const solvers = getFromPrepared(preprocessedPlayer); + const responses = input.requests.map((input) => { + if (!isOneOf(input.type, 'n', 'sig')) { + return { type: 'error', error: `Unknown request type: ${input.type}` }; + } + const solver = solvers[input.type]; + if (!solver) { + return { + type: 'error', + error: `Failed to extract ${input.type} function`, + }; + } + try { + return { + type: 'result', + data: Object.fromEntries( + input.challenges.map((challenge) => [challenge, solver(challenge)]), + ), + }; + } catch (error) { + return { + type: 'error', + error: + error instanceof Error + ? `${error.message}\n${error.stack}` + : `${error}`, + }; + } + }); + const output = { type: 'result', responses: responses }; + if (input.type === 'player' && input.output_preprocessed) { + output.preprocessed_player = preprocessedPlayer; + } + return output; + } + return main; +})(meriyah, astring); diff --git a/yt_dlp/extractor/youtube/jsc/_builtin/vendor/yt.solver.deno.lib.js b/yt_dlp/extractor/youtube/jsc/_builtin/vendor/yt.solver.deno.lib.js new file mode 100644 index 0000000000..6a0062325f --- /dev/null +++ b/yt_dlp/extractor/youtube/jsc/_builtin/vendor/yt.solver.deno.lib.js @@ -0,0 +1,9 @@ +/*! + * SPDX-License-Identifier: Unlicense + * This file was automatically generated by https://github.com/yt-dlp/ejs + */ +const lib = { + meriyah: await import('npm:meriyah@6.1.4'), + astring: await import('npm:astring@1.9.0'), +}; +export { lib }; diff --git a/yt_dlp/extractor/youtube/jsc/_director.py b/yt_dlp/extractor/youtube/jsc/_director.py new file mode 100644 index 0000000000..77857edfd4 --- /dev/null +++ b/yt_dlp/extractor/youtube/jsc/_director.py @@ -0,0 +1,287 @@ +from __future__ import annotations + +import collections +import dataclasses +import typing + +from yt_dlp.extractor.youtube.jsc._builtin.ejs import _EJS_WIKI_URL +from yt_dlp.extractor.youtube.jsc._registry import ( + _jsc_preferences, + _jsc_providers, +) +from yt_dlp.extractor.youtube.jsc.provider import ( + JsChallengeProvider, + JsChallengeProviderError, + JsChallengeProviderRejectedRequest, + JsChallengeProviderResponse, + JsChallengeRequest, + JsChallengeResponse, + JsChallengeType, + NChallengeInput, + NChallengeOutput, + SigChallengeInput, + SigChallengeOutput, +) +from yt_dlp.extractor.youtube.pot._director import YoutubeIEContentProviderLogger, provider_display_list +from yt_dlp.extractor.youtube.pot._provider import ( + IEContentProviderLogger, +) +from yt_dlp.extractor.youtube.pot.provider import ( + provider_bug_report_message, +) + +if typing.TYPE_CHECKING: + from collections.abc import Iterable + + from yt_dlp.extractor.youtube.jsc._builtin.ejs import _SkippedComponent + from yt_dlp.extractor.youtube.jsc.provider import Preference as JsChallengePreference + + +class JsChallengeRequestDirector: + + def __init__(self, logger: IEContentProviderLogger): + self.providers: dict[str, JsChallengeProvider] = {} + self.preferences: list[JsChallengePreference] = [] + self.logger = logger + + def register_provider(self, provider: JsChallengeProvider): + self.providers[provider.PROVIDER_KEY] = provider + + def register_preference(self, preference: JsChallengePreference): + self.preferences.append(preference) + + def _get_providers(self, requests: list[JsChallengeRequest]) -> Iterable[JsChallengeProvider]: + """Sorts available providers by preference, given a request""" + preferences = { + provider: sum(pref(provider, requests) for pref in self.preferences) + for provider in self.providers.values() + } + if self.logger.log_level <= self.logger.LogLevel.TRACE: + # calling is_available() for every JS Challenge provider upfront may have some overhead + self.logger.trace(f'JS Challenge Providers: {provider_display_list(self.providers.values())}') + self.logger.trace('JS Challenge Provider preferences for this request: {}'.format(', '.join( + f'{provider.PROVIDER_NAME}={pref}' for provider, pref in preferences.items()))) + + return ( + provider for provider in sorted( + self.providers.values(), key=preferences.get, reverse=True) + if provider.is_available() + ) + + def _handle_error(self, e: Exception, provider: JsChallengeProvider, requests: list[JsChallengeRequest]): + if isinstance(e, JsChallengeProviderRejectedRequest): + self.logger.trace( + f'JS Challenge Provider "{provider.PROVIDER_NAME}" rejected ' + f'{"this request" if len(requests) == 1 else f"{len(requests)} requests"}, ' + f'trying next available provider. Reason: {e}', + ) + elif isinstance(e, JsChallengeProviderError): + if len(requests) == 1: + self.logger.warning( + f'Error solving {requests[0].type.value} challenge request using "{provider.PROVIDER_NAME}" provider: {e}.\n' + f' input = {requests[0].input}\n' + f' {(provider_bug_report_message(provider, before="") if not e.expected else "")}') + else: + self.logger.warning( + f'Error solving {len(requests)} challenge requests using "{provider.PROVIDER_NAME}" provider: {e}.\n' + f' requests = {requests}\n' + f' {(provider_bug_report_message(provider, before="") if not e.expected else "")}') + else: + self.logger.error( + f'Unexpected error solving {len(requests)} challenge request(s) using "{provider.PROVIDER_NAME}" provider: {e!r}\n' + f' requests = {requests}\n' + f' {provider_bug_report_message(provider, before="")}', cause=e) + + def bulk_solve(self, requests: list[JsChallengeRequest]) -> list[tuple[JsChallengeRequest, JsChallengeResponse]]: + """Solves multiple JS Challenges in bulk, returning a list of responses""" + if not self.providers: + self.logger.trace('No JS Challenge providers registered') + return [] + + results = [] + next_requests = requests[:] + + skipped_components = [] + for provider in self._get_providers(next_requests): + if not next_requests: + break + self.logger.trace( + f'Attempting to solve {len(next_requests)} challenges using "{provider.PROVIDER_NAME}" provider') + try: + for response in provider.bulk_solve([dataclasses.replace(request) for request in next_requests]): + if not validate_provider_response(response): + self.logger.warning( + f'JS Challenge Provider "{provider.PROVIDER_NAME}" returned an invalid response:' + f' response = {response!r}\n' + f' {provider_bug_report_message(provider, before="")}') + continue + if response.error: + self._handle_error(response.error, provider, [response.request]) + continue + if (vr_msg := validate_response(response.response, response.request)) is not True: + self.logger.warning( + f'Invalid JS Challenge response received from "{provider.PROVIDER_NAME}" provider: {vr_msg or ""}\n' + f' response = {response.response}\n' + f' request = {response.request}\n' + f' {provider_bug_report_message(provider, before="")}') + continue + try: + next_requests.remove(response.request) + except ValueError: + self.logger.warning( + f'JS Challenge Provider "{provider.PROVIDER_NAME}" returned a response for an unknown request:\n' + f' request = {response.request}\n' + f' {provider_bug_report_message(provider, before="")}') + continue + results.append((response.request, response.response)) + except Exception as e: + if isinstance(e, JsChallengeProviderRejectedRequest) and e._skipped_components: + skipped_components.extend(e._skipped_components) + self._handle_error(e, provider, next_requests) + continue + + if skipped_components: + self.__report_skipped_components(skipped_components) + + if len(results) != len(requests): + self.logger.trace( + f'Not all JS Challenges were solved, expected {len(requests)} responses, got {len(results)}') + self.logger.trace(f'Unsolved requests: {next_requests}') + else: + self.logger.trace(f'Solved all {len(requests)} requested JS Challenges') + return results + + def __report_skipped_components(self, components: list[_SkippedComponent], /): + runtime_components = collections.defaultdict(list) + for component in components: + runtime_components[component.component].append(component.runtime) + for runtimes in runtime_components.values(): + runtimes.sort() + + description_lookup = { + 'ejs:npm': 'NPM package', + 'ejs:github': 'challenge solver script', + } + + descriptions = [ + f'{description_lookup.get(component, component)} ({", ".join(runtimes)})' + for component, runtimes in runtime_components.items() + if runtimes + ] + flags = [ + f' --remote-components {f"{component} (recommended)" if component == "ejs:github" else f"{component} "}' + for component, runtimes in runtime_components.items() + if runtimes + ] + + def join_parts(parts, joiner): + if not parts: + return '' + if len(parts) == 1: + return parts[0] + return f'{", ".join(parts[:-1])} {joiner} {parts[-1]}' + + if len(descriptions) == 1: + msg = ( + f'Remote component {descriptions[0]} was skipped. ' + f'It may be required to solve JS challenges. ' + f'You can enable the download with {flags[0]}') + else: + msg = ( + f'Remote components {join_parts(descriptions, "and")} were skipped. ' + f'These may be required to solve JS challenges. ' + f'You can enable these downloads with {join_parts(flags, "or")}, respectively') + + self.logger.warning(f'{msg}. For more information and alternatives, refer to {_EJS_WIKI_URL}') + + def close(self): + for provider in self.providers.values(): + provider.close() + + +EXTRACTOR_ARG_PREFIX = 'youtubejsc' + + +def initialize_jsc_director(ie): + assert ie._downloader is not None, 'Downloader not set' + + enable_trace = ie._configuration_arg( + 'jsc_trace', ['false'], ie_key='youtube', casesense=False)[0] == 'true' + + if enable_trace: + log_level = IEContentProviderLogger.LogLevel.TRACE + elif ie.get_param('verbose', False): + log_level = IEContentProviderLogger.LogLevel.DEBUG + else: + log_level = IEContentProviderLogger.LogLevel.INFO + + def get_provider_logger_and_settings(provider, logger_key): + logger_prefix = f'{logger_key}:{provider.PROVIDER_NAME}' + extractor_key = f'{EXTRACTOR_ARG_PREFIX}-{provider.PROVIDER_KEY.lower()}' + return ( + YoutubeIEContentProviderLogger(ie, logger_prefix, log_level=log_level), + ie.get_param('extractor_args', {}).get(extractor_key, {})) + + director = JsChallengeRequestDirector( + logger=YoutubeIEContentProviderLogger(ie, 'jsc', log_level=log_level), + ) + + ie._downloader.add_close_hook(director.close) + + for provider in _jsc_providers.value.values(): + logger, settings = get_provider_logger_and_settings(provider, 'jsc') + director.register_provider(provider(ie, logger, settings)) + + for preference in _jsc_preferences.value: + director.register_preference(preference) + + if director.logger.log_level <= director.logger.LogLevel.DEBUG: + # calling is_available() for every JS Challenge provider upfront may have some overhead + director.logger.debug(f'JS Challenge Providers: {provider_display_list(director.providers.values())}') + director.logger.trace(f'Registered {len(director.preferences)} JS Challenge provider preferences') + + return director + + +def validate_provider_response(response: JsChallengeProviderResponse) -> bool: + return ( + isinstance(response, JsChallengeProviderResponse) + and isinstance(response.request, JsChallengeRequest) + and ( + isinstance(response.response, JsChallengeResponse) + or (response.error is not None and isinstance(response.error, Exception))) + ) + + +def validate_response(response: JsChallengeResponse, request: JsChallengeRequest) -> bool | str: + if not isinstance(response, JsChallengeResponse): + return 'Response is not a JsChallengeResponse' + if request.type == JsChallengeType.N: + return validate_nsig_challenge_output(response.output, request.input) + else: + return validate_sig_challenge_output(response.output, request.input) + + +def validate_nsig_challenge_output(challenge_output: NChallengeOutput, challenge_input: NChallengeInput) -> bool | str: + if not ( + isinstance(challenge_output, NChallengeOutput) + and len(challenge_output.results) == len(challenge_input.challenges) + and all(isinstance(k, str) and isinstance(v, str) for k, v in challenge_output.results.items()) + and all(challenge in challenge_output.results for challenge in challenge_input.challenges) + ): + return 'Invalid NChallengeOutput' + + # Validate n results are valid - if they end with the input challenge then the js function returned with an exception. + for challenge, result in challenge_output.results.items(): + if result.endswith(challenge): + return f'n result is invalid for {challenge!r}: {result!r}' + return True + + +def validate_sig_challenge_output(challenge_output: SigChallengeOutput, challenge_input: SigChallengeInput) -> bool: + return ( + isinstance(challenge_output, SigChallengeOutput) + and len(challenge_output.results) == len(challenge_input.challenges) + and all(isinstance(k, str) and isinstance(v, str) for k, v in challenge_output.results.items()) + and all(challenge in challenge_output.results for challenge in challenge_input.challenges) + ) or 'Invalid SigChallengeOutput' diff --git a/yt_dlp/extractor/youtube/jsc/_registry.py b/yt_dlp/extractor/youtube/jsc/_registry.py new file mode 100644 index 0000000000..e1bbed62de --- /dev/null +++ b/yt_dlp/extractor/youtube/jsc/_registry.py @@ -0,0 +1,4 @@ +from yt_dlp.globals import Indirect + +_jsc_providers = Indirect({}) +_jsc_preferences = Indirect(set()) diff --git a/yt_dlp/extractor/youtube/jsc/provider.py b/yt_dlp/extractor/youtube/jsc/provider.py new file mode 100644 index 0000000000..c368d95df7 --- /dev/null +++ b/yt_dlp/extractor/youtube/jsc/provider.py @@ -0,0 +1,161 @@ +"""PUBLIC API""" + +from __future__ import annotations + +import abc +import dataclasses +import enum +import typing + +from yt_dlp.extractor.youtube.jsc._registry import _jsc_preferences, _jsc_providers +from yt_dlp.extractor.youtube.pot._provider import ( + IEContentProvider, + IEContentProviderError, + register_preference_generic, + register_provider_generic, +) +from yt_dlp.utils import ExtractorError + +__all__ = [ + 'JsChallengeProvider', + 'JsChallengeProviderError', + 'JsChallengeProviderRejectedRequest', + 'JsChallengeProviderResponse', + 'JsChallengeRequest', + 'JsChallengeResponse', + 'JsChallengeType', + 'NChallengeInput', + 'NChallengeOutput', + 'SigChallengeInput', + 'SigChallengeOutput', + 'register_preference', + 'register_provider', +] + + +class JsChallengeType(enum.Enum): + N = 'n' + SIG = 'sig' + + +@dataclasses.dataclass(frozen=True) +class JsChallengeRequest: + type: JsChallengeType + input: NChallengeInput | SigChallengeInput + video_id: str | None = None + + +@dataclasses.dataclass(frozen=True) +class NChallengeInput: + player_url: str + challenges: list[str] = dataclasses.field(default_factory=list) + + +@dataclasses.dataclass(frozen=True) +class SigChallengeInput: + player_url: str + challenges: list[str] = dataclasses.field(default_factory=list) + + +@dataclasses.dataclass(frozen=True) +class NChallengeOutput: + results: dict[str, str] = dataclasses.field(default_factory=dict) + + +@dataclasses.dataclass(frozen=True) +class SigChallengeOutput: + results: dict[str, str] = dataclasses.field(default_factory=dict) + + +@dataclasses.dataclass +class JsChallengeProviderResponse: + request: JsChallengeRequest + response: JsChallengeResponse | None = None + error: Exception | None = None + + +@dataclasses.dataclass +class JsChallengeResponse: + type: JsChallengeType + output: NChallengeOutput | SigChallengeOutput + + +class JsChallengeProviderRejectedRequest(IEContentProviderError): + """Reject the JsChallengeRequest (cannot handle the request)""" + + def __init__(self, msg=None, expected: bool = False, *, _skipped_components=None): + super().__init__(msg, expected) + self._skipped_components = _skipped_components + + +class JsChallengeProviderError(IEContentProviderError): + """An error occurred while solving the challenge""" + + +class JsChallengeProvider(IEContentProvider, abc.ABC, suffix='JCP'): + + # Set to None to disable the check + _SUPPORTED_TYPES: tuple[JsChallengeType] | None = () + + def __validate_request(self, request: JsChallengeRequest): + if not self.is_available(): + raise JsChallengeProviderRejectedRequest(f'{self.PROVIDER_NAME} is not available') + + # Validate request using built-in settings + if ( + self._SUPPORTED_TYPES is not None + and request.type not in self._SUPPORTED_TYPES + ): + raise JsChallengeProviderRejectedRequest( + f'JS Challenge type "{request.type}" is not supported by {self.PROVIDER_NAME}') + + def bulk_solve(self, requests: list[JsChallengeRequest]) -> typing.Generator[JsChallengeProviderResponse, None, None]: + """Solve multiple JS challenges and return the results""" + validated_requests = [] + for request in requests: + try: + self.__validate_request(request) + validated_requests.append(request) + except JsChallengeProviderRejectedRequest as e: + yield JsChallengeProviderResponse(request=request, error=e) + continue + yield from self._real_bulk_solve(validated_requests) + + @abc.abstractmethod + def _real_bulk_solve(self, requests: list[JsChallengeRequest]) -> typing.Generator[JsChallengeProviderResponse, None, None]: + """Subclasses can override this method to handle bulk solving""" + raise NotImplementedError(f'{self.PROVIDER_NAME} does not implement bulk solving') + + def _get_player(self, video_id, player_url): + try: + return self.ie._load_player( + video_id=video_id, + player_url=player_url, + fatal=True, + ) + except ExtractorError as e: + raise JsChallengeProviderError( + f'Failed to load player for JS challenge: {e}') from e + + +def register_provider(provider: type[JsChallengeProvider]): + """Register a JsChallengeProvider class""" + return register_provider_generic( + provider=provider, + base_class=JsChallengeProvider, + registry=_jsc_providers.value, + ) + + +def register_preference(*providers: type[JsChallengeProvider]) -> typing.Callable[[Preference], Preference]: + """Register a preference for a JsChallengeProvider class.""" + return register_preference_generic( + JsChallengeProvider, + _jsc_preferences.value, + *providers, + ) + + +if typing.TYPE_CHECKING: + Preference = typing.Callable[[JsChallengeProvider, list[JsChallengeRequest]], int] + __all__.append('Preference') diff --git a/yt_dlp/extractor/youtube/pot/_director.py b/yt_dlp/extractor/youtube/pot/_director.py index aaf1d5290a..26e7a6ac1f 100644 --- a/yt_dlp/extractor/youtube/pot/_director.py +++ b/yt_dlp/extractor/youtube/pot/_director.py @@ -6,6 +6,7 @@ import dataclasses import datetime as dt import hashlib import json +import traceback import typing import urllib.parse from collections.abc import Iterable @@ -58,9 +59,9 @@ class YoutubeIEContentProviderLogger(IEContentProviderLogger): if self.log_level <= self.LogLevel.TRACE: self.__ie.write_debug(self._format_msg('TRACE: ' + message)) - def debug(self, message: str): + def debug(self, message: str, *, once=False): if self.log_level <= self.LogLevel.DEBUG: - self.__ie.write_debug(self._format_msg(message)) + self.__ie.write_debug(self._format_msg(message), only_once=once) def info(self, message: str): if self.log_level <= self.LogLevel.INFO: @@ -70,9 +71,11 @@ class YoutubeIEContentProviderLogger(IEContentProviderLogger): if self.log_level <= self.LogLevel.WARNING: self.__ie.report_warning(self._format_msg(message), only_once=once) - def error(self, message: str): + def error(self, message: str, cause=None): if self.log_level <= self.LogLevel.ERROR: - self.__ie._downloader.report_error(self._format_msg(message), is_error=False) + self.__ie._downloader.report_error( + self._format_msg(message), is_error=False, + tb=''.join(traceback.format_exception(None, cause, cause.__traceback__)) if cause else None) class PoTokenCache: diff --git a/yt_dlp/extractor/youtube/pot/_provider.py b/yt_dlp/extractor/youtube/pot/_provider.py index af7034d227..3aa467342f 100644 --- a/yt_dlp/extractor/youtube/pot/_provider.py +++ b/yt_dlp/extractor/youtube/pot/_provider.py @@ -36,7 +36,7 @@ class IEContentProviderLogger(abc.ABC): pass @abc.abstractmethod - def debug(self, message: str): + def debug(self, message: str, *, once=False): pass @abc.abstractmethod @@ -48,7 +48,7 @@ class IEContentProviderLogger(abc.ABC): pass @abc.abstractmethod - def error(self, message: str): + def error(self, message: str, cause=None): pass @@ -90,7 +90,7 @@ class IEContentProvider(abc.ABC): @classproperty def PROVIDER_KEY(cls) -> str: assert hasattr(cls, '_PROVIDER_KEY_SUFFIX'), 'Content Provider implementation must define a suffix for the provider key' - assert cls.__name__.endswith(cls._PROVIDER_KEY_SUFFIX), f'PoTokenProvider class names must end with "{cls._PROVIDER_KEY_SUFFIX}"' + assert cls.__name__.endswith(cls._PROVIDER_KEY_SUFFIX), f'Class name must end with "{cls._PROVIDER_KEY_SUFFIX}"' return cls.__name__[:-len(cls._PROVIDER_KEY_SUFFIX)] @abc.abstractmethod @@ -114,10 +114,7 @@ class IEContentProvider(abc.ABC): @param default The default value to return when the key is not present (default: []) @param casesense When false, the values are converted to lower case """ - val = traverse_obj(self.settings, key) - if val is None: - return [] if default is NO_DEFAULT else default - return list(val) if casesense else [x.lower() for x in val] + return configuration_arg(self.settings, key, default=default, casesense=casesense) class BuiltinIEContentProvider(IEContentProvider, abc.ABC): @@ -125,6 +122,20 @@ class BuiltinIEContentProvider(IEContentProvider, abc.ABC): BUG_REPORT_MESSAGE = bug_reports_message(before='') +def configuration_arg(config, key, default=NO_DEFAULT, *, casesense=False): + """ + @returns A list of values for the setting given by "key" + or "default" if no such key is present + @param config The configuration dictionary + @param default The default value to return when the key is not present (default: []) + @param casesense When false, the values are converted to lower case + """ + val = traverse_obj(config, key) + if val is None: + return [] if default is NO_DEFAULT else default + return list(val) if casesense else [x.lower() for x in val] + + def register_provider_generic( provider, base_class, diff --git a/yt_dlp/extractor/youtube/pot/provider.py b/yt_dlp/extractor/youtube/pot/provider.py index 13b3b1f9bb..2511edf015 100644 --- a/yt_dlp/extractor/youtube/pot/provider.py +++ b/yt_dlp/extractor/youtube/pot/provider.py @@ -58,6 +58,8 @@ class PoTokenRequest: visitor_data: str | None = None data_sync_id: str | None = None video_id: str | None = None + # Internal, YouTube experiment on whether to bind GVS PO Token to video_id. + _gvs_bind_to_video_id: bool = False # Networking parameters request_cookiejar: YoutubeDLCookieJar = dataclasses.field(default_factory=YoutubeDLCookieJar) diff --git a/yt_dlp/extractor/youtube/pot/utils.py b/yt_dlp/extractor/youtube/pot/utils.py index a27921d4af..7f9ca078d6 100644 --- a/yt_dlp/extractor/youtube/pot/utils.py +++ b/yt_dlp/extractor/youtube/pot/utils.py @@ -42,6 +42,9 @@ def get_webpo_content_binding( if not client_name or client_name not in webpo_clients: return None, None + if request.context == PoTokenContext.GVS and request._gvs_bind_to_video_id: + return request.video_id, ContentBindingType.VIDEO_ID + if request.context == PoTokenContext.GVS or client_name in ('WEB_REMIX', ): if request.is_authenticated: return request.data_sync_id, ContentBindingType.DATASYNC_ID diff --git a/yt_dlp/globals.py b/yt_dlp/globals.py index 81ad004480..42f345e039 100644 --- a/yt_dlp/globals.py +++ b/yt_dlp/globals.py @@ -1,3 +1,4 @@ +from __future__ import annotations import os from collections import defaultdict @@ -30,3 +31,11 @@ plugin_ies_overrides = Indirect(defaultdict(list)) IN_CLI = Indirect(False) LAZY_EXTRACTORS = Indirect(None) # `False`=force, `None`=disabled, `True`=enabled WINDOWS_VT_MODE = Indirect(False if os.name == 'nt' else None) + +# JS Runtimes +# If adding support for another runtime, register it here to allow `js_runtimes` option to accept it. +# key is the runtime name, value a JsRuntime subclass (internal-only) or None +supported_js_runtimes = Indirect({}) + +# List of remote components supported with --remote-components option +supported_remote_components = Indirect([]) diff --git a/yt_dlp/jsinterp.py b/yt_dlp/jsinterp.py index 460bc2c03e..d22d176d2f 100644 --- a/yt_dlp/jsinterp.py +++ b/yt_dlp/jsinterp.py @@ -186,7 +186,7 @@ _OPERATORS = { # None => Defined in JSInterpreter._operator _COMP_OPERATORS = {'===', '!==', '==', '!=', '<=', '>=', '<', '>'} _NAME_RE = r'[a-zA-Z_$][\w$]*' -_MATCHING_PARENS = dict(zip(*zip('()', '{}', '[]'))) +_MATCHING_PARENS = dict(zip(*zip('()', '{}', '[]', strict=True), strict=True)) _QUOTES = '\'"/' _NESTED_BRACKETS = r'[^[\]]+(?:\[[^[\]]+(?:\[[^\]]+\])?\])?' diff --git a/yt_dlp/networking/_requests.py b/yt_dlp/networking/_requests.py index 1526d2a599..06d3220f4a 100644 --- a/yt_dlp/networking/_requests.py +++ b/yt_dlp/networking/_requests.py @@ -4,7 +4,6 @@ import functools import http.client import logging import re -import socket import warnings from ..dependencies import brotli, requests, urllib3 @@ -125,7 +124,7 @@ class RequestsResponseAdapter(Response): # Work around issue with `.read(amt)` then `.read()` # See: https://github.com/urllib3/urllib3/issues/3636 if amt is None: - # Python 3.9 preallocates the whole read buffer, read in chunks + # compat: py3.9: Python 3.9 preallocates the whole read buffer, read in chunks read_chunk = functools.partial(self.fp.read, 1 << 20, decode_content=True) return b''.join(iter(read_chunk, b'')) # Interact with urllib3 response directly. @@ -378,7 +377,7 @@ class SocksHTTPConnection(urllib3.connection.HTTPConnection): source_address=self.source_address, _create_socket_func=functools.partial( create_socks_proxy_socket, (self.host, self.port), self._proxy_args)) - except (socket.timeout, TimeoutError) as e: + except TimeoutError as e: raise urllib3.exceptions.ConnectTimeoutError( self, f'Connection to {self.host} timed out. (connect timeout={self.timeout})') from e except SocksProxyError as e: diff --git a/yt_dlp/networking/common.py b/yt_dlp/networking/common.py index e33769422b..6680d1c7c8 100644 --- a/yt_dlp/networking/common.py +++ b/yt_dlp/networking/common.py @@ -12,6 +12,7 @@ import urllib.response from collections.abc import Iterable, Mapping from email.message import Message from http import HTTPStatus +from types import NoneType from ._helper import make_ssl_context, wrap_request_errors from .exceptions import ( @@ -20,7 +21,6 @@ from .exceptions import ( TransportError, UnsupportedRequest, ) -from ..compat.types import NoneType from ..cookies import YoutubeDLCookieJar from ..utils import ( bug_reports_message, diff --git a/yt_dlp/networking/impersonate.py b/yt_dlp/networking/impersonate.py index b90d10b760..458ec04a19 100644 --- a/yt_dlp/networking/impersonate.py +++ b/yt_dlp/networking/impersonate.py @@ -3,11 +3,11 @@ from __future__ import annotations import re from abc import ABC from dataclasses import dataclass +from types import NoneType from typing import Any from .common import RequestHandler, register_preference, Request from .exceptions import UnsupportedRequest -from ..compat.types import NoneType from ..utils import classproperty, join_nonempty from ..utils.networking import std_headers, HTTPHeaderDict diff --git a/yt_dlp/options.py b/yt_dlp/options.py index eaa4a7305f..43fa2f84b9 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -456,6 +456,41 @@ def create_parser(): '--no-plugin-dirs', dest='plugin_dirs', action='store_const', const=[], help='Clear plugin directories to search, including defaults and those provided by previous --plugin-dirs') + general.add_option( + '--js-runtimes', + metavar='RUNTIME[:PATH]', + dest='js_runtimes', + action='callback', + callback=_list_from_options_callback, + type='str', + callback_kwargs={'delim': None}, + default=['deno'], + help=( + 'Additional JavaScript runtime to enable, with an optional path to the runtime location. ' + 'This option can be used multiple times to enable multiple runtimes. ' + 'Supported runtimes: deno, node, bun, quickjs. By default, only "deno" runtime is enabled.')) + general.add_option( + '--no-js-runtimes', + dest='js_runtimes', action='store_const', const=[], + help='Clear JavaScript runtimes to enable, including defaults and those provided by previous --js-runtimes') + general.add_option( + '--remote-components', + metavar='COMPONENT', + dest='remote_components', + action='callback', + callback=_list_from_options_callback, + type='str', + callback_kwargs={'delim': None}, + default=[], + help=( + 'Remote components to allow yt-dlp to fetch when required. ' + 'You can use this option multiple times to allow multiple components. ' + 'Supported values: ejs:npm (external JavaScript components from npm), ejs:github (external JavaScript components from yt-dlp-ejs GitHub). ' + 'By default, no remote components are allowed.')) + general.add_option( + '--no-remote-components', + dest='remote_components', action='store_const', const=[], + help='Disallow fetching of all remote components, including any previously allowed by --remote-components or defaults.') general.add_option( '--flat-playlist', action='store_const', dest='extract_flat', const='in_playlist', default=False, diff --git a/yt_dlp/plugins.py b/yt_dlp/plugins.py index 941709b21d..f9c1ceaa6b 100644 --- a/yt_dlp/plugins.py +++ b/yt_dlp/plugins.py @@ -11,7 +11,6 @@ import os import pkgutil import sys import traceback -import zipimport from pathlib import Path from zipfile import ZipFile @@ -202,16 +201,10 @@ def load_plugins(plugin_spec: PluginSpec): if any(x.startswith('_') for x in module_name.split('.')): continue try: - if sys.version_info < (3, 10) and isinstance(finder, zipimport.zipimporter): - # zipimporter.load_module() is deprecated in 3.10 and removed in 3.12 - # The exec_module branch below is the replacement for >= 3.10 - # See: https://docs.python.org/3/library/zipimport.html#zipimport.zipimporter.exec_module - module = finder.load_module(module_name) - else: - spec = finder.find_spec(module_name) - module = importlib.util.module_from_spec(spec) - sys.modules[module_name] = module - spec.loader.exec_module(module) + spec = finder.find_spec(module_name) + module = importlib.util.module_from_spec(spec) + sys.modules[module_name] = module + spec.loader.exec_module(module) except Exception: write_string( f'Error while importing module {module_name!r}\n{traceback.format_exc(limit=-1)}', diff --git a/yt_dlp/postprocessor/ffmpeg.py b/yt_dlp/postprocessor/ffmpeg.py index 774625660a..5c9c0390d9 100644 --- a/yt_dlp/postprocessor/ffmpeg.py +++ b/yt_dlp/postprocessor/ffmpeg.py @@ -418,7 +418,7 @@ class FFmpegPostProcessor(PostProcessor): if concat_opts is None: concat_opts = [{}] * len(in_files) yield 'ffconcat version 1.0\n' - for file, opts in zip(in_files, concat_opts): + for file, opts in zip(in_files, concat_opts, strict=True): yield f'file {cls._quote_for_ffmpeg(cls._ffmpeg_filename_argument(file))}\n' # Iterate explicitly to yield the following directives in order, ignoring the rest. for directive in 'inpoint', 'outpoint', 'duration': @@ -639,7 +639,7 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor): # postprocessor a second time '-map', '-0:s', ] - for i, (lang, name) in enumerate(zip(sub_langs, sub_names)): + for i, (lang, name) in enumerate(zip(sub_langs, sub_names, strict=True)): opts.extend(['-map', f'{i + 1}:0']) lang_code = ISO639Utils.short2long(lang) or lang opts.extend([f'-metadata:s:s:{i}', f'language={lang_code}']) diff --git a/yt_dlp/postprocessor/sponsorblock.py b/yt_dlp/postprocessor/sponsorblock.py index b3fc8b54a8..2aadbd9527 100644 --- a/yt_dlp/postprocessor/sponsorblock.py +++ b/yt_dlp/postprocessor/sponsorblock.py @@ -27,6 +27,7 @@ class SponsorBlockPP(FFmpegPostProcessor): 'filler': 'Filler Tangent', 'interaction': 'Interaction Reminder', 'music_offtopic': 'Non-Music Section', + 'hook': 'Hook/Greetings', **NON_SKIPPABLE_CATEGORIES, } diff --git a/yt_dlp/update.py b/yt_dlp/update.py index e33be3f7b3..c957e0d07d 100644 --- a/yt_dlp/update.py +++ b/yt_dlp/update.py @@ -70,7 +70,17 @@ def _get_variant_and_executable_path(): return 'linux_static_exe', static_exe_path # We know it's a PyInstaller bundle, but is it "onedir" or "onefile"? - suffix = 'dir' if sys._MEIPASS == os.path.dirname(path) else 'exe' + if ( + # PyInstaller >= 6.0.0 sets sys._MEIPASS for onedir to its `_internal` subdirectory + # Ref: https://pyinstaller.org/en/v6.0.0/CHANGES.html#incompatible-changes + sys._MEIPASS == f'{os.path.dirname(path)}/_internal' + # compat: PyInstaller < 6.0.0 + or sys._MEIPASS == os.path.dirname(path) + ): + suffix = 'dir' + else: + suffix = 'exe' + system_platform = remove_end(sys.platform, '32') if system_platform == 'darwin': @@ -133,8 +143,9 @@ _FILE_SUFFIXES = { _NON_UPDATEABLE_REASONS = { **dict.fromkeys(_FILE_SUFFIXES), # Updatable - **{variant: f'Auto-update is not supported for unpackaged {name} executable; Re-download the latest release' - for variant, name in {'win32_dir': 'Windows', 'darwin_dir': 'MacOS', 'linux_dir': 'Linux'}.items()}, + **dict.fromkeys( + ['linux_armv7l_dir', *(f'{variant[:-4]}_dir' for variant in _FILE_SUFFIXES if variant.endswith('_exe'))], + 'Auto-update is not supported for unpackaged executables; Re-download the latest release'), 'py2exe': 'py2exe is no longer supported by yt-dlp; This executable cannot be updated', 'source': 'You cannot update when running from source code; Use git to pull the latest changes', 'unknown': 'You installed yt-dlp from a manual build or with a package manager; Use that to update', @@ -154,7 +165,7 @@ def _get_binary_name(): def _get_system_deprecation(): - MIN_SUPPORTED, MIN_RECOMMENDED = (3, 9), (3, 10) + MIN_SUPPORTED, MIN_RECOMMENDED = (3, 10), (3, 10) if sys.version_info > MIN_RECOMMENDED: return None @@ -559,11 +570,9 @@ class Updater: @functools.cached_property def cmd(self): """The command-line to run the executable, if known""" - argv = None - # There is no sys.orig_argv in py < 3.10. Also, it can be [] when frozen - if getattr(sys, 'orig_argv', None): - argv = sys.orig_argv - elif getattr(sys, 'frozen', False): + argv = sys.orig_argv + # sys.orig_argv can be [] when frozen + if not argv and getattr(sys, 'frozen', False): argv = sys.argv # linux_static exe's argv[0] will be /tmp/staticx-NNNN/yt-dlp_linux if we don't fixup here if argv and os.getenv('STATICX_PROG_PATH'): @@ -572,7 +581,7 @@ class Updater: def restart(self): """Restart the executable""" - assert self.cmd, 'Must be frozen or Py >= 3.10' + assert self.cmd, 'Unable to determine argv' self.ydl.write_debug(f'Restarting: {shell_quote(self.cmd)}') _, _, returncode = Popen.run(self.cmd) return returncode diff --git a/yt_dlp/utils/_jsruntime.py b/yt_dlp/utils/_jsruntime.py new file mode 100644 index 0000000000..6ee299fde1 --- /dev/null +++ b/yt_dlp/utils/_jsruntime.py @@ -0,0 +1,101 @@ +from __future__ import annotations +import abc +import dataclasses +import functools + +from ._utils import _get_exe_version_output, detect_exe_version, int_or_none + + +# NOT public API +def runtime_version_tuple(v): + # NB: will return (0,) if `v` is an invalid version string + return tuple(int_or_none(x, default=0) for x in v.split('.')) + + +@dataclasses.dataclass(frozen=True) +class JsRuntimeInfo: + name: str + path: str + version: str + version_tuple: tuple[int, ...] + supported: bool = True + + +class JsRuntime(abc.ABC): + def __init__(self, path=None): + self._path = path + + @functools.cached_property + def info(self) -> JsRuntimeInfo | None: + return self._info() + + @abc.abstractmethod + def _info(self) -> JsRuntimeInfo | None: + raise NotImplementedError + + +class DenoJsRuntime(JsRuntime): + MIN_SUPPORTED_VERSION = (2, 0, 0) + + def _info(self): + path = self._path or 'deno' + out = _get_exe_version_output(path, ['--version']) + if not out: + return None + version = detect_exe_version(out, r'^deno (\S+)', 'unknown') + vt = runtime_version_tuple(version) + return JsRuntimeInfo( + name='deno', path=path, version=version, version_tuple=vt, + supported=vt >= self.MIN_SUPPORTED_VERSION) + + +class BunJsRuntime(JsRuntime): + MIN_SUPPORTED_VERSION = (1, 0, 31) + + def _info(self): + path = self._path or 'bun' + out = _get_exe_version_output(path, ['--version']) + if not out: + return None + version = detect_exe_version(out, r'^(\S+)', 'unknown') + vt = runtime_version_tuple(version) + return JsRuntimeInfo( + name='bun', path=path, version=version, version_tuple=vt, + supported=vt >= self.MIN_SUPPORTED_VERSION) + + +class NodeJsRuntime(JsRuntime): + MIN_SUPPORTED_VERSION = (20, 0, 0) + + def _info(self): + path = self._path or 'node' + out = _get_exe_version_output(path, ['--version']) + if not out: + return None + version = detect_exe_version(out, r'^v(\S+)', 'unknown') + vt = runtime_version_tuple(version) + return JsRuntimeInfo( + name='node', path=path, version=version, version_tuple=vt, + supported=vt >= self.MIN_SUPPORTED_VERSION) + + +class QuickJsRuntime(JsRuntime): + MIN_SUPPORTED_VERSION = (2023, 12, 9) + + def _info(self): + path = self._path or 'qjs' + # quickjs does not have --version and --help returns a status code of 1 + out = _get_exe_version_output(path, ['--help'], ignore_return_code=True) + if not out: + return None + is_ng = 'QuickJS-ng' in out + + version = detect_exe_version(out, r'^QuickJS(?:-ng)?\s+version\s+(\S+)', 'unknown') + vt = runtime_version_tuple(version.replace('-', '.')) + if is_ng: + return JsRuntimeInfo( + name='quickjs-ng', path=path, version=version, version_tuple=vt, + supported=vt > (0,)) + return JsRuntimeInfo( + name='quickjs', path=path, version=version, version_tuple=vt, + supported=vt >= self.MIN_SUPPORTED_VERSION) diff --git a/yt_dlp/utils/_legacy.py b/yt_dlp/utils/_legacy.py index d65b135d9d..baa8162370 100644 --- a/yt_dlp/utils/_legacy.py +++ b/yt_dlp/utils/_legacy.py @@ -1,6 +1,4 @@ """No longer used and new code should not use. Exists only for API compat.""" -import asyncio -import atexit import platform import struct import sys @@ -34,77 +32,6 @@ has_certifi = bool(certifi) has_websockets = bool(websockets) -class WebSocketsWrapper: - """Wraps websockets module to use in non-async scopes""" - pool = None - - def __init__(self, url, headers=None, connect=True, **ws_kwargs): - self.loop = asyncio.new_event_loop() - # XXX: "loop" is deprecated - self.conn = websockets.connect( - url, extra_headers=headers, ping_interval=None, - close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'), **ws_kwargs) - if connect: - self.__enter__() - atexit.register(self.__exit__, None, None, None) - - def __enter__(self): - if not self.pool: - self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop) - return self - - def send(self, *args): - self.run_with_loop(self.pool.send(*args), self.loop) - - def recv(self, *args): - return self.run_with_loop(self.pool.recv(*args), self.loop) - - def __exit__(self, type, value, traceback): - try: - return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop) - finally: - self.loop.close() - self._cancel_all_tasks(self.loop) - - # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications - # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class - @staticmethod - def run_with_loop(main, loop): - if not asyncio.iscoroutine(main): - raise ValueError(f'a coroutine was expected, got {main!r}') - - try: - return loop.run_until_complete(main) - finally: - loop.run_until_complete(loop.shutdown_asyncgens()) - if hasattr(loop, 'shutdown_default_executor'): - loop.run_until_complete(loop.shutdown_default_executor()) - - @staticmethod - def _cancel_all_tasks(loop): - to_cancel = asyncio.all_tasks(loop) - - if not to_cancel: - return - - for task in to_cancel: - task.cancel() - - # XXX: "loop" is removed in Python 3.10+ - loop.run_until_complete( - asyncio.gather(*to_cancel, loop=loop, return_exceptions=True)) - - for task in to_cancel: - if task.cancelled(): - continue - if task.exception() is not None: - loop.call_exception_handler({ - 'message': 'unhandled exception during asyncio.run() shutdown', - 'exception': task.exception(), - 'task': task, - }) - - def load_plugins(name, suffix, namespace): from ..plugins import load_plugins ret = load_plugins(name, suffix) diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index 50652c1e45..c6ae21f6c7 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -95,7 +95,7 @@ TIMEZONE_NAMES = { # needed for sanitizing filenames in restricted mode ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ', itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'], - 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y'))) + 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y'), strict=True)) DATE_FORMATS = ( '%d %B %Y', @@ -2150,14 +2150,14 @@ def check_executable(exe, args=[]): return exe -def _get_exe_version_output(exe, args): +def _get_exe_version_output(exe, args, ignore_return_code=False): try: # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers # SIGTTOU if yt-dlp is run in the background. # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656 stdout, _, ret = Popen.run([encodeArgument(exe), *args], text=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) - if ret: + if not ignore_return_code and ret: return None except OSError: return False @@ -2415,7 +2415,7 @@ class PlaylistEntries: if self.is_incomplete: assert self.is_exhausted self._entries = [self.MissingEntry] * max(requested_entries or [0]) - for i, entry in zip(requested_entries, entries): + for i, entry in zip(requested_entries, entries): # noqa: B905 self._entries[i - 1] = entry elif isinstance(entries, (list, PagedList, LazyList)): self._entries = entries @@ -3184,7 +3184,7 @@ def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False): return len(remove_terminal_sequences(string).replace('\t', '')) def get_max_lens(table): - return [max(width(str(v)) for v in col) for col in zip(*table)] + return [max(width(str(v)) for v in col) for col in zip(*table, strict=True)] def filter_using_list(row, filter_array): return [col for take, col in itertools.zip_longest(filter_array, row, fillvalue=True) if take] @@ -3540,7 +3540,7 @@ def dfxp2srt(dfxp_data): continue default_style.update(style) - for para, index in zip(paras, itertools.count(1)): + for para, index in zip(paras, itertools.count(1), strict=False): begin_time = parse_dfxp_time_expr(para.attrib.get('begin')) end_time = parse_dfxp_time_expr(para.attrib.get('end')) dur = parse_dfxp_time_expr(para.attrib.get('dur')) @@ -4854,7 +4854,7 @@ def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re): return [ merge_dicts( {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])}, - dict(zip(_keys, max_dimensions)), thumbnail) + dict(zip(_keys, max_dimensions, strict=True)), thumbnail) for thumbnail in thumbnails ] diff --git a/yt_dlp/utils/jslib/devalue.py b/yt_dlp/utils/jslib/devalue.py index d82880d921..ac5bd68577 100644 --- a/yt_dlp/utils/jslib/devalue.py +++ b/yt_dlp/utils/jslib/devalue.py @@ -110,7 +110,7 @@ def parse_iter(parsed: typing.Any, /, *, revivers: dict[str, collections.abc.Cal elif value[0] == 'Map': result = [] - for key, new_source in zip(*(iter(value[1:]),) * 2): + for key, new_source in zip(*(iter(value[1:]),) * 2, strict=True): pair = [None, None] stack.append((pair, 0, key)) stack.append((pair, 1, new_source)) @@ -129,7 +129,7 @@ def parse_iter(parsed: typing.Any, /, *, revivers: dict[str, collections.abc.Cal elif value[0] == 'null': result = {} - for key, new_source in zip(*(iter(value[1:]),) * 2): + for key, new_source in zip(*(iter(value[1:]),) * 2, strict=True): stack.append((result, key, new_source)) elif value[0] in _ARRAY_TYPE_LOOKUP: diff --git a/yt_dlp/version.py b/yt_dlp/version.py index 748e818939..b068f0f2fa 100644 --- a/yt_dlp/version.py +++ b/yt_dlp/version.py @@ -1,8 +1,8 @@ # Autogenerated by devscripts/update-version.py -__version__ = '2025.09.26' +__version__ = '2025.10.22' -RELEASE_GIT_HEAD = '12b57d2858845c0c7fb33bf9aa8ed7be6905535d' +RELEASE_GIT_HEAD = 'c9356f308dd3c5f9f494cb40ed14c5df017b4fe0' VARIANT = None @@ -12,4 +12,4 @@ CHANNEL = 'stable' ORIGIN = 'yt-dlp/yt-dlp' -_pkg_version = '2025.09.26' +_pkg_version = '2025.10.22' diff --git a/yt_dlp/webvtt.py b/yt_dlp/webvtt.py index 9f1a5086b8..a86ddc7ce0 100644 --- a/yt_dlp/webvtt.py +++ b/yt_dlp/webvtt.py @@ -103,7 +103,7 @@ def _parse_ts(ts): into an MPEG PES timestamp: a tick counter at 90 kHz resolution. """ return 90 * sum( - int(part or 0) * mult for part, mult in zip(ts.groups(), (3600_000, 60_000, 1000, 1))) + int(part or 0) * mult for part, mult in zip(ts.groups(), (3600_000, 60_000, 1000, 1), strict=True)) def _format_ts(ts):