mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2025-12-06 06:45:00 +01:00
[rh:curlcffi] Add support for curl_cffi
Authored by: coletdjnz, Grub4K, pukkandan, bashonly Co-authored-by: Simon Sawicki <contact@grub4k.xyz> Co-authored-by: pukkandan <pukkandan.ytdlp@gmail.com> Co-authored-by: bashonly <bashonly@protonmail.com>
This commit is contained in:
parent
0b81d4d252
commit
52f5be1f1e
14 changed files with 628 additions and 140 deletions
|
|
@ -28,3 +28,10 @@ except ImportError:
|
|||
pass
|
||||
except Exception as e:
|
||||
warnings.warn(f'Failed to import "websockets" request handler: {e}' + bug_reports_message())
|
||||
|
||||
try:
|
||||
from . import _curlcffi # noqa: F401
|
||||
except ImportError:
|
||||
pass
|
||||
except Exception as e:
|
||||
warnings.warn(f'Failed to import "curl_cffi" request handler: {e}' + bug_reports_message())
|
||||
|
|
|
|||
221
yt_dlp/networking/_curlcffi.py
Normal file
221
yt_dlp/networking/_curlcffi.py
Normal file
|
|
@ -0,0 +1,221 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
import math
|
||||
import urllib.parse
|
||||
|
||||
from ._helper import InstanceStoreMixin, select_proxy
|
||||
from .common import (
|
||||
Features,
|
||||
Request,
|
||||
Response,
|
||||
register_preference,
|
||||
register_rh,
|
||||
)
|
||||
from .exceptions import (
|
||||
CertificateVerifyError,
|
||||
HTTPError,
|
||||
IncompleteRead,
|
||||
ProxyError,
|
||||
SSLError,
|
||||
TransportError,
|
||||
)
|
||||
from .impersonate import ImpersonateRequestHandler, ImpersonateTarget
|
||||
from ..dependencies import curl_cffi
|
||||
from ..utils import int_or_none
|
||||
|
||||
if curl_cffi is None:
|
||||
raise ImportError('curl_cffi is not installed')
|
||||
|
||||
curl_cffi_version = tuple(int_or_none(x, default=0) for x in curl_cffi.__version__.split('.'))
|
||||
|
||||
if curl_cffi_version != (0, 5, 10):
|
||||
curl_cffi._yt_dlp__version = f'{curl_cffi.__version__} (unsupported)'
|
||||
raise ImportError('Only curl_cffi 0.5.10 is supported')
|
||||
|
||||
import curl_cffi.requests
|
||||
from curl_cffi.const import CurlECode, CurlOpt
|
||||
|
||||
|
||||
class CurlCFFIResponseReader(io.IOBase):
|
||||
def __init__(self, response: curl_cffi.requests.Response):
|
||||
self._response = response
|
||||
self._iterator = response.iter_content()
|
||||
self._buffer = b''
|
||||
self.bytes_read = 0
|
||||
|
||||
def readable(self):
|
||||
return True
|
||||
|
||||
def read(self, size=None):
|
||||
exception_raised = True
|
||||
try:
|
||||
while self._iterator and (size is None or len(self._buffer) < size):
|
||||
chunk = next(self._iterator, None)
|
||||
if chunk is None:
|
||||
self._iterator = None
|
||||
break
|
||||
self._buffer += chunk
|
||||
self.bytes_read += len(chunk)
|
||||
|
||||
if size is None:
|
||||
size = len(self._buffer)
|
||||
data = self._buffer[:size]
|
||||
self._buffer = self._buffer[size:]
|
||||
|
||||
# "free" the curl instance if the response is fully read.
|
||||
# curl_cffi doesn't do this automatically and only allows one open response per thread
|
||||
if not self._iterator and not self._buffer:
|
||||
self.close()
|
||||
exception_raised = False
|
||||
return data
|
||||
finally:
|
||||
if exception_raised:
|
||||
self.close()
|
||||
|
||||
def close(self):
|
||||
if not self.closed:
|
||||
self._response.close()
|
||||
self._buffer = b''
|
||||
super().close()
|
||||
|
||||
|
||||
class CurlCFFIResponseAdapter(Response):
|
||||
fp: CurlCFFIResponseReader
|
||||
|
||||
def __init__(self, response: curl_cffi.requests.Response):
|
||||
super().__init__(
|
||||
fp=CurlCFFIResponseReader(response),
|
||||
headers=response.headers,
|
||||
url=response.url,
|
||||
status=response.status_code)
|
||||
|
||||
def read(self, amt=None):
|
||||
try:
|
||||
return self.fp.read(amt)
|
||||
except curl_cffi.requests.errors.RequestsError as e:
|
||||
if e.code == CurlECode.PARTIAL_FILE:
|
||||
content_length = int_or_none(e.response.headers.get('Content-Length'))
|
||||
raise IncompleteRead(
|
||||
partial=self.fp.bytes_read,
|
||||
expected=content_length - self.fp.bytes_read if content_length is not None else None,
|
||||
cause=e) from e
|
||||
raise TransportError(cause=e) from e
|
||||
|
||||
|
||||
@register_rh
|
||||
class CurlCFFIRH(ImpersonateRequestHandler, InstanceStoreMixin):
|
||||
RH_NAME = 'curl_cffi'
|
||||
_SUPPORTED_URL_SCHEMES = ('http', 'https')
|
||||
_SUPPORTED_FEATURES = (Features.NO_PROXY, Features.ALL_PROXY)
|
||||
_SUPPORTED_PROXY_SCHEMES = ('http', 'https', 'socks4', 'socks4a', 'socks5', 'socks5h')
|
||||
_SUPPORTED_IMPERSONATE_TARGET_MAP = {
|
||||
ImpersonateTarget('chrome', '110', 'windows', '10'): curl_cffi.requests.BrowserType.chrome110,
|
||||
ImpersonateTarget('chrome', '107', 'windows', '10'): curl_cffi.requests.BrowserType.chrome107,
|
||||
ImpersonateTarget('chrome', '104', 'windows', '10'): curl_cffi.requests.BrowserType.chrome104,
|
||||
ImpersonateTarget('chrome', '101', 'windows', '10'): curl_cffi.requests.BrowserType.chrome101,
|
||||
ImpersonateTarget('chrome', '100', 'windows', '10'): curl_cffi.requests.BrowserType.chrome100,
|
||||
ImpersonateTarget('chrome', '99', 'windows', '10'): curl_cffi.requests.BrowserType.chrome99,
|
||||
ImpersonateTarget('edge', '101', 'windows', '10'): curl_cffi.requests.BrowserType.edge101,
|
||||
ImpersonateTarget('edge', '99', 'windows', '10'): curl_cffi.requests.BrowserType.edge99,
|
||||
ImpersonateTarget('safari', '15.5', 'macos', '12'): curl_cffi.requests.BrowserType.safari15_5,
|
||||
ImpersonateTarget('safari', '15.3', 'macos', '11'): curl_cffi.requests.BrowserType.safari15_3,
|
||||
ImpersonateTarget('chrome', '99', 'android', '12'): curl_cffi.requests.BrowserType.chrome99_android,
|
||||
}
|
||||
|
||||
def _create_instance(self, cookiejar=None):
|
||||
return curl_cffi.requests.Session(cookies=cookiejar)
|
||||
|
||||
def _check_extensions(self, extensions):
|
||||
super()._check_extensions(extensions)
|
||||
extensions.pop('impersonate', None)
|
||||
extensions.pop('cookiejar', None)
|
||||
extensions.pop('timeout', None)
|
||||
|
||||
def _send(self, request: Request):
|
||||
max_redirects_exceeded = False
|
||||
session: curl_cffi.requests.Session = self._get_instance(
|
||||
cookiejar=self._get_cookiejar(request) if 'cookie' not in request.headers else None)
|
||||
|
||||
if self.verbose:
|
||||
session.curl.setopt(CurlOpt.VERBOSE, 1)
|
||||
|
||||
proxies = self._get_proxies(request)
|
||||
if 'no' in proxies:
|
||||
session.curl.setopt(CurlOpt.NOPROXY, proxies['no'])
|
||||
proxies.pop('no', None)
|
||||
|
||||
# curl doesn't support per protocol proxies, so we select the one that matches the request protocol
|
||||
proxy = select_proxy(request.url, proxies=proxies)
|
||||
if proxy:
|
||||
session.curl.setopt(CurlOpt.PROXY, proxy)
|
||||
scheme = urllib.parse.urlparse(request.url).scheme.lower()
|
||||
if scheme != 'http':
|
||||
# Enable HTTP CONNECT for HTTPS urls.
|
||||
# Don't use CONNECT for http for compatibility with urllib behaviour.
|
||||
# See: https://curl.se/libcurl/c/CURLOPT_HTTPPROXYTUNNEL.html
|
||||
session.curl.setopt(CurlOpt.HTTPPROXYTUNNEL, 1)
|
||||
|
||||
headers = self._get_impersonate_headers(request)
|
||||
|
||||
if self._client_cert:
|
||||
session.curl.setopt(CurlOpt.SSLCERT, self._client_cert['client_certificate'])
|
||||
client_certificate_key = self._client_cert.get('client_certificate_key')
|
||||
client_certificate_password = self._client_cert.get('client_certificate_password')
|
||||
if client_certificate_key:
|
||||
session.curl.setopt(CurlOpt.SSLKEY, client_certificate_key)
|
||||
if client_certificate_password:
|
||||
session.curl.setopt(CurlOpt.KEYPASSWD, client_certificate_password)
|
||||
|
||||
timeout = self._calculate_timeout(request)
|
||||
|
||||
# set CURLOPT_LOW_SPEED_LIMIT and CURLOPT_LOW_SPEED_TIME to act as a read timeout. [1]
|
||||
# curl_cffi does not currently do this. [2]
|
||||
# Note: CURLOPT_LOW_SPEED_TIME is in seconds, so we need to round up to the nearest second. [3]
|
||||
# [1] https://unix.stackexchange.com/a/305311
|
||||
# [2] https://github.com/yifeikong/curl_cffi/issues/156
|
||||
# [3] https://curl.se/libcurl/c/CURLOPT_LOW_SPEED_TIME.html
|
||||
session.curl.setopt(CurlOpt.LOW_SPEED_LIMIT, 1) # 1 byte per second
|
||||
session.curl.setopt(CurlOpt.LOW_SPEED_TIME, math.ceil(timeout))
|
||||
|
||||
try:
|
||||
curl_response = session.request(
|
||||
method=request.method,
|
||||
url=request.url,
|
||||
headers=headers,
|
||||
data=request.data,
|
||||
verify=self.verify,
|
||||
max_redirects=5,
|
||||
timeout=timeout,
|
||||
impersonate=self._SUPPORTED_IMPERSONATE_TARGET_MAP.get(
|
||||
self._get_request_target(request)),
|
||||
interface=self.source_address,
|
||||
stream=True
|
||||
)
|
||||
except curl_cffi.requests.errors.RequestsError as e:
|
||||
if e.code == CurlECode.PEER_FAILED_VERIFICATION:
|
||||
raise CertificateVerifyError(cause=e) from e
|
||||
|
||||
elif e.code == CurlECode.SSL_CONNECT_ERROR:
|
||||
raise SSLError(cause=e) from e
|
||||
|
||||
elif e.code == CurlECode.TOO_MANY_REDIRECTS:
|
||||
max_redirects_exceeded = True
|
||||
curl_response = e.response
|
||||
|
||||
elif e.code == CurlECode.PROXY:
|
||||
raise ProxyError(cause=e) from e
|
||||
else:
|
||||
raise TransportError(cause=e) from e
|
||||
|
||||
response = CurlCFFIResponseAdapter(curl_response)
|
||||
|
||||
if not 200 <= response.status < 300:
|
||||
raise HTTPError(response, redirect_loop=max_redirects_exceeded)
|
||||
|
||||
return response
|
||||
|
||||
|
||||
@register_preference(CurlCFFIRH)
|
||||
def curl_cffi_preference(rh, request):
|
||||
return -100
|
||||
|
|
@ -307,8 +307,7 @@ class RequestsRH(RequestHandler, InstanceStoreMixin):
|
|||
|
||||
max_redirects_exceeded = False
|
||||
|
||||
session = self._get_instance(
|
||||
cookiejar=request.extensions.get('cookiejar') or self.cookiejar)
|
||||
session = self._get_instance(cookiejar=self._get_cookiejar(request))
|
||||
|
||||
try:
|
||||
requests_res = session.request(
|
||||
|
|
@ -316,8 +315,8 @@ class RequestsRH(RequestHandler, InstanceStoreMixin):
|
|||
url=request.url,
|
||||
data=request.data,
|
||||
headers=headers,
|
||||
timeout=float(request.extensions.get('timeout') or self.timeout),
|
||||
proxies=request.proxies or self.proxies,
|
||||
timeout=self._calculate_timeout(request),
|
||||
proxies=self._get_proxies(request),
|
||||
allow_redirects=True,
|
||||
stream=True
|
||||
)
|
||||
|
|
|
|||
|
|
@ -389,11 +389,11 @@ class UrllibRH(RequestHandler, InstanceStoreMixin):
|
|||
)
|
||||
|
||||
opener = self._get_instance(
|
||||
proxies=request.proxies or self.proxies,
|
||||
cookiejar=request.extensions.get('cookiejar') or self.cookiejar
|
||||
proxies=self._get_proxies(request),
|
||||
cookiejar=self._get_cookiejar(request)
|
||||
)
|
||||
try:
|
||||
res = opener.open(urllib_req, timeout=float(request.extensions.get('timeout') or self.timeout))
|
||||
res = opener.open(urllib_req, timeout=self._calculate_timeout(request))
|
||||
except urllib.error.HTTPError as e:
|
||||
if isinstance(e.fp, (http.client.HTTPResponse, urllib.response.addinfourl)):
|
||||
# Prevent file object from being closed when urllib.error.HTTPError is destroyed.
|
||||
|
|
|
|||
|
|
@ -112,10 +112,10 @@ class WebsocketsRH(WebSocketRequestHandler):
|
|||
logging.getLogger(name).removeHandler(handler)
|
||||
|
||||
def _send(self, request):
|
||||
timeout = float(request.extensions.get('timeout') or self.timeout)
|
||||
timeout = self._calculate_timeout(request)
|
||||
headers = self._merge_headers(request.headers)
|
||||
if 'cookie' not in headers:
|
||||
cookiejar = request.extensions.get('cookiejar') or self.cookiejar
|
||||
cookiejar = self._get_cookiejar(request)
|
||||
cookie_header = cookiejar.get_cookie_header(request.url)
|
||||
if cookie_header:
|
||||
headers['cookie'] = cookie_header
|
||||
|
|
@ -125,7 +125,7 @@ class WebsocketsRH(WebSocketRequestHandler):
|
|||
'source_address': (self.source_address, 0) if self.source_address else None,
|
||||
'timeout': timeout
|
||||
}
|
||||
proxy = select_proxy(request.url, request.proxies or self.proxies or {})
|
||||
proxy = select_proxy(request.url, self._get_proxies(request))
|
||||
try:
|
||||
if proxy:
|
||||
socks_proxy_options = make_socks_proxy_opts(proxy)
|
||||
|
|
|
|||
|
|
@ -256,6 +256,15 @@ class RequestHandler(abc.ABC):
|
|||
def _merge_headers(self, request_headers):
|
||||
return HTTPHeaderDict(self.headers, request_headers)
|
||||
|
||||
def _calculate_timeout(self, request):
|
||||
return float(request.extensions.get('timeout') or self.timeout)
|
||||
|
||||
def _get_cookiejar(self, request):
|
||||
return request.extensions.get('cookiejar') or self.cookiejar
|
||||
|
||||
def _get_proxies(self, request):
|
||||
return (request.proxies or self.proxies).copy()
|
||||
|
||||
def _check_url_scheme(self, request: Request):
|
||||
scheme = urllib.parse.urlparse(request.url).scheme.lower()
|
||||
if self._SUPPORTED_URL_SCHEMES is not None and scheme not in self._SUPPORTED_URL_SCHEMES:
|
||||
|
|
@ -491,7 +500,7 @@ class Response(io.IOBase):
|
|||
|
||||
def __init__(
|
||||
self,
|
||||
fp: typing.IO,
|
||||
fp: io.IOBase,
|
||||
url: str,
|
||||
headers: Mapping[str, str],
|
||||
status: int = 200,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue