[rh:curlcffi] Add support for curl_cffi

Authored by: coletdjnz, Grub4K, pukkandan, bashonly

Co-authored-by: Simon Sawicki <contact@grub4k.xyz>
Co-authored-by: pukkandan <pukkandan.ytdlp@gmail.com>
Co-authored-by: bashonly <bashonly@protonmail.com>
This commit is contained in:
coletdjnz 2024-03-16 22:52:38 -05:00 committed by bashonly
parent 0b81d4d252
commit 52f5be1f1e
No known key found for this signature in database
GPG key ID: 783F096F253D15B0
14 changed files with 628 additions and 140 deletions

View file

@ -28,3 +28,10 @@ except ImportError:
pass
except Exception as e:
warnings.warn(f'Failed to import "websockets" request handler: {e}' + bug_reports_message())
try:
from . import _curlcffi # noqa: F401
except ImportError:
pass
except Exception as e:
warnings.warn(f'Failed to import "curl_cffi" request handler: {e}' + bug_reports_message())

View file

@ -0,0 +1,221 @@
from __future__ import annotations
import io
import math
import urllib.parse
from ._helper import InstanceStoreMixin, select_proxy
from .common import (
Features,
Request,
Response,
register_preference,
register_rh,
)
from .exceptions import (
CertificateVerifyError,
HTTPError,
IncompleteRead,
ProxyError,
SSLError,
TransportError,
)
from .impersonate import ImpersonateRequestHandler, ImpersonateTarget
from ..dependencies import curl_cffi
from ..utils import int_or_none
if curl_cffi is None:
raise ImportError('curl_cffi is not installed')
curl_cffi_version = tuple(int_or_none(x, default=0) for x in curl_cffi.__version__.split('.'))
if curl_cffi_version != (0, 5, 10):
curl_cffi._yt_dlp__version = f'{curl_cffi.__version__} (unsupported)'
raise ImportError('Only curl_cffi 0.5.10 is supported')
import curl_cffi.requests
from curl_cffi.const import CurlECode, CurlOpt
class CurlCFFIResponseReader(io.IOBase):
def __init__(self, response: curl_cffi.requests.Response):
self._response = response
self._iterator = response.iter_content()
self._buffer = b''
self.bytes_read = 0
def readable(self):
return True
def read(self, size=None):
exception_raised = True
try:
while self._iterator and (size is None or len(self._buffer) < size):
chunk = next(self._iterator, None)
if chunk is None:
self._iterator = None
break
self._buffer += chunk
self.bytes_read += len(chunk)
if size is None:
size = len(self._buffer)
data = self._buffer[:size]
self._buffer = self._buffer[size:]
# "free" the curl instance if the response is fully read.
# curl_cffi doesn't do this automatically and only allows one open response per thread
if not self._iterator and not self._buffer:
self.close()
exception_raised = False
return data
finally:
if exception_raised:
self.close()
def close(self):
if not self.closed:
self._response.close()
self._buffer = b''
super().close()
class CurlCFFIResponseAdapter(Response):
fp: CurlCFFIResponseReader
def __init__(self, response: curl_cffi.requests.Response):
super().__init__(
fp=CurlCFFIResponseReader(response),
headers=response.headers,
url=response.url,
status=response.status_code)
def read(self, amt=None):
try:
return self.fp.read(amt)
except curl_cffi.requests.errors.RequestsError as e:
if e.code == CurlECode.PARTIAL_FILE:
content_length = int_or_none(e.response.headers.get('Content-Length'))
raise IncompleteRead(
partial=self.fp.bytes_read,
expected=content_length - self.fp.bytes_read if content_length is not None else None,
cause=e) from e
raise TransportError(cause=e) from e
@register_rh
class CurlCFFIRH(ImpersonateRequestHandler, InstanceStoreMixin):
RH_NAME = 'curl_cffi'
_SUPPORTED_URL_SCHEMES = ('http', 'https')
_SUPPORTED_FEATURES = (Features.NO_PROXY, Features.ALL_PROXY)
_SUPPORTED_PROXY_SCHEMES = ('http', 'https', 'socks4', 'socks4a', 'socks5', 'socks5h')
_SUPPORTED_IMPERSONATE_TARGET_MAP = {
ImpersonateTarget('chrome', '110', 'windows', '10'): curl_cffi.requests.BrowserType.chrome110,
ImpersonateTarget('chrome', '107', 'windows', '10'): curl_cffi.requests.BrowserType.chrome107,
ImpersonateTarget('chrome', '104', 'windows', '10'): curl_cffi.requests.BrowserType.chrome104,
ImpersonateTarget('chrome', '101', 'windows', '10'): curl_cffi.requests.BrowserType.chrome101,
ImpersonateTarget('chrome', '100', 'windows', '10'): curl_cffi.requests.BrowserType.chrome100,
ImpersonateTarget('chrome', '99', 'windows', '10'): curl_cffi.requests.BrowserType.chrome99,
ImpersonateTarget('edge', '101', 'windows', '10'): curl_cffi.requests.BrowserType.edge101,
ImpersonateTarget('edge', '99', 'windows', '10'): curl_cffi.requests.BrowserType.edge99,
ImpersonateTarget('safari', '15.5', 'macos', '12'): curl_cffi.requests.BrowserType.safari15_5,
ImpersonateTarget('safari', '15.3', 'macos', '11'): curl_cffi.requests.BrowserType.safari15_3,
ImpersonateTarget('chrome', '99', 'android', '12'): curl_cffi.requests.BrowserType.chrome99_android,
}
def _create_instance(self, cookiejar=None):
return curl_cffi.requests.Session(cookies=cookiejar)
def _check_extensions(self, extensions):
super()._check_extensions(extensions)
extensions.pop('impersonate', None)
extensions.pop('cookiejar', None)
extensions.pop('timeout', None)
def _send(self, request: Request):
max_redirects_exceeded = False
session: curl_cffi.requests.Session = self._get_instance(
cookiejar=self._get_cookiejar(request) if 'cookie' not in request.headers else None)
if self.verbose:
session.curl.setopt(CurlOpt.VERBOSE, 1)
proxies = self._get_proxies(request)
if 'no' in proxies:
session.curl.setopt(CurlOpt.NOPROXY, proxies['no'])
proxies.pop('no', None)
# curl doesn't support per protocol proxies, so we select the one that matches the request protocol
proxy = select_proxy(request.url, proxies=proxies)
if proxy:
session.curl.setopt(CurlOpt.PROXY, proxy)
scheme = urllib.parse.urlparse(request.url).scheme.lower()
if scheme != 'http':
# Enable HTTP CONNECT for HTTPS urls.
# Don't use CONNECT for http for compatibility with urllib behaviour.
# See: https://curl.se/libcurl/c/CURLOPT_HTTPPROXYTUNNEL.html
session.curl.setopt(CurlOpt.HTTPPROXYTUNNEL, 1)
headers = self._get_impersonate_headers(request)
if self._client_cert:
session.curl.setopt(CurlOpt.SSLCERT, self._client_cert['client_certificate'])
client_certificate_key = self._client_cert.get('client_certificate_key')
client_certificate_password = self._client_cert.get('client_certificate_password')
if client_certificate_key:
session.curl.setopt(CurlOpt.SSLKEY, client_certificate_key)
if client_certificate_password:
session.curl.setopt(CurlOpt.KEYPASSWD, client_certificate_password)
timeout = self._calculate_timeout(request)
# set CURLOPT_LOW_SPEED_LIMIT and CURLOPT_LOW_SPEED_TIME to act as a read timeout. [1]
# curl_cffi does not currently do this. [2]
# Note: CURLOPT_LOW_SPEED_TIME is in seconds, so we need to round up to the nearest second. [3]
# [1] https://unix.stackexchange.com/a/305311
# [2] https://github.com/yifeikong/curl_cffi/issues/156
# [3] https://curl.se/libcurl/c/CURLOPT_LOW_SPEED_TIME.html
session.curl.setopt(CurlOpt.LOW_SPEED_LIMIT, 1) # 1 byte per second
session.curl.setopt(CurlOpt.LOW_SPEED_TIME, math.ceil(timeout))
try:
curl_response = session.request(
method=request.method,
url=request.url,
headers=headers,
data=request.data,
verify=self.verify,
max_redirects=5,
timeout=timeout,
impersonate=self._SUPPORTED_IMPERSONATE_TARGET_MAP.get(
self._get_request_target(request)),
interface=self.source_address,
stream=True
)
except curl_cffi.requests.errors.RequestsError as e:
if e.code == CurlECode.PEER_FAILED_VERIFICATION:
raise CertificateVerifyError(cause=e) from e
elif e.code == CurlECode.SSL_CONNECT_ERROR:
raise SSLError(cause=e) from e
elif e.code == CurlECode.TOO_MANY_REDIRECTS:
max_redirects_exceeded = True
curl_response = e.response
elif e.code == CurlECode.PROXY:
raise ProxyError(cause=e) from e
else:
raise TransportError(cause=e) from e
response = CurlCFFIResponseAdapter(curl_response)
if not 200 <= response.status < 300:
raise HTTPError(response, redirect_loop=max_redirects_exceeded)
return response
@register_preference(CurlCFFIRH)
def curl_cffi_preference(rh, request):
return -100

View file

@ -307,8 +307,7 @@ class RequestsRH(RequestHandler, InstanceStoreMixin):
max_redirects_exceeded = False
session = self._get_instance(
cookiejar=request.extensions.get('cookiejar') or self.cookiejar)
session = self._get_instance(cookiejar=self._get_cookiejar(request))
try:
requests_res = session.request(
@ -316,8 +315,8 @@ class RequestsRH(RequestHandler, InstanceStoreMixin):
url=request.url,
data=request.data,
headers=headers,
timeout=float(request.extensions.get('timeout') or self.timeout),
proxies=request.proxies or self.proxies,
timeout=self._calculate_timeout(request),
proxies=self._get_proxies(request),
allow_redirects=True,
stream=True
)

View file

@ -389,11 +389,11 @@ class UrllibRH(RequestHandler, InstanceStoreMixin):
)
opener = self._get_instance(
proxies=request.proxies or self.proxies,
cookiejar=request.extensions.get('cookiejar') or self.cookiejar
proxies=self._get_proxies(request),
cookiejar=self._get_cookiejar(request)
)
try:
res = opener.open(urllib_req, timeout=float(request.extensions.get('timeout') or self.timeout))
res = opener.open(urllib_req, timeout=self._calculate_timeout(request))
except urllib.error.HTTPError as e:
if isinstance(e.fp, (http.client.HTTPResponse, urllib.response.addinfourl)):
# Prevent file object from being closed when urllib.error.HTTPError is destroyed.

View file

@ -112,10 +112,10 @@ class WebsocketsRH(WebSocketRequestHandler):
logging.getLogger(name).removeHandler(handler)
def _send(self, request):
timeout = float(request.extensions.get('timeout') or self.timeout)
timeout = self._calculate_timeout(request)
headers = self._merge_headers(request.headers)
if 'cookie' not in headers:
cookiejar = request.extensions.get('cookiejar') or self.cookiejar
cookiejar = self._get_cookiejar(request)
cookie_header = cookiejar.get_cookie_header(request.url)
if cookie_header:
headers['cookie'] = cookie_header
@ -125,7 +125,7 @@ class WebsocketsRH(WebSocketRequestHandler):
'source_address': (self.source_address, 0) if self.source_address else None,
'timeout': timeout
}
proxy = select_proxy(request.url, request.proxies or self.proxies or {})
proxy = select_proxy(request.url, self._get_proxies(request))
try:
if proxy:
socks_proxy_options = make_socks_proxy_opts(proxy)

View file

@ -256,6 +256,15 @@ class RequestHandler(abc.ABC):
def _merge_headers(self, request_headers):
return HTTPHeaderDict(self.headers, request_headers)
def _calculate_timeout(self, request):
return float(request.extensions.get('timeout') or self.timeout)
def _get_cookiejar(self, request):
return request.extensions.get('cookiejar') or self.cookiejar
def _get_proxies(self, request):
return (request.proxies or self.proxies).copy()
def _check_url_scheme(self, request: Request):
scheme = urllib.parse.urlparse(request.url).scheme.lower()
if self._SUPPORTED_URL_SCHEMES is not None and scheme not in self._SUPPORTED_URL_SCHEMES:
@ -491,7 +500,7 @@ class Response(io.IOBase):
def __init__(
self,
fp: typing.IO,
fp: io.IOBase,
url: str,
headers: Mapping[str, str],
status: int = 200,