Merge 8057c858ba into c8680b65f7

2026-01-23 13:37:08 +01:00 · 2026-01-21 04:07:55 +00:00 · 2026-01-21 04:07:55 +00:00 · 84b3764113
commit 84b3764113
parent c8680b65f7 8057c858ba
2 changed files with 44 additions and 6 deletions
--- a/test/test_utils.py
+++ b/test/test_utils.py
@ -1814,6 +1814,9 @@ Line 1
    GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING = '''
        <div itemprop="author" itemscope>foo</div>
    '''
+    VOID_ELEMENT_TEST_STRING = '''
+        <img alt="foo" src="bar.png"><img alt="foobar" src="baz.jpg"><img alt="foo"/>
+    '''

    def test_get_element_by_attribute(self):
        html = self.GET_ELEMENT_BY_CLASS_TEST_STRING
@ -1826,6 +1829,10 @@ Line 1

        self.assertEqual(get_element_by_attribute('itemprop', 'author', html), 'foo')

+        html = self.VOID_ELEMENT_TEST_STRING
+
+        self.assertEqual(get_element_by_attribute('alt', 'foo', html), '')
+
    def test_get_element_html_by_attribute(self):
        html = self.GET_ELEMENT_BY_CLASS_TEST_STRING

@ -1837,6 +1844,10 @@ Line 1

        self.assertEqual(get_element_html_by_attribute('itemprop', 'author', html), html.strip())

+        html = self.VOID_ELEMENT_TEST_STRING
+
+        self.assertEqual(get_element_html_by_attribute('alt', 'foo', html), '<img alt="foo" src="bar.png">')
+
    GET_ELEMENTS_BY_CLASS_TEST_STRING = '''
        <span class="foo bar">nice</span><span class="foo bar">also nice</span>
    '''
@ -1861,6 +1872,10 @@ Line 1
        self.assertEqual(get_elements_by_attribute('class', 'foo', html), [])
        self.assertEqual(get_elements_by_attribute('class', 'no-such-foo', html), [])

+        html = self.VOID_ELEMENT_TEST_STRING
+
+        self.assertEqual(get_elements_by_attribute('alt', 'foo', html), ['', ''])
+
    def test_get_elements_html_by_attribute(self):
        html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING

@ -1868,6 +1883,11 @@ Line 1
        self.assertEqual(get_elements_html_by_attribute('class', 'foo', html), [])
        self.assertEqual(get_elements_html_by_attribute('class', 'no-such-foo', html), [])

+        html = self.VOID_ELEMENT_TEST_STRING
+
+        self.assertEqual(get_elements_html_by_attribute(
+            'alt', 'foo', html), ['<img alt="foo" src="bar.png">', '<img alt="foo"/>'])
+
    def test_get_elements_text_and_html_by_attribute(self):
        html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING

@ -1880,6 +1900,11 @@ Line 1
        self.assertEqual(list(get_elements_text_and_html_by_attribute(
            'class', 'foo', '<a class="foo">nice</a><span class="foo">nice</span>', tag='a')), [('nice', '<a class="foo">nice</a>')])

+        html = self.VOID_ELEMENT_TEST_STRING
+
+        self.assertEqual(list(get_elements_text_and_html_by_attribute(
+            'alt', 'foo', html, tag='img')), [('', '<img alt="foo" src="bar.png">'), ('', '<img alt="foo"/>')])
+
    GET_ELEMENT_BY_TAG_TEST_STRING = '''
    random text lorem ipsum</p>
    <div>
@ -1908,6 +1933,10 @@ Line 1
            (self.GET_ELEMENT_BY_TAG_RES_INNERSPAN_TEXT, self.GET_ELEMENT_BY_TAG_RES_INNERSPAN_HTML))
        self.assertRaises(compat_HTMLParseError, get_element_text_and_html_by_tag, 'article', html)

+        html = self.VOID_ELEMENT_TEST_STRING
+
+        self.assertEqual(get_element_text_and_html_by_tag('img', html), ('', '<img alt="foo" src="bar.png">'))
+
    def test_iri_to_uri(self):
        self.assertEqual(
            iri_to_uri('https://www.google.com/search?q=foo&ie=utf-8&oe=utf-8&client=firefox-b'),
--- a/yt_dlp/utils/_utils.py
+++ b/yt_dlp/utils/_utils.py
@ -168,6 +168,12 @@ JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<

 NUMBER_RE = r'\d+(?:\.\d+)?'

+VOID_ELEMENTS = [
+    'area', 'base', 'br', 'col', 'embed',
+    'hr', 'img', 'input', 'link', 'meta',
+    'param', 'source', 'track', 'wbr',
+]
+

@functools.cache
 def preferredencoding():
@ -365,15 +371,13 @@ def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w
    if not value:
        return

-    quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
-
    value = re.escape(value) if escape_value else value

    partial_element_re = rf'''(?x)
        <(?P<tag>{tag})
-         (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
-         \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
-        '''
+        (?:\s[^>"']*|"[^"]*"|'[^']*')*?
+        \s{re.escape(attribute)}\s*=\s*(?P<q>['"])?(?-x:{value})(?(q)(?P=q)|(?=[\s/>]))
+    '''

    for m in re.finditer(partial_element_re, html):
        content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
@ -437,12 +441,17 @@ def get_element_text_and_html_by_tag(tag, html):
            return haystack.index(needle)
        except ValueError:
            raise exc
-    closing_tag = f'</{tag}>'
+
    whole_start = find_or_raise(
        html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
    content_start = find_or_raise(
        html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
    content_start += whole_start + 1
+
+    if tag in VOID_ELEMENTS:
+        return '', html[whole_start:content_start]
+
+    closing_tag = f'</{tag}>'
    with HTMLBreakOnClosingTagParser() as parser:
        parser.feed(html[whole_start:content_start])
        if not parser.tagstack or parser.tagstack[0] != tag: