Skip to content

gh-135661: Fix CDATA section parsing in HTMLParser #135665

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions Doc/library/html.parser.rst
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,17 @@ The output will then be:
attributes can be preserved, etc.).


.. method:: HTMLParser.support_cdata(flag)

Sets how the parser will parse CDATA declarations.
If *flag* is true, then the :meth:`unknown_decl` method will be called
for the CDATA section ``<![CDATA[...]]>``.
If *flag* is false, then the :meth:`handle_comment` method will be called
for ``<![CDATA[...>``.

.. versionadded:: 3.13.6


The following methods are called when data or markup elements are encountered
and they are meant to be overridden in a subclass. The base class
implementations do nothing (except for :meth:`~HTMLParser.handle_startendtag`):
Expand Down
18 changes: 16 additions & 2 deletions Lib/html/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,7 @@ def reset(self):
self.lasttag = '???'
self.interesting = interesting_normal
self.cdata_elem = None
self._support_cdata = False
super().reset()

def feed(self, data):
Expand Down Expand Up @@ -174,6 +175,9 @@ def clear_cdata_mode(self):
self.interesting = interesting_normal
self.cdata_elem = None

def support_cdata(self, flag=True):
self._support_cdata = flag

# Internal -- handle data as far as reasonable. May leave state
# and data to be processed by a subsequent call. If 'end' is
# true, force handling all data as if followed by EOF marker.
Expand Down Expand Up @@ -249,7 +253,10 @@ def goahead(self, end):
break
self.handle_comment(rawdata[i+4:j])
elif startswith("<![CDATA[", i):
self.unknown_decl(rawdata[i+3:])
if self._support_cdata:
self.unknown_decl(rawdata[i+3:])
else:
self.handle_comment(rawdata[i+1:])
elif rawdata[i:i+9].lower() == '<!doctype':
self.handle_decl(rawdata[i+2:])
elif startswith("<!", i):
Expand Down Expand Up @@ -325,7 +332,14 @@ def parse_html_declaration(self, i):
# this case is actually already handled in goahead()
return self.parse_comment(i)
elif rawdata[i:i+9] == '<![CDATA[':
return self.parse_marked_section(i)
if self._support_cdata:
j = rawdata.find(']]>', i+9)
if j < 0:
return -1
self.unknown_decl(rawdata[i+3: j])
return j + 3
else:
return self.parse_bogus_comment(i)
elif rawdata[i:i+9].lower() == '<!doctype':
# find the closing >
gtpos = rawdata.find('>', i+9)
Expand Down
82 changes: 61 additions & 21 deletions Lib/test/test_htmlparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,16 @@ def get_events(self):

def handle_starttag(self, tag, attrs):
self.append(("starttag", tag, attrs))
if tag == 'svg':
self.support_cdata(True)

def handle_startendtag(self, tag, attrs):
self.append(("startendtag", tag, attrs))

def handle_endtag(self, tag):
self.append(("endtag", tag))
if tag == 'svg':
self.support_cdata(False)

# all other markup

Expand Down Expand Up @@ -643,10 +647,22 @@ def test_eof_in_declarations(self):
('<!', [('comment', '')]),
('<!-', [('comment', '-')]),
('<![', [('comment', '[')]),
('<![CDATA[', [('unknown decl', 'CDATA[')]),
('<![CDATA[x', [('unknown decl', 'CDATA[x')]),
('<![CDATA[x]', [('unknown decl', 'CDATA[x]')]),
('<![CDATA[x]]', [('unknown decl', 'CDATA[x]]')]),
('<![CDATA[', [('comment', '![CDATA[')]),
('<![CDATA[x', [('comment', '![CDATA[x')]),
('<![CDATA[x]', [('comment', '![CDATA[x]')]),
('<![CDATA[x]]', [('comment', '![CDATA[x]]')]),
('<svg><text y="100"><![CDATA[',
[('starttag', 'svg', []), ('starttag', 'text', [('y', '100')]),
('unknown decl', 'CDATA[')]),
('<svg><text y="100"><![CDATA[x',
[('starttag', 'svg', []), ('starttag', 'text', [('y', '100')]),
('unknown decl', 'CDATA[x')]),
('<svg><text y="100"><![CDATA[x]',
[('starttag', 'svg', []), ('starttag', 'text', [('y', '100')]),
('unknown decl', 'CDATA[x]')]),
('<svg><text y="100"><![CDATA[x]]',
[('starttag', 'svg', []), ('starttag', 'text', [('y', '100')]),
('unknown decl', 'CDATA[x]]')]),
('<!DOCTYPE', [('decl', 'DOCTYPE')]),
('<!DOCTYPE ', [('decl', 'DOCTYPE ')]),
('<!DOCTYPE html', [('decl', 'DOCTYPE html')]),
Expand Down Expand Up @@ -721,26 +737,50 @@ def test_broken_condcoms(self):
]
self._run_check(html, expected)

def test_cdata_declarations(self):
# More tests should be added. See also "8.2.4.42. Markup
# declaration open state", "8.2.4.69. CDATA section state",
# and issue 32876
html = ('<![CDATA[just some plain text]]>')
expected = [('unknown decl', 'CDATA[just some plain text')]
@support.subTests('content', [
'just some plain text',
'<!-- not a comment -->',
'&not-an-entity-ref;',
"<not a='start tag'>",
'',
'[[I have many brackets]]',
'I have a > in the middle',
'I have a ]] in the middle',
'] ]>',
']] >',
('\n'
' if (a < b && a > b) {\n'
' printf("[<marquee>How?</marquee>]");\n'
' }\n'),
])
def test_cdata_section_content(self, content):
# See "13.2.5.42 Markup declaration open state",
# "13.2.5.69 CDATA section state", and issue bpo-32876.
html = f'<svg><text y="100"><![CDATA[{content}]]></text></svg>'
expected = [
('starttag', 'svg', []),
('starttag', 'text', [('y', '100')]),
('unknown decl', 'CDATA[' + content),
('endtag', 'text'),
('endtag', 'svg'),
]
self._run_check(html, expected)

def test_cdata_declarations_multiline(self):
html = ('<code><![CDATA['
' if (a < b && a > b) {'
' printf("[<marquee>How?</marquee>]");'
' }'
']]></code>')
def test_cdata_section(self):
# See "13.2.5.42 Markup declaration open state".
html = ('<![CDATA[foo<br>bar]]>'
'<svg><text y="100"><![CDATA[foo<br>bar]]></text></svg>'
'<![CDATA[foo<br>bar]]>')
expected = [
('starttag', 'code', []),
('unknown decl',
'CDATA[ if (a < b && a > b) { '
'printf("[<marquee>How?</marquee>]"); }'),
('endtag', 'code')
('comment', '[CDATA[foo<br'),
('data', 'bar]]>'),
('starttag', 'svg', []),
('starttag', 'text', [('y', '100')]),
('unknown decl', 'CDATA[foo<br>bar'),
('endtag', 'text'),
('endtag', 'svg'),
('comment', '[CDATA[foo<br'),
('data', 'bar]]>'),
]
self._run_check(html, expected)

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Fix CDATA section parsing in :class:`html.parser.HTMLParser` according to
the HTML5 standard: ``] ]>`` and ``]] >`` no longer end the CDATA section.
Loading