From f7f9f562f1b31c2130e26269cf4f196f378d80f2 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Wed, 18 Jun 2025 13:34:58 +0300 Subject: [PATCH 1/4] gh-135661: Fix CDATA section parsing in HTMLParser "] ]>" and "]] >" no longer end the CDATA section. --- Lib/html/parser.py | 6 ++- Lib/test/test_htmlparser.py | 42 +++++++++---------- ...-06-18-13-34-55.gh-issue-135661.NZlpWf.rst | 2 + 3 files changed, 28 insertions(+), 22 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2025-06-18-13-34-55.gh-issue-135661.NZlpWf.rst diff --git a/Lib/html/parser.py b/Lib/html/parser.py index ba416e7fa6e3fe..99aebc19d4a2e3 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -298,7 +298,11 @@ def parse_html_declaration(self, i): # this case is actually already handled in goahead() return self.parse_comment(i) elif rawdata[i:i+9] == '') + if j < 0: + return -1 + self.unknown_decl(rawdata[i+3: j]) + return j + 3 elif rawdata[i:i+9].lower() == ' gtpos = rawdata.find('>', i+9) diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index 65a4bee72b9775..b75b4c711ccac5 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -686,27 +686,27 @@ def test_broken_condcoms(self): ] self._run_check(html, expected) - def test_cdata_declarations(self): - # More tests should be added. See also "8.2.4.42. Markup - # declaration open state", "8.2.4.69. CDATA section state", - # and issue 32876 - html = ('') - expected = [('unknown decl', 'CDATA[just some plain text')] - self._run_check(html, expected) - - def test_cdata_declarations_multiline(self): - html = (' b) {' - ' printf("[How?]");' - ' }' - ']]>') - expected = [ - ('starttag', 'code', []), - ('unknown decl', - 'CDATA[ if (a < b && a > b) { ' - 'printf("[How?]"); }'), - ('endtag', 'code') - ] + @support.subTests('content', [ + 'just some plain text', + '', + '¬-an-entity-ref;', + "", + '', + '[[I have many brackets]]', + 'I have a > in the middle', + 'I have a ]] in the middle', + '] ]>', + ']] >', + ('\n' + ' if (a < b && a > b) {\n' + ' printf("[How?]");\n' + ' }\n'), + ]) + def test_cdata_section(self, content): + # See "13.2.5.42 Markup declaration open state", + # "13.2.5.69 CDATA section state", and issue bpo-32876. + html = f'' + expected = [('unknown decl', 'CDATA[' + content)] self._run_check(html, expected) def test_convert_charrefs_dropped_text(self): diff --git a/Misc/NEWS.d/next/Library/2025-06-18-13-34-55.gh-issue-135661.NZlpWf.rst b/Misc/NEWS.d/next/Library/2025-06-18-13-34-55.gh-issue-135661.NZlpWf.rst new file mode 100644 index 00000000000000..7a07e8535bb497 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-06-18-13-34-55.gh-issue-135661.NZlpWf.rst @@ -0,0 +1,2 @@ +Fix CDATA section parsing in :class:`html.parser.HTMLParser`: ``] ]>`` and +``]] >`` no longer end the CDATA section. From cf918e3718227dbdda763720f2553b41711dda0e Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Thu, 3 Jul 2025 18:17:26 +0300 Subject: [PATCH 2/4] Move to Security. --- .../2025-06-18-13-34-55.gh-issue-135661.NZlpWf.rst | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename Misc/NEWS.d/next/{Library => Security}/2025-06-18-13-34-55.gh-issue-135661.NZlpWf.rst (100%) diff --git a/Misc/NEWS.d/next/Library/2025-06-18-13-34-55.gh-issue-135661.NZlpWf.rst b/Misc/NEWS.d/next/Security/2025-06-18-13-34-55.gh-issue-135661.NZlpWf.rst similarity index 100% rename from Misc/NEWS.d/next/Library/2025-06-18-13-34-55.gh-issue-135661.NZlpWf.rst rename to Misc/NEWS.d/next/Security/2025-06-18-13-34-55.gh-issue-135661.NZlpWf.rst From d346c10f25179eaf333cbb38a7b86dd937556da7 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Fri, 4 Jul 2025 09:10:35 +0300 Subject: [PATCH 3/4] Update 2025-06-18-13-34-55.gh-issue-135661.NZlpWf.rst --- .../Security/2025-06-18-13-34-55.gh-issue-135661.NZlpWf.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Misc/NEWS.d/next/Security/2025-06-18-13-34-55.gh-issue-135661.NZlpWf.rst b/Misc/NEWS.d/next/Security/2025-06-18-13-34-55.gh-issue-135661.NZlpWf.rst index 7a07e8535bb497..59c76d50f79443 100644 --- a/Misc/NEWS.d/next/Security/2025-06-18-13-34-55.gh-issue-135661.NZlpWf.rst +++ b/Misc/NEWS.d/next/Security/2025-06-18-13-34-55.gh-issue-135661.NZlpWf.rst @@ -1,2 +1,2 @@ -Fix CDATA section parsing in :class:`html.parser.HTMLParser`: ``] ]>`` and -``]] >`` no longer end the CDATA section. +Fix CDATA section parsing in :class:`html.parser.HTMLParser` according to +the HTML5 standard: ``] ]>`` and ``]] >`` no longer end the CDATA section. From 524cac599dc5554650e6f1a8c81d808fa8ef54d6 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Sat, 5 Jul 2025 15:54:03 +0300 Subject: [PATCH 4/4] * Make CDATA section parsing context depending. * Add HTMLParser.support_cdata(). --- Doc/library/html.parser.rst | 11 ++++++++ Lib/html/parser.py | 22 ++++++++++----- Lib/test/test_htmlparser.py | 54 ++++++++++++++++++++++++++++++++----- 3 files changed, 74 insertions(+), 13 deletions(-) diff --git a/Doc/library/html.parser.rst b/Doc/library/html.parser.rst index dd67fc34e856f1..b2fa043625c7e1 100644 --- a/Doc/library/html.parser.rst +++ b/Doc/library/html.parser.rst @@ -121,6 +121,17 @@ The output will then be: attributes can be preserved, etc.). +.. method:: HTMLParser.support_cdata(flag) + + Sets how the parser will parse CDATA declarations. + If *flag* is true, then the :meth:`unknown_decl` method will be called + for the CDATA section ````. + If *flag* is false, then the :meth:`handle_comment` method will be called + for ````. + + .. versionadded:: 3.13.6 + + The following methods are called when data or markup elements are encountered and they are meant to be overridden in a subclass. The base class implementations do nothing (except for :meth:`~HTMLParser.handle_startendtag`): diff --git a/Lib/html/parser.py b/Lib/html/parser.py index d405d653f45270..88a084dcf1ce7d 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -144,6 +144,7 @@ def reset(self): self.lasttag = '???' self.interesting = interesting_normal self.cdata_elem = None + self._support_cdata = False super().reset() def feed(self, data): @@ -174,6 +175,9 @@ def clear_cdata_mode(self): self.interesting = interesting_normal self.cdata_elem = None + def support_cdata(self, flag=True): + self._support_cdata = flag + # Internal -- handle data as far as reasonable. May leave state # and data to be processed by a subsequent call. If 'end' is # true, force handling all data as if followed by EOF marker. @@ -249,7 +253,10 @@ def goahead(self, end): break self.handle_comment(rawdata[i+4:j]) elif startswith("') - if j < 0: - return -1 - self.unknown_decl(rawdata[i+3: j]) - return j + 3 + if self._support_cdata: + j = rawdata.find(']]>', i+9) + if j < 0: + return -1 + self.unknown_decl(rawdata[i+3: j]) + return j + 3 + else: + return self.parse_bogus_comment(i) elif rawdata[i:i+9].lower() == ' gtpos = rawdata.find('>', i+9) diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index f913732c0b13d1..65fbf5d7b618fd 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -34,12 +34,16 @@ def get_events(self): def handle_starttag(self, tag, attrs): self.append(("starttag", tag, attrs)) + if tag == 'svg': + self.support_cdata(True) def handle_startendtag(self, tag, attrs): self.append(("startendtag", tag, attrs)) def handle_endtag(self, tag): self.append(("endtag", tag)) + if tag == 'svg': + self.support_cdata(False) # all other markup @@ -643,10 +647,22 @@ def test_eof_in_declarations(self): ('How?]");\n' ' }\n'), ]) - def test_cdata_section(self, content): + def test_cdata_section_content(self, content): # See "13.2.5.42 Markup declaration open state", # "13.2.5.69 CDATA section state", and issue bpo-32876. - html = f'' - expected = [('unknown decl', 'CDATA[' + content)] + html = f'{content}' + expected = [ + ('starttag', 'svg', []), + ('starttag', 'text', [('y', '100')]), + ('unknown decl', 'CDATA[' + content), + ('endtag', 'text'), + ('endtag', 'svg'), + ] + self._run_check(html, expected) + + def test_cdata_section(self): + # See "13.2.5.42 Markup declaration open state". + html = ('bar]]>' + 'foo<br>bar' + 'bar]]>') + expected = [ + ('comment', '[CDATA[foo'), + ('starttag', 'svg', []), + ('starttag', 'text', [('y', '100')]), + ('unknown decl', 'CDATA[foo
bar'), + ('endtag', 'text'), + ('endtag', 'svg'), + ('comment', '[CDATA[foo'), + ] self._run_check(html, expected) def test_convert_charrefs_dropped_text(self):