From 16d508d3af886777f76d76ecf6d6fb9d9ac28cbb Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Fri, 13 Oct 2023 15:54:15 +0200 Subject: [PATCH 1/6] gh-107450: Check for overflow in the tokenizer and fix overflow test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Filipe LaĆ­ns --- Include/errcode.h | 37 +++++++++++++++++++------------------ Lib/test/test_exceptions.py | 20 ++++++++++++++++---- Parser/lexer/lexer.c | 4 ++++ Parser/pegen_errors.c | 5 +++++ 4 files changed, 44 insertions(+), 22 deletions(-) diff --git a/Include/errcode.h b/Include/errcode.h index 8d44e9ae559193..dac5cf068c99d6 100644 --- a/Include/errcode.h +++ b/Include/errcode.h @@ -19,24 +19,25 @@ extern "C" { #endif -#define E_OK 10 /* No error */ -#define E_EOF 11 /* End Of File */ -#define E_INTR 12 /* Interrupted */ -#define E_TOKEN 13 /* Bad token */ -#define E_SYNTAX 14 /* Syntax error */ -#define E_NOMEM 15 /* Ran out of memory */ -#define E_DONE 16 /* Parsing complete */ -#define E_ERROR 17 /* Execution error */ -#define E_TABSPACE 18 /* Inconsistent mixing of tabs and spaces */ -#define E_OVERFLOW 19 /* Node had too many children */ -#define E_TOODEEP 20 /* Too many indentation levels */ -#define E_DEDENT 21 /* No matching outer block for dedent */ -#define E_DECODE 22 /* Error in decoding into Unicode */ -#define E_EOFS 23 /* EOF in triple-quoted string */ -#define E_EOLS 24 /* EOL in single-quoted string */ -#define E_LINECONT 25 /* Unexpected characters after a line continuation */ -#define E_BADSINGLE 27 /* Ill-formed single statement input */ -#define E_INTERACT_STOP 28 /* Interactive mode stopped tokenization */ +#define E_OK 10 /* No error */ +#define E_EOF 11 /* End Of File */ +#define E_INTR 12 /* Interrupted */ +#define E_TOKEN 13 /* Bad token */ +#define E_SYNTAX 14 /* Syntax error */ +#define E_NOMEM 15 /* Ran out of memory */ +#define E_DONE 16 /* Parsing complete */ +#define E_ERROR 17 /* Execution error */ +#define E_TABSPACE 18 /* Inconsistent mixing of tabs and spaces */ +#define E_OVERFLOW 19 /* Node had too many children */ +#define E_TOODEEP 20 /* Too many indentation levels */ +#define E_DEDENT 21 /* No matching outer block for dedent */ +#define E_DECODE 22 /* Error in decoding into Unicode */ +#define E_EOFS 23 /* EOF in triple-quoted string */ +#define E_EOLS 24 /* EOL in single-quoted string */ +#define E_LINECONT 25 /* Unexpected characters after a line continuation */ +#define E_BADSINGLE 27 /* Ill-formed single statement input */ +#define E_INTERACT_STOP 28 /* Interactive mode stopped tokenization */ +#define E_COLUMNOVERFLOW 29 /* Column offset overflow */ #ifdef __cplusplus } diff --git a/Lib/test/test_exceptions.py b/Lib/test/test_exceptions.py index eafa7d84638b76..1d2282536ad714 100644 --- a/Lib/test/test_exceptions.py +++ b/Lib/test/test_exceptions.py @@ -1,6 +1,7 @@ # Python test set -- part 5, built-in exceptions import copy +import ctypes import os import sys import unittest @@ -318,11 +319,22 @@ def baz(): check('(yield i) = 2', 1, 2) check('def f(*):\n pass', 1, 7) + @unittest.skipIf(ctypes.sizeof(ctypes.c_int) >= ctypes.sizeof(ctypes.c_ssize_t), + "Downcasting to int is safe for col_offset") @support.requires_resource('cpu') - @support.bigmemtest(support._2G, memuse=1.5) - def testMemoryErrorBigSource(self, _size): - with self.assertRaises(OverflowError): - exec(f"if True:\n {' ' * 2**31}print('hello world')") + @support.bigmemtest(2**(ctypes.sizeof(ctypes.c_int)*8-1)-1-len("pass"), memuse=1) + def testMemoryErrorBigSource(self, size): + if size < 2**(ctypes.sizeof(ctypes.c_int)*8-1)-1-len("pass"): + self.skipTest('Not enough memory for overflow to occur') + + # Construct buffer to hold just enough characters so that the tokenizer offset overflows. + # This makes sure that we don't overflow in the string creation itself + distance_to_prev_divisible_by_8 = size & 7 + padding = ' ' * distance_to_prev_divisible_by_8 + padding += ' ' * ((size - distance_to_prev_divisible_by_8) // 8) + + with self.assertRaisesRegex(OverflowError, "Parser column offset overflow"): + exec(f"if True:\n{padding}pass") @cpython_only def testSettingException(self): diff --git a/Parser/lexer/lexer.c b/Parser/lexer/lexer.c index c7134ab868bfbd..849dd60fb339e1 100644 --- a/Parser/lexer/lexer.c +++ b/Parser/lexer/lexer.c @@ -59,6 +59,10 @@ tok_nextc(struct tok_state *tok) int rc; for (;;) { if (tok->cur != tok->inp) { + if (INT_MAX - tok->col_offset - 1 < 0) { + tok->done = E_COLUMNOVERFLOW; + return EOF; + } tok->col_offset++; return Py_CHARMASK(*tok->cur++); /* Fast path */ } diff --git a/Parser/pegen_errors.c b/Parser/pegen_errors.c index 15e99e23d8490f..057bf551519935 100644 --- a/Parser/pegen_errors.c +++ b/Parser/pegen_errors.c @@ -68,6 +68,7 @@ _Pypegen_tokenizer_error(Parser *p) const char *msg = NULL; PyObject* errtype = PyExc_SyntaxError; Py_ssize_t col_offset = -1; + p->error_indicator = 1; switch (p->tok->done) { case E_TOKEN: msg = "invalid token"; @@ -103,6 +104,10 @@ _Pypegen_tokenizer_error(Parser *p) msg = "unexpected character after line continuation character"; break; } + case E_COLUMNOVERFLOW: + PyErr_SetString(PyExc_OverflowError, + "Parser column offset overflow - source line is too big"); + return -1; default: msg = "unknown parsing error"; } From c8433ca03c5065711c7b431dbdc196d567080543 Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Mon, 16 Oct 2023 14:20:35 +0200 Subject: [PATCH 2/6] Address some feedback --- Lib/test/test_exceptions.py | 12 ++++++------ Parser/lexer/lexer.c | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/Lib/test/test_exceptions.py b/Lib/test/test_exceptions.py index 1d2282536ad714..b8c8bc7e225c73 100644 --- a/Lib/test/test_exceptions.py +++ b/Lib/test/test_exceptions.py @@ -10,6 +10,7 @@ import errno from textwrap import dedent +from _testcapi import INT_MAX from test.support import (captured_stderr, check_impl_detail, cpython_only, gc_collect, no_tracing, script_helper, @@ -322,19 +323,18 @@ def baz(): @unittest.skipIf(ctypes.sizeof(ctypes.c_int) >= ctypes.sizeof(ctypes.c_ssize_t), "Downcasting to int is safe for col_offset") @support.requires_resource('cpu') - @support.bigmemtest(2**(ctypes.sizeof(ctypes.c_int)*8-1)-1-len("pass"), memuse=1) + @support.bigmemtest(INT_MAX, memuse=2, dry_run=False) def testMemoryErrorBigSource(self, size): - if size < 2**(ctypes.sizeof(ctypes.c_int)*8-1)-1-len("pass"): + padding_needed = INT_MAX-len("pass") + if size < padding_needed: self.skipTest('Not enough memory for overflow to occur') # Construct buffer to hold just enough characters so that the tokenizer offset overflows. # This makes sure that we don't overflow in the string creation itself - distance_to_prev_divisible_by_8 = size & 7 - padding = ' ' * distance_to_prev_divisible_by_8 - padding += ' ' * ((size - distance_to_prev_divisible_by_8) // 8) + src = f"if True:\n{' ' * padding_needed}pass" with self.assertRaisesRegex(OverflowError, "Parser column offset overflow"): - exec(f"if True:\n{padding}pass") + compile(src, '', 'exec') @cpython_only def testSettingException(self): diff --git a/Parser/lexer/lexer.c b/Parser/lexer/lexer.c index 849dd60fb339e1..1a01bb0352a7b1 100644 --- a/Parser/lexer/lexer.c +++ b/Parser/lexer/lexer.c @@ -59,7 +59,7 @@ tok_nextc(struct tok_state *tok) int rc; for (;;) { if (tok->cur != tok->inp) { - if (INT_MAX - tok->col_offset - 1 < 0) { + if ((unsigned int) tok->col_offset >= (unsigned int) INT_MAX) { tok->done = E_COLUMNOVERFLOW; return EOF; } From e4a059e1db318425c24748afa4618f80bec89217 Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Mon, 16 Oct 2023 14:22:17 +0200 Subject: [PATCH 3/6] Address more feedback --- Lib/test/test_exceptions.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/Lib/test/test_exceptions.py b/Lib/test/test_exceptions.py index b8c8bc7e225c73..0b188ff8d29f86 100644 --- a/Lib/test/test_exceptions.py +++ b/Lib/test/test_exceptions.py @@ -1,7 +1,6 @@ # Python test set -- part 5, built-in exceptions import copy -import ctypes import os import sys import unittest @@ -10,7 +9,7 @@ import errno from textwrap import dedent -from _testcapi import INT_MAX +from _testcapi import INT_MAX, PY_SSIZE_T_MAX from test.support import (captured_stderr, check_impl_detail, cpython_only, gc_collect, no_tracing, script_helper, @@ -320,8 +319,7 @@ def baz(): check('(yield i) = 2', 1, 2) check('def f(*):\n pass', 1, 7) - @unittest.skipIf(ctypes.sizeof(ctypes.c_int) >= ctypes.sizeof(ctypes.c_ssize_t), - "Downcasting to int is safe for col_offset") + @unittest.skipIf(INT_MAX >= PY_SSIZE_T_MAX, "Downcasting to int is safe for col_offset") @support.requires_resource('cpu') @support.bigmemtest(INT_MAX, memuse=2, dry_run=False) def testMemoryErrorBigSource(self, size): @@ -329,10 +327,7 @@ def testMemoryErrorBigSource(self, size): if size < padding_needed: self.skipTest('Not enough memory for overflow to occur') - # Construct buffer to hold just enough characters so that the tokenizer offset overflows. - # This makes sure that we don't overflow in the string creation itself src = f"if True:\n{' ' * padding_needed}pass" - with self.assertRaisesRegex(OverflowError, "Parser column offset overflow"): compile(src, '', 'exec') From 5d7c5a665f63e8c9e1bc1f6847f5a167e1087ed7 Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Mon, 16 Oct 2023 14:31:25 +0200 Subject: [PATCH 4/6] Formatting --- Lib/test/test_exceptions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/test/test_exceptions.py b/Lib/test/test_exceptions.py index 0b188ff8d29f86..d4fc5381f03de0 100644 --- a/Lib/test/test_exceptions.py +++ b/Lib/test/test_exceptions.py @@ -323,7 +323,7 @@ def baz(): @support.requires_resource('cpu') @support.bigmemtest(INT_MAX, memuse=2, dry_run=False) def testMemoryErrorBigSource(self, size): - padding_needed = INT_MAX-len("pass") + padding_needed = INT_MAX - len("pass") if size < padding_needed: self.skipTest('Not enough memory for overflow to occur') From 66d94be0d3ae4df5f35369fd31f47ab91e2f6550 Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Mon, 16 Oct 2023 14:38:04 +0200 Subject: [PATCH 5/6] More feedback --- Lib/test/test_exceptions.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/Lib/test/test_exceptions.py b/Lib/test/test_exceptions.py index d4fc5381f03de0..1ed714c7f727e1 100644 --- a/Lib/test/test_exceptions.py +++ b/Lib/test/test_exceptions.py @@ -9,7 +9,6 @@ import errno from textwrap import dedent -from _testcapi import INT_MAX, PY_SSIZE_T_MAX from test.support import (captured_stderr, check_impl_detail, cpython_only, gc_collect, no_tracing, script_helper, @@ -19,6 +18,12 @@ from test.support.warnings_helper import check_warnings from test import support +try: + from _testcapi import INT_MAX +except ImportError: + INT_MAX = 2**31 - 1 + + class NaiveException(Exception): def __init__(self, x): @@ -319,14 +324,11 @@ def baz(): check('(yield i) = 2', 1, 2) check('def f(*):\n pass', 1, 7) - @unittest.skipIf(INT_MAX >= PY_SSIZE_T_MAX, "Downcasting to int is safe for col_offset") + @unittest.skipIf(INT_MAX >= sys.maxsize, "Downcasting to int is safe for col_offset") @support.requires_resource('cpu') @support.bigmemtest(INT_MAX, memuse=2, dry_run=False) def testMemoryErrorBigSource(self, size): padding_needed = INT_MAX - len("pass") - if size < padding_needed: - self.skipTest('Not enough memory for overflow to occur') - src = f"if True:\n{' ' * padding_needed}pass" with self.assertRaisesRegex(OverflowError, "Parser column offset overflow"): compile(src, '', 'exec') From f20b95f679d215d6e5b99929b721f6ef50d407b8 Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Mon, 16 Oct 2023 15:02:48 +0200 Subject: [PATCH 6/6] Use bytes for source --- Lib/test/test_exceptions.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Lib/test/test_exceptions.py b/Lib/test/test_exceptions.py index 1ed714c7f727e1..4031c5ca76c705 100644 --- a/Lib/test/test_exceptions.py +++ b/Lib/test/test_exceptions.py @@ -328,8 +328,7 @@ def baz(): @support.requires_resource('cpu') @support.bigmemtest(INT_MAX, memuse=2, dry_run=False) def testMemoryErrorBigSource(self, size): - padding_needed = INT_MAX - len("pass") - src = f"if True:\n{' ' * padding_needed}pass" + src = b"if True:\n%*s" % (size, b"pass") with self.assertRaisesRegex(OverflowError, "Parser column offset overflow"): compile(src, '', 'exec')