From dbe8857b51b734949c9e410abe6893475373ce3a Mon Sep 17 00:00:00 2001 From: Pablo Galindo Date: Fri, 8 Jan 2021 03:03:42 +0000 Subject: [PATCH 1/5] bpo-42864: Improve error messages regarding unclosed parentheses --- Lib/test/test_codeop.py | 1 - Lib/test/test_grammar.py | 2 +- Lib/test/test_pdb.py | 4 +-- Parser/pegen.c | 72 ++++++++++++++++++++++++++++++++++++++-- Parser/tokenizer.c | 5 ++- Parser/tokenizer.h | 1 + 6 files changed, 77 insertions(+), 8 deletions(-) diff --git a/Lib/test/test_codeop.py b/Lib/test/test_codeop.py index 45d0a7de9d9253..1da6ca55c48f72 100644 --- a/Lib/test/test_codeop.py +++ b/Lib/test/test_codeop.py @@ -160,7 +160,6 @@ def test_incomplete(self): ai("","eval") ai("\n","eval") ai("(","eval") - ai("(\n\n\n","eval") ai("(9+","eval") ai("9+ \\","eval") ai("lambda z: \\","eval") diff --git a/Lib/test/test_grammar.py b/Lib/test/test_grammar.py index 2f6716dfc9a130..0be869ef69b7ce 100644 --- a/Lib/test/test_grammar.py +++ b/Lib/test/test_grammar.py @@ -260,7 +260,7 @@ def test_eof_error(self): for s in samples: with self.assertRaises(SyntaxError) as cm: compile(s, "", "exec") - self.assertIn("unexpected EOF", str(cm.exception)) + self.assertIn("was never closed", str(cm.exception)) var_annot_global: int # a global annotated is necessary for test_var_annot diff --git a/Lib/test/test_pdb.py b/Lib/test/test_pdb.py index 4bb574fc5b7bff..93b61dcfbcdd8d 100644 --- a/Lib/test/test_pdb.py +++ b/Lib/test/test_pdb.py @@ -1649,10 +1649,10 @@ def test_errors_in_command(self): self.assertEqual(stdout.splitlines()[1:], [ '-> pass', - '(Pdb) *** SyntaxError: unexpected EOF while parsing', + '(Pdb) *** SyntaxError: \'(\' was never closed', '(Pdb) ENTERING RECURSIVE DEBUGGER', - '*** SyntaxError: unexpected EOF while parsing', + '*** SyntaxError: \'(\' was never closed', 'LEAVING RECURSIVE DEBUGGER', '(Pdb) ENTERING RECURSIVE DEBUGGER', diff --git a/Parser/pegen.c b/Parser/pegen.c index a6f97929255ac2..bb04d959f5bbad 100644 --- a/Parser/pegen.c +++ b/Parser/pegen.c @@ -324,7 +324,15 @@ tokenizer_error(Parser *p) RAISE_SYNTAX_ERROR("EOL while scanning string literal"); return -1; case E_EOF: - RAISE_SYNTAX_ERROR("unexpected EOF while parsing"); + if (p->tok->level) { + RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError, + p->tok->parenlinenostack[p->tok->level-1], + p->tok->parencolstack[p->tok->level-1], + "'%c' was never closed", + p->tok->parenstack[p->tok->level-1]); + } else { + RAISE_SYNTAX_ERROR("unexpected EOF while parsing"); + } return -1; case E_DEDENT: RAISE_INDENTATION_ERROR("unindent does not match any outer indentation level"); @@ -1151,6 +1159,53 @@ reset_parser_state(Parser *p) p->call_invalid_rules = 1; } +int +_PyPegen_check_tokenizer_errors(Parser *p) { + // Tokenize the whole input to see if there are any tokenization + // errors such as mistmatching parentheses. These will get priority + // over generic syntax errors only if the line number of the error is + // before the one that we had for the generic error. + + // We don't want to tokenize to the end for interactive input + if (p->tok->prompt != NULL) { + return 0; + } + + const char *start; + const char *end; + int type; + + Token *current_token = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1]; + Py_ssize_t current_err_line = current_token->lineno; + + // Save the tokenizer buffers to restore them later in case we found nothing + struct tok_state saved_tok; + memcpy(&saved_tok, p->tok, sizeof(struct tok_state)); + + while (1) { + type = PyTokenizer_Get(p->tok, &start, &end); + if (type == ERRORTOKEN) { + if (p->tok->level != 0) { + int error_lineno = p->tok->parenlinenostack[p->tok->level-1]; + int error_col = p->tok->parencolstack[p->tok->level-1]; + if (current_err_line > error_lineno) { + RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError, + error_lineno, error_col, + "'%c' was never closed", + p->tok->parenstack[p->tok->level-1]); + return -1; + } + } + break; + } + if (type == ENDMARKER) { + break; + } + } + memcpy(p->tok, &saved_tok, sizeof(struct tok_state)); + return 0; +} + void * _PyPegen_run_parser(Parser *p) { @@ -1164,8 +1219,16 @@ _PyPegen_run_parser(Parser *p) if (p->fill == 0) { RAISE_SYNTAX_ERROR("error at start before reading any input"); } - else if (p->tok->done == E_EOF) { - RAISE_SYNTAX_ERROR("unexpected EOF while parsing"); + else if (p->tok->done == E_EOF) { + if (p->tok->level) { + RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError, + p->tok->parenlinenostack[p->tok->level-1], + p->tok->parencolstack[p->tok->level-1], + "'%c' was never closed", + p->tok->parenstack[p->tok->level-1]); + } else { + RAISE_SYNTAX_ERROR("unexpected EOF while parsing"); + } } else { if (p->tokens[p->fill-1]->type == INDENT) { @@ -1175,6 +1238,9 @@ _PyPegen_run_parser(Parser *p) RAISE_INDENTATION_ERROR("unexpected unindent"); } else { + if (_PyPegen_check_tokenizer_errors(p)) { + return NULL; + } RAISE_SYNTAX_ERROR("invalid syntax"); } } diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index 62cd2966231b8a..f9c8bf652cdfb3 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -64,7 +64,6 @@ tok_new(void) tok->tabsize = TABSIZE; tok->indent = 0; tok->indstack[0] = 0; - tok->atbol = 1; tok->pendin = 0; tok->prompt = tok->nextprompt = NULL; @@ -1396,6 +1395,9 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) /* Check for EOF and errors now */ if (c == EOF) { + if (tok->level) { + return ERRORTOKEN; + } return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN; } @@ -1818,6 +1820,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end) } tok->parenstack[tok->level] = c; tok->parenlinenostack[tok->level] = tok->lineno; + tok->parencolstack[tok->level] = tok->start - tok->line_start; tok->level++; break; case ')': diff --git a/Parser/tokenizer.h b/Parser/tokenizer.h index b659f34796e424..56074b61ae100e 100644 --- a/Parser/tokenizer.h +++ b/Parser/tokenizer.h @@ -45,6 +45,7 @@ struct tok_state { /* Used to allow free continuations inside them */ char parenstack[MAXLEVEL]; int parenlinenostack[MAXLEVEL]; + int parencolstack[MAXLEVEL]; PyObject *filename; /* Stuff for checking on different tab sizes */ int altindstack[MAXINDENT]; /* Stack of alternate indents */ From 4c8f1c2220828269420a66b880bce9dcc5bad829 Mon Sep 17 00:00:00 2001 From: Pablo Galindo Date: Thu, 14 Jan 2021 23:14:56 +0000 Subject: [PATCH 2/5] Add an extra test --- Lib/test/test_syntax.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Lib/test/test_syntax.py b/Lib/test/test_syntax.py index d8255607dcfd5c..c8d191df4cc495 100644 --- a/Lib/test/test_syntax.py +++ b/Lib/test/test_syntax.py @@ -987,6 +987,14 @@ def test_invalid_line_continuation_left_recursive(self): self._check_error("A.\u03bc\\\n", "unexpected EOF while parsing") + def test_error_parenthesis(self): + for paren in "([{": + self._check_error(paren + "1 + 2", f"\\{paren}' was never closed") + + for paren in ")]}": + self._check_error(paren + "1 + 2", f"unmatched '\\{paren}'") + + def test_main(): support.run_unittest(SyntaxTestCase) from test import test_syntax From e8bf6b9a1d7997dac4eaea30e0101c62b9f448d1 Mon Sep 17 00:00:00 2001 From: Pablo Galindo Date: Thu, 14 Jan 2021 23:15:40 +0000 Subject: [PATCH 3/5] Add NEWS entry --- .../Core and Builtins/2021-01-14-23-15-34.bpo-42864.QgOAQ1.rst | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2021-01-14-23-15-34.bpo-42864.QgOAQ1.rst diff --git a/Misc/NEWS.d/next/Core and Builtins/2021-01-14-23-15-34.bpo-42864.QgOAQ1.rst b/Misc/NEWS.d/next/Core and Builtins/2021-01-14-23-15-34.bpo-42864.QgOAQ1.rst new file mode 100644 index 00000000000000..127a29f518d798 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2021-01-14-23-15-34.bpo-42864.QgOAQ1.rst @@ -0,0 +1,2 @@ +Improve error messages in the parser when parentheses are not closed. Patch +by Pablo Galindo. From f63ac52c730268bcc4ec71c717f909cbce1c9209 Mon Sep 17 00:00:00 2001 From: Pablo Galindo Date: Thu, 14 Jan 2021 23:26:29 +0000 Subject: [PATCH 4/5] Refactor error function --- Parser/pegen.c | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/Parser/pegen.c b/Parser/pegen.c index bb04d959f5bbad..e38c11f38e3be4 100644 --- a/Parser/pegen.c +++ b/Parser/pegen.c @@ -265,6 +265,16 @@ raise_decode_error(Parser *p) return -1; } +static inline void +raise_unclosed_parentheses_error(Parser *p) { + int error_lineno = p->tok->parenlinenostack[p->tok->level-1]; + int error_col = p->tok->parencolstack[p->tok->level-1]; + RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError, + error_lineno, error_col, + "'%c' was never closed", + p->tok->parenstack[p->tok->level-1]); +} + static void raise_tokenizer_init_error(PyObject *filename) { @@ -325,11 +335,7 @@ tokenizer_error(Parser *p) return -1; case E_EOF: if (p->tok->level) { - RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError, - p->tok->parenlinenostack[p->tok->level-1], - p->tok->parencolstack[p->tok->level-1], - "'%c' was never closed", - p->tok->parenstack[p->tok->level-1]); + raise_unclosed_parentheses_error(p); } else { RAISE_SYNTAX_ERROR("unexpected EOF while parsing"); } @@ -1159,7 +1165,7 @@ reset_parser_state(Parser *p) p->call_invalid_rules = 1; } -int +static int _PyPegen_check_tokenizer_errors(Parser *p) { // Tokenize the whole input to see if there are any tokenization // errors such as mistmatching parentheses. These will get priority @@ -1187,12 +1193,8 @@ _PyPegen_check_tokenizer_errors(Parser *p) { if (type == ERRORTOKEN) { if (p->tok->level != 0) { int error_lineno = p->tok->parenlinenostack[p->tok->level-1]; - int error_col = p->tok->parencolstack[p->tok->level-1]; if (current_err_line > error_lineno) { - RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError, - error_lineno, error_col, - "'%c' was never closed", - p->tok->parenstack[p->tok->level-1]); + raise_unclosed_parentheses_error(p); return -1; } } @@ -1221,11 +1223,7 @@ _PyPegen_run_parser(Parser *p) } else if (p->tok->done == E_EOF) { if (p->tok->level) { - RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError, - p->tok->parenlinenostack[p->tok->level-1], - p->tok->parencolstack[p->tok->level-1], - "'%c' was never closed", - p->tok->parenstack[p->tok->level-1]); + raise_unclosed_parentheses_error(p); } else { RAISE_SYNTAX_ERROR("unexpected EOF while parsing"); } From 241807e22497acdf5e3f4c4e33bda1ca1c478b92 Mon Sep 17 00:00:00 2001 From: Pablo Galindo Date: Thu, 14 Jan 2021 23:44:11 +0000 Subject: [PATCH 5/5] Minor cleanup --- Parser/pegen.c | 37 ++++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/Parser/pegen.c b/Parser/pegen.c index e38c11f38e3be4..6c279806021057 100644 --- a/Parser/pegen.c +++ b/Parser/pegen.c @@ -1177,33 +1177,36 @@ _PyPegen_check_tokenizer_errors(Parser *p) { return 0; } - const char *start; - const char *end; - int type; Token *current_token = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1]; Py_ssize_t current_err_line = current_token->lineno; - // Save the tokenizer buffers to restore them later in case we found nothing + // Save the tokenizer state to restore them later in case we found nothing struct tok_state saved_tok; memcpy(&saved_tok, p->tok, sizeof(struct tok_state)); - while (1) { - type = PyTokenizer_Get(p->tok, &start, &end); - if (type == ERRORTOKEN) { - if (p->tok->level != 0) { - int error_lineno = p->tok->parenlinenostack[p->tok->level-1]; - if (current_err_line > error_lineno) { - raise_unclosed_parentheses_error(p); - return -1; + for (;;) { + const char *start; + const char *end; + switch (PyTokenizer_Get(p->tok, &start, &end)) { + case ERRORTOKEN: + if (p->tok->level != 0) { + int error_lineno = p->tok->parenlinenostack[p->tok->level-1]; + if (current_err_line > error_lineno) { + raise_unclosed_parentheses_error(p); + return -1; + } } - } - break; - } - if (type == ENDMARKER) { - break; + break; + case ENDMARKER: + break; + default: + continue; } + break; } + + // Restore the tokenizer state memcpy(p->tok, &saved_tok, sizeof(struct tok_state)); return 0; }