From 7129f00d952ace3634bc08a2931a00aa59d47815 Mon Sep 17 00:00:00 2001 From: Petr Viktorin Date: Wed, 22 Jan 2025 17:55:21 +0100 Subject: [PATCH 01/12] generate_tokens.py: Only generate docs for 'literal' tokens; check the rest --- Doc/library/token-list.inc | 37 ------------------------ Doc/library/token.rst | 51 +++++++++++++++++++++++++-------- Tools/build/generate_token.py | 53 +++++++++++++++++++++++++++-------- 3 files changed, 82 insertions(+), 59 deletions(-) diff --git a/Doc/library/token-list.inc b/Doc/library/token-list.inc index 39df2927a0b7f2..128bb218dfb02c 100644 --- a/Doc/library/token-list.inc +++ b/Doc/library/token-list.inc @@ -1,18 +1,4 @@ .. Auto-generated by Tools/build/generate_token.py -.. data:: ENDMARKER - -.. data:: NAME - -.. data:: NUMBER - -.. data:: STRING - -.. data:: NEWLINE - -.. data:: INDENT - -.. data:: DEDENT - .. data:: LPAR Token value for ``"("``. @@ -205,26 +191,3 @@ Token value for ``"!"``. -.. data:: OP - -.. data:: TYPE_IGNORE - -.. data:: TYPE_COMMENT - -.. data:: SOFT_KEYWORD - -.. data:: FSTRING_START - -.. data:: FSTRING_MIDDLE - -.. data:: FSTRING_END - -.. data:: COMMENT - -.. data:: NL - -.. data:: ERRORTOKEN - -.. data:: N_TOKENS - -.. data:: NT_OFFSET diff --git a/Doc/library/token.rst b/Doc/library/token.rst index 40982f32b4beee..73a5fff3f04ce8 100644 --- a/Doc/library/token.rst +++ b/Doc/library/token.rst @@ -44,25 +44,51 @@ functions. The functions mirror definitions in the Python C header files. The token constants are: -.. include:: token-list.inc +.. data:: ENDMARKER + +.. data:: NAME + +.. data:: NUMBER + +.. data:: STRING + +.. data:: NEWLINE + +.. data:: INDENT + +.. data:: DEDENT + +.. data:: OP + +.. data:: TYPE_IGNORE + +.. data:: TYPE_COMMENT + + Token value indicating that a type comment was recognized. Such + tokens are only produced when :func:`ast.parse` is invoked with + ``type_comments=True``. + +.. data:: SOFT_KEYWORD + +.. data:: FSTRING_START -The following token type values aren't used by the C tokenizer but are needed for -the :mod:`tokenize` module. +.. data:: FSTRING_MIDDLE + +.. data:: FSTRING_END .. data:: COMMENT - :noindex: Token value used to indicate a comment. - .. data:: NL - :noindex: Token value used to indicate a non-terminating newline. The :data:`NEWLINE` token indicates the end of a logical line of Python code; ``NL`` tokens are generated when a logical line of code is continued over multiple physical lines. +.. data:: ERRORTOKEN + .. data:: ENCODING @@ -70,14 +96,17 @@ the :mod:`tokenize` module. into text. The first token returned by :func:`tokenize.tokenize` will always be an ``ENCODING`` token. + This token type isn't used by the C tokenizer but is needed for + the :mod:`tokenize` module. -.. data:: TYPE_COMMENT - :noindex: +The remaining tokens represent literal text; most are :ref:`operators` +and :ref:`delimiters`: - Token value indicating that a type comment was recognized. Such - tokens are only produced when :func:`ast.parse` is invoked with - ``type_comments=True``. +.. include:: token-list.inc + +.. data:: N_TOKENS +.. data:: NT_OFFSET .. data:: EXACT_TOKEN_TYPES diff --git a/Tools/build/generate_token.py b/Tools/build/generate_token.py index d32747f19945d8..2672c9054d3518 100755 --- a/Tools/build/generate_token.py +++ b/Tools/build/generate_token.py @@ -1,10 +1,17 @@ #! /usr/bin/env python3 # This script generates token related files from Grammar/Tokens: # -# Doc/library/token-list.inc -# Include/token.h -# Parser/token.c -# Lib/token.py +# make_rst: +# Doc/library/token-list.inc +# Doc/library/token.rst (checked, not generated) +# make_h: +# Include/token.h +# make_c: +# Parser/token.c +# make_py: +# Lib/token.py + +import re SCRIPT_NAME = 'Tools/build/generate_token.py' @@ -200,22 +207,46 @@ def make_c(infile, outfile='Parser/token.c'): token_inc_template = f"""\ .. {AUTO_GENERATED_BY_SCRIPT} %s -.. data:: N_TOKENS - -.. data:: NT_OFFSET """ -def make_rst(infile, outfile='Doc/library/token-list.inc'): +def make_rst(infile, outfile='Doc/library/token-list.inc', + rstfile='Doc/library/token.rst'): tok_names, ERRORTOKEN, string_to_tok = load_tokens(infile) tok_to_string = {value: s for s, value in string_to_tok.items()} + needs_handwritten_doc = set() + names = [] - for value, name in enumerate(tok_names[:ERRORTOKEN + 1]): - names.append('.. data:: %s' % (name,)) + for value, name in enumerate(tok_names): if value in tok_to_string: + assert name.isupper() + names.append('.. data:: %s' % (name,)) names.append('') names.append(' Token value for ``"%s"``.' % tok_to_string[value]) - names.append('') + names.append('') + else: + needs_handwritten_doc.add(name) + + has_handwritten_doc = set() + with open(rstfile) as fileobj: + tokendef_re = re.compile(r'.. data:: (\w+)') + for line in fileobj: + if match := tokendef_re.fullmatch(line.strip()): + if match[1].isupper(): + has_handwritten_doc.add(match[1]) + + # Exclude non-token constants in token.py + has_handwritten_doc -= {'N_TOKENS', 'NT_OFFSET', 'EXACT_TOKEN_TYPES'} + + if needs_handwritten_doc != has_handwritten_doc: + message_parts = [f'ERROR: {rstfile} does not document all tokens!'] + undocumented = needs_handwritten_doc - has_handwritten_doc + extra = has_handwritten_doc - needs_handwritten_doc + if undocumented: + message_parts.append(f'Undocumented tokens: {undocumented}') + if extra: + message_parts.append(f'Documented nonexistent tokens: {extra}') + exit('\n'.join(message_parts)) if update_file(outfile, token_inc_template % '\n'.join(names)): print("%s regenerated from %s" % (outfile, infile)) From d400ae7c62ff17bc92f190a7a083e80ef6030acf Mon Sep 17 00:00:00 2001 From: Petr Viktorin Date: Wed, 22 Jan 2025 18:01:54 +0100 Subject: [PATCH 02/12] Change docs for the "literal" tokens to a list-table --- Doc/library/token-list.inc | 292 ++++++++++++---------------------- Tools/build/generate_token.py | 12 +- 2 files changed, 109 insertions(+), 195 deletions(-) diff --git a/Doc/library/token-list.inc b/Doc/library/token-list.inc index 128bb218dfb02c..357638aed2714a 100644 --- a/Doc/library/token-list.inc +++ b/Doc/library/token-list.inc @@ -1,193 +1,103 @@ .. Auto-generated by Tools/build/generate_token.py -.. data:: LPAR - - Token value for ``"("``. - -.. data:: RPAR - - Token value for ``")"``. - -.. data:: LSQB - - Token value for ``"["``. - -.. data:: RSQB - - Token value for ``"]"``. - -.. data:: COLON - - Token value for ``":"``. - -.. data:: COMMA - - Token value for ``","``. - -.. data:: SEMI - - Token value for ``";"``. - -.. data:: PLUS - - Token value for ``"+"``. - -.. data:: MINUS - - Token value for ``"-"``. - -.. data:: STAR - - Token value for ``"*"``. - -.. data:: SLASH - - Token value for ``"/"``. - -.. data:: VBAR - - Token value for ``"|"``. - -.. data:: AMPER - - Token value for ``"&"``. - -.. data:: LESS - - Token value for ``"<"``. - -.. data:: GREATER - - Token value for ``">"``. - -.. data:: EQUAL - - Token value for ``"="``. - -.. data:: DOT - - Token value for ``"."``. - -.. data:: PERCENT - - Token value for ``"%"``. - -.. data:: LBRACE - - Token value for ``"{"``. - -.. data:: RBRACE - - Token value for ``"}"``. - -.. data:: EQEQUAL - - Token value for ``"=="``. - -.. data:: NOTEQUAL - - Token value for ``"!="``. - -.. data:: LESSEQUAL - - Token value for ``"<="``. - -.. data:: GREATEREQUAL - - Token value for ``">="``. - -.. data:: TILDE - - Token value for ``"~"``. - -.. data:: CIRCUMFLEX - - Token value for ``"^"``. - -.. data:: LEFTSHIFT - - Token value for ``"<<"``. - -.. data:: RIGHTSHIFT - - Token value for ``">>"``. - -.. data:: DOUBLESTAR - - Token value for ``"**"``. - -.. data:: PLUSEQUAL - - Token value for ``"+="``. - -.. data:: MINEQUAL - - Token value for ``"-="``. - -.. data:: STAREQUAL - - Token value for ``"*="``. - -.. data:: SLASHEQUAL - - Token value for ``"/="``. - -.. data:: PERCENTEQUAL - - Token value for ``"%="``. - -.. data:: AMPEREQUAL - - Token value for ``"&="``. - -.. data:: VBAREQUAL - - Token value for ``"|="``. - -.. data:: CIRCUMFLEXEQUAL - - Token value for ``"^="``. - -.. data:: LEFTSHIFTEQUAL - - Token value for ``"<<="``. - -.. data:: RIGHTSHIFTEQUAL - - Token value for ``">>="``. - -.. data:: DOUBLESTAREQUAL - - Token value for ``"**="``. - -.. data:: DOUBLESLASH - - Token value for ``"//"``. - -.. data:: DOUBLESLASHEQUAL - - Token value for ``"//="``. - -.. data:: AT - - Token value for ``"@"``. - -.. data:: ATEQUAL - - Token value for ``"@="``. - -.. data:: RARROW - - Token value for ``"->"``. - -.. data:: ELLIPSIS - - Token value for ``"..."``. - -.. data:: COLONEQUAL - - Token value for ``":="``. - -.. data:: EXCLAMATION - - Token value for ``"!"``. +.. list-table:: + :header-rows: 1 + + * - Token + - Value + * - .. data:: LPAR + - ``"("`` + * - .. data:: RPAR + - ``")"`` + * - .. data:: LSQB + - ``"["`` + * - .. data:: RSQB + - ``"]"`` + * - .. data:: COLON + - ``":"`` + * - .. data:: COMMA + - ``","`` + * - .. data:: SEMI + - ``";"`` + * - .. data:: PLUS + - ``"+"`` + * - .. data:: MINUS + - ``"-"`` + * - .. data:: STAR + - ``"*"`` + * - .. data:: SLASH + - ``"/"`` + * - .. data:: VBAR + - ``"|"`` + * - .. data:: AMPER + - ``"&"`` + * - .. data:: LESS + - ``"<"`` + * - .. data:: GREATER + - ``">"`` + * - .. data:: EQUAL + - ``"="`` + * - .. data:: DOT + - ``"."`` + * - .. data:: PERCENT + - ``"%"`` + * - .. data:: LBRACE + - ``"{"`` + * - .. data:: RBRACE + - ``"}"`` + * - .. data:: EQEQUAL + - ``"=="`` + * - .. data:: NOTEQUAL + - ``"!="`` + * - .. data:: LESSEQUAL + - ``"<="`` + * - .. data:: GREATEREQUAL + - ``">="`` + * - .. data:: TILDE + - ``"~"`` + * - .. data:: CIRCUMFLEX + - ``"^"`` + * - .. data:: LEFTSHIFT + - ``"<<"`` + * - .. data:: RIGHTSHIFT + - ``">>"`` + * - .. data:: DOUBLESTAR + - ``"**"`` + * - .. data:: PLUSEQUAL + - ``"+="`` + * - .. data:: MINEQUAL + - ``"-="`` + * - .. data:: STAREQUAL + - ``"*="`` + * - .. data:: SLASHEQUAL + - ``"/="`` + * - .. data:: PERCENTEQUAL + - ``"%="`` + * - .. data:: AMPEREQUAL + - ``"&="`` + * - .. data:: VBAREQUAL + - ``"|="`` + * - .. data:: CIRCUMFLEXEQUAL + - ``"^="`` + * - .. data:: LEFTSHIFTEQUAL + - ``"<<="`` + * - .. data:: RIGHTSHIFTEQUAL + - ``">>="`` + * - .. data:: DOUBLESTAREQUAL + - ``"**="`` + * - .. data:: DOUBLESLASH + - ``"//"`` + * - .. data:: DOUBLESLASHEQUAL + - ``"//="`` + * - .. data:: AT + - ``"@"`` + * - .. data:: ATEQUAL + - ``"@="`` + * - .. data:: RARROW + - ``"->"`` + * - .. data:: ELLIPSIS + - ``"..."`` + * - .. data:: COLONEQUAL + - ``":="`` + * - .. data:: EXCLAMATION + - ``"!"`` diff --git a/Tools/build/generate_token.py b/Tools/build/generate_token.py index 2672c9054d3518..e25837eb715bc0 100755 --- a/Tools/build/generate_token.py +++ b/Tools/build/generate_token.py @@ -206,6 +206,12 @@ def make_c(infile, outfile='Parser/token.c'): token_inc_template = f"""\ .. {AUTO_GENERATED_BY_SCRIPT} + +.. list-table:: + :header-rows: 1 + + * - Token + - Value %s """ @@ -220,10 +226,8 @@ def make_rst(infile, outfile='Doc/library/token-list.inc', for value, name in enumerate(tok_names): if value in tok_to_string: assert name.isupper() - names.append('.. data:: %s' % (name,)) - names.append('') - names.append(' Token value for ``"%s"``.' % tok_to_string[value]) - names.append('') + names.append(f' * - .. data:: {name}') + names.append(f' - ``"{tok_to_string[value]}"``') else: needs_handwritten_doc.add(name) From 4411c370437cc7c929c69f872d635687ce4e9bcd Mon Sep 17 00:00:00 2001 From: Petr Viktorin Date: Wed, 12 Feb 2025 18:04:12 +0100 Subject: [PATCH 03/12] Document most of the tokens; improve top-level grammar docs link ENDMARKER to that --- Doc/library/token.rst | 75 ++++++++++++++++++++++++--- Doc/reference/toplevel_components.rst | 11 ++-- 2 files changed, 74 insertions(+), 12 deletions(-) diff --git a/Doc/library/token.rst b/Doc/library/token.rst index 73a5fff3f04ce8..cdba4652d725e6 100644 --- a/Doc/library/token.rst +++ b/Doc/library/token.rst @@ -46,22 +46,58 @@ The token constants are: .. data:: ENDMARKER + Token value that indicates the end of input. + Used in :ref:`top-level grammar rules `. + .. data:: NAME + Token value that indicates an :ref:`identifier `. + Note that keywords are also identifiers. + .. data:: NUMBER + Token value that indicates a :ref:`numeric literal ` + .. data:: STRING + Token value that indicates a :ref:`string or byte literal `. + The token string is not interpreted: it includes the prefix (if any) + and the quote characters; escape sequences are included with their + initial backslash. + .. data:: NEWLINE + Token value that indicates the end of a :ref:`logical line ` + of Python code. + +.. data:: NL + + Token value used to indicate a non-terminating newline. + :data:`!NL` tokens are generated when a logical line of code is continued + over multiple physical lines. The parser ignores :data:`!NL` tokens. + .. data:: INDENT + Token value used at the beginning of a :ref:`logical line ` + to indicate the start of an :ref:`indented block `. + .. data:: DEDENT + Token value used at the beginning of a :ref:`logical line ` + to indicate the end of an :ref:`indented block `. + .. data:: OP + A generic token value returned by the :mod:`tokenize` module for + :ref:`operator ` and :ref:`delimiter `. + See the :mod:`tokenize` module documentation for details. + .. data:: TYPE_IGNORE + Token value indicating that a ``type: ignore`` comment was recognized. + Such tokens are only produced when :func:`ast.parse` is invoked with + ``type_comments=True``. + .. data:: TYPE_COMMENT Token value indicating that a type comment was recognized. Such @@ -72,23 +108,44 @@ The token constants are: .. data:: FSTRING_START + .. impl-detail:: + + Token value used to indicate the beginning of a + :ref:`f-string `. + The token string includes the prefix and the opening quote, but none + of the contents of the literal. + .. data:: FSTRING_MIDDLE + .. impl-detail:: + + Token value used for literal text inside an :ref:`f-string `, + including format specifications. + + Replacement fields (that is, the non-literal parts of f-strings) use + the same tokens as other expressions, and are delimited by :data:`LBRACE` + and :data:`RBRACE` tokens. + .. data:: FSTRING_END + .. impl-detail:: + + Token value used to indicate the end of a :ref:`f-string `. + The token string contains the closing quote. + .. data:: COMMENT Token value used to indicate a comment. + The parser ignores :data:`!COMMENT` tokens. -.. data:: NL +.. data:: ERRORTOKEN - Token value used to indicate a non-terminating newline. The - :data:`NEWLINE` token indicates the end of a logical line of Python code; - ``NL`` tokens are generated when a logical line of code is continued over - multiple physical lines. + Token value used to indicate wrong input. -.. data:: ERRORTOKEN + .. impl-detail:: + The :mod:`tokenize` module generally indicates errors by + raising exceptions instead of emitting this token. .. data:: ENCODING @@ -96,8 +153,10 @@ The token constants are: into text. The first token returned by :func:`tokenize.tokenize` will always be an ``ENCODING`` token. - This token type isn't used by the C tokenizer but is needed for - the :mod:`tokenize` module. + .. impl-detail:: + + This token type isn't used by the C tokenizer but is needed for + the :mod:`tokenize` module. The remaining tokens represent literal text; most are :ref:`operators` and :ref:`delimiters`: diff --git a/Doc/reference/toplevel_components.rst b/Doc/reference/toplevel_components.rst index f155fafbe4d738..a5fd320c2cb308 100644 --- a/Doc/reference/toplevel_components.rst +++ b/Doc/reference/toplevel_components.rst @@ -69,7 +69,7 @@ All input read from non-interactive files has the same form: .. grammar-snippet:: :group: python-grammar - file_input: (NEWLINE | `statement`)* + file: `statement`* ENDMARKER This syntax is used in the following situations: @@ -90,7 +90,7 @@ Input in interactive mode is parsed using the following grammar: .. grammar-snippet:: :group: python-grammar - interactive_input: [`stmt_list`] NEWLINE | `compound_stmt` NEWLINE + interactive: [`stmt_list`] NEWLINE | `compound_stmt` NEWLINE | ENDMARKER Note that a (top-level) compound statement must be followed by a blank line in interactive mode; this is needed to help the parser detect the end of the input. @@ -107,5 +107,8 @@ Expression input :func:`eval` is used for expression input. It ignores leading whitespace. The string argument to :func:`eval` must have the following form: -.. productionlist:: python-grammar - eval_input: `expression_list` NEWLINE* +.. productionlist:: + :group: python-grammar + + eval: `expressions` NEWLINE* ENDMARKER + expressions: ','.`expression`+ [','] From 1bf55112465c60053faa143ae0f6fae137d46c84 Mon Sep 17 00:00:00 2001 From: Petr Viktorin Date: Wed, 19 Feb 2025 17:25:13 +0100 Subject: [PATCH 04/12] Write prose; reorganize the token list --- Doc/library/token.rst | 101 +++++++++++++++++++++++++----------------- 1 file changed, 61 insertions(+), 40 deletions(-) diff --git a/Doc/library/token.rst b/Doc/library/token.rst index cdba4652d725e6..5fe3bec4e9b1f8 100644 --- a/Doc/library/token.rst +++ b/Doc/library/token.rst @@ -44,11 +44,6 @@ functions. The functions mirror definitions in the Python C header files. The token constants are: -.. data:: ENDMARKER - - Token value that indicates the end of input. - Used in :ref:`top-level grammar rules `. - .. data:: NAME Token value that indicates an :ref:`identifier `. @@ -65,6 +60,17 @@ The token constants are: and the quote characters; escape sequences are included with their initial backslash. +.. data:: OP + + A generic token value returned by the :mod:`tokenize` module for + :ref:`operators ` and :ref:`delimiters `. + See the :mod:`tokenize` module documentation for details. + +.. data:: COMMENT + + Token value used to indicate a comment. + The parser ignores :data:`!COMMENT` tokens. + .. data:: NEWLINE Token value that indicates the end of a :ref:`logical line ` @@ -86,26 +92,6 @@ The token constants are: Token value used at the beginning of a :ref:`logical line ` to indicate the end of an :ref:`indented block `. -.. data:: OP - - A generic token value returned by the :mod:`tokenize` module for - :ref:`operator ` and :ref:`delimiter `. - See the :mod:`tokenize` module documentation for details. - -.. data:: TYPE_IGNORE - - Token value indicating that a ``type: ignore`` comment was recognized. - Such tokens are only produced when :func:`ast.parse` is invoked with - ``type_comments=True``. - -.. data:: TYPE_COMMENT - - Token value indicating that a type comment was recognized. Such - tokens are only produced when :func:`ast.parse` is invoked with - ``type_comments=True``. - -.. data:: SOFT_KEYWORD - .. data:: FSTRING_START .. impl-detail:: @@ -133,19 +119,10 @@ The token constants are: Token value used to indicate the end of a :ref:`f-string `. The token string contains the closing quote. -.. data:: COMMENT - - Token value used to indicate a comment. - The parser ignores :data:`!COMMENT` tokens. - -.. data:: ERRORTOKEN - - Token value used to indicate wrong input. - - .. impl-detail:: +.. data:: ENDMARKER - The :mod:`tokenize` module generally indicates errors by - raising exceptions instead of emitting this token. + Token value that indicates the end of input. + Used in :ref:`top-level grammar rules `. .. data:: ENCODING @@ -158,14 +135,55 @@ The token constants are: This token type isn't used by the C tokenizer but is needed for the :mod:`tokenize` module. -The remaining tokens represent literal text; most are :ref:`operators` -and :ref:`delimiters`: + +The following token types are not produced by the :mod:`tokenize` module, +and are defined for special uses in the tokenizer or parser: + +.. data:: TYPE_IGNORE + + Token value indicating that a ``type: ignore`` comment was recognized. + Such tokens are produced instead of regular :data:`COMMENT` tokens only when + :func:`ast.parse` is invoked with ``type_comments=True``. + +.. data:: TYPE_COMMENT + + Token value indicating that a type comment was recognized. + Such tokens are produced instead of regular :data:`COMMENT` tokens only when + :func:`ast.parse` is invoked with ``type_comments=True``. + +.. data:: SOFT_KEYWORD + + Token value indicating a :ref:`soft keyword `. + + The tokenizer never produces this value. + To check for a soft keyword, pass a :data:`NAME` token's string to + :func:`keyword.issoftkeyword`. + +.. data:: ERRORTOKEN + + Token value used to indicate wrong input. + + The :mod:`tokenize` module generally indicates errors by + raising exceptions instead of emitting this token. + It can also emit tokens such as :data:`OP` or :data:`NAME` with strings that + are later rejected by the parser. + + +The remaining tokens represent specific :ref:`operators` and :ref:`delimiters`. +(The :mod:`tokenize` module reports these as :data:`OP`; see ``exact_type`` +in the :mod:`tokenize` documentation for details.) .. include:: token-list.inc + +The following are non-token constants: + .. data:: N_TOKENS -.. data:: NT_OFFSET + The number of token types defined in this module. + +.. NT_OFFSET is deliberately undocumented; if you need it you should be + reading the source .. data:: EXACT_TOKEN_TYPES @@ -190,6 +208,9 @@ and :ref:`delimiters`: to support parsing older Python versions for :func:`ast.parse` with ``feature_version`` set to 6 or lower). +.. versionchanged:: 3.12 + Added :data:`EXCLAMATION`. + .. versionchanged:: 3.13 Removed :data:`!AWAIT` and :data:`!ASYNC` tokens again. From e1e498b0b4c0810821321ceeab7cd156d68849b2 Mon Sep 17 00:00:00 2001 From: Petr Viktorin Date: Wed, 19 Feb 2025 17:38:57 +0100 Subject: [PATCH 05/12] Fixups --- Doc/library/token.rst | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/Doc/library/token.rst b/Doc/library/token.rst index 5fe3bec4e9b1f8..de01fc020197c1 100644 --- a/Doc/library/token.rst +++ b/Doc/library/token.rst @@ -55,7 +55,8 @@ The token constants are: .. data:: STRING - Token value that indicates a :ref:`string or byte literal `. + Token value that indicates a :ref:`string or byte literal `, + excluding :ref:`f-strings `. The token string is not interpreted: it includes the prefix (if any) and the quote characters; escape sequences are included with their initial backslash. @@ -94,29 +95,32 @@ The token constants are: .. data:: FSTRING_START + Token value used to indicate the beginning of a + :ref:`f-string `. + .. impl-detail:: - Token value used to indicate the beginning of a - :ref:`f-string `. The token string includes the prefix and the opening quote, but none of the contents of the literal. .. data:: FSTRING_MIDDLE - .. impl-detail:: + Token value used for literal text inside an :ref:`f-string `, + including format specifications. - Token value used for literal text inside an :ref:`f-string `, - including format specifications. + .. impl-detail:: Replacement fields (that is, the non-literal parts of f-strings) use - the same tokens as other expressions, and are delimited by :data:`LBRACE` - and :data:`RBRACE` tokens. + the same tokens as other expressions, and are delimited by + :data:`LBRACE`, :data:`RBRACE`, :data:`EXCLAMATION` and :data:`COLON` + tokens. .. data:: FSTRING_END + Token value used to indicate the end of a :ref:`f-string `. + .. impl-detail:: - Token value used to indicate the end of a :ref:`f-string `. The token string contains the closing quote. .. data:: ENDMARKER @@ -169,14 +173,15 @@ and are defined for special uses in the tokenizer or parser: are later rejected by the parser. -The remaining tokens represent specific :ref:`operators` and :ref:`delimiters`. +The remaining tokens represent specific :ref:`operators ` and +:ref:`delimiters `. (The :mod:`tokenize` module reports these as :data:`OP`; see ``exact_type`` in the :mod:`tokenize` documentation for details.) .. include:: token-list.inc -The following are non-token constants: +The following non-token constants are provided: .. data:: N_TOKENS From 8bbbb0f8e5c2f802e3d3c1cb7aac38d039c3ae01 Mon Sep 17 00:00:00 2001 From: Petr Viktorin Date: Wed, 19 Feb 2025 17:42:39 +0100 Subject: [PATCH 06/12] Correct directive name Co-authored-by: Blaise Pabon --- Doc/reference/toplevel_components.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Doc/reference/toplevel_components.rst b/Doc/reference/toplevel_components.rst index a5fd320c2cb308..48fd84e559e66d 100644 --- a/Doc/reference/toplevel_components.rst +++ b/Doc/reference/toplevel_components.rst @@ -107,7 +107,7 @@ Expression input :func:`eval` is used for expression input. It ignores leading whitespace. The string argument to :func:`eval` must have the following form: -.. productionlist:: +.. grammar-snippet:: :group: python-grammar eval: `expressions` NEWLINE* ENDMARKER From eed407e00df75f5c8eac4072551612a619e9d81a Mon Sep 17 00:00:00 2001 From: Petr Viktorin Date: Wed, 19 Feb 2025 18:03:37 +0100 Subject: [PATCH 07/12] Don't use the Gather syntax Co-authored-by: Blaise Pabon --- Doc/reference/toplevel_components.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Doc/reference/toplevel_components.rst b/Doc/reference/toplevel_components.rst index 48fd84e559e66d..786eacbd013df9 100644 --- a/Doc/reference/toplevel_components.rst +++ b/Doc/reference/toplevel_components.rst @@ -111,4 +111,4 @@ string argument to :func:`eval` must have the following form: :group: python-grammar eval: `expressions` NEWLINE* ENDMARKER - expressions: ','.`expression`+ [','] + expressions: `expression` (',' `expression` )* [','] From 0dd236f25a3c27a1eccd99fe31fe27257254925d Mon Sep 17 00:00:00 2001 From: Petr Viktorin Date: Wed, 26 Feb 2025 16:10:08 +0100 Subject: [PATCH 08/12] Revert some changes to the toplevel_components --- Doc/reference/toplevel_components.rst | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/Doc/reference/toplevel_components.rst b/Doc/reference/toplevel_components.rst index 786eacbd013df9..9e8579b1bda69f 100644 --- a/Doc/reference/toplevel_components.rst +++ b/Doc/reference/toplevel_components.rst @@ -69,7 +69,7 @@ All input read from non-interactive files has the same form: .. grammar-snippet:: :group: python-grammar - file: `statement`* ENDMARKER + file_input: (`statement` | NEWLINE)* ENDMARKER This syntax is used in the following situations: @@ -90,7 +90,7 @@ Input in interactive mode is parsed using the following grammar: .. grammar-snippet:: :group: python-grammar - interactive: [`stmt_list`] NEWLINE | `compound_stmt` NEWLINE | ENDMARKER + interactive_input: [`stmt_list`] NEWLINE | `compound_stmt` NEWLINE | ENDMARKER Note that a (top-level) compound statement must be followed by a blank line in interactive mode; this is needed to help the parser detect the end of the input. @@ -110,5 +110,4 @@ string argument to :func:`eval` must have the following form: .. grammar-snippet:: :group: python-grammar - eval: `expressions` NEWLINE* ENDMARKER - expressions: `expression` (',' `expression` )* [','] + eval_input: `expression_list` NEWLINE* ENDMARKER From dca268ed98cbc8ae1278d6dfede0efb86cbc8c5f Mon Sep 17 00:00:00 2001 From: Petr Viktorin Date: Wed, 26 Feb 2025 16:11:13 +0100 Subject: [PATCH 09/12] Revert an order change --- Doc/reference/toplevel_components.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Doc/reference/toplevel_components.rst b/Doc/reference/toplevel_components.rst index 9e8579b1bda69f..bd64b1c08bd1ff 100644 --- a/Doc/reference/toplevel_components.rst +++ b/Doc/reference/toplevel_components.rst @@ -69,7 +69,7 @@ All input read from non-interactive files has the same form: .. grammar-snippet:: :group: python-grammar - file_input: (`statement` | NEWLINE)* ENDMARKER + file_input: (NEWLINE | `statement`)* ENDMARKER This syntax is used in the following situations: From e02ced8eadfa4edad70130c76c7597acc2723b14 Mon Sep 17 00:00:00 2001 From: Petr Viktorin Date: Thu, 27 Feb 2025 11:59:42 +0100 Subject: [PATCH 10/12] Apply suggestions from code review Co-authored-by: Adam Turner <9087854+AA-Turner@users.noreply.github.com> Co-authored-by: Lysandros Nikolaou --- Doc/library/token-list.inc | 1 + Doc/library/token.rst | 31 +++++++++++++++---------------- Tools/build/generate_token.py | 1 + 3 files changed, 17 insertions(+), 16 deletions(-) diff --git a/Doc/library/token-list.inc b/Doc/library/token-list.inc index 357638aed2714a..655758c4a400cc 100644 --- a/Doc/library/token-list.inc +++ b/Doc/library/token-list.inc @@ -1,6 +1,7 @@ .. Auto-generated by Tools/build/generate_token.py .. list-table:: + :align: left :header-rows: 1 * - Token diff --git a/Doc/library/token.rst b/Doc/library/token.rst index de01fc020197c1..a94d45c5b8b36a 100644 --- a/Doc/library/token.rst +++ b/Doc/library/token.rst @@ -47,7 +47,7 @@ The token constants are: .. data:: NAME Token value that indicates an :ref:`identifier `. - Note that keywords are also identifiers. + Note that keywords are also initially tokenized an ``NAME`` tokens. .. data:: NUMBER @@ -56,10 +56,10 @@ The token constants are: .. data:: STRING Token value that indicates a :ref:`string or byte literal `, - excluding :ref:`f-strings `. - The token string is not interpreted: it includes the prefix (if any) - and the quote characters; escape sequences are included with their - initial backslash. + excluding :ref:`formatted string literals `. + The token string is not interpreted: + it includes the surrounding quotation marks and the prefix (if given); + backslashes are included literally, without processing escape sequences. .. data:: OP @@ -74,8 +74,7 @@ The token constants are: .. data:: NEWLINE - Token value that indicates the end of a :ref:`logical line ` - of Python code. + Token value that indicates the end of a :ref:`logical line `. .. data:: NL @@ -95,17 +94,17 @@ The token constants are: .. data:: FSTRING_START - Token value used to indicate the beginning of a - :ref:`f-string `. + Token value used to indicate the beginning of an + :ref:`f-string literal `. .. impl-detail:: - The token string includes the prefix and the opening quote, but none + The token string includes the prefix and the opening quote(s), but none of the contents of the literal. .. data:: FSTRING_MIDDLE - Token value used for literal text inside an :ref:`f-string `, + Token value used for literal text inside an :ref:`f-string literal `, including format specifications. .. impl-detail:: @@ -121,7 +120,7 @@ The token constants are: .. impl-detail:: - The token string contains the closing quote. + The token string contains the closing quote(s). .. data:: ENDMARKER @@ -146,14 +145,14 @@ and are defined for special uses in the tokenizer or parser: .. data:: TYPE_IGNORE Token value indicating that a ``type: ignore`` comment was recognized. - Such tokens are produced instead of regular :data:`COMMENT` tokens only when - :func:`ast.parse` is invoked with ``type_comments=True``. + Such tokens are produced instead of regular :data:`COMMENT` tokens only + with the :data:`~ast.PyCF_TYPE_COMMENTS` flag. .. data:: TYPE_COMMENT Token value indicating that a type comment was recognized. - Such tokens are produced instead of regular :data:`COMMENT` tokens only when - :func:`ast.parse` is invoked with ``type_comments=True``. + Such tokens are produced instead of regular :data:`COMMENT` tokens only + with the :data:`~ast.PyCF_TYPE_COMMENTS` flag. .. data:: SOFT_KEYWORD diff --git a/Tools/build/generate_token.py b/Tools/build/generate_token.py index e25837eb715bc0..5577998029e3cb 100755 --- a/Tools/build/generate_token.py +++ b/Tools/build/generate_token.py @@ -208,6 +208,7 @@ def make_c(infile, outfile='Parser/token.c'): .. {AUTO_GENERATED_BY_SCRIPT} .. list-table:: + :align: left :header-rows: 1 * - Token From 02fff75da34c460c58ea0b6cba96590192bc9792 Mon Sep 17 00:00:00 2001 From: Petr Viktorin Date: Wed, 5 Mar 2025 16:31:55 +0100 Subject: [PATCH 11/12] Add a note and reword OP docs --- Doc/library/token.rst | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/Doc/library/token.rst b/Doc/library/token.rst index a94d45c5b8b36a..24455b1ef77893 100644 --- a/Doc/library/token.rst +++ b/Doc/library/token.rst @@ -19,6 +19,10 @@ change between Python versions. The module also provides a mapping from numeric codes to names and some functions. The functions mirror definitions in the Python C header files. +Note that a token's value may depend on tokenizer options. For example, a +``"+"`` token may be reported as either :data:`PLUS` or :data:`OP`, or +a ``"match"`` token may be either :data:`NAME` or :data:`SOFT_KEYWORD`. + .. data:: tok_name @@ -63,9 +67,14 @@ The token constants are: .. data:: OP - A generic token value returned by the :mod:`tokenize` module for - :ref:`operators ` and :ref:`delimiters `. - See the :mod:`tokenize` module documentation for details. + A generic token value that indicates an + :ref:`operator ` or :ref:`delimiter `. + + .. impl-detail:: + + This value is only reported by the :mod:`tokenize` module. + Internally, the tokenizer uses + :ref:`exact token types ` instead. .. data:: COMMENT @@ -172,6 +181,8 @@ and are defined for special uses in the tokenizer or parser: are later rejected by the parser. +.. _token_operators_delimiters: + The remaining tokens represent specific :ref:`operators ` and :ref:`delimiters `. (The :mod:`tokenize` module reports these as :data:`OP`; see ``exact_type`` From 9b6eb2abd69d1281717390c6e8431c125ffb6b3c Mon Sep 17 00:00:00 2001 From: Petr Viktorin Date: Wed, 5 Mar 2025 18:07:26 +0100 Subject: [PATCH 12/12] Apply suggestions from code review Co-authored-by: Adam Turner <9087854+AA-Turner@users.noreply.github.com> --- Tools/build/generate_token.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/Tools/build/generate_token.py b/Tools/build/generate_token.py index 5577998029e3cb..a5f9828c466eda 100755 --- a/Tools/build/generate_token.py +++ b/Tools/build/generate_token.py @@ -234,11 +234,10 @@ def make_rst(infile, outfile='Doc/library/token-list.inc', has_handwritten_doc = set() with open(rstfile) as fileobj: - tokendef_re = re.compile(r'.. data:: (\w+)') + tokendef_re = re.compile(r'.. data:: ([0-9A-Z_]+)\s*') for line in fileobj: - if match := tokendef_re.fullmatch(line.strip()): - if match[1].isupper(): - has_handwritten_doc.add(match[1]) + if match := tokendef_re.fullmatch(line): + has_handwritten_doc.add(match[1]) # Exclude non-token constants in token.py has_handwritten_doc -= {'N_TOKENS', 'NT_OFFSET', 'EXACT_TOKEN_TYPES'}