diff --git a/Doc/library/re.rst b/Doc/library/re.rst index 0ee2d68bcbe006..a02888e43bb68c 100644 --- a/Doc/library/re.rst +++ b/Doc/library/re.rst @@ -1378,6 +1378,27 @@ when there is no match, you can test whether there was a match with a simple if match: process(match) +Match objects are proper :class:`~collections.abc.Sequence` types. You can access +match groups via subscripting ``match[...]`` and use familiar +:class:`~collections.abc.Sequence` idioms to iterate over and extract match groups:: + + >>> m = re.match(r"(\w+) (\w+)", "Isaac Newton, physicist") + >>> m[1] + "Isaac" + >>> list(m) + ["Isaac Newton", "Isaac", "Newton"] + >>> _, first_name, last_name = m + >>> last_name + "Newton" + +You can also destructure match objects with python's ``match`` statement:: + + >>> match re.match(r"(\d+)-(\d+)-(\d+)", "2000-10-16"): + ... case [_, year, month, day]: + ... year + ... + "2000" + .. class:: Match Match object returned by successful ``match``\ es and ``search``\ es. @@ -1473,6 +1494,37 @@ when there is no match, you can test whether there was a match with a simple .. versionadded:: 3.6 + .. versionchanged:: next + + Negative indexing is now supported. This allows accessing match groups + from the end, starting from the last group defined in the pattern:: + + >>> m = re.match(r"(\w+) (\w+)", "Isaac Newton, physicist") + >>> m[-1] # The first parenthesized subgroup starting from the end. + 'Newton' + >>> m[-2] # The second parenthesized subgroup starting from the end. + 'Isaac' + >>> m[-3] # The entire match starting from the end. + 'Isaac Newton' + + You can also use slicing to extract multiple groups as a tuple:: + + >>> m = re.match(r"(\w+) (\w+)", "Isaac Newton, physicist") + >>> m[1:] + ('Isaac', 'Newton') + + +.. method:: Match.__len__() + + Returns the number of groups accessible through the subscript syntax provided by + :meth:`~Match.__getitem__`. This includes group ``0`` representing the entire match:: + + >>> m = re.match(r"(\w+) (\w+)", "Isaac Newton, physicist") + >>> len(m) + 3 + + .. versionadded:: next + .. method:: Match.groups(default=None) @@ -1539,6 +1591,22 @@ when there is no match, you can test whether there was a match with a simple *group* defaults to zero, the entire match. +.. method:: Match.index(value, start=0, stop=sys.maxsize, /) + + Return the index of the first occurrence of the value among the matched groups. + + Raises :exc:`ValueError` if the value is not present. + + .. versionadded:: next + + +.. method:: Match.count(value, /) + + Return the number of occurrences of the value among the matched groups. + + .. versionadded:: next + + .. attribute:: Match.pos The value of *pos* which was passed to the :meth:`~Pattern.search` or diff --git a/Lib/re/__init__.py b/Lib/re/__init__.py index af2808a77da691..a9aed8de4b093e 100644 --- a/Lib/re/__init__.py +++ b/Lib/re/__init__.py @@ -125,6 +125,7 @@ import enum from . import _compiler, _parser import functools +import _collections_abc import _sre @@ -315,6 +316,8 @@ def escape(pattern): Pattern = type(_compiler.compile('', 0)) Match = type(_compiler.compile('', 0).match('')) +_collections_abc.Sequence.register(Match) + # -------------------------------------------------------------------- # internals diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index f79a6149078996..101f38f9bfde33 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -2,12 +2,14 @@ cpython_only, captured_stdout, check_disallow_instantiation, linked_to_musl, warnings_helper, SHORT_TIMEOUT, Stopwatch, requires_resource) +import itertools import locale import re import string import sys import unittest import warnings +from collections.abc import Sequence from re import Scanner from weakref import proxy @@ -570,10 +572,14 @@ def test_match_getitem(self): self.assertEqual(m[1], 'a') self.assertEqual(m[2], None) self.assertEqual(m[3], None) + self.assertEqual(m[-1], None) + self.assertEqual(m[-2], None) + self.assertEqual(m[-3], 'a') + self.assertEqual(m[-4], 'a') with self.assertRaisesRegex(IndexError, 'no such group'): m['X'] with self.assertRaisesRegex(IndexError, 'no such group'): - m[-1] + m[-5] with self.assertRaisesRegex(IndexError, 'no such group'): m[4] with self.assertRaisesRegex(IndexError, 'no such group'): @@ -594,13 +600,144 @@ def test_match_getitem(self): self.assertEqual(m[1], 'a') self.assertEqual(m[2], None) self.assertEqual(m[3], 'c') + self.assertEqual(m[-1], 'c') + self.assertEqual(m[-2], None) + self.assertEqual(m[-3], 'a') + self.assertEqual(m[-4], 'ac') # Cannot assign. with self.assertRaises(TypeError): m[0] = 1 - # No len(). - self.assertRaises(TypeError, len, m) + def test_match_getitem_slice(self): + m = re.match(r"(a)(b)(c)", "abc") + seq = ("abc", "a", "b", "c") + indices = [None, *range(-len(seq), len(seq) + 1)] + for start, end, step in itertools.product( + indices, + indices, + filter(lambda x: x != 0, indices), # slice step cannot be zero + ): + with self.subTest(start=start, end=end, step=step): + self.assertEqual(m[start:end:step], seq[start:end:step]) + + def test_match_sequence(self): + m = re.match(r"(a)(b)(c)", "abc") + self.assertIsInstance(m, Sequence) + self.assertEqual(len(m), 4) + + self.assertEqual(tuple(m), ("abc", "a", "b", "c")) + self.assertEqual(list(m), ["abc", "a", "b", "c"]) + + abc, a, b, c = m + self.assertEqual(abc, "abc") + self.assertEqual(a, "a") + self.assertEqual(b, "b") + self.assertEqual(c, "c") + + self.assertIn("abc", m) + self.assertIn("a", m) + self.assertIn("b", m) + self.assertIn("c", m) + self.assertNotIn("123", m) + + self.assertEqual(list(reversed(m)), ["c", "b", "a", "abc"]) + + for s, k, v in re.finditer(r"(\w+):(\w+)", "abc:123"): + self.assertEqual(s, "abc:123") + self.assertEqual(k, "abc") + self.assertEqual(v, "123") + + def test_match_iter(self): + it = iter(re.match(r"(a)(b)(c)", "abc")) + self.assertEqual(next(it), "abc") + self.assertEqual(next(it), "a") + self.assertEqual(next(it), "b") + self.assertEqual(next(it), "c") + self.assertRaises(StopIteration, next, it) + + def test_match_index(self): + m = re.match(r"(a)(b)(c)(b)", "abcb") + self.assertEqual(m.index("abcb"), 0) + self.assertEqual(m.index("a"), 1) + self.assertEqual(m.index("b"), 2) + self.assertEqual(m.index("c"), 3) + self.assertRaises(ValueError, m.index, "123") + + # With start index. + self.assertEqual(m.index("a", 1), 1) + self.assertEqual(m.index("b", 1), 2) + self.assertEqual(m.index("c", 1), 3) + self.assertRaises(ValueError, m.index, "abcb", 1) + self.assertRaises(ValueError, m.index, "123", 1) + + self.assertEqual(m.index("b", 2), 2) + self.assertEqual(m.index("c", 2), 3) + self.assertRaises(ValueError, m.index, "abcb", 2) + self.assertRaises(ValueError, m.index, "a", 2) + self.assertRaises(ValueError, m.index, "123", 2) + + self.assertEqual(m.index("b", 3), 4) + self.assertEqual(m.index("c", 3), 3) + self.assertRaises(ValueError, m.index, "abcb", 3) + self.assertRaises(ValueError, m.index, "a", 3) + self.assertRaises(ValueError, m.index, "123", 3) + + self.assertEqual(m.index("b", 4), 4) + self.assertRaises(ValueError, m.index, "abcb", 4) + self.assertRaises(ValueError, m.index, "a", 4) + self.assertRaises(ValueError, m.index, "c", 4) + self.assertRaises(ValueError, m.index, "123", 4) + + self.assertRaises(ValueError, m.index, "abcb", 5) + self.assertRaises(ValueError, m.index, "a", 5) + self.assertRaises(ValueError, m.index, "b", 5) + self.assertRaises(ValueError, m.index, "c", 5) + self.assertRaises(ValueError, m.index, "123", 5) + + # With start index and stop index. + self.assertEqual(m.index("b", 1, 3), 2) + self.assertEqual(m.index("b", 2, 4), 2) + self.assertEqual(m.index("b", 3, 5), 4) + self.assertRaises(ValueError, m.index, "b", 0, 2) + self.assertRaises(ValueError, m.index, "b", 3, 4) + self.assertRaises(ValueError, m.index, "b", -1, 0) + + # Non-string objects. + self.assertRaises(ValueError, m.index, 123) + self.assertRaises(ValueError, m.index, [1, 2, 3]) + self.assertRaises(ValueError, m.index, object()) + + def test_match_count(self): + m = re.match(r"(a)(b)(c)", "abc") + self.assertEqual(m.count("abc"), 1) + self.assertEqual(m.count("a"), 1) + self.assertEqual(m.count("b"), 1) + self.assertEqual(m.count("c"), 1) + self.assertEqual(m.count("123"), 0) + + # Non-string objects. + self.assertEqual(m.count(123), 0) + self.assertEqual(m.count([1, 2, 3]), 0) + self.assertEqual(m.count(object()), 0) + + def test_match_match_case(self): + m = re.match(r"(a)(b)(c)", "abc") + + match m: + case [abc, "a", "b", "c"]: + self.assertEqual(abc, "abc") + case _: + self.fail() + + match re.match(r"(\d+)-(\d+)-(\d+)", "2025-05-07"): + case [date, year, month, day]: + self.assertEqual(date, "2025-05-07") + self.assertEqual(year, "2025") + self.assertEqual(month, "05") + self.assertEqual(day, "07") + case _: + self.fail() def test_re_fullmatch(self): # Issue 16203: Proposal: add re.fullmatch() method. diff --git a/Misc/ACKS b/Misc/ACKS index 610dcf9f4238de..e25379ffe696c3 100644 --- a/Misc/ACKS +++ b/Misc/ACKS @@ -157,6 +157,7 @@ Bennett Benson Ezra Berch Stuart Berg Michel Van den Bergh +Valentin Berlier Julian Berman Brice Berna Olivier Bernard diff --git a/Misc/NEWS.d/next/Library/2025-05-12-02-49-18.gh-issue-133546.Gk0Qct.rst b/Misc/NEWS.d/next/Library/2025-05-12-02-49-18.gh-issue-133546.Gk0Qct.rst new file mode 100644 index 00000000000000..dcbbbb92ca4b53 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-05-12-02-49-18.gh-issue-133546.Gk0Qct.rst @@ -0,0 +1,2 @@ +Make :class:`re.Match` a well-rounded :class:`~collections.abc.Sequence` +type. diff --git a/Modules/_sre/clinic/sre.c.h b/Modules/_sre/clinic/sre.c.h index d2f25a71495cda..5cd5ed2ab5c405 100644 --- a/Modules/_sre/clinic/sre.c.h +++ b/Modules/_sre/clinic/sre.c.h @@ -1485,6 +1485,74 @@ _sre_SRE_Match_span(PyObject *self, PyObject *const *args, Py_ssize_t nargs) return return_value; } +PyDoc_STRVAR(_sre_SRE_Match_index__doc__, +"index($self, value, start=0, stop=sys.maxsize, /)\n" +"--\n" +"\n" +"Return the index of the first occurrence of the value among the matched groups.\n" +"\n" +"Raises ValueError if the value is not present."); + +#define _SRE_SRE_MATCH_INDEX_METHODDEF \ + {"index", _PyCFunction_CAST(_sre_SRE_Match_index), METH_FASTCALL, _sre_SRE_Match_index__doc__}, + +static PyObject * +_sre_SRE_Match_index_impl(MatchObject *self, PyObject *value, + Py_ssize_t start, Py_ssize_t stop); + +static PyObject * +_sre_SRE_Match_index(PyObject *self, PyObject *const *args, Py_ssize_t nargs) +{ + PyObject *return_value = NULL; + PyObject *value; + Py_ssize_t start = 0; + Py_ssize_t stop = PY_SSIZE_T_MAX; + + if (!_PyArg_CheckPositional("index", nargs, 1, 3)) { + goto exit; + } + value = args[0]; + if (nargs < 2) { + goto skip_optional; + } + if (!_PyEval_SliceIndexNotNone(args[1], &start)) { + goto exit; + } + if (nargs < 3) { + goto skip_optional; + } + if (!_PyEval_SliceIndexNotNone(args[2], &stop)) { + goto exit; + } +skip_optional: + return_value = _sre_SRE_Match_index_impl((MatchObject *)self, value, start, stop); + +exit: + return return_value; +} + +PyDoc_STRVAR(_sre_SRE_Match_count__doc__, +"count($self, value, /)\n" +"--\n" +"\n" +"Return the number of occurrences of the value among the matched groups."); + +#define _SRE_SRE_MATCH_COUNT_METHODDEF \ + {"count", (PyCFunction)_sre_SRE_Match_count, METH_O, _sre_SRE_Match_count__doc__}, + +static PyObject * +_sre_SRE_Match_count_impl(MatchObject *self, PyObject *value); + +static PyObject * +_sre_SRE_Match_count(PyObject *self, PyObject *value) +{ + PyObject *return_value = NULL; + + return_value = _sre_SRE_Match_count_impl((MatchObject *)self, value); + + return return_value; +} + PyDoc_STRVAR(_sre_SRE_Match___copy____doc__, "__copy__($self, /)\n" "--\n" @@ -1568,4 +1636,4 @@ _sre_SRE_Scanner_search(PyObject *self, PyTypeObject *cls, PyObject *const *args #ifndef _SRE_SRE_PATTERN__FAIL_AFTER_METHODDEF #define _SRE_SRE_PATTERN__FAIL_AFTER_METHODDEF #endif /* !defined(_SRE_SRE_PATTERN__FAIL_AFTER_METHODDEF) */ -/*[clinic end generated code: output=bbf42e1de3bdd3ae input=a9049054013a1b77]*/ +/*[clinic end generated code: output=eb27b8f0b871277f input=a9049054013a1b77]*/ diff --git a/Modules/_sre/sre.c b/Modules/_sre/sre.c index 602d0ab8588f62..bc3f30ae781811 100644 --- a/Modules/_sre/sre.c +++ b/Modules/_sre/sre.c @@ -2431,11 +2431,74 @@ match_group(PyObject *op, PyObject* args) return result; } +static Py_ssize_t +match_length(PyObject *op) +{ + MatchObject *self = _MatchObject_CAST(op); + return self->groups; +} + static PyObject* -match_getitem(PyObject *op, PyObject* name) +match_item(PyObject *op, Py_ssize_t index) { MatchObject *self = _MatchObject_CAST(op); - return match_getslice(self, name, Py_None); + if (index < 0 || index >= self->groups) { + PyErr_SetString(PyExc_IndexError, "no such group"); + return NULL; + } + return match_getslice_by_index(self, index, Py_None); +} + +static PyObject* +match_subscript(PyObject *op, PyObject* item) +{ + MatchObject *self = _MatchObject_CAST(op); + + if (PyIndex_Check(item)) { + Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError); + if (i == -1 && PyErr_Occurred()) { + return NULL; + } + if (i < 0) { + i += self->groups; + } + return match_item(op, i); + } + else if (PySlice_Check(item)) { + Py_ssize_t start, stop, step; + if (PySlice_Unpack(item, &start, &stop, &step) < 0) { + return NULL; + } + Py_ssize_t slicelength = PySlice_AdjustIndices(self->groups, &start, &stop, step); + PyObject* result = PyTuple_New(slicelength); + if (!result) { + return NULL; + } + for (Py_ssize_t cur = start, i = 0; i < slicelength; cur += step, i++) { + PyObject* group = match_getslice_by_index(self, cur, Py_None); + if (!group) { + Py_DECREF(result); + return NULL; + } + PyTuple_SET_ITEM(result, i, group); + } + return result; + } + else { + if (self->pattern->groupindex) { + PyObject* index = PyDict_GetItemWithError(self->pattern->groupindex, item); + if (index && PyLong_Check(index)) { + Py_ssize_t i = PyLong_AsSsize_t(index); + if (i != -1 || !PyErr_Occurred()) { + return match_item(op, i); + } + } + } + if (!PyErr_Occurred()) { + PyErr_SetString(PyExc_IndexError, "no such group"); + } + return NULL; + } } /*[clinic input] @@ -2614,6 +2677,76 @@ _sre_SRE_Match_span_impl(MatchObject *self, PyObject *group) return _pair(self->mark[index*2], self->mark[index*2+1]); } +/*[clinic input] +_sre.SRE_Match.index + + value: object + start: slice_index(accept={int}) = 0 + stop: slice_index(accept={int}, c_default="PY_SSIZE_T_MAX") = sys.maxsize + / + +Return the index of the first occurrence of the value among the matched groups. + +Raises ValueError if the value is not present. +[clinic start generated code]*/ + +static PyObject * +_sre_SRE_Match_index_impl(MatchObject *self, PyObject *value, + Py_ssize_t start, Py_ssize_t stop) +/*[clinic end generated code: output=846597f6f96f829c input=7f41b5a99e0ad88e]*/ +{ + (void)PySlice_AdjustIndices(self->groups, &start, &stop, 1); + + for (Py_ssize_t i = start; i < stop; i++) { + PyObject* group = match_getslice_by_index(self, i, Py_None); + if (group == NULL) { + return NULL; + } + int cmp = PyObject_RichCompareBool(group, value, Py_EQ); + Py_DECREF(group); + if (cmp < 0) { + return NULL; + } + else if (cmp > 0) { + return PyLong_FromSsize_t(i); + } + } + PyErr_SetString(PyExc_ValueError, "match.index(x): x not in match"); + return NULL; +} + +/*[clinic input] +_sre.SRE_Match.count + + value: object + / + +Return the number of occurrences of the value among the matched groups. +[clinic start generated code]*/ + +static PyObject * +_sre_SRE_Match_count_impl(MatchObject *self, PyObject *value) +/*[clinic end generated code: output=c0b81bdce5872620 input=b1f3372cfb4b8c74]*/ +{ + Py_ssize_t count = 0; + + for (Py_ssize_t i = 0; i < self->groups; i++) { + PyObject* group = match_getslice_by_index(self, i, Py_None); + if (group == NULL) { + return NULL; + } + int cmp = PyObject_RichCompareBool(group, value, Py_EQ); + Py_DECREF(group); + if (cmp < 0) { + return NULL; + } + else if (cmp > 0) { + count++; + } + } + return PyLong_FromSsize_t(count); +} + static PyObject* match_regs(MatchObject* self) { @@ -3224,6 +3357,8 @@ static PyMethodDef match_methods[] = { _SRE_SRE_MATCH_START_METHODDEF _SRE_SRE_MATCH_END_METHODDEF _SRE_SRE_MATCH_SPAN_METHODDEF + _SRE_SRE_MATCH_INDEX_METHODDEF + _SRE_SRE_MATCH_COUNT_METHODDEF _SRE_SRE_MATCH_GROUPS_METHODDEF _SRE_SRE_MATCH_GROUPDICT_METHODDEF _SRE_SRE_MATCH_EXPAND_METHODDEF @@ -3268,12 +3403,12 @@ static PyType_Slot match_slots[] = { {Py_tp_traverse, match_traverse}, {Py_tp_clear, match_clear}, - /* As mapping. - * - * Match objects do not support length or assignment, but do support - * __getitem__. - */ - {Py_mp_subscript, match_getitem}, + // Sequence protocol + {Py_sq_length, match_length}, + {Py_sq_item, match_item}, + + // Support group names provided as subscripts + {Py_mp_subscript, match_subscript}, {0, NULL}, }; @@ -3282,7 +3417,7 @@ static PyType_Spec match_spec = { .name = "re.Match", .basicsize = sizeof(MatchObject), .itemsize = sizeof(Py_ssize_t), - .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE | + .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_SEQUENCE | Py_TPFLAGS_IMMUTABLETYPE | Py_TPFLAGS_DISALLOW_INSTANTIATION | Py_TPFLAGS_HAVE_GC), .slots = match_slots, };