From 8c4d75f48ebbef10623db9313dd6e09d5ead46d4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?=
 <10796600+picnixz@users.noreply.github.com>
Date: Fri, 6 Dec 2024 11:22:37 +0100
Subject: [PATCH 1/9] fix `codecs.xmlcharrefreplace_errors` handler

---
 Lib/test/test_capi/test_codecs.py |   7 +-
 Python/codecs.c                   | 193 ++++++++++++++++--------------
 2 files changed, 109 insertions(+), 91 deletions(-)

diff --git a/Lib/test/test_capi/test_codecs.py b/Lib/test/test_capi/test_codecs.py
index a557e35e68915d..3e79dd2f7ca2fa 100644
--- a/Lib/test/test_capi/test_codecs.py
+++ b/Lib/test/test_capi/test_codecs.py
@@ -843,7 +843,8 @@ def test_codec_replace_errors_handler(self):
 
     def test_codec_xmlcharrefreplace_errors_handler(self):
         handler = _testcapi.codec_xmlcharrefreplace_errors
-        self.do_test_codec_errors_handler(handler, self.unicode_encode_errors)
+        self.do_test_codec_errors_handler(handler, self.unicode_encode_errors,
+                                          safe=True)
 
     def test_codec_backslashreplace_errors_handler(self):
         handler = _testcapi.codec_backslashreplace_errors
@@ -853,12 +854,12 @@ def test_codec_namereplace_errors_handler(self):
         handler = _testlimitedcapi.codec_namereplace_errors
         self.do_test_codec_errors_handler(handler, self.unicode_encode_errors)
 
-    def do_test_codec_errors_handler(self, handler, exceptions):
+    def do_test_codec_errors_handler(self, handler, exceptions, *, safe=False):
         at_least_one = False
         for exc in exceptions:
             # See https://github.com/python/cpython/issues/123378 and related
             # discussion and issues for details.
-            if self._exception_may_crash(exc):
+            if not safe and self._exception_may_crash(exc):
                 continue
 
             at_least_one = True
diff --git a/Python/codecs.c b/Python/codecs.c
index 2cb3875db35058..fe4209f7579fd8 100644
--- a/Python/codecs.c
+++ b/Python/codecs.c
@@ -755,100 +755,117 @@ PyObject *PyCodec_ReplaceErrors(PyObject *exc)
 
 PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
 {
-    if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
-        PyObject *restuple;
-        PyObject *object;
-        Py_ssize_t i;
-        Py_ssize_t start;
-        Py_ssize_t end;
-        PyObject *res;
-        Py_UCS1 *outp;
-        Py_ssize_t ressize;
-        Py_UCS4 ch;
-        if (PyUnicodeEncodeError_GetStart(exc, &start))
-            return NULL;
-        if (PyUnicodeEncodeError_GetEnd(exc, &end))
-            return NULL;
-        if (!(object = PyUnicodeEncodeError_GetObject(exc)))
-            return NULL;
-        if (end - start > PY_SSIZE_T_MAX / (2+7+1))
-            end = start + PY_SSIZE_T_MAX / (2+7+1);
-        for (i = start, ressize = 0; i < end; ++i) {
-            /* object is guaranteed to be "ready" */
-            ch = PyUnicode_READ_CHAR(object, i);
-            if (ch<10)
-                ressize += 2+1+1;
-            else if (ch<100)
-                ressize += 2+2+1;
-            else if (ch<1000)
-                ressize += 2+3+1;
-            else if (ch<10000)
-                ressize += 2+4+1;
-            else if (ch<100000)
-                ressize += 2+5+1;
-            else if (ch<1000000)
-                ressize += 2+6+1;
-            else
-                ressize += 2+7+1;
+    if (!PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
+        wrong_exception_type(exc);
+        return NULL;
+    }
+
+    Py_ssize_t start, end;
+    if (PyUnicodeEncodeError_GetStart(exc, &start)) {
+        return NULL;
+    }
+    if (PyUnicodeEncodeError_GetEnd(exc, &end)) {
+        return NULL;
+    }
+    if (end <= start) {
+        // gh-12337 will handle negative end or start (for now we crash)
+        return Py_BuildValue("(Nn)", Py_GetConstant(Py_CONSTANT_EMPTY_STR), end);
+    }
+
+    PyObject *obj = PyUnicodeEncodeError_GetObject(exc);
+    if (obj == NULL) {
+        return NULL;
+    }
+
+    if (end - start > PY_SSIZE_T_MAX / 10) {
+        end = start + PY_SSIZE_T_MAX / 10;
+    }
+
+    end = Py_MIN(end, PyUnicode_GET_LENGTH(obj));
+
+    Py_ssize_t ressize = 0;
+    for (Py_ssize_t i = start; i < end; ++i) {
+        /* object is guaranteed to be "ready" */
+        Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
+        // The number of characters that each character 'ch' contributes
+        // in the result is 2 + k + 1, where k = min{t >= 1 | 10^t > ch}.
+        if (ch < 10) {
+            ressize += 4;
         }
-        /* allocate replacement */
-        res = PyUnicode_New(ressize, 127);
-        if (res == NULL) {
-            Py_DECREF(object);
-            return NULL;
+        else if (ch < 100) {
+            ressize += 5;
         }
-        outp = PyUnicode_1BYTE_DATA(res);
-        /* generate replacement */
-        for (i = start; i < end; ++i) {
-            int digits;
-            int base;
-            ch = PyUnicode_READ_CHAR(object, i);
-            *outp++ = '&';
-            *outp++ = '#';
-            if (ch<10) {
-                digits = 1;
-                base = 1;
-            }
-            else if (ch<100) {
-                digits = 2;
-                base = 10;
-            }
-            else if (ch<1000) {
-                digits = 3;
-                base = 100;
-            }
-            else if (ch<10000) {
-                digits = 4;
-                base = 1000;
-            }
-            else if (ch<100000) {
-                digits = 5;
-                base = 10000;
-            }
-            else if (ch<1000000) {
-                digits = 6;
-                base = 100000;
-            }
-            else {
-                digits = 7;
-                base = 1000000;
-            }
-            while (digits-->0) {
-                *outp++ = '0' + ch/base;
-                ch %= base;
-                base /= 10;
-            }
-            *outp++ = ';';
+        else if (ch < 1000) {
+            ressize += 6;
+        }
+        else if (ch < 10000) {
+            ressize += 7;
+        }
+        else if (ch < 100000) {
+            ressize += 8;
+        }
+        else if (ch < 1000000) {
+            ressize += 9;
+        }
+        else {
+            assert(ch < 10000000);
+            ressize += 10;
         }
-        assert(_PyUnicode_CheckConsistency(res, 1));
-        restuple = Py_BuildValue("(Nn)", res, end);
-        Py_DECREF(object);
-        return restuple;
     }
-    else {
-        wrong_exception_type(exc);
+
+    /* allocate replacement */
+    PyObject *res = PyUnicode_New(ressize, 127);
+    if (res == NULL) {
+        Py_DECREF(obj);
         return NULL;
     }
+    Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res);
+    /* generate replacement */
+    for (Py_ssize_t i = start; i < end; ++i) {
+        int digits, base;
+        Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
+        if (ch < 10) {
+            digits = 1;
+            base = 1;
+        }
+        else if (ch < 100) {
+            digits = 2;
+            base = 10;
+        }
+        else if (ch < 1000) {
+            digits = 3;
+            base = 100;
+        }
+        else if (ch < 10000) {
+            digits = 4;
+            base = 1000;
+        }
+        else if (ch < 100000) {
+            digits = 5;
+            base = 10000;
+        }
+        else if (ch < 1000000) {
+            digits = 6;
+            base = 100000;
+        }
+        else {
+            assert(ch < 10000000);
+            digits = 7;
+            base = 1000000;
+        }
+        *outp++ = '&';
+        *outp++ = '#';
+        while (digits-- > 0) {
+            *outp++ = '0' + ch / base;
+            ch %= base;
+            base /= 10;
+        }
+        *outp++ = ';';
+    }
+    assert(_PyUnicode_CheckConsistency(res, 1));
+    PyObject *restuple = Py_BuildValue("(Nn)", res, end);
+    Py_DECREF(obj);
+    return restuple;
 }
 
 PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)

From 4bee6352c2101d891ce0af560370862f523d3413 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?=
 <10796600+picnixz@users.noreply.github.com>
Date: Fri, 6 Dec 2024 11:22:38 +0100
Subject: [PATCH 2/9] blurb

---
 .../2024-12-06-11-17-46.gh-issue-126004.-p8MAS.rst             | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 Misc/NEWS.d/next/Core_and_Builtins/2024-12-06-11-17-46.gh-issue-126004.-p8MAS.rst

diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2024-12-06-11-17-46.gh-issue-126004.-p8MAS.rst b/Misc/NEWS.d/next/Core_and_Builtins/2024-12-06-11-17-46.gh-issue-126004.-p8MAS.rst
new file mode 100644
index 00000000000000..60b1c5d8b80793
--- /dev/null
+++ b/Misc/NEWS.d/next/Core_and_Builtins/2024-12-06-11-17-46.gh-issue-126004.-p8MAS.rst
@@ -0,0 +1,3 @@
+Fix handling of :attr:`UnicodeError.start` and :attr:`UnicodeError.end`
+values in the :func:`codecs.xmlcharrefreplace_errors` error handler.
+Patch by Bénédikt Tran.

From 3820ecad928b003b5358245a469823b5af62370f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?=
 <10796600+picnixz@users.noreply.github.com>
Date: Fri, 6 Dec 2024 14:56:23 +0100
Subject: [PATCH 3/9] cosmetic changes

---
 Python/codecs.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Python/codecs.c b/Python/codecs.c
index fe4209f7579fd8..7dd77da7e892e2 100644
--- a/Python/codecs.c
+++ b/Python/codecs.c
@@ -761,10 +761,10 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
     }
 
     Py_ssize_t start, end;
-    if (PyUnicodeEncodeError_GetStart(exc, &start)) {
+    if (PyUnicodeEncodeError_GetStart(exc, &start) < 0) {
         return NULL;
     }
-    if (PyUnicodeEncodeError_GetEnd(exc, &end)) {
+    if (PyUnicodeEncodeError_GetEnd(exc, &end) < 0) {
         return NULL;
     }
     if (end <= start) {

From e178ddfb63053bf6b8d37314e3cfbfba30113960 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?=
 <10796600+picnixz@users.noreply.github.com>
Date: Fri, 6 Dec 2024 17:12:10 +0100
Subject: [PATCH 4/9] address Petr's review

---
 Python/codecs.c | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/Python/codecs.c b/Python/codecs.c
index 7dd77da7e892e2..233515366213d1 100644
--- a/Python/codecs.c
+++ b/Python/codecs.c
@@ -768,7 +768,6 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
         return NULL;
     }
     if (end <= start) {
-        // gh-12337 will handle negative end or start (for now we crash)
         return Py_BuildValue("(Nn)", Py_GetConstant(Py_CONSTANT_EMPTY_STR), end);
     }
 
@@ -777,8 +776,13 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
         return NULL;
     }
 
-    if (end - start > PY_SSIZE_T_MAX / 10) {
-        end = start + PY_SSIZE_T_MAX / 10;
+    // The number of characters that each character 'ch' contributes
+    // in the result is 2 + k + 1, where k = min{t >= 1 | 10^t > ch}
+    // and will be formatted as "&#" + DIGITS + ";". Since the Unicode
+    // range is below 10^7, each "block" requires at most 2 + 7 + 1
+    // characters.
+    if (end - start > PY_SSIZE_T_MAX / (2 + 7 + 1)) {
+        end = start + PY_SSIZE_T_MAX / (2 + 7 + 1);
     }
 
     end = Py_MIN(end, PyUnicode_GET_LENGTH(obj));
@@ -787,29 +791,27 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
     for (Py_ssize_t i = start; i < end; ++i) {
         /* object is guaranteed to be "ready" */
         Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i);
-        // The number of characters that each character 'ch' contributes
-        // in the result is 2 + k + 1, where k = min{t >= 1 | 10^t > ch}.
         if (ch < 10) {
-            ressize += 4;
+            ressize += 2 + 1 + 1;
         }
         else if (ch < 100) {
-            ressize += 5;
+            ressize += 2 + 2 + 1;
         }
         else if (ch < 1000) {
-            ressize += 6;
+            ressize += 2 + 3 + 1;
         }
         else if (ch < 10000) {
-            ressize += 7;
+            ressize += 2 + 4 + 1;
         }
         else if (ch < 100000) {
-            ressize += 8;
+            ressize += 2 + 5 + 1;
         }
         else if (ch < 1000000) {
-            ressize += 9;
+            ressize += 2 + 6 + 1;
         }
         else {
             assert(ch < 10000000);
-            ressize += 10;
+            ressize += 2 + 7 + 1;
         }
     }
 

From e3cec4f423f43b264855460a45d2ded827cbb114 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?=
 <10796600+picnixz@users.noreply.github.com>
Date: Fri, 3 Jan 2025 14:16:06 +0100
Subject: [PATCH 5/9] use internal `_PyUnicodeError_GetParams` helper

---
 Python/codecs.c | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/Python/codecs.c b/Python/codecs.c
index 233515366213d1..1aed51254f3c6e 100644
--- a/Python/codecs.c
+++ b/Python/codecs.c
@@ -760,22 +760,16 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
         return NULL;
     }
 
-    Py_ssize_t start, end;
-    if (PyUnicodeEncodeError_GetStart(exc, &start) < 0) {
-        return NULL;
-    }
-    if (PyUnicodeEncodeError_GetEnd(exc, &end) < 0) {
+    PyObject *obj;
+    Py_ssize_t objlen, start, end;
+    if (_PyUnicodeError_GetParams(exc, &obj, &objlen, &start, &end, false) < 0) {
         return NULL;
     }
     if (end <= start) {
+        Py_DECREF(obj);
         return Py_BuildValue("(Nn)", Py_GetConstant(Py_CONSTANT_EMPTY_STR), end);
     }
 
-    PyObject *obj = PyUnicodeEncodeError_GetObject(exc);
-    if (obj == NULL) {
-        return NULL;
-    }
-
     // The number of characters that each character 'ch' contributes
     // in the result is 2 + k + 1, where k = min{t >= 1 | 10^t > ch}
     // and will be formatted as "&#" + DIGITS + ";". Since the Unicode
@@ -785,7 +779,7 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
         end = start + PY_SSIZE_T_MAX / (2 + 7 + 1);
     }
 
-    end = Py_MIN(end, PyUnicode_GET_LENGTH(obj));
+    end = Py_MIN(end, objlen);
 
     Py_ssize_t ressize = 0;
     for (Py_ssize_t i = start; i < end; ++i) {

From b1ba1097479edd776958ecfd2544a2611706730d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?=
 <10796600+picnixz@users.noreply.github.com>
Date: Tue, 21 Jan 2025 12:58:48 +0100
Subject: [PATCH 6/9] update usages of `_PyUnicodeError_GetParams`

---
 Python/codecs.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/Python/codecs.c b/Python/codecs.c
index 1aed51254f3c6e..627f0426412f4e 100644
--- a/Python/codecs.c
+++ b/Python/codecs.c
@@ -753,7 +753,8 @@ PyObject *PyCodec_ReplaceErrors(PyObject *exc)
     }
 }
 
-PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
+PyObject *
+PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
 {
     if (!PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
         wrong_exception_type(exc);
@@ -761,24 +762,22 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
     }
 
     PyObject *obj;
-    Py_ssize_t objlen, start, end;
-    if (_PyUnicodeError_GetParams(exc, &obj, &objlen, &start, &end, false) < 0) {
+    Py_ssize_t objlen, start, end, slen;
+    if (_PyUnicodeError_GetParams(exc,
+                                  &obj, &objlen,
+                                  &start, &end, &slen, false) < 0)
+    {
         return NULL;
     }
-    if (end <= start) {
-        Py_DECREF(obj);
-        return Py_BuildValue("(Nn)", Py_GetConstant(Py_CONSTANT_EMPTY_STR), end);
-    }
 
     // The number of characters that each character 'ch' contributes
     // in the result is 2 + k + 1, where k = min{t >= 1 | 10^t > ch}
     // and will be formatted as "&#" + DIGITS + ";". Since the Unicode
     // range is below 10^7, each "block" requires at most 2 + 7 + 1
     // characters.
-    if (end - start > PY_SSIZE_T_MAX / (2 + 7 + 1)) {
+    if (slen > PY_SSIZE_T_MAX / (2 + 7 + 1)) {
         end = start + PY_SSIZE_T_MAX / (2 + 7 + 1);
     }
-
     end = Py_MIN(end, objlen);
 
     Py_ssize_t ressize = 0;

From debf20fb2a07fe0f15e6234ed462842a3c8d3f88 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?=
 <10796600+picnixz@users.noreply.github.com>
Date: Tue, 21 Jan 2025 13:00:36 +0100
Subject: [PATCH 7/9] amend some cosmetic changes to be consistent

---
 Python/codecs.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/Python/codecs.c b/Python/codecs.c
index 627f0426412f4e..e8710bf011c1e6 100644
--- a/Python/codecs.c
+++ b/Python/codecs.c
@@ -753,8 +753,7 @@ PyObject *PyCodec_ReplaceErrors(PyObject *exc)
     }
 }
 
-PyObject *
-PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
+PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
 {
     if (!PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
         wrong_exception_type(exc);

From 2ac9464d5e69771632f5709b9a0f6b77e6fbe4b6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?=
 <10796600+picnixz@users.noreply.github.com>
Date: Tue, 21 Jan 2025 15:28:08 +0100
Subject: [PATCH 8/9] fix bounds

---
 Python/codecs.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Python/codecs.c b/Python/codecs.c
index e8710bf011c1e6..3f09a6dc137989 100644
--- a/Python/codecs.c
+++ b/Python/codecs.c
@@ -776,8 +776,9 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
     // characters.
     if (slen > PY_SSIZE_T_MAX / (2 + 7 + 1)) {
         end = start + PY_SSIZE_T_MAX / (2 + 7 + 1);
+        end = Py_MIN(end, objlen);
+        slen = Py_MAX(0, end - start);
     }
-    end = Py_MIN(end, objlen);
 
     Py_ssize_t ressize = 0;
     for (Py_ssize_t i = start; i < end; ++i) {

From 50463586ff6d4738a788a0e8c4b5c40a44b35d91 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?=
 <10796600+picnixz@users.noreply.github.com>
Date: Wed, 22 Jan 2025 14:11:57 +0100
Subject: [PATCH 9/9] add assertion per Victor's suggestion

---
 Python/codecs.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Python/codecs.c b/Python/codecs.c
index 3f09a6dc137989..11eaca175abf13 100644
--- a/Python/codecs.c
+++ b/Python/codecs.c
@@ -851,6 +851,7 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
         *outp++ = '&';
         *outp++ = '#';
         while (digits-- > 0) {
+            assert(base >= 1);
             *outp++ = '0' + ch / base;
             ch %= base;
             base /= 10;