From 8c4d75f48ebbef10623db9313dd6e09d5ead46d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Fri, 6 Dec 2024 11:22:37 +0100 Subject: [PATCH 1/9] fix `codecs.xmlcharrefreplace_errors` handler --- Lib/test/test_capi/test_codecs.py | 7 +- Python/codecs.c | 193 ++++++++++++++++-------------- 2 files changed, 109 insertions(+), 91 deletions(-) diff --git a/Lib/test/test_capi/test_codecs.py b/Lib/test/test_capi/test_codecs.py index a557e35e68915d..3e79dd2f7ca2fa 100644 --- a/Lib/test/test_capi/test_codecs.py +++ b/Lib/test/test_capi/test_codecs.py @@ -843,7 +843,8 @@ def test_codec_replace_errors_handler(self): def test_codec_xmlcharrefreplace_errors_handler(self): handler = _testcapi.codec_xmlcharrefreplace_errors - self.do_test_codec_errors_handler(handler, self.unicode_encode_errors) + self.do_test_codec_errors_handler(handler, self.unicode_encode_errors, + safe=True) def test_codec_backslashreplace_errors_handler(self): handler = _testcapi.codec_backslashreplace_errors @@ -853,12 +854,12 @@ def test_codec_namereplace_errors_handler(self): handler = _testlimitedcapi.codec_namereplace_errors self.do_test_codec_errors_handler(handler, self.unicode_encode_errors) - def do_test_codec_errors_handler(self, handler, exceptions): + def do_test_codec_errors_handler(self, handler, exceptions, *, safe=False): at_least_one = False for exc in exceptions: # See https://github.com/python/cpython/issues/123378 and related # discussion and issues for details. - if self._exception_may_crash(exc): + if not safe and self._exception_may_crash(exc): continue at_least_one = True diff --git a/Python/codecs.c b/Python/codecs.c index 2cb3875db35058..fe4209f7579fd8 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -755,100 +755,117 @@ PyObject *PyCodec_ReplaceErrors(PyObject *exc) PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc) { - if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) { - PyObject *restuple; - PyObject *object; - Py_ssize_t i; - Py_ssize_t start; - Py_ssize_t end; - PyObject *res; - Py_UCS1 *outp; - Py_ssize_t ressize; - Py_UCS4 ch; - if (PyUnicodeEncodeError_GetStart(exc, &start)) - return NULL; - if (PyUnicodeEncodeError_GetEnd(exc, &end)) - return NULL; - if (!(object = PyUnicodeEncodeError_GetObject(exc))) - return NULL; - if (end - start > PY_SSIZE_T_MAX / (2+7+1)) - end = start + PY_SSIZE_T_MAX / (2+7+1); - for (i = start, ressize = 0; i < end; ++i) { - /* object is guaranteed to be "ready" */ - ch = PyUnicode_READ_CHAR(object, i); - if (ch<10) - ressize += 2+1+1; - else if (ch<100) - ressize += 2+2+1; - else if (ch<1000) - ressize += 2+3+1; - else if (ch<10000) - ressize += 2+4+1; - else if (ch<100000) - ressize += 2+5+1; - else if (ch<1000000) - ressize += 2+6+1; - else - ressize += 2+7+1; + if (!PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) { + wrong_exception_type(exc); + return NULL; + } + + Py_ssize_t start, end; + if (PyUnicodeEncodeError_GetStart(exc, &start)) { + return NULL; + } + if (PyUnicodeEncodeError_GetEnd(exc, &end)) { + return NULL; + } + if (end <= start) { + // gh-12337 will handle negative end or start (for now we crash) + return Py_BuildValue("(Nn)", Py_GetConstant(Py_CONSTANT_EMPTY_STR), end); + } + + PyObject *obj = PyUnicodeEncodeError_GetObject(exc); + if (obj == NULL) { + return NULL; + } + + if (end - start > PY_SSIZE_T_MAX / 10) { + end = start + PY_SSIZE_T_MAX / 10; + } + + end = Py_MIN(end, PyUnicode_GET_LENGTH(obj)); + + Py_ssize_t ressize = 0; + for (Py_ssize_t i = start; i < end; ++i) { + /* object is guaranteed to be "ready" */ + Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i); + // The number of characters that each character 'ch' contributes + // in the result is 2 + k + 1, where k = min{t >= 1 | 10^t > ch}. + if (ch < 10) { + ressize += 4; } - /* allocate replacement */ - res = PyUnicode_New(ressize, 127); - if (res == NULL) { - Py_DECREF(object); - return NULL; + else if (ch < 100) { + ressize += 5; } - outp = PyUnicode_1BYTE_DATA(res); - /* generate replacement */ - for (i = start; i < end; ++i) { - int digits; - int base; - ch = PyUnicode_READ_CHAR(object, i); - *outp++ = '&'; - *outp++ = '#'; - if (ch<10) { - digits = 1; - base = 1; - } - else if (ch<100) { - digits = 2; - base = 10; - } - else if (ch<1000) { - digits = 3; - base = 100; - } - else if (ch<10000) { - digits = 4; - base = 1000; - } - else if (ch<100000) { - digits = 5; - base = 10000; - } - else if (ch<1000000) { - digits = 6; - base = 100000; - } - else { - digits = 7; - base = 1000000; - } - while (digits-->0) { - *outp++ = '0' + ch/base; - ch %= base; - base /= 10; - } - *outp++ = ';'; + else if (ch < 1000) { + ressize += 6; + } + else if (ch < 10000) { + ressize += 7; + } + else if (ch < 100000) { + ressize += 8; + } + else if (ch < 1000000) { + ressize += 9; + } + else { + assert(ch < 10000000); + ressize += 10; } - assert(_PyUnicode_CheckConsistency(res, 1)); - restuple = Py_BuildValue("(Nn)", res, end); - Py_DECREF(object); - return restuple; } - else { - wrong_exception_type(exc); + + /* allocate replacement */ + PyObject *res = PyUnicode_New(ressize, 127); + if (res == NULL) { + Py_DECREF(obj); return NULL; } + Py_UCS1 *outp = PyUnicode_1BYTE_DATA(res); + /* generate replacement */ + for (Py_ssize_t i = start; i < end; ++i) { + int digits, base; + Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i); + if (ch < 10) { + digits = 1; + base = 1; + } + else if (ch < 100) { + digits = 2; + base = 10; + } + else if (ch < 1000) { + digits = 3; + base = 100; + } + else if (ch < 10000) { + digits = 4; + base = 1000; + } + else if (ch < 100000) { + digits = 5; + base = 10000; + } + else if (ch < 1000000) { + digits = 6; + base = 100000; + } + else { + assert(ch < 10000000); + digits = 7; + base = 1000000; + } + *outp++ = '&'; + *outp++ = '#'; + while (digits-- > 0) { + *outp++ = '0' + ch / base; + ch %= base; + base /= 10; + } + *outp++ = ';'; + } + assert(_PyUnicode_CheckConsistency(res, 1)); + PyObject *restuple = Py_BuildValue("(Nn)", res, end); + Py_DECREF(obj); + return restuple; } PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) From 4bee6352c2101d891ce0af560370862f523d3413 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Fri, 6 Dec 2024 11:22:38 +0100 Subject: [PATCH 2/9] blurb --- .../2024-12-06-11-17-46.gh-issue-126004.-p8MAS.rst | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 Misc/NEWS.d/next/Core_and_Builtins/2024-12-06-11-17-46.gh-issue-126004.-p8MAS.rst diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2024-12-06-11-17-46.gh-issue-126004.-p8MAS.rst b/Misc/NEWS.d/next/Core_and_Builtins/2024-12-06-11-17-46.gh-issue-126004.-p8MAS.rst new file mode 100644 index 00000000000000..60b1c5d8b80793 --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2024-12-06-11-17-46.gh-issue-126004.-p8MAS.rst @@ -0,0 +1,3 @@ +Fix handling of :attr:`UnicodeError.start` and :attr:`UnicodeError.end` +values in the :func:`codecs.xmlcharrefreplace_errors` error handler. +Patch by Bénédikt Tran. From 3820ecad928b003b5358245a469823b5af62370f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Fri, 6 Dec 2024 14:56:23 +0100 Subject: [PATCH 3/9] cosmetic changes --- Python/codecs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Python/codecs.c b/Python/codecs.c index fe4209f7579fd8..7dd77da7e892e2 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -761,10 +761,10 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc) } Py_ssize_t start, end; - if (PyUnicodeEncodeError_GetStart(exc, &start)) { + if (PyUnicodeEncodeError_GetStart(exc, &start) < 0) { return NULL; } - if (PyUnicodeEncodeError_GetEnd(exc, &end)) { + if (PyUnicodeEncodeError_GetEnd(exc, &end) < 0) { return NULL; } if (end <= start) { From e178ddfb63053bf6b8d37314e3cfbfba30113960 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Fri, 6 Dec 2024 17:12:10 +0100 Subject: [PATCH 4/9] address Petr's review --- Python/codecs.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/Python/codecs.c b/Python/codecs.c index 7dd77da7e892e2..233515366213d1 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -768,7 +768,6 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc) return NULL; } if (end <= start) { - // gh-12337 will handle negative end or start (for now we crash) return Py_BuildValue("(Nn)", Py_GetConstant(Py_CONSTANT_EMPTY_STR), end); } @@ -777,8 +776,13 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc) return NULL; } - if (end - start > PY_SSIZE_T_MAX / 10) { - end = start + PY_SSIZE_T_MAX / 10; + // The number of characters that each character 'ch' contributes + // in the result is 2 + k + 1, where k = min{t >= 1 | 10^t > ch} + // and will be formatted as "&#" + DIGITS + ";". Since the Unicode + // range is below 10^7, each "block" requires at most 2 + 7 + 1 + // characters. + if (end - start > PY_SSIZE_T_MAX / (2 + 7 + 1)) { + end = start + PY_SSIZE_T_MAX / (2 + 7 + 1); } end = Py_MIN(end, PyUnicode_GET_LENGTH(obj)); @@ -787,29 +791,27 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc) for (Py_ssize_t i = start; i < end; ++i) { /* object is guaranteed to be "ready" */ Py_UCS4 ch = PyUnicode_READ_CHAR(obj, i); - // The number of characters that each character 'ch' contributes - // in the result is 2 + k + 1, where k = min{t >= 1 | 10^t > ch}. if (ch < 10) { - ressize += 4; + ressize += 2 + 1 + 1; } else if (ch < 100) { - ressize += 5; + ressize += 2 + 2 + 1; } else if (ch < 1000) { - ressize += 6; + ressize += 2 + 3 + 1; } else if (ch < 10000) { - ressize += 7; + ressize += 2 + 4 + 1; } else if (ch < 100000) { - ressize += 8; + ressize += 2 + 5 + 1; } else if (ch < 1000000) { - ressize += 9; + ressize += 2 + 6 + 1; } else { assert(ch < 10000000); - ressize += 10; + ressize += 2 + 7 + 1; } } From e3cec4f423f43b264855460a45d2ded827cbb114 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Fri, 3 Jan 2025 14:16:06 +0100 Subject: [PATCH 5/9] use internal `_PyUnicodeError_GetParams` helper --- Python/codecs.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/Python/codecs.c b/Python/codecs.c index 233515366213d1..1aed51254f3c6e 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -760,22 +760,16 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc) return NULL; } - Py_ssize_t start, end; - if (PyUnicodeEncodeError_GetStart(exc, &start) < 0) { - return NULL; - } - if (PyUnicodeEncodeError_GetEnd(exc, &end) < 0) { + PyObject *obj; + Py_ssize_t objlen, start, end; + if (_PyUnicodeError_GetParams(exc, &obj, &objlen, &start, &end, false) < 0) { return NULL; } if (end <= start) { + Py_DECREF(obj); return Py_BuildValue("(Nn)", Py_GetConstant(Py_CONSTANT_EMPTY_STR), end); } - PyObject *obj = PyUnicodeEncodeError_GetObject(exc); - if (obj == NULL) { - return NULL; - } - // The number of characters that each character 'ch' contributes // in the result is 2 + k + 1, where k = min{t >= 1 | 10^t > ch} // and will be formatted as "&#" + DIGITS + ";". Since the Unicode @@ -785,7 +779,7 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc) end = start + PY_SSIZE_T_MAX / (2 + 7 + 1); } - end = Py_MIN(end, PyUnicode_GET_LENGTH(obj)); + end = Py_MIN(end, objlen); Py_ssize_t ressize = 0; for (Py_ssize_t i = start; i < end; ++i) { From b1ba1097479edd776958ecfd2544a2611706730d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Tue, 21 Jan 2025 12:58:48 +0100 Subject: [PATCH 6/9] update usages of `_PyUnicodeError_GetParams` --- Python/codecs.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/Python/codecs.c b/Python/codecs.c index 1aed51254f3c6e..627f0426412f4e 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -753,7 +753,8 @@ PyObject *PyCodec_ReplaceErrors(PyObject *exc) } } -PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc) +PyObject * +PyCodec_XMLCharRefReplaceErrors(PyObject *exc) { if (!PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) { wrong_exception_type(exc); @@ -761,24 +762,22 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc) } PyObject *obj; - Py_ssize_t objlen, start, end; - if (_PyUnicodeError_GetParams(exc, &obj, &objlen, &start, &end, false) < 0) { + Py_ssize_t objlen, start, end, slen; + if (_PyUnicodeError_GetParams(exc, + &obj, &objlen, + &start, &end, &slen, false) < 0) + { return NULL; } - if (end <= start) { - Py_DECREF(obj); - return Py_BuildValue("(Nn)", Py_GetConstant(Py_CONSTANT_EMPTY_STR), end); - } // The number of characters that each character 'ch' contributes // in the result is 2 + k + 1, where k = min{t >= 1 | 10^t > ch} // and will be formatted as "&#" + DIGITS + ";". Since the Unicode // range is below 10^7, each "block" requires at most 2 + 7 + 1 // characters. - if (end - start > PY_SSIZE_T_MAX / (2 + 7 + 1)) { + if (slen > PY_SSIZE_T_MAX / (2 + 7 + 1)) { end = start + PY_SSIZE_T_MAX / (2 + 7 + 1); } - end = Py_MIN(end, objlen); Py_ssize_t ressize = 0; From debf20fb2a07fe0f15e6234ed462842a3c8d3f88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Tue, 21 Jan 2025 13:00:36 +0100 Subject: [PATCH 7/9] amend some cosmetic changes to be consistent --- Python/codecs.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Python/codecs.c b/Python/codecs.c index 627f0426412f4e..e8710bf011c1e6 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -753,8 +753,7 @@ PyObject *PyCodec_ReplaceErrors(PyObject *exc) } } -PyObject * -PyCodec_XMLCharRefReplaceErrors(PyObject *exc) +PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc) { if (!PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) { wrong_exception_type(exc); From 2ac9464d5e69771632f5709b9a0f6b77e6fbe4b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Tue, 21 Jan 2025 15:28:08 +0100 Subject: [PATCH 8/9] fix bounds --- Python/codecs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Python/codecs.c b/Python/codecs.c index e8710bf011c1e6..3f09a6dc137989 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -776,8 +776,9 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc) // characters. if (slen > PY_SSIZE_T_MAX / (2 + 7 + 1)) { end = start + PY_SSIZE_T_MAX / (2 + 7 + 1); + end = Py_MIN(end, objlen); + slen = Py_MAX(0, end - start); } - end = Py_MIN(end, objlen); Py_ssize_t ressize = 0; for (Py_ssize_t i = start; i < end; ++i) { From 50463586ff6d4738a788a0e8c4b5c40a44b35d91 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Wed, 22 Jan 2025 14:11:57 +0100 Subject: [PATCH 9/9] add assertion per Victor's suggestion --- Python/codecs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/Python/codecs.c b/Python/codecs.c index 3f09a6dc137989..11eaca175abf13 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -851,6 +851,7 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc) *outp++ = '&'; *outp++ = '#'; while (digits-- > 0) { + assert(base >= 1); *outp++ = '0' + ch / base; ch %= base; base /= 10;