From e3afcb8440d0c7d41339fad4decafc587ba647c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20=C4=8Ciha=C5=99?= Date: Thu, 27 Feb 2025 08:56:40 +0100 Subject: [PATCH] gh-130567: fix strxfrm memory allocation The posix specification does not define that wcsxfrm should return needed buffer size, it just says: If the value returned is n or more, the contents of the array pointed to by ws1 are unspecified. Therefore double the allocation when the original call has failed and repeat that until it works. --- Lib/test/test_locale.py | 11 +++++++++ Modules/_localemodule.c | 55 ++++++++++++++++++++++++----------------- 2 files changed, 43 insertions(+), 23 deletions(-) diff --git a/Lib/test/test_locale.py b/Lib/test/test_locale.py index c025ed4108fb58..c3e5b4312aa469 100644 --- a/Lib/test/test_locale.py +++ b/Lib/test/test_locale.py @@ -371,6 +371,17 @@ def test_strcoll_with_diacritic(self): def test_strxfrm_with_diacritic(self): self.assertLess(locale.strxfrm('à'), locale.strxfrm('b')) + @unittest.skipIf(sys.platform.startswith('aix'), + 'bpo-29972: broken test on AIX') + @unittest.skipIf( + is_emscripten or is_wasi, + "musl libc issue on Emscripten/WASI, bpo-46390" + ) + @unittest.skipIf(sys.platform.startswith("netbsd"), + "gh-124108: NetBSD doesn't support UTF-8 for LC_COLLATE") + def test_strxfrm_non_latin_1(self): + self.assertLess(locale.strxfrm('s'), locale.strxfrm('š')) + class NormalizeTest(unittest.TestCase): def check(self, localename, expected): diff --git a/Modules/_localemodule.c b/Modules/_localemodule.c index ad618398d5b824..f91fc3fe8e0169 100644 --- a/Modules/_localemodule.c +++ b/Modules/_localemodule.c @@ -394,49 +394,58 @@ static PyObject * _locale_strxfrm_impl(PyObject *module, PyObject *str) /*[clinic end generated code: output=3081866ebffc01af input=1378bbe6a88b4780]*/ { - Py_ssize_t n1; + Py_ssize_t buf_len; wchar_t *s = NULL, *buf = NULL; - size_t n2; + size_t xfrm_result; PyObject *result = NULL; - s = PyUnicode_AsWideCharString(str, &n1); + s = PyUnicode_AsWideCharString(str, &buf_len); if (s == NULL) goto exit; - if (wcslen(s) != (size_t)n1) { + if (wcslen(s) != (size_t)buf_len) { PyErr_SetString(PyExc_ValueError, "embedded null character"); goto exit; } /* assume no change in size, first */ - n1 = n1 + 1; - buf = PyMem_New(wchar_t, n1); + buf_len = buf_len + 1; + buf = PyMem_New(wchar_t, buf_len); if (!buf) { PyErr_NoMemory(); goto exit; } - errno = 0; - n2 = wcsxfrm(buf, s, n1); - if (errno && errno != ERANGE) { - PyErr_SetFromErrno(PyExc_OSError); - goto exit; - } - if (n2 >= (size_t)n1) { - /* more space needed */ - wchar_t * new_buf = PyMem_Realloc(buf, (n2+1)*sizeof(wchar_t)); + for (;;) { + errno = 0; + xfrm_result = wcsxfrm(buf, s, buf_len); + if (errno && errno != ERANGE) { + PyErr_SetFromErrno(PyExc_OSError); + break; + } + + if (xfrm_result < (size_t)buf_len) { + // wcsxfrm succeeded, return result + result = PyUnicode_FromWideChar(buf, xfrm_result); + break; + } + + if (xfrm_result > (size_t)buf_len) { + // Assume this is desired buffer size + buf_len = xfrm_result + 1; + } else { + // Some platforms, such as macOS 15 doesn't return desired buffer + // size so it is up to the caller to figure out needed buffer size + // (gh-130567). + buf_len = buf_len * 2; + } + + wchar_t * new_buf = PyMem_Realloc(buf, buf_len * sizeof(wchar_t)); if (!new_buf) { PyErr_NoMemory(); - goto exit; + break; } buf = new_buf; - errno = 0; - n2 = wcsxfrm(buf, s, n2+1); - if (errno) { - PyErr_SetFromErrno(PyExc_OSError); - goto exit; - } } - result = PyUnicode_FromWideChar(buf, n2); exit: PyMem_Free(buf); PyMem_Free(s);