From 2b0bf5bd7b93d480a1c2634059101a6265913f81 Mon Sep 17 00:00:00 2001 From: Tamas Hegedus Date: Tue, 17 Oct 2023 23:02:02 +0200 Subject: [PATCH 1/4] Fix UTF-8 sequence boundary search --- Modules/_io/winconsoleio.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/Modules/_io/winconsoleio.c b/Modules/_io/winconsoleio.c index 50b8818aad410b..7d7cfe9f5ff50f 100644 --- a/Modules/_io/winconsoleio.c +++ b/Modules/_io/winconsoleio.c @@ -134,6 +134,18 @@ char _PyIO_get_console_type(PyObject *path_or_fd) { return m; } +DWORD _find_last_utf8_boundary(char *buf, DWORD len) { + /* This function never returns 0, returns the original len instead */ + DWORD count = 1; + if (len == 0 || (buf[len - 1] & 0x80) == 0) + return len; + for (;; count++) { + if (count > 3 || count >= len) + return len; + if ((buf[len - count] & 0xc0) != 0x80) + return len - count; + } +} /*[clinic input] module _io @@ -975,7 +987,7 @@ _io__WindowsConsoleIO_write_impl(winconsoleio *self, PyTypeObject *cls, { BOOL res = TRUE; wchar_t *wbuf; - DWORD len, wlen, orig_len, n = 0; + DWORD len, wlen, n = 0; HANDLE handle; if (self->fd == -1) @@ -1007,21 +1019,8 @@ _io__WindowsConsoleIO_write_impl(winconsoleio *self, PyTypeObject *cls, have to reduce and recalculate. */ while (wlen > 32766 / sizeof(wchar_t)) { len /= 2; - orig_len = len; - /* Reduce the length until we hit the final byte of a UTF-8 sequence - * (top bit is unset). Fix for github issue 82052. - */ - while (len > 0 && (((char *)b->buf)[len-1] & 0x80) != 0) - --len; - /* If we hit a length of 0, something has gone wrong. This shouldn't - * be possible, as valid UTF-8 can have at most 3 non-final bytes - * before a final one, and our buffer is way longer than that. - * But to be on the safe side, if we hit this issue we just restore - * the original length and let the console API sort it out. - */ - if (len == 0) { - len = orig_len; - } + /* Fix for github issues gh-110913 and gh-82052. */ + len = _find_last_utf8_boundary(b->buf, len); wlen = MultiByteToWideChar(CP_UTF8, 0, b->buf, len, NULL, 0); } Py_END_ALLOW_THREADS From fae514784f2b25991ba8b6a0136d0c12906f8012 Mon Sep 17 00:00:00 2001 From: "blurb-it[bot]" <43283697+blurb-it[bot]@users.noreply.github.com> Date: Thu, 19 Oct 2023 21:46:19 +0000 Subject: [PATCH 2/4] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20blu?= =?UTF-8?q?rb=5Fit.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../next/Windows/2023-10-19-21-46-18.gh-issue-110913.CWlPfg.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Windows/2023-10-19-21-46-18.gh-issue-110913.CWlPfg.rst diff --git a/Misc/NEWS.d/next/Windows/2023-10-19-21-46-18.gh-issue-110913.CWlPfg.rst b/Misc/NEWS.d/next/Windows/2023-10-19-21-46-18.gh-issue-110913.CWlPfg.rst new file mode 100644 index 00000000000000..faf23f7ba17269 --- /dev/null +++ b/Misc/NEWS.d/next/Windows/2023-10-19-21-46-18.gh-issue-110913.CWlPfg.rst @@ -0,0 +1 @@ +WindowsConsoleIO now correctly chunks large buffers without splitting up UTF-8 sequences From cd641bfa999f6061b4159daaee1d96f5f66b05ed Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Fri, 20 Oct 2023 00:51:15 +0300 Subject: [PATCH 3/4] Style fix. --- Modules/_io/winconsoleio.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/Modules/_io/winconsoleio.c b/Modules/_io/winconsoleio.c index 7d7cfe9f5ff50f..6680488b740cfc 100644 --- a/Modules/_io/winconsoleio.c +++ b/Modules/_io/winconsoleio.c @@ -134,16 +134,21 @@ char _PyIO_get_console_type(PyObject *path_or_fd) { return m; } -DWORD _find_last_utf8_boundary(char *buf, DWORD len) { +static DWORD +_find_last_utf8_boundary(const char *buf, DWORD len) +{ /* This function never returns 0, returns the original len instead */ DWORD count = 1; - if (len == 0 || (buf[len - 1] & 0x80) == 0) + if (len == 0 || (buf[len - 1] & 0x80) == 0) { return len; + } for (;; count++) { - if (count > 3 || count >= len) + if (count > 3 || count >= len) { return len; - if ((buf[len - count] & 0xc0) != 0x80) + } + if ((buf[len - count] & 0xc0) != 0x80) { return len - count; + } } } From 0c8ca337033770baa6f40b656d21fc89ff65bdcf Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Fri, 20 Oct 2023 00:53:18 +0300 Subject: [PATCH 4/4] Period. --- .../next/Windows/2023-10-19-21-46-18.gh-issue-110913.CWlPfg.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Misc/NEWS.d/next/Windows/2023-10-19-21-46-18.gh-issue-110913.CWlPfg.rst b/Misc/NEWS.d/next/Windows/2023-10-19-21-46-18.gh-issue-110913.CWlPfg.rst index faf23f7ba17269..d4c1b56d98ef0e 100644 --- a/Misc/NEWS.d/next/Windows/2023-10-19-21-46-18.gh-issue-110913.CWlPfg.rst +++ b/Misc/NEWS.d/next/Windows/2023-10-19-21-46-18.gh-issue-110913.CWlPfg.rst @@ -1 +1 @@ -WindowsConsoleIO now correctly chunks large buffers without splitting up UTF-8 sequences +WindowsConsoleIO now correctly chunks large buffers without splitting up UTF-8 sequences.