From 2f48e5381900cf37c2b98a9cab51f91d718e7256 Mon Sep 17 00:00:00 2001 From: Paul Ganssle Date: Wed, 22 Aug 2018 12:15:43 -0400 Subject: [PATCH 1/7] Fix issue with non-UTF8 separator strings It is possible to pass a non-UTF-8 string as a separator in datetime.isoformat, but the current implementation starts by decoding to UTF-8, which will fail even for some valid strings. In the special case of non-UTF-8 separators, we take a performance hit by encoding the string as ASCII and replacing any invalid characters with ?. --- Modules/_datetimemodule.c | 43 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 40 insertions(+), 3 deletions(-) diff --git a/Modules/_datetimemodule.c b/Modules/_datetimemodule.c index 076912d58f4af8..7e9459b7eb642b 100644 --- a/Modules/_datetimemodule.c +++ b/Modules/_datetimemodule.c @@ -4848,8 +4848,33 @@ datetime_fromisoformat(PyObject* cls, PyObject *dtstr) { return NULL; } + const PyObject * dtstr_bytes = NULL; + unsigned char bytes_needs_decref = 0; + Py_ssize_t len; const char * dt_ptr = PyUnicode_AsUTF8AndSize(dtstr, &len); + + if (dt_ptr == NULL) { + len = PyUnicode_GET_LENGTH(dtstr); + if (len == 10) { + goto invalid_string_error; + } + PyErr_Clear(); + + // If the datetime string cannot be encoded as UTF8 because the + // separator character is an invalid character, this could still + // be a valid isoformat, so we decode it and ignore. + dtstr_bytes = PyUnicode_AsEncodedString(dtstr, "ascii", "replace"); + if (dtstr_bytes == NULL) { + goto finally; + } + bytes_needs_decref = 1; + dt_ptr = PyBytes_AS_STRING(dtstr_bytes); + if (dt_ptr == NULL) { + goto finally; + } + } + const char * p = dt_ptr; int year = 0, month = 0, day = 0; @@ -4883,20 +4908,32 @@ datetime_fromisoformat(PyObject* cls, PyObject *dtstr) { &tzoffset, &tzusec); } if (rv < 0) { - PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %s", dt_ptr); - return NULL; + goto invalid_string_error; } PyObject* tzinfo = tzinfo_from_isoformat_results(rv, tzoffset, tzusec); if (tzinfo == NULL) { - return NULL; + goto finally; } PyObject *dt = new_datetime_subclass_ex(year, month, day, hour, minute, second, microsecond, tzinfo, cls); Py_DECREF(tzinfo); + if (bytes_needs_decref) { + Py_DECREF(dtstr_bytes); + } return dt; + +invalid_string_error: + PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %R", dtstr); + +finally: + if (bytes_needs_decref) { + Py_DECREF(dtstr_bytes); + } + + return NULL; } From 85a99ca97a90c8b91b4901a7deea7b444303aa94 Mon Sep 17 00:00:00 2001 From: Paul Ganssle Date: Wed, 22 Aug 2018 12:50:19 -0400 Subject: [PATCH 2/7] Fix non-UTF8 crash for (date|time)_fromisoformat Previously this would end up dereferencing a NULL pointer if the PyUnicode_AsUTF8AndSize call failed, this makes it so that the same error as any other parsing error is raised. --- Modules/_datetimemodule.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/Modules/_datetimemodule.c b/Modules/_datetimemodule.c index 7e9459b7eb642b..91cf203ec9d37c 100644 --- a/Modules/_datetimemodule.c +++ b/Modules/_datetimemodule.c @@ -2883,6 +2883,9 @@ date_fromisoformat(PyObject *cls, PyObject *dtstr) { Py_ssize_t len; const char * dt_ptr = PyUnicode_AsUTF8AndSize(dtstr, &len); + if (dt_ptr == NULL) { + goto invalid_string_error; + } int year = 0, month = 0, day = 0; @@ -2894,12 +2897,15 @@ date_fromisoformat(PyObject *cls, PyObject *dtstr) { } if (rv < 0) { - PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %s", - dt_ptr); - return NULL; + goto invalid_string_error; } return new_date_subclass_ex(year, month, day, cls); + +invalid_string_error: + PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %R", + dtstr); + return NULL; } @@ -4258,6 +4264,10 @@ time_fromisoformat(PyObject *cls, PyObject *tstr) { Py_ssize_t len; const char *p = PyUnicode_AsUTF8AndSize(tstr, &len); + if (p == NULL) { + goto invalid_string_error; + } + int hour = 0, minute = 0, second = 0, microsecond = 0; int tzoffset, tzimicrosecond = 0; int rv = parse_isoformat_time(p, len, @@ -4265,8 +4275,7 @@ time_fromisoformat(PyObject *cls, PyObject *tstr) { &tzoffset, &tzimicrosecond); if (rv < 0) { - PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %s", p); - return NULL; + goto invalid_string_error; } PyObject *tzinfo = tzinfo_from_isoformat_results(rv, tzoffset, @@ -4286,6 +4295,10 @@ time_fromisoformat(PyObject *cls, PyObject *tstr) { Py_DECREF(tzinfo); return t; + +invalid_string_error: + PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %R", tstr); + return NULL; } From dd82aa01bb2789595f178d6d8501557c5516502c Mon Sep 17 00:00:00 2001 From: Paul Ganssle Date: Wed, 22 Aug 2018 16:23:22 -0400 Subject: [PATCH 3/7] Refactor non-UTF-8 sanitization This increases performance for valid non-UTF-8 strings by avoiding an error condition, and minimizes the impact on the rest of the algorithm. --- Modules/_datetimemodule.c | 74 +++++++++++++++++++++++++-------------- 1 file changed, 47 insertions(+), 27 deletions(-) diff --git a/Modules/_datetimemodule.c b/Modules/_datetimemodule.c index 91cf203ec9d37c..bb86b28f10a9c3 100644 --- a/Modules/_datetimemodule.c +++ b/Modules/_datetimemodule.c @@ -4852,6 +4852,40 @@ datetime_combine(PyObject *cls, PyObject *args, PyObject *kw) return result; } +static PyObject * +_sanitize_isoformat_str(PyObject *dtstr, int *needs_decref) { + // `fromisoformat` allows surrogate characters in exactly one position, + // the separator; to allow datetime_fromisoformat to make the simplifying + // assumption that all valid strings can be encoded in UTF-8, this function + // replaces any surrogate character separators with `T`. + Py_ssize_t len = PyUnicode_GetLength(dtstr); + *needs_decref = 0; + if (len <= 10 || !Py_UNICODE_IS_SURROGATE(PyUnicode_READ_CHAR(dtstr, 10))) { + return dtstr; + } + + PyObject *left = PyUnicode_Substring(dtstr, 0, 10); + if (left == NULL) { + return NULL; + } + + PyObject *right = PyUnicode_Substring(dtstr, 11, len); + if (right == NULL) { + Py_DECREF(left); + return NULL; + } + + PyObject *str_out = PyUnicode_FromFormat("%UT%U", left, right); + Py_DECREF(left); + Py_DECREF(right); + if (str_out == NULL) { + return NULL; + } + + *needs_decref = 1; + return str_out; +} + static PyObject * datetime_fromisoformat(PyObject* cls, PyObject *dtstr) { assert(dtstr != NULL); @@ -4861,34 +4895,20 @@ datetime_fromisoformat(PyObject* cls, PyObject *dtstr) { return NULL; } - const PyObject * dtstr_bytes = NULL; - unsigned char bytes_needs_decref = 0; + int needs_decref = 0; + dtstr = _sanitize_isoformat_str(dtstr, &needs_decref); + if (dtstr == NULL) { + goto error; + } Py_ssize_t len; const char * dt_ptr = PyUnicode_AsUTF8AndSize(dtstr, &len); if (dt_ptr == NULL) { - len = PyUnicode_GET_LENGTH(dtstr); - if (len == 10) { - goto invalid_string_error; - } - PyErr_Clear(); - - // If the datetime string cannot be encoded as UTF8 because the - // separator character is an invalid character, this could still - // be a valid isoformat, so we decode it and ignore. - dtstr_bytes = PyUnicode_AsEncodedString(dtstr, "ascii", "replace"); - if (dtstr_bytes == NULL) { - goto finally; - } - bytes_needs_decref = 1; - dt_ptr = PyBytes_AS_STRING(dtstr_bytes); - if (dt_ptr == NULL) { - goto finally; - } + goto invalid_string_error; } - const char * p = dt_ptr; + const char *p = dt_ptr; int year = 0, month = 0, day = 0; int hour = 0, minute = 0, second = 0, microsecond = 0; @@ -4926,24 +4946,24 @@ datetime_fromisoformat(PyObject* cls, PyObject *dtstr) { PyObject* tzinfo = tzinfo_from_isoformat_results(rv, tzoffset, tzusec); if (tzinfo == NULL) { - goto finally; + goto error; } PyObject *dt = new_datetime_subclass_ex(year, month, day, hour, minute, second, microsecond, tzinfo, cls); Py_DECREF(tzinfo); - if (bytes_needs_decref) { - Py_DECREF(dtstr_bytes); + if (needs_decref) { + Py_DECREF(dtstr); } return dt; invalid_string_error: PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %R", dtstr); -finally: - if (bytes_needs_decref) { - Py_DECREF(dtstr_bytes); +error: + if (needs_decref) { + Py_DECREF(dtstr); } return NULL; From c24388e94666ef079a4978956016512a3e9ce3a4 Mon Sep 17 00:00:00 2001 From: Tal Einat Date: Wed, 22 Aug 2018 16:27:43 -0400 Subject: [PATCH 4/7] Add tests for surrogate code points Co-authored-by: Alexey Izbyshev Co-authored-by: Paul Ganssle --- Lib/test/datetimetester.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/Lib/test/datetimetester.py b/Lib/test/datetimetester.py index f647a232f40442..9c6e71c54d79d3 100644 --- a/Lib/test/datetimetester.py +++ b/Lib/test/datetimetester.py @@ -1667,6 +1667,7 @@ def test_fromisoformat_fails(self): # Test that fromisoformat() fails on invalid values bad_strs = [ '', # Empty string + '\ud800', # bpo-34454: Surrogate code point '009-03-04', # Not 10 characters '123456789', # Not a date '200a-12-04', # Invalid character in year @@ -1675,6 +1676,7 @@ def test_fromisoformat_fails(self): '2009-01-32', # Invalid day '2009-02-29', # Invalid leap day '20090228', # Valid ISO8601 output not from isoformat() + '2009\ud80002\ud80028', # Separators are surrogate codepoints ] for bad_str in bad_strs: @@ -2587,7 +2589,8 @@ def test_fromisoformat_separators(self): ' ', 'T', '\u007f', # 1-bit widths '\u0080', 'ʁ', # 2-bit widths 'ᛇ', '時', # 3-bit widths - '🐍' # 4-bit widths + '🐍', # 4-bit widths + '\ud800', # bpo-34454: Surrogate code point ] for sep in separators: @@ -2639,6 +2642,7 @@ def test_fromisoformat_fails_datetime(self): # Test that fromisoformat() fails on invalid values bad_strs = [ '', # Empty string + '\ud800', # bpo-34454: Surrogate code point '2009.04-19T03', # Wrong first separator '2009-04.19T03', # Wrong second separator '2009-04-19T0a', # Invalid hours @@ -2652,6 +2656,8 @@ def test_fromisoformat_fails_datetime(self): '2009-04-19T03:15:45.123456+24:30', # Invalid time zone offset '2009-04-19T03:15:45.123456-24:30', # Invalid negative offset '2009-04-10ᛇᛇᛇᛇᛇ12:15', # Too many unicode separators + '2009-04\ud80010T12:15', # Surrogate char in date + '2009-04-10T12\ud80015', # Surrogate char in time '2009-04-19T1', # Incomplete hours '2009-04-19T12:3', # Incomplete minutes '2009-04-19T12:30:4', # Incomplete seconds @@ -3521,6 +3527,7 @@ def test_fromisoformat_timespecs(self): def test_fromisoformat_fails(self): bad_strs = [ '', # Empty string + '12\ud80000', # Invalid separator - surrogate char '12:', # Ends on a separator '12:30:', # Ends on a separator '12:30:15.', # Ends on a separator From a0246a0a351376fee0806d7f1e9c1716fe6b2c56 Mon Sep 17 00:00:00 2001 From: Tal Einat Date: Wed, 22 Aug 2018 16:27:14 -0400 Subject: [PATCH 5/7] Add news entry for bpo-34454 --- .../next/Library/2018-08-22-21-59-08.bpo-34454.z7uG4b.rst | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 Misc/NEWS.d/next/Library/2018-08-22-21-59-08.bpo-34454.z7uG4b.rst diff --git a/Misc/NEWS.d/next/Library/2018-08-22-21-59-08.bpo-34454.z7uG4b.rst b/Misc/NEWS.d/next/Library/2018-08-22-21-59-08.bpo-34454.z7uG4b.rst new file mode 100644 index 00000000000000..d6c5eee84c60b1 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2018-08-22-21-59-08.bpo-34454.z7uG4b.rst @@ -0,0 +1,4 @@ +Fix the .fromisoformat() methods of datetime types crashing when given +unicode with non-UTF-8-encodable code points. Specifically, +datetime.fromisoformat() now accepts surrogate unicode code points used as +the separator. Report and tests by Alexey Izbyshev. From b89d4f5627a423bf66b86ad90a436ebea6c4b8e9 Mon Sep 17 00:00:00 2001 From: Paul Ganssle Date: Wed, 22 Aug 2018 17:51:33 -0400 Subject: [PATCH 6/7] Refactor sanitize_isoformat_str Rather than splitting the string at position 10 and re-joining it with PyUnicode_Format, this copies the original unicode object and overwrites the separator character. Co-Authored-By: Alexey Izbyshev --- Modules/_datetimemodule.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/Modules/_datetimemodule.c b/Modules/_datetimemodule.c index bb86b28f10a9c3..2522b65f1168e7 100644 --- a/Modules/_datetimemodule.c +++ b/Modules/_datetimemodule.c @@ -4864,21 +4864,14 @@ _sanitize_isoformat_str(PyObject *dtstr, int *needs_decref) { return dtstr; } - PyObject *left = PyUnicode_Substring(dtstr, 0, 10); - if (left == NULL) { - return NULL; - } - - PyObject *right = PyUnicode_Substring(dtstr, 11, len); - if (right == NULL) { - Py_DECREF(left); + PyObject *str_out = PyUnicode_New(len, PyUnicode_MAX_CHAR_VALUE(dtstr)); + if (str_out == NULL) { return NULL; } - PyObject *str_out = PyUnicode_FromFormat("%UT%U", left, right); - Py_DECREF(left); - Py_DECREF(right); - if (str_out == NULL) { + if (PyUnicode_CopyCharacters(str_out, 0, dtstr, 0, len) == -1 || + PyUnicode_WriteChar(str_out, 10, (Py_UCS4)'T')) { + Py_DECREF(str_out); return NULL; } From 160e779654eda94a819b928ebd60901d588f2b6a Mon Sep 17 00:00:00 2001 From: Tal Einat Date: Thu, 23 Aug 2018 16:32:32 +0300 Subject: [PATCH 7/7] added mention of patch author in NEWS --- .../next/Library/2018-08-22-21-59-08.bpo-34454.z7uG4b.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Misc/NEWS.d/next/Library/2018-08-22-21-59-08.bpo-34454.z7uG4b.rst b/Misc/NEWS.d/next/Library/2018-08-22-21-59-08.bpo-34454.z7uG4b.rst index d6c5eee84c60b1..1d5c32708c1554 100644 --- a/Misc/NEWS.d/next/Library/2018-08-22-21-59-08.bpo-34454.z7uG4b.rst +++ b/Misc/NEWS.d/next/Library/2018-08-22-21-59-08.bpo-34454.z7uG4b.rst @@ -1,4 +1,4 @@ Fix the .fromisoformat() methods of datetime types crashing when given unicode with non-UTF-8-encodable code points. Specifically, datetime.fromisoformat() now accepts surrogate unicode code points used as -the separator. Report and tests by Alexey Izbyshev. +the separator. Report and tests by Alexey Izbyshev, patch by Paul Ganssle.