From 6df3ca984743cbc66e8020c883ecda1db56bc967 Mon Sep 17 00:00:00 2001 From: Tal Einat Date: Wed, 22 Aug 2018 15:06:45 +0300 Subject: [PATCH 1/2] bpo-34454: avoid internally encoding fromisoformat() input to UTF-8 This breaks if a surrogate Unicode code point is used as the separator character in the input string. --- Modules/_datetimemodule.c | 104 ++++++++++++++++++++++++-------------- 1 file changed, 67 insertions(+), 37 deletions(-) diff --git a/Modules/_datetimemodule.c b/Modules/_datetimemodule.c index 076912d58f4af8..8f2baf272f9d63 100644 --- a/Modules/_datetimemodule.c +++ b/Modules/_datetimemodule.c @@ -2880,26 +2880,34 @@ date_fromisoformat(PyObject *cls, PyObject *dtstr) { return NULL; } - Py_ssize_t len; + Py_ssize_t len = PyUnicode_GET_LENGTH(dtstr); + if (len != 10) { + goto invalid_string_error; + } - const char * dt_ptr = PyUnicode_AsUTF8AndSize(dtstr, &len); + PyObject * bytes = PyUnicode_AsASCIIString(dtstr); + if (bytes == NULL) { + return NULL; + } + const char * p = PyBytes_AS_STRING(bytes); + Py_DECREF(bytes); + if (p == NULL) { + goto invalid_string_error; + } int year = 0, month = 0, day = 0; - int rv; - if (len == 10) { - rv = parse_isoformat_date(dt_ptr, &year, &month, &day); - } else { - rv = -1; - } + int rv = parse_isoformat_date(p, &year, &month, &day); if (rv < 0) { - PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %s", - dt_ptr); - return NULL; + goto invalid_string_error; } return new_date_subclass_ex(year, month, day, cls); + + invalid_string_error: + PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %R", dtstr); + return NULL; } @@ -4848,43 +4856,61 @@ datetime_fromisoformat(PyObject* cls, PyObject *dtstr) { return NULL; } - Py_ssize_t len; - const char * dt_ptr = PyUnicode_AsUTF8AndSize(dtstr, &len); - const char * p = dt_ptr; + Py_ssize_t len = PyUnicode_GET_LENGTH(dtstr); + if (len < 10) { + goto invalid_string_error; + } int year = 0, month = 0, day = 0; int hour = 0, minute = 0, second = 0, microsecond = 0; int tzoffset = 0, tzusec = 0; + int rv; + PyObject *substr, *substr_bytes; + const char * p; // date has a fixed length of 10 - int rv = parse_isoformat_date(p, &year, &month, &day); + substr = PyUnicode_Substring(dtstr, 0, 10); + if (substr == NULL) { + return NULL; + } + substr_bytes = PyUnicode_AsASCIIString(substr); + Py_DECREF(substr); + if (substr_bytes == NULL) { + goto invalid_string_error; + } + p = PyBytes_AS_STRING(substr_bytes); + Py_DECREF(substr_bytes); + if (p == NULL) { + return NULL; + } - if (!rv && len > 10) { - // In UTF-8, the length of multi-byte characters is encoded in the MSB - if ((p[10] & 0x80) == 0) { - p += 11; - } else { - switch(p[10] & 0xf0) { - case 0xe0: - p += 13; - break; - case 0xf0: - p += 14; - break; - default: - p += 12; - break; - } + rv = parse_isoformat_date(p, &year, &month, &day); + if (rv != 0) { + goto invalid_string_error; + } + + if (len > 10) { + substr = PyUnicode_Substring(dtstr, 11, len); + if (substr == NULL) { + return NULL; + } + substr_bytes = PyUnicode_AsASCIIString(substr); + Py_DECREF(substr); + if (substr_bytes == NULL) { + goto invalid_string_error; + } + p = PyBytes_AS_STRING(substr_bytes); + Py_DECREF(substr_bytes); + if (p == NULL) { + return NULL; } - len -= (p - dt_ptr); - rv = parse_isoformat_time(p, len, + rv = parse_isoformat_time(p, len - 11, &hour, &minute, &second, µsecond, &tzoffset, &tzusec); - } - if (rv < 0) { - PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %s", dt_ptr); - return NULL; + if (rv < 0) { + goto invalid_string_error; + } } PyObject* tzinfo = tzinfo_from_isoformat_results(rv, tzoffset, tzusec); @@ -4897,6 +4923,10 @@ datetime_fromisoformat(PyObject* cls, PyObject *dtstr) { Py_DECREF(tzinfo); return dt; + + invalid_string_error: + PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %R", dtstr); + return NULL; } From 3a02c2b04e27f2deef6dac3a6820dc6d7b295eb1 Mon Sep 17 00:00:00 2001 From: Tal Einat Date: Wed, 22 Aug 2018 22:07:35 +0300 Subject: [PATCH 2/2] bpo-34454: optimize, add tests and NEWS --- Lib/test/datetimetester.py | 5 +- .../2018-08-22-21-59-08.bpo-34454.z7uG4b.rst | 4 + Modules/_datetimemodule.c | 121 ++++++++++++------ 3 files changed, 90 insertions(+), 40 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2018-08-22-21-59-08.bpo-34454.z7uG4b.rst diff --git a/Lib/test/datetimetester.py b/Lib/test/datetimetester.py index f647a232f40442..155d81429c6460 100644 --- a/Lib/test/datetimetester.py +++ b/Lib/test/datetimetester.py @@ -1667,6 +1667,7 @@ def test_fromisoformat_fails(self): # Test that fromisoformat() fails on invalid values bad_strs = [ '', # Empty string + '\ud800', # bpo-34454: Surrogate code point '009-03-04', # Not 10 characters '123456789', # Not a date '200a-12-04', # Invalid character in year @@ -2587,7 +2588,8 @@ def test_fromisoformat_separators(self): ' ', 'T', '\u007f', # 1-bit widths '\u0080', 'ʁ', # 2-bit widths 'ᛇ', '時', # 3-bit widths - '🐍' # 4-bit widths + '🐍', # 4-bit widths + '\ud800', # bpo-34454: Surrogate code point ] for sep in separators: @@ -2639,6 +2641,7 @@ def test_fromisoformat_fails_datetime(self): # Test that fromisoformat() fails on invalid values bad_strs = [ '', # Empty string + '\ud800', # bpo-34454: Surrogate code point '2009.04-19T03', # Wrong first separator '2009-04.19T03', # Wrong second separator '2009-04-19T0a', # Invalid hours diff --git a/Misc/NEWS.d/next/Library/2018-08-22-21-59-08.bpo-34454.z7uG4b.rst b/Misc/NEWS.d/next/Library/2018-08-22-21-59-08.bpo-34454.z7uG4b.rst new file mode 100644 index 00000000000000..4b6dd3dcb689a6 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2018-08-22-21-59-08.bpo-34454.z7uG4b.rst @@ -0,0 +1,4 @@ +Fix the .fromisoformat() methods of datetime types crashing when given +unicode with non-UTF-8-encodable code points. Specifically, +datetime.fromisoformat() now accepts surrogate unicode code points used as +the separator, to be consistent with the pure-python version. diff --git a/Modules/_datetimemodule.c b/Modules/_datetimemodule.c index 8f2baf272f9d63..4b64b895cbbae7 100644 --- a/Modules/_datetimemodule.c +++ b/Modules/_datetimemodule.c @@ -2880,25 +2880,31 @@ date_fromisoformat(PyObject *cls, PyObject *dtstr) { return NULL; } - Py_ssize_t len = PyUnicode_GET_LENGTH(dtstr); - if (len != 10) { - goto invalid_string_error; + if (PyUnicode_READY(dtstr) == -1) { + return NULL; } - PyObject * bytes = PyUnicode_AsASCIIString(dtstr); - if (bytes == NULL) { - return NULL; + const PyObject *bytes = NULL; + const char * p; + + if (PyUnicode_KIND(dtstr) == PyUnicode_1BYTE_KIND) { + p = (const char *) PyUnicode_1BYTE_DATA(dtstr); } - const char * p = PyBytes_AS_STRING(bytes); - Py_DECREF(bytes); - if (p == NULL) { - goto invalid_string_error; + else { + bytes = PyUnicode_AsASCIIString(dtstr); + if (bytes == NULL) { + goto invalid_string_error; + } + p = PyBytes_AS_STRING(bytes); } int year = 0, month = 0, day = 0; - int rv = parse_isoformat_date(p, &year, &month, &day); + if (bytes != NULL) { + Py_DECREF(bytes); + } + if (rv < 0) { goto invalid_string_error; } @@ -4263,8 +4269,24 @@ time_fromisoformat(PyObject *cls, PyObject *tstr) { return NULL; } - Py_ssize_t len; - const char *p = PyUnicode_AsUTF8AndSize(tstr, &len); + if (PyUnicode_READY(tstr) == -1) { + return NULL; + } + + Py_ssize_t len = PyUnicode_GET_LENGTH(tstr); + const PyObject * bytes = NULL; + const char *p; + + if (PyUnicode_KIND(tstr) == PyUnicode_1BYTE_KIND) { + p = (const char *) PyUnicode_1BYTE_DATA(tstr); + } + else { + bytes = PyUnicode_AsASCIIString(tstr); + if (bytes == NULL) { + goto invalid_string_error; + } + p = PyBytes_AS_STRING(bytes); + } int hour = 0, minute = 0, second = 0, microsecond = 0; int tzoffset, tzimicrosecond = 0; @@ -4272,9 +4294,12 @@ time_fromisoformat(PyObject *cls, PyObject *tstr) { &hour, &minute, &second, µsecond, &tzoffset, &tzimicrosecond); + if (bytes != NULL) { + Py_DECREF(bytes); + } + if (rv < 0) { - PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %s", p); - return NULL; + goto invalid_string_error; } PyObject *tzinfo = tzinfo_from_isoformat_results(rv, tzoffset, @@ -4294,6 +4319,10 @@ time_fromisoformat(PyObject *cls, PyObject *tstr) { Py_DECREF(tzinfo); return t; + + invalid_string_error: + PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %s", p); + return NULL; } @@ -4856,6 +4885,10 @@ datetime_fromisoformat(PyObject* cls, PyObject *dtstr) { return NULL; } + if (PyUnicode_READY(dtstr) == -1) { + return NULL; + } + Py_ssize_t len = PyUnicode_GET_LENGTH(dtstr); if (len < 10) { goto invalid_string_error; @@ -4865,32 +4898,17 @@ datetime_fromisoformat(PyObject* cls, PyObject *dtstr) { int hour = 0, minute = 0, second = 0, microsecond = 0; int tzoffset = 0, tzusec = 0; int rv; - PyObject *substr, *substr_bytes; + PyObject *substr, *substr_bytes = NULL; const char * p; - // date has a fixed length of 10 - substr = PyUnicode_Substring(dtstr, 0, 10); - if (substr == NULL) { - return NULL; - } - substr_bytes = PyUnicode_AsASCIIString(substr); - Py_DECREF(substr); - if (substr_bytes == NULL) { - goto invalid_string_error; - } - p = PyBytes_AS_STRING(substr_bytes); - Py_DECREF(substr_bytes); - if (p == NULL) { - return NULL; - } + int is_1byte = (PyUnicode_KIND(dtstr) == PyUnicode_1BYTE_KIND); - rv = parse_isoformat_date(p, &year, &month, &day); - if (rv != 0) { - goto invalid_string_error; + if (is_1byte) { + p = (const char *) PyUnicode_1BYTE_DATA(dtstr); } - - if (len > 10) { - substr = PyUnicode_Substring(dtstr, 11, len); + else { + // date has a fixed length of 10 + substr = PyUnicode_Substring(dtstr, 0, 10); if (substr == NULL) { return NULL; } @@ -4900,14 +4918,39 @@ datetime_fromisoformat(PyObject* cls, PyObject *dtstr) { goto invalid_string_error; } p = PyBytes_AS_STRING(substr_bytes); + } + + rv = parse_isoformat_date(p, &year, &month, &day); + if (substr_bytes != NULL) { Py_DECREF(substr_bytes); - if (p == NULL) { - return NULL; + } + if (rv != 0) { + goto invalid_string_error; + } + + if (len > 10) { + if (is_1byte) { + p += 11; + } + else { + substr = PyUnicode_Substring(dtstr, 11, len); + if (substr == NULL) { + return NULL; + } + substr_bytes = PyUnicode_AsASCIIString(substr); + Py_DECREF(substr); + if (substr_bytes == NULL) { + goto invalid_string_error; + } + p = PyBytes_AS_STRING(substr_bytes); } rv = parse_isoformat_time(p, len - 11, &hour, &minute, &second, µsecond, &tzoffset, &tzusec); + if (substr_bytes != NULL) { + Py_DECREF(substr_bytes); + } if (rv < 0) { goto invalid_string_error; }