From 76b274d1fdf3f9bf650123db3d6a257e70ed9f4d Mon Sep 17 00:00:00 2001 From: Paul Ganssle Date: Mon, 27 Aug 2018 16:31:27 -0400 Subject: [PATCH 1/6] Use _PyUnicode_Copy in sanitize_isoformat_str --- Modules/_datetimemodule.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/Modules/_datetimemodule.c b/Modules/_datetimemodule.c index cdfa235f0917e6..add275c44621ef 100644 --- a/Modules/_datetimemodule.c +++ b/Modules/_datetimemodule.c @@ -4869,18 +4869,21 @@ _sanitize_isoformat_str(PyObject *dtstr, int *needs_decref) { // assumption that all valid strings can be encoded in UTF-8, this function // replaces any surrogate character separators with `T`. Py_ssize_t len = PyUnicode_GetLength(dtstr); + if (len < 0) { + return NULL; + } + *needs_decref = 0; if (len <= 10 || !Py_UNICODE_IS_SURROGATE(PyUnicode_READ_CHAR(dtstr, 10))) { return dtstr; } - PyObject *str_out = PyUnicode_New(len, PyUnicode_MAX_CHAR_VALUE(dtstr)); + PyObject *str_out = _PyUnicode_Copy(dtstr); if (str_out == NULL) { return NULL; } - if (PyUnicode_CopyCharacters(str_out, 0, dtstr, 0, len) == -1 || - PyUnicode_WriteChar(str_out, 10, (Py_UCS4)'T')) { + if (PyUnicode_WriteChar(str_out, 10, (Py_UCS4)'T')) { Py_DECREF(str_out); return NULL; } From 6d6f3d0a55aa5a57a8cac5b39b4409fad8a5298e Mon Sep 17 00:00:00 2001 From: Paul Ganssle Date: Mon, 27 Aug 2018 17:16:07 -0400 Subject: [PATCH 2/6] Use repr in fromisoformat error message This reverses commit 67b74a98b2 per Serhiy Storchaka's suggestion: I suggested to use %R in the error message because including the raw string can be confusing in the case of empty string, or string containing trailing whitespaces, invisible or unprintable characters. We agree that it is better to change both the C and pure Python versions to use repr. --- Lib/datetime.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Lib/datetime.py b/Lib/datetime.py index cff92033c496a7..292919fd7988e7 100644 --- a/Lib/datetime.py +++ b/Lib/datetime.py @@ -857,7 +857,7 @@ def fromisoformat(cls, date_string): assert len(date_string) == 10 return cls(*_parse_isoformat_date(date_string)) except Exception: - raise ValueError('Invalid isoformat string: %s' % date_string) + raise ValueError(f'Invalid isoformat string: {date_string!r}') # Conversions to string @@ -1369,7 +1369,7 @@ def fromisoformat(cls, time_string): try: return cls(*_parse_isoformat_time(time_string)) except Exception: - raise ValueError('Invalid isoformat string: %s' % time_string) + raise ValueError(f'Invalid isoformat string: {time_string!r}') def strftime(self, fmt): @@ -1646,13 +1646,13 @@ def fromisoformat(cls, date_string): try: date_components = _parse_isoformat_date(dstr) except ValueError: - raise ValueError('Invalid isoformat string: %s' % date_string) + raise ValueError(f'Invalid isoformat string: {date_string!r}') if tstr: try: time_components = _parse_isoformat_time(tstr) except ValueError: - raise ValueError('Invalid isoformat string: %s' % date_string) + raise ValueError(f'Invalid isoformat string: {date_string!r}') else: time_components = [0, 0, 0, 0, None] From eb8e17632353044c9c45ed4b16552b65e056960b Mon Sep 17 00:00:00 2001 From: Paul Ganssle Date: Mon, 27 Aug 2018 17:16:43 -0400 Subject: [PATCH 3/6] Retain non-sanitized dtstr for error printing This does not create an extra string, it just holds on to a reference to the original input string for purposes of creating the error message. --- Lib/test/datetimetester.py | 9 +++++++++ Modules/_datetimemodule.c | 10 +++++----- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/Lib/test/datetimetester.py b/Lib/test/datetimetester.py index 9c6e71c54d79d3..122f6b55ba33b1 100644 --- a/Lib/test/datetimetester.py +++ b/Lib/test/datetimetester.py @@ -13,6 +13,7 @@ import os import pickle import random +import re import struct import unittest @@ -2676,6 +2677,14 @@ def test_fromisoformat_fails_datetime(self): with self.assertRaises(ValueError): self.theclass.fromisoformat(bad_str) + def test_fromisoformat_fails_surrogate(self): + # Test that when fromisoformat() fails with a surrogate character as + # the separator, the error message contains the original string + dtstr = "2018-01-03\ud80001:0113" + + with self.assertRaisesRegex(ValueError, re.escape(repr(dtstr))): + self.theclass.fromisoformat(dtstr) + def test_fromisoformat_utc(self): dt_str = '2014-04-19T13:21:13+00:00' dt = self.theclass.fromisoformat(dt_str) diff --git a/Modules/_datetimemodule.c b/Modules/_datetimemodule.c index add275c44621ef..52287ace236ecc 100644 --- a/Modules/_datetimemodule.c +++ b/Modules/_datetimemodule.c @@ -4902,13 +4902,13 @@ datetime_fromisoformat(PyObject* cls, PyObject *dtstr) { } int needs_decref = 0; - dtstr = _sanitize_isoformat_str(dtstr, &needs_decref); - if (dtstr == NULL) { + PyObject *dtstr_clean = _sanitize_isoformat_str(dtstr, &needs_decref); + if (dtstr_clean == NULL) { goto error; } Py_ssize_t len; - const char * dt_ptr = PyUnicode_AsUTF8AndSize(dtstr, &len); + const char * dt_ptr = PyUnicode_AsUTF8AndSize(dtstr_clean, &len); if (dt_ptr == NULL) { goto invalid_string_error; @@ -4960,7 +4960,7 @@ datetime_fromisoformat(PyObject* cls, PyObject *dtstr) { Py_DECREF(tzinfo); if (needs_decref) { - Py_DECREF(dtstr); + Py_DECREF(dtstr_clean); } return dt; @@ -4969,7 +4969,7 @@ datetime_fromisoformat(PyObject* cls, PyObject *dtstr) { error: if (needs_decref) { - Py_DECREF(dtstr); + Py_DECREF(dtstr_clean); } return NULL; From 4e54bd90b1f587bdaccd3ed5ae3ce7915836949e Mon Sep 17 00:00:00 2001 From: Paul Ganssle Date: Tue, 28 Aug 2018 09:22:29 -0400 Subject: [PATCH 4/6] Separate handling of Unicode and other errors In the initial implementation, errors other than encoding errors would both raise an error indicating an invalid format, which would not be true for errors like MemoryError. --- Modules/_datetimemodule.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/Modules/_datetimemodule.c b/Modules/_datetimemodule.c index 52287ace236ecc..c5682df9c6d40a 100644 --- a/Modules/_datetimemodule.c +++ b/Modules/_datetimemodule.c @@ -4911,7 +4911,12 @@ datetime_fromisoformat(PyObject* cls, PyObject *dtstr) { const char * dt_ptr = PyUnicode_AsUTF8AndSize(dtstr_clean, &len); if (dt_ptr == NULL) { - goto invalid_string_error; + if (PyErr_ExceptionMatches(PyExc_UnicodeEncodeError)) { + // Encoding errors are invalid string errors at this point + goto invalid_string_error; + } else { + goto error; + } } const char *p = dt_ptr; From d809d712c3c79ccadc6061469f9751f7120edbf5 Mon Sep 17 00:00:00 2001 From: Paul Ganssle Date: Mon, 27 Aug 2018 16:27:13 -0400 Subject: [PATCH 5/6] PEP 7 fixes to from_isoformat --- Modules/_datetimemodule.c | 133 +++++++++++++++++++++----------------- 1 file changed, 75 insertions(+), 58 deletions(-) diff --git a/Modules/_datetimemodule.c b/Modules/_datetimemodule.c index c5682df9c6d40a..33444bd9c9442c 100644 --- a/Modules/_datetimemodule.c +++ b/Modules/_datetimemodule.c @@ -668,8 +668,8 @@ set_date_fields(PyDateTime_Date *self, int y, int m, int d) * String parsing utilities and helper functions */ -static const char* -parse_digits(const char* ptr, int* var, size_t num_digits) +static const char * +parse_digits(const char *ptr, int *var, size_t num_digits) { for (size_t i = 0; i < num_digits; ++i) { unsigned int tmp = (unsigned int)(*(ptr++) - '0'); @@ -683,15 +683,16 @@ parse_digits(const char* ptr, int* var, size_t num_digits) return ptr; } -static int parse_isoformat_date(const char *dtstr, - int* year, int *month, int* day) { +static int +parse_isoformat_date(const char *dtstr, int *year, int *month, int *day) +{ /* Parse the date components of the result of date.isoformat() - * - * Return codes: - * 0: Success - * -1: Failed to parse date component - * -2: Failed to parse dateseparator - */ + * + * Return codes: + * 0: Success + * -1: Failed to parse date component + * -2: Failed to parse dateseparator + */ const char *p = dtstr; p = parse_digits(p, year, 4); if (NULL == p) { @@ -720,8 +721,9 @@ static int parse_isoformat_date(const char *dtstr, } static int -parse_hh_mm_ss_ff(const char *tstr, const char *tstr_end, - int* hour, int* minute, int *second, int *microsecond) { +parse_hh_mm_ss_ff(const char *tstr, const char *tstr_end, int *hour, + int *minute, int *second, int *microsecond) +{ const char *p = tstr; const char *p_end = tstr_end; int *vals[3] = {hour, minute, second}; @@ -736,12 +738,15 @@ parse_hh_mm_ss_ff(const char *tstr, const char *tstr_end, char c = *(p++); if (p >= p_end) { return c != '\0'; - } else if (c == ':') { + } + else if (c == ':') { continue; - } else if (c == '.') { + } + else if (c == '.') { break; - } else { - return -4; // Malformed time separator + } + else { + return -4; // Malformed time separator } } @@ -765,9 +770,10 @@ parse_hh_mm_ss_ff(const char *tstr, const char *tstr_end, } static int -parse_isoformat_time(const char *dtstr, size_t dtlen, - int* hour, int *minute, int *second, int *microsecond, - int* tzoffset, int *tzmicrosecond) { +parse_isoformat_time(const char *dtstr, size_t dtlen, int *hour, int *minute, + int *second, int *microsecond, int *tzoffset, + int *tzmicrosecond) +{ // Parse the time portion of a datetime.isoformat() string // // Return codes: @@ -785,19 +791,21 @@ parse_isoformat_time(const char *dtstr, size_t dtlen, if (*tzinfo_pos == '+' || *tzinfo_pos == '-') { break; } - } while(++tzinfo_pos < p_end); + } while (++tzinfo_pos < p_end); - int rv = parse_hh_mm_ss_ff(dtstr, tzinfo_pos, - hour, minute, second, microsecond); + int rv = parse_hh_mm_ss_ff(dtstr, tzinfo_pos, hour, minute, second, + microsecond); if (rv < 0) { return rv; - } else if (tzinfo_pos == p_end) { + } + else if (tzinfo_pos == p_end) { // We know that there's no time zone, so if there's stuff at the // end of the string it's an error. if (rv == 1) { return -5; - } else { + } + else { return 0; } } @@ -812,19 +820,18 @@ parse_isoformat_time(const char *dtstr, size_t dtlen, return -5; } - int tzsign = (*tzinfo_pos == '-')?-1:1; + int tzsign = (*tzinfo_pos == '-') ? -1 : 1; tzinfo_pos++; int tzhour = 0, tzminute = 0, tzsecond = 0; - rv = parse_hh_mm_ss_ff(tzinfo_pos, p_end, - &tzhour, &tzminute, &tzsecond, tzmicrosecond); + rv = parse_hh_mm_ss_ff(tzinfo_pos, p_end, &tzhour, &tzminute, &tzsecond, + tzmicrosecond); *tzoffset = tzsign * ((tzhour * 3600) + (tzminute * 60) + tzsecond); *tzmicrosecond *= tzsign; - return rv?-5:1; + return rv ? -5 : 1; } - /* --------------------------------------------------------------------------- * Create various objects, mostly without range checking. */ @@ -839,30 +846,33 @@ new_date_ex(int year, int month, int day, PyTypeObject *type) return NULL; } - self = (PyDateTime_Date *) (type->tp_alloc(type, 0)); + self = (PyDateTime_Date *)(type->tp_alloc(type, 0)); if (self != NULL) set_date_fields(self, year, month, day); - return (PyObject *) self; + return (PyObject *)self; } #define new_date(year, month, day) \ new_date_ex(year, month, day, &PyDateTime_DateType) // Forward declaration -static PyObject * new_datetime_ex(int, int, int, int, int, int, int, - PyObject*, PyTypeObject*); +static PyObject * +new_datetime_ex(int, int, int, int, int, int, int, PyObject *, PyTypeObject *); /* Create date instance with no range checking, or call subclass constructor */ static PyObject * -new_date_subclass_ex(int year, int month, int day, PyObject *cls) { +new_date_subclass_ex(int year, int month, int day, PyObject *cls) +{ PyObject *result; // We have "fast path" constructors for two subclasses: date and datetime if ((PyTypeObject *)cls == &PyDateTime_DateType) { result = new_date_ex(year, month, day, (PyTypeObject *)cls); - } else if ((PyTypeObject *)cls == &PyDateTime_DateTimeType) { + } + else if ((PyTypeObject *)cls == &PyDateTime_DateTimeType) { result = new_datetime_ex(year, month, day, 0, 0, 0, 0, Py_None, (PyTypeObject *)cls); - } else { + } + else { result = PyObject_CallFunction(cls, "iii", year, month, day); } @@ -1281,7 +1291,8 @@ append_keyword_fold(PyObject *repr, int fold) } static inline PyObject * -tzinfo_from_isoformat_results(int rv, int tzoffset, int tz_useconds) { +tzinfo_from_isoformat_results(int rv, int tzoffset, int tz_useconds) +{ PyObject *tzinfo; if (rv == 1) { // Create a timezone from offset in seconds (0 returns UTC) @@ -1296,7 +1307,8 @@ tzinfo_from_isoformat_results(int rv, int tzoffset, int tz_useconds) { } tzinfo = new_timezone(delta, NULL); Py_DECREF(delta); - } else { + } + else { tzinfo = Py_None; Py_INCREF(Py_None); } @@ -2886,17 +2898,19 @@ date_fromordinal(PyObject *cls, PyObject *args) /* Return the new date from a string as generated by date.isoformat() */ static PyObject * -date_fromisoformat(PyObject *cls, PyObject *dtstr) { +date_fromisoformat(PyObject *cls, PyObject *dtstr) +{ assert(dtstr != NULL); if (!PyUnicode_Check(dtstr)) { - PyErr_SetString(PyExc_TypeError, "fromisoformat: argument must be str"); + PyErr_SetString(PyExc_TypeError, + "fromisoformat: argument must be str"); return NULL; } Py_ssize_t len; - const char * dt_ptr = PyUnicode_AsUTF8AndSize(dtstr, &len); + const char *dt_ptr = PyUnicode_AsUTF8AndSize(dtstr, &len); if (dt_ptr == NULL) { goto invalid_string_error; } @@ -2906,7 +2920,8 @@ date_fromisoformat(PyObject *cls, PyObject *dtstr) { int rv; if (len == 10) { rv = parse_isoformat_date(dt_ptr, &year, &month, &day); - } else { + } + else { rv = -1; } @@ -2917,12 +2932,10 @@ date_fromisoformat(PyObject *cls, PyObject *dtstr) { return new_date_subclass_ex(year, month, day, cls); invalid_string_error: - PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %R", - dtstr); + PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %R", dtstr); return NULL; } - /* * Date arithmetic. */ @@ -4863,7 +4876,8 @@ datetime_combine(PyObject *cls, PyObject *args, PyObject *kw) } static PyObject * -_sanitize_isoformat_str(PyObject *dtstr, int *needs_decref) { +_sanitize_isoformat_str(PyObject *dtstr, int *needs_decref) +{ // `fromisoformat` allows surrogate characters in exactly one position, // the separator; to allow datetime_fromisoformat to make the simplifying // assumption that all valid strings can be encoded in UTF-8, this function @@ -4874,7 +4888,8 @@ _sanitize_isoformat_str(PyObject *dtstr, int *needs_decref) { } *needs_decref = 0; - if (len <= 10 || !Py_UNICODE_IS_SURROGATE(PyUnicode_READ_CHAR(dtstr, 10))) { + if (len <= 10 || + !Py_UNICODE_IS_SURROGATE(PyUnicode_READ_CHAR(dtstr, 10))) { return dtstr; } @@ -4893,11 +4908,13 @@ _sanitize_isoformat_str(PyObject *dtstr, int *needs_decref) { } static PyObject * -datetime_fromisoformat(PyObject* cls, PyObject *dtstr) { +datetime_fromisoformat(PyObject *cls, PyObject *dtstr) +{ assert(dtstr != NULL); if (!PyUnicode_Check(dtstr)) { - PyErr_SetString(PyExc_TypeError, "fromisoformat: argument must be str"); + PyErr_SetString(PyExc_TypeError, + "fromisoformat: argument must be str"); return NULL; } @@ -4908,13 +4925,14 @@ datetime_fromisoformat(PyObject* cls, PyObject *dtstr) { } Py_ssize_t len; - const char * dt_ptr = PyUnicode_AsUTF8AndSize(dtstr_clean, &len); + const char *dt_ptr = PyUnicode_AsUTF8AndSize(dtstr_clean, &len); if (dt_ptr == NULL) { if (PyErr_ExceptionMatches(PyExc_UnicodeEncodeError)) { // Encoding errors are invalid string errors at this point goto invalid_string_error; - } else { + } + else { goto error; } } @@ -4932,8 +4950,9 @@ datetime_fromisoformat(PyObject* cls, PyObject *dtstr) { // In UTF-8, the length of multi-byte characters is encoded in the MSB if ((p[10] & 0x80) == 0) { p += 11; - } else { - switch(p[10] & 0xf0) { + } + else { + switch (p[10] & 0xf0) { case 0xe0: p += 13; break; @@ -4947,15 +4966,14 @@ datetime_fromisoformat(PyObject* cls, PyObject *dtstr) { } len -= (p - dt_ptr); - rv = parse_isoformat_time(p, len, - &hour, &minute, &second, µsecond, - &tzoffset, &tzusec); + rv = parse_isoformat_time(p, len, &hour, &minute, &second, + µsecond, &tzoffset, &tzusec); } if (rv < 0) { goto invalid_string_error; } - PyObject* tzinfo = tzinfo_from_isoformat_results(rv, tzoffset, tzusec); + PyObject *tzinfo = tzinfo_from_isoformat_results(rv, tzoffset, tzusec); if (tzinfo == NULL) { goto error; } @@ -4980,7 +4998,6 @@ datetime_fromisoformat(PyObject* cls, PyObject *dtstr) { return NULL; } - /* * Destructor. */ From 90a2e71e6131bb19efd09c6d5e241495a77ad3a3 Mon Sep 17 00:00:00 2001 From: Paul Ganssle Date: Tue, 28 Aug 2018 12:15:25 -0400 Subject: [PATCH 6/6] Drop needs_decref from _sanitize_isoformat_str Instead _sanitize_isoformat_str returns a new reference, even to the original string. --- Modules/_datetimemodule.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/Modules/_datetimemodule.c b/Modules/_datetimemodule.c index 33444bd9c9442c..bc4caa02d5f146 100644 --- a/Modules/_datetimemodule.c +++ b/Modules/_datetimemodule.c @@ -4876,20 +4876,22 @@ datetime_combine(PyObject *cls, PyObject *args, PyObject *kw) } static PyObject * -_sanitize_isoformat_str(PyObject *dtstr, int *needs_decref) +_sanitize_isoformat_str(PyObject *dtstr) { // `fromisoformat` allows surrogate characters in exactly one position, // the separator; to allow datetime_fromisoformat to make the simplifying // assumption that all valid strings can be encoded in UTF-8, this function // replaces any surrogate character separators with `T`. + // + // The result of this, if not NULL, returns a new reference Py_ssize_t len = PyUnicode_GetLength(dtstr); if (len < 0) { return NULL; } - *needs_decref = 0; if (len <= 10 || !Py_UNICODE_IS_SURROGATE(PyUnicode_READ_CHAR(dtstr, 10))) { + Py_INCREF(dtstr); return dtstr; } @@ -4903,7 +4905,6 @@ _sanitize_isoformat_str(PyObject *dtstr, int *needs_decref) return NULL; } - *needs_decref = 1; return str_out; } @@ -4918,8 +4919,7 @@ datetime_fromisoformat(PyObject *cls, PyObject *dtstr) return NULL; } - int needs_decref = 0; - PyObject *dtstr_clean = _sanitize_isoformat_str(dtstr, &needs_decref); + PyObject *dtstr_clean = _sanitize_isoformat_str(dtstr); if (dtstr_clean == NULL) { goto error; } @@ -4982,18 +4982,14 @@ datetime_fromisoformat(PyObject *cls, PyObject *dtstr) second, microsecond, tzinfo, cls); Py_DECREF(tzinfo); - if (needs_decref) { - Py_DECREF(dtstr_clean); - } + Py_DECREF(dtstr_clean); return dt; invalid_string_error: PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %R", dtstr); error: - if (needs_decref) { - Py_DECREF(dtstr_clean); - } + Py_XDECREF(dtstr_clean); return NULL; }