diff --git a/Lib/test/datetimetester.py b/Lib/test/datetimetester.py index 9c6e71c54d79d3..de1c1d71ffa640 100644 --- a/Lib/test/datetimetester.py +++ b/Lib/test/datetimetester.py @@ -302,6 +302,8 @@ def test_tzname(self): self.assertEqual('UTC+09:30', timezone(9.5 * HOUR).tzname(None)) self.assertEqual('UTC-00:01', timezone(timedelta(minutes=-1)).tzname(None)) self.assertEqual('XYZ', timezone(-5 * HOUR, 'XYZ').tzname(None)) + # bpo-34482: Check that surrogates are handled properly. + self.assertEqual('\ud800', timezone(ZERO, '\ud800').tzname(None)) # Sub-minute offsets: self.assertEqual('UTC+01:06:40', timezone(timedelta(0, 4000)).tzname(None)) @@ -1307,6 +1309,12 @@ def test_strftime(self): except ValueError: pass + # bpo-34482: Check that surrogates don't cause a crash. + no_op_strs = ['\ud8888', '\ud800', '\\ud800\ud800'] + for tstr in no_op_strs: + with self.subTest(f'strftime: {tstr!r}'): + self.assertEqual(t.strftime(tstr), tstr) + #check that this standard extension works t.strftime("%f") @@ -1746,6 +1754,9 @@ def test_isoformat(self): self.assertEqual(t.isoformat('T'), "0001-02-03T04:05:01.000123") self.assertEqual(t.isoformat(' '), "0001-02-03 04:05:01.000123") self.assertEqual(t.isoformat('\x00'), "0001-02-03\x0004:05:01.000123") + # bpo-34482: Check that surrogates are handled properly. + self.assertEqual(t.isoformat('\ud800'), + "0001-02-03\ud80004:05:01.000123") self.assertEqual(t.isoformat(timespec='hours'), "0001-02-03T04") self.assertEqual(t.isoformat(timespec='minutes'), "0001-02-03T04:05") self.assertEqual(t.isoformat(timespec='seconds'), "0001-02-03T04:05:01") @@ -1754,6 +1765,8 @@ def test_isoformat(self): self.assertEqual(t.isoformat(timespec='auto'), "0001-02-03T04:05:01.000123") self.assertEqual(t.isoformat(sep=' ', timespec='minutes'), "0001-02-03 04:05") self.assertRaises(ValueError, t.isoformat, timespec='foo') + # bpo-34482: Check that surrogates are handled properly. + self.assertRaises(ValueError, t.isoformat, timespec='\ud800') # str is ISO format with the separator forced to a blank. self.assertEqual(str(t), "0001-02-03 04:05:01.000123") @@ -2277,13 +2290,21 @@ def test_utcnow(self): self.assertLessEqual(abs(from_timestamp - from_now), tolerance) def test_strptime(self): - string = '2004-12-01 13:02:47.197' - format = '%Y-%m-%d %H:%M:%S.%f' - expected = _strptime._strptime_datetime(self.theclass, string, format) - got = self.theclass.strptime(string, format) - self.assertEqual(expected, got) - self.assertIs(type(expected), self.theclass) - self.assertIs(type(got), self.theclass) + inputs = [ + ('2004-12-01 13:02:47.197', '%Y-%m-%d %H:%M:%S.%f'), + # bpo-34482: Check that surrogates are handled properly. + ('2004-12-01\ud80013:02:47.197', '%Y-%m-%d\ud800%H:%M:%S.%f'), + ('2004\ud80012-01 13:02:47.197', '%Y\ud800%m-%d %H:%M:%S.%f'), + ('2004-12-01 13:02\ud80047.197', '%Y-%m-%d %H:%M\ud800%S.%f'), + ] + for string, format in inputs: + with self.subTest(string=string, format=format): + expected = _strptime._strptime_datetime(self.theclass, string, + format) + got = self.theclass.strptime(string, format) + self.assertEqual(expected, got) + self.assertIs(type(expected), self.theclass) + self.assertIs(type(got), self.theclass) strptime = self.theclass.strptime self.assertEqual(strptime("+0002", "%z").utcoffset(), 2 * MINUTE) @@ -2869,6 +2890,8 @@ def test_isoformat(self): self.assertEqual(t.isoformat(timespec='microseconds'), "12:34:56.123456") self.assertEqual(t.isoformat(timespec='auto'), "12:34:56.123456") self.assertRaises(ValueError, t.isoformat, timespec='monkey') + # bpo-34482: Check that surrogates are handled properly. + self.assertRaises(ValueError, t.isoformat, timespec='\ud800') t = self.theclass(hour=12, minute=34, second=56, microsecond=999500) self.assertEqual(t.isoformat(timespec='milliseconds'), "12:34:56.999") @@ -2919,6 +2942,13 @@ def test_strftime(self): # A naive object replaces %z and %Z with empty strings. self.assertEqual(t.strftime("'%z' '%Z'"), "'' ''") + # bpo-34482: Check that surrogates don't cause a crash. + self.assertEqual(t.strftime('\ud800'), '\ud800') + + # bpo-34482: Check that surrogates in tzinfo don't crash + tzinfo = timezone(timedelta(hours=1), '\ud800') + t.replace(tzinfo=tzinfo).strftime("%Z") + def test_format(self): t = self.theclass(1, 2, 3, 4) self.assertEqual(t.__format__(''), str(t)) @@ -3324,11 +3354,6 @@ def tzname(self, dt): return self.tz self.assertEqual(t.strftime("%H:%M:%S"), "02:03:04") self.assertRaises(TypeError, t.strftime, "%Z") - # Issue #6697: - if '_Fast' in self.__class__.__name__: - Badtzname.tz = '\ud800' - self.assertRaises(ValueError, t.strftime, "%Z") - def test_hash_edge_cases(self): # Offsets that overflow a basic time. t1 = self.theclass(0, 1, 2, 3, tzinfo=FixedOffset(1439, "")) diff --git a/Misc/NEWS.d/next/Tests/2018-08-23-20-42-14.bpo-34482.BzQYUs.rst b/Misc/NEWS.d/next/Tests/2018-08-23-20-42-14.bpo-34482.BzQYUs.rst new file mode 100644 index 00000000000000..23da8366a4e354 --- /dev/null +++ b/Misc/NEWS.d/next/Tests/2018-08-23-20-42-14.bpo-34482.BzQYUs.rst @@ -0,0 +1,2 @@ +Add tests for proper handling of non-UTF-8-encodable strings in +:mod:`datetime` classes. Patch by Alexey Izbyshev. diff --git a/Modules/_datetimemodule.c b/Modules/_datetimemodule.c index 2522b65f1168e7..041dfc6e2c0714 100644 --- a/Modules/_datetimemodule.c +++ b/Modules/_datetimemodule.c @@ -1475,8 +1475,23 @@ wrap_strftime(PyObject *object, PyObject *format, PyObject *timetuple, assert(PyUnicode_Check(format)); /* Convert the input format to a C string and size */ pin = PyUnicode_AsUTF8AndSize(format, &flen); - if (!pin) - return NULL; + PyObject* format_esc = NULL; + if (pin == NULL) { + PyErr_Clear(); + + format_esc = PyUnicode_AsEncodedString(format, "utf-8", "surrogatepass"); + if (format_esc == NULL) { + return NULL; + } + char *pin_tmp; + if (PyBytes_AsStringAndSize(format_esc, &pin_tmp, &flen)) { + Py_DECREF(format_esc); + return NULL; + } + + pin = pin_tmp; + } + int contains_surrogates = (format_esc != NULL); /* Scan the input format, looking for %z/%Z/%f escapes, building * a new format. Since computing the replacements for those codes @@ -1537,15 +1552,33 @@ wrap_strftime(PyObject *object, PyObject *format, PyObject *timetuple, if (Zreplacement == NULL) { Zreplacement = make_Zreplacement(object, tzinfoarg); - if (Zreplacement == NULL) + if (Zreplacement == NULL) { goto Done; + } } assert(Zreplacement != NULL); assert(PyUnicode_Check(Zreplacement)); ptoappend = PyUnicode_AsUTF8AndSize(Zreplacement, &ntoappend); - if (ptoappend == NULL) - goto Done; + if (ptoappend == NULL) { + PyErr_Clear(); + + PyObject *Zreplacement_old = Zreplacement; + Zreplacement = PyUnicode_AsEncodedString( + Zreplacement, "utf-8", "surrogatepass" + ); + Py_DECREF(Zreplacement_old); + if (Zreplacement == NULL) { + goto Done; + } + + char *p_tmp; + if (PyBytes_AsStringAndSize(Zreplacement, &p_tmp, &ntoappend)) { + goto Done; + } + ptoappend = p_tmp; + contains_surrogates = 1; + } } else if (ch == 'f') { /* format microseconds */ @@ -1596,7 +1629,13 @@ wrap_strftime(PyObject *object, PyObject *format, PyObject *timetuple, if (time == NULL) goto Done; - format = PyUnicode_FromString(PyBytes_AS_STRING(newfmt)); + if (!contains_surrogates) { + format = PyUnicode_FromString(PyBytes_AS_STRING(newfmt)); + } else { + format = PyUnicode_Decode(PyBytes_AS_STRING(newfmt), + PyBytes_GET_SIZE(newfmt), + "utf-8", "surrogatepass"); + } if (format != NULL) { result = _PyObject_CallMethodIdObjArgs(time, &PyId_strftime, format, timetuple, NULL); @@ -1605,6 +1644,8 @@ wrap_strftime(PyObject *object, PyObject *format, PyObject *timetuple, Py_DECREF(time); } Done: + + Py_XDECREF(format_esc); Py_XDECREF(freplacement); Py_XDECREF(zreplacement); Py_XDECREF(Zreplacement); @@ -4898,7 +4939,12 @@ datetime_fromisoformat(PyObject* cls, PyObject *dtstr) { const char * dt_ptr = PyUnicode_AsUTF8AndSize(dtstr, &len); if (dt_ptr == NULL) { - goto invalid_string_error; + if (PyErr_ExceptionMatches(PyExc_UnicodeEncodeError)) { + // Encoding errors are invalid string errors at this point + goto invalid_string_error; + } else { + goto error; + } } const char *p = dt_ptr; diff --git a/Modules/timemodule.c b/Modules/timemodule.c index 998216cc8fdbdf..b4fdf4e6cb66bf 100644 --- a/Modules/timemodule.c +++ b/Modules/timemodule.c @@ -675,6 +675,81 @@ the C library strftime function.\n" #define time_strlen strlen #endif +static PyObject * +_escape_strftime_chars(PyObject* format_arg) +{ + // This (and the subsequent _from_escaped_strftime_chars is a special + // case for handling surrogate characters that are failed by + // surrogateescape. It is unlikely that this will be important in any + // real, non-adversarial situation, but there's something to be said for + // correctness on this point. + // + // Specifically, this will allow "pass through" printing for characters + // like \ud800, which would have to be encoded with a non-terminating + // null byte + PyObject * substr = NULL; + PyObject * replstr = NULL; + PyObject * escaped_unicode = NULL; + PyObject * rv = NULL; + + substr = PyUnicode_FromString("\\"); + if (substr == NULL) { + goto error; + } + + replstr = PyUnicode_FromString("\\\\"); + if (replstr == NULL) { + goto error; + } + + escaped_unicode = PyUnicode_Replace(format_arg, substr, replstr, -1); + if (escaped_unicode == NULL) { + goto error; + } + + rv = PyUnicode_AsUnicodeEscapeString(escaped_unicode); + +error: + Py_XDECREF(substr); + Py_XDECREF(replstr); + Py_XDECREF(escaped_unicode); + + return rv; +} + +static PyObject * +_from_escaped_strftime_chars(const char* buf, Py_ssize_t buflen) +{ + PyObject * tmp = NULL; + PyObject * substr = NULL; + PyObject * replstr = NULL; + PyObject * rv = NULL; + + tmp = PyUnicode_DecodeUnicodeEscape(buf, buflen, "strict"); + if (tmp == NULL) { + goto error; + } + + substr = PyUnicode_FromString("\\\\"); + if (substr == NULL) { + goto error; + } + + replstr = PyUnicode_FromString("\\"); + if (replstr == NULL) { + goto error; + } + + rv = PyUnicode_Replace(tmp, substr, replstr, -1); + +error: + Py_XDECREF(tmp); + Py_XDECREF(replstr); + Py_XDECREF(substr); + return rv; + +} + static PyObject * time_strftime(PyObject *self, PyObject *args) { @@ -736,8 +811,21 @@ time_strftime(PyObject *self, PyObject *args) #else /* Convert the unicode string to an ascii one */ format = PyUnicode_EncodeLocale(format_arg, "surrogateescape"); - if (format == NULL) - return NULL; + int decode_escaped = 0; + if (format == NULL) { + if (!PyErr_ExceptionMatches(PyExc_UnicodeEncodeError)) { + return NULL; + } + PyErr_Clear(); + + // Surrogate characters with 0x00 bytes - needs backslashreplace + decode_escaped = 1; + format = _escape_strftime_chars(format_arg); + if (format == NULL) { + return NULL; + } + } + fmt = PyBytes_AS_STRING(format); #endif @@ -809,7 +897,12 @@ time_strftime(PyObject *self, PyObject *args) #ifdef HAVE_WCSFTIME ret = PyUnicode_FromWideChar(outbuf, buflen); #else - ret = PyUnicode_DecodeLocaleAndSize(outbuf, buflen, "surrogateescape"); + if (decode_escaped) { + ret = _from_escaped_strftime_chars(outbuf, buflen); + } + else { + ret = PyUnicode_DecodeLocaleAndSize(outbuf, buflen, "surrogateescape"); + } #endif PyMem_Free(outbuf); break;