From 2f48e5381900cf37c2b98a9cab51f91d718e7256 Mon Sep 17 00:00:00 2001
From: Paul Ganssle <paul@ganssle.io>
Date: Wed, 22 Aug 2018 12:15:43 -0400
Subject: [PATCH 1/7] Fix issue with non-UTF8 separator strings

It is possible to pass a non-UTF-8 string as a separator in
datetime.isoformat, but the current implementation starts by decoding to
UTF-8, which will fail even for some valid strings.

In the special case of non-UTF-8 separators, we take a performance hit
by encoding the string as ASCII and replacing any invalid characters
with ?.
---
 Modules/_datetimemodule.c | 43 ++++++++++++++++++++++++++++++++++++---
 1 file changed, 40 insertions(+), 3 deletions(-)

diff --git a/Modules/_datetimemodule.c b/Modules/_datetimemodule.c
index 076912d58f4af8..7e9459b7eb642b 100644
--- a/Modules/_datetimemodule.c
+++ b/Modules/_datetimemodule.c
@@ -4848,8 +4848,33 @@ datetime_fromisoformat(PyObject* cls, PyObject *dtstr) {
         return NULL;
     }
 
+    const PyObject * dtstr_bytes = NULL;
+    unsigned char bytes_needs_decref = 0;
+
     Py_ssize_t len;
     const char * dt_ptr = PyUnicode_AsUTF8AndSize(dtstr, &len);
+
+    if (dt_ptr == NULL) {
+        len = PyUnicode_GET_LENGTH(dtstr);
+        if (len == 10) {
+            goto invalid_string_error;
+        }
+        PyErr_Clear();
+
+        // If the datetime string cannot be encoded as UTF8 because the
+        // separator character is an invalid character, this could still
+        // be a valid isoformat, so we decode it and ignore.
+        dtstr_bytes = PyUnicode_AsEncodedString(dtstr, "ascii", "replace");
+        if (dtstr_bytes == NULL) {
+            goto finally;
+        }
+        bytes_needs_decref = 1;
+        dt_ptr = PyBytes_AS_STRING(dtstr_bytes);
+        if (dt_ptr == NULL) {
+            goto finally;
+        }
+    }
+
     const char * p = dt_ptr;
 
     int year = 0, month = 0, day = 0;
@@ -4883,20 +4908,32 @@ datetime_fromisoformat(PyObject* cls, PyObject *dtstr) {
                                   &tzoffset, &tzusec);
     }
     if (rv < 0) {
-        PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %s", dt_ptr);
-        return NULL;
+        goto invalid_string_error;
     }
 
     PyObject* tzinfo = tzinfo_from_isoformat_results(rv, tzoffset, tzusec);
     if (tzinfo == NULL) {
-        return NULL;
+        goto finally;
     }
 
     PyObject *dt = new_datetime_subclass_ex(year, month, day, hour, minute,
                                             second, microsecond, tzinfo, cls);
 
     Py_DECREF(tzinfo);
+    if (bytes_needs_decref) {
+        Py_DECREF(dtstr_bytes);
+    }
     return dt;
+
+invalid_string_error:
+    PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %R", dtstr);
+
+finally:
+    if (bytes_needs_decref) {
+        Py_DECREF(dtstr_bytes);
+    }
+
+    return NULL;
 }
 
 

From 85a99ca97a90c8b91b4901a7deea7b444303aa94 Mon Sep 17 00:00:00 2001
From: Paul Ganssle <paul@ganssle.io>
Date: Wed, 22 Aug 2018 12:50:19 -0400
Subject: [PATCH 2/7] Fix non-UTF8 crash for (date|time)_fromisoformat

Previously this would end up dereferencing a NULL pointer if the
PyUnicode_AsUTF8AndSize call failed, this makes it so that the same
error as any other parsing error is raised.
---
 Modules/_datetimemodule.c | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/Modules/_datetimemodule.c b/Modules/_datetimemodule.c
index 7e9459b7eb642b..91cf203ec9d37c 100644
--- a/Modules/_datetimemodule.c
+++ b/Modules/_datetimemodule.c
@@ -2883,6 +2883,9 @@ date_fromisoformat(PyObject *cls, PyObject *dtstr) {
     Py_ssize_t len;
 
     const char * dt_ptr = PyUnicode_AsUTF8AndSize(dtstr, &len);
+    if (dt_ptr == NULL) {
+        goto invalid_string_error;
+    }
 
     int year = 0, month = 0, day = 0;
 
@@ -2894,12 +2897,15 @@ date_fromisoformat(PyObject *cls, PyObject *dtstr) {
     }
 
     if (rv < 0) {
-        PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %s",
-                     dt_ptr);
-        return NULL;
+        goto invalid_string_error;
     }
 
     return new_date_subclass_ex(year, month, day, cls);
+
+invalid_string_error:
+    PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %R",
+                 dtstr);
+    return NULL;
 }
 
 
@@ -4258,6 +4264,10 @@ time_fromisoformat(PyObject *cls, PyObject *tstr) {
     Py_ssize_t len;
     const char *p = PyUnicode_AsUTF8AndSize(tstr, &len);
 
+    if (p == NULL) {
+        goto invalid_string_error;
+    }
+
     int hour = 0, minute = 0, second = 0, microsecond = 0;
     int tzoffset, tzimicrosecond = 0;
     int rv = parse_isoformat_time(p, len,
@@ -4265,8 +4275,7 @@ time_fromisoformat(PyObject *cls, PyObject *tstr) {
                                   &tzoffset, &tzimicrosecond);
 
     if (rv < 0) {
-        PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %s", p);
-        return NULL;
+        goto invalid_string_error;
     }
 
     PyObject *tzinfo = tzinfo_from_isoformat_results(rv, tzoffset,
@@ -4286,6 +4295,10 @@ time_fromisoformat(PyObject *cls, PyObject *tstr) {
 
     Py_DECREF(tzinfo);
     return t;
+
+invalid_string_error:
+    PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %R", tstr);
+    return NULL;
 }
 
 

From dd82aa01bb2789595f178d6d8501557c5516502c Mon Sep 17 00:00:00 2001
From: Paul Ganssle <paul@ganssle.io>
Date: Wed, 22 Aug 2018 16:23:22 -0400
Subject: [PATCH 3/7] Refactor non-UTF-8 sanitization

This increases performance for valid non-UTF-8 strings by avoiding
an error condition, and minimizes the impact on the rest of the
algorithm.
---
 Modules/_datetimemodule.c | 74 +++++++++++++++++++++++++--------------
 1 file changed, 47 insertions(+), 27 deletions(-)

diff --git a/Modules/_datetimemodule.c b/Modules/_datetimemodule.c
index 91cf203ec9d37c..bb86b28f10a9c3 100644
--- a/Modules/_datetimemodule.c
+++ b/Modules/_datetimemodule.c
@@ -4852,6 +4852,40 @@ datetime_combine(PyObject *cls, PyObject *args, PyObject *kw)
     return result;
 }
 
+static PyObject *
+_sanitize_isoformat_str(PyObject *dtstr, int *needs_decref) {
+    // `fromisoformat` allows surrogate characters in exactly one position,
+    // the separator; to allow datetime_fromisoformat to make the simplifying
+    // assumption that all valid strings can be encoded in UTF-8, this function
+    // replaces any surrogate character separators with `T`.
+    Py_ssize_t len = PyUnicode_GetLength(dtstr);
+    *needs_decref = 0;
+    if (len <= 10 || !Py_UNICODE_IS_SURROGATE(PyUnicode_READ_CHAR(dtstr, 10))) {
+        return dtstr;
+    }
+
+    PyObject *left = PyUnicode_Substring(dtstr, 0, 10);
+    if (left == NULL) {
+        return NULL;
+    }
+
+    PyObject *right = PyUnicode_Substring(dtstr, 11, len);
+    if (right == NULL) {
+        Py_DECREF(left);
+        return NULL;
+    }
+
+    PyObject *str_out = PyUnicode_FromFormat("%UT%U", left, right);
+    Py_DECREF(left);
+    Py_DECREF(right);
+    if (str_out == NULL) {
+        return NULL;
+    }
+
+    *needs_decref = 1;
+    return str_out;
+}
+
 static PyObject *
 datetime_fromisoformat(PyObject* cls, PyObject *dtstr) {
     assert(dtstr != NULL);
@@ -4861,34 +4895,20 @@ datetime_fromisoformat(PyObject* cls, PyObject *dtstr) {
         return NULL;
     }
 
-    const PyObject * dtstr_bytes = NULL;
-    unsigned char bytes_needs_decref = 0;
+    int needs_decref = 0;
+    dtstr = _sanitize_isoformat_str(dtstr, &needs_decref);
+    if (dtstr == NULL) {
+        goto error;
+    }
 
     Py_ssize_t len;
     const char * dt_ptr = PyUnicode_AsUTF8AndSize(dtstr, &len);
 
     if (dt_ptr == NULL) {
-        len = PyUnicode_GET_LENGTH(dtstr);
-        if (len == 10) {
-            goto invalid_string_error;
-        }
-        PyErr_Clear();
-
-        // If the datetime string cannot be encoded as UTF8 because the
-        // separator character is an invalid character, this could still
-        // be a valid isoformat, so we decode it and ignore.
-        dtstr_bytes = PyUnicode_AsEncodedString(dtstr, "ascii", "replace");
-        if (dtstr_bytes == NULL) {
-            goto finally;
-        }
-        bytes_needs_decref = 1;
-        dt_ptr = PyBytes_AS_STRING(dtstr_bytes);
-        if (dt_ptr == NULL) {
-            goto finally;
-        }
+        goto invalid_string_error;
     }
 
-    const char * p = dt_ptr;
+    const char *p = dt_ptr;
 
     int year = 0, month = 0, day = 0;
     int hour = 0, minute = 0, second = 0, microsecond = 0;
@@ -4926,24 +4946,24 @@ datetime_fromisoformat(PyObject* cls, PyObject *dtstr) {
 
     PyObject* tzinfo = tzinfo_from_isoformat_results(rv, tzoffset, tzusec);
     if (tzinfo == NULL) {
-        goto finally;
+        goto error;
     }
 
     PyObject *dt = new_datetime_subclass_ex(year, month, day, hour, minute,
                                             second, microsecond, tzinfo, cls);
 
     Py_DECREF(tzinfo);
-    if (bytes_needs_decref) {
-        Py_DECREF(dtstr_bytes);
+    if (needs_decref) {
+        Py_DECREF(dtstr);
     }
     return dt;
 
 invalid_string_error:
     PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %R", dtstr);
 
-finally:
-    if (bytes_needs_decref) {
-        Py_DECREF(dtstr_bytes);
+error:
+    if (needs_decref) {
+        Py_DECREF(dtstr);
     }
 
     return NULL;

From c24388e94666ef079a4978956016512a3e9ce3a4 Mon Sep 17 00:00:00 2001
From: Tal Einat <taleinat@gmail.com>
Date: Wed, 22 Aug 2018 16:27:43 -0400
Subject: [PATCH 4/7] Add tests for surrogate code points

Co-authored-by: Alexey Izbyshev <izbyshev@ispras.ru>
Co-authored-by: Paul Ganssle <paul@ganssle.io>
---
 Lib/test/datetimetester.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/Lib/test/datetimetester.py b/Lib/test/datetimetester.py
index f647a232f40442..9c6e71c54d79d3 100644
--- a/Lib/test/datetimetester.py
+++ b/Lib/test/datetimetester.py
@@ -1667,6 +1667,7 @@ def test_fromisoformat_fails(self):
         # Test that fromisoformat() fails on invalid values
         bad_strs = [
             '',                 # Empty string
+            '\ud800',           # bpo-34454: Surrogate code point
             '009-03-04',        # Not 10 characters
             '123456789',        # Not a date
             '200a-12-04',       # Invalid character in year
@@ -1675,6 +1676,7 @@ def test_fromisoformat_fails(self):
             '2009-01-32',       # Invalid day
             '2009-02-29',       # Invalid leap day
             '20090228',         # Valid ISO8601 output not from isoformat()
+            '2009\ud80002\ud80028',     # Separators are surrogate codepoints
         ]
 
         for bad_str in bad_strs:
@@ -2587,7 +2589,8 @@ def test_fromisoformat_separators(self):
             ' ', 'T', '\u007f',     # 1-bit widths
             '\u0080', 'ʁ',          # 2-bit widths
             'ᛇ', '時',               # 3-bit widths
-            '🐍'                     # 4-bit widths
+            '🐍',                    # 4-bit widths
+            '\ud800',               # bpo-34454: Surrogate code point
         ]
 
         for sep in separators:
@@ -2639,6 +2642,7 @@ def test_fromisoformat_fails_datetime(self):
         # Test that fromisoformat() fails on invalid values
         bad_strs = [
             '',                             # Empty string
+            '\ud800',                       # bpo-34454: Surrogate code point
             '2009.04-19T03',                # Wrong first separator
             '2009-04.19T03',                # Wrong second separator
             '2009-04-19T0a',                # Invalid hours
@@ -2652,6 +2656,8 @@ def test_fromisoformat_fails_datetime(self):
             '2009-04-19T03:15:45.123456+24:30',    # Invalid time zone offset
             '2009-04-19T03:15:45.123456-24:30',    # Invalid negative offset
             '2009-04-10ᛇᛇᛇᛇᛇ12:15',         # Too many unicode separators
+            '2009-04\ud80010T12:15',        # Surrogate char in date
+            '2009-04-10T12\ud80015',        # Surrogate char in time
             '2009-04-19T1',                 # Incomplete hours
             '2009-04-19T12:3',              # Incomplete minutes
             '2009-04-19T12:30:4',           # Incomplete seconds
@@ -3521,6 +3527,7 @@ def test_fromisoformat_timespecs(self):
     def test_fromisoformat_fails(self):
         bad_strs = [
             '',                         # Empty string
+            '12\ud80000',               # Invalid separator - surrogate char
             '12:',                      # Ends on a separator
             '12:30:',                   # Ends on a separator
             '12:30:15.',                # Ends on a separator

From a0246a0a351376fee0806d7f1e9c1716fe6b2c56 Mon Sep 17 00:00:00 2001
From: Tal Einat <taleinat@gmail.com>
Date: Wed, 22 Aug 2018 16:27:14 -0400
Subject: [PATCH 5/7] Add news entry for bpo-34454

---
 .../next/Library/2018-08-22-21-59-08.bpo-34454.z7uG4b.rst     | 4 ++++
 1 file changed, 4 insertions(+)
 create mode 100644 Misc/NEWS.d/next/Library/2018-08-22-21-59-08.bpo-34454.z7uG4b.rst

diff --git a/Misc/NEWS.d/next/Library/2018-08-22-21-59-08.bpo-34454.z7uG4b.rst b/Misc/NEWS.d/next/Library/2018-08-22-21-59-08.bpo-34454.z7uG4b.rst
new file mode 100644
index 00000000000000..d6c5eee84c60b1
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2018-08-22-21-59-08.bpo-34454.z7uG4b.rst
@@ -0,0 +1,4 @@
+Fix the .fromisoformat() methods of datetime types crashing when given
+unicode with non-UTF-8-encodable code points.  Specifically,
+datetime.fromisoformat() now accepts surrogate unicode code points used as
+the separator. Report and tests by Alexey Izbyshev.

From b89d4f5627a423bf66b86ad90a436ebea6c4b8e9 Mon Sep 17 00:00:00 2001
From: Paul Ganssle <paul@ganssle.io>
Date: Wed, 22 Aug 2018 17:51:33 -0400
Subject: [PATCH 6/7] Refactor sanitize_isoformat_str

Rather than splitting the string at position 10 and re-joining it with
PyUnicode_Format, this copies the original unicode object and overwrites
the separator character.

Co-Authored-By: Alexey Izbyshev <izbyshev@ispras.ru>
---
 Modules/_datetimemodule.c | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/Modules/_datetimemodule.c b/Modules/_datetimemodule.c
index bb86b28f10a9c3..2522b65f1168e7 100644
--- a/Modules/_datetimemodule.c
+++ b/Modules/_datetimemodule.c
@@ -4864,21 +4864,14 @@ _sanitize_isoformat_str(PyObject *dtstr, int *needs_decref) {
         return dtstr;
     }
 
-    PyObject *left = PyUnicode_Substring(dtstr, 0, 10);
-    if (left == NULL) {
-        return NULL;
-    }
-
-    PyObject *right = PyUnicode_Substring(dtstr, 11, len);
-    if (right == NULL) {
-        Py_DECREF(left);
+    PyObject *str_out = PyUnicode_New(len, PyUnicode_MAX_CHAR_VALUE(dtstr));
+    if (str_out == NULL) {
         return NULL;
     }
 
-    PyObject *str_out = PyUnicode_FromFormat("%UT%U", left, right);
-    Py_DECREF(left);
-    Py_DECREF(right);
-    if (str_out == NULL) {
+    if (PyUnicode_CopyCharacters(str_out, 0, dtstr, 0, len) == -1 ||
+            PyUnicode_WriteChar(str_out, 10, (Py_UCS4)'T')) {
+        Py_DECREF(str_out);
         return NULL;
     }
 

From 160e779654eda94a819b928ebd60901d588f2b6a Mon Sep 17 00:00:00 2001
From: Tal Einat <taleinat+github@gmail.com>
Date: Thu, 23 Aug 2018 16:32:32 +0300
Subject: [PATCH 7/7] added mention of patch author in NEWS

---
 .../next/Library/2018-08-22-21-59-08.bpo-34454.z7uG4b.rst       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Misc/NEWS.d/next/Library/2018-08-22-21-59-08.bpo-34454.z7uG4b.rst b/Misc/NEWS.d/next/Library/2018-08-22-21-59-08.bpo-34454.z7uG4b.rst
index d6c5eee84c60b1..1d5c32708c1554 100644
--- a/Misc/NEWS.d/next/Library/2018-08-22-21-59-08.bpo-34454.z7uG4b.rst
+++ b/Misc/NEWS.d/next/Library/2018-08-22-21-59-08.bpo-34454.z7uG4b.rst
@@ -1,4 +1,4 @@
 Fix the .fromisoformat() methods of datetime types crashing when given
 unicode with non-UTF-8-encodable code points.  Specifically,
 datetime.fromisoformat() now accepts surrogate unicode code points used as
-the separator. Report and tests by Alexey Izbyshev.
+the separator. Report and tests by Alexey Izbyshev, patch by Paul Ganssle.