From 8bb6eed67ebc0f906d5d48e612a2420d3fc7f595 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 30 Sep 2022 19:28:54 -0700 Subject: [PATCH 1/3] BUG: DataFrame from dict with non-nano Timedelta --- pandas/_libs/tslibs/timedeltas.pyx | 13 +++++++++- pandas/tests/frame/test_constructors.py | 32 +++++++++++++++---------- pandas/tests/libs/test_lib.py | 16 +++++++++++++ 3 files changed, 47 insertions(+), 14 deletions(-) diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index bf22967f615c4..d68ce56d981a7 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -1082,8 +1082,19 @@ cdef class _Timedelta(timedelta): # non-invariant behavior. # see GH#44504 return hash(self.value) - else: + elif self._reso == NPY_FR_ns: return timedelta.__hash__(self) + else: + # We want to ensure that two equivalent Timedelta objects + # have the same hash. So we try downcasting to the next-lowest + # resolution. + try: + obj = self._as_reso((self._reso + 1)) + except OverflowError: + # Doesn't fit, so we're off the hook + return hash(self.value) + else: + return hash(obj) def __richcmp__(_Timedelta self, object other, int op): cdef: diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 37e08adcfdf88..5c5a23d4872d8 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -856,16 +856,20 @@ def create_data(constructor): tm.assert_frame_equal(result_datetime, expected) tm.assert_frame_equal(result_Timestamp, expected) - def test_constructor_dict_timedelta64_index(self): + @pytest.mark.parametrize( + "klass,name", + [ + (lambda x: np.timedelta64(x, "D"), "timedelta64"), + (lambda x: timedelta(days=x), "pytimedelta"), + (lambda x: Timedelta(x, "D"), "Timedelta[ns]"), + (lambda x: Timedelta(x, "D")._as_unit("s"), "Timedelta[s]"), + ], + ) + def test_constructor_dict_timedelta64_index(self, klass, name): # GH 10160 td_as_int = [1, 2, 3, 4] - def create_data(constructor): - return {i: {constructor(s): 2 * i} for i, s in enumerate(td_as_int)} - - data_timedelta64 = create_data(lambda x: np.timedelta64(x, "D")) - data_timedelta = create_data(lambda x: timedelta(days=x)) - data_Timedelta = create_data(lambda x: Timedelta(x, "D")) + data = {i: {klass(s): 2 * i} for i, s in enumerate(td_as_int)} expected = DataFrame( [ @@ -877,12 +881,14 @@ def create_data(constructor): index=[Timedelta(td, "D") for td in td_as_int], ) - result_timedelta64 = DataFrame(data_timedelta64) - result_timedelta = DataFrame(data_timedelta) - result_Timedelta = DataFrame(data_Timedelta) - tm.assert_frame_equal(result_timedelta64, expected) - tm.assert_frame_equal(result_timedelta, expected) - tm.assert_frame_equal(result_Timedelta, expected) + if name == "Timedelta[s]": + # TODO(2.0): passing index here shouldn't be necessary, is for now + # otherwise we raise in _extract_index + result = DataFrame(data, index=expected.index) + else: + result = DataFrame(data) + + tm.assert_frame_equal(result, expected) def test_constructor_period_dict(self): # PeriodIndex diff --git a/pandas/tests/libs/test_lib.py b/pandas/tests/libs/test_lib.py index e9a26b26eefc4..052b19dbd306c 100644 --- a/pandas/tests/libs/test_lib.py +++ b/pandas/tests/libs/test_lib.py @@ -2,6 +2,7 @@ import pytest from pandas._libs import ( + Timedelta, lib, writers as libwriters, ) @@ -42,6 +43,21 @@ def test_fast_unique_multiple_list_gen_sort(self): out = lib.fast_unique_multiple_list_gen(gen, sort=False) tm.assert_numpy_array_equal(np.array(out), expected) + def test_fast_multiget_timedelta_resos(self): + # This will become relevant for test_constructor_dict_timedelta64_index + # once Timedelta constructor preserves reso when passed a + # np.timedelta64 object + td = Timedelta(days=1) + + mapping1 = {td: 1} + mapping2 = {td._as_unit("s"): 1} + + oindex = Index([td * n for n in range(3)])._values.astype(object) + + expected = lib.fast_multiget(mapping1, oindex) + result = lib.fast_multiget(mapping2, oindex) + tm.assert_numpy_array_equal(result, expected) + class TestIndexing: def test_maybe_indices_to_slice_left_edge(self): From 809064c1b9c8c03ee001ea47e08e43994bdbf49e Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 12 Oct 2022 15:46:07 -0700 Subject: [PATCH 2/3] fix __hash__ --- pandas/_libs/tslibs/timedeltas.pxd | 1 + pandas/_libs/tslibs/timedeltas.pyx | 18 ++++++++++++++++-- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/tslibs/timedeltas.pxd b/pandas/_libs/tslibs/timedeltas.pxd index 3251e10a88b73..f4e2817e414e5 100644 --- a/pandas/_libs/tslibs/timedeltas.pxd +++ b/pandas/_libs/tslibs/timedeltas.pxd @@ -22,6 +22,7 @@ cdef class _Timedelta(timedelta): cpdef timedelta to_pytimedelta(_Timedelta self) cdef bint _has_ns(self) + cdef bint _is_in_pytimedelta_bounds(self) cdef _ensure_components(_Timedelta self) cdef inline bint _compare_mismatched_resos(self, _Timedelta other, op) cdef _Timedelta _as_reso(self, NPY_DATETIMEUNIT reso, bint round_ok=*) diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index d68ce56d981a7..9ba49685c4ad8 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -963,7 +963,6 @@ cdef _timedelta_from_value_and_reso(int64_t value, NPY_DATETIMEUNIT reso): "Only resolutions 's', 'ms', 'us', 'ns' are supported." ) - td_base.value = value td_base._is_populated = 0 td_base._reso = reso @@ -1082,7 +1081,15 @@ cdef class _Timedelta(timedelta): # non-invariant behavior. # see GH#44504 return hash(self.value) - elif self._reso == NPY_FR_ns: + elif self._is_in_pytimedelta_bounds() and ( + self._reso == NPY_FR_ns or self._reso == NPY_DATETIMEUNIT.NPY_FR_us + ): + # If we can defer to timedelta.__hash__, do so, as that + # ensures the hash is invariant to our _reso. + # We can only defer for ns and us, as for these two resos we + # call _Timedelta.__new__ with the correct input in + # _timedelta_from_value_and_reso; so timedelta.__hash__ + # will be correct return timedelta.__hash__(self) else: # We want to ensure that two equivalent Timedelta objects @@ -1152,6 +1159,13 @@ cdef class _Timedelta(timedelta): else: raise NotImplementedError(self._reso) + cdef bint _is_in_pytimedelta_bounds(self): + """ + Check if we are within the bounds of datetime.timedelta. + """ + self._ensure_components() + return -999999999 <= self._d and self._d <= 999999999 + cdef _ensure_components(_Timedelta self): """ compute the components From c47d8573e76e12ea123f5fff40c6e06b091c8e26 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 18 Nov 2022 11:53:35 -0800 Subject: [PATCH 3/3] test for both hash cases --- pandas/_libs/tslibs/timedeltas.pyx | 4 ++-- pandas/tests/frame/test_constructors.py | 2 +- pandas/tests/libs/test_lib.py | 15 ++++++++++++++- 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 83b533dfff3c3..d084b4944b6ed 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -1094,7 +1094,7 @@ cdef class _Timedelta(timedelta): # see GH#44504 return hash(self.value) elif self._is_in_pytimedelta_bounds() and ( - self._reso == NPY_FR_ns or self._reso == NPY_DATETIMEUNIT.NPY_FR_us + self._creso == NPY_FR_ns or self._creso == NPY_DATETIMEUNIT.NPY_FR_us ): # If we can defer to timedelta.__hash__, do so, as that # ensures the hash is invariant to our _reso. @@ -1108,7 +1108,7 @@ cdef class _Timedelta(timedelta): # have the same hash. So we try downcasting to the next-lowest # resolution. try: - obj = self._as_reso((self._reso + 1)) + obj = (<_Timedelta>self)._as_creso((self._creso + 1)) except OverflowError: # Doesn't fit, so we're off the hook return hash(self.value) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index fdf9ee2d217d1..b83e060a66650 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -851,7 +851,7 @@ def create_data(constructor): (lambda x: np.timedelta64(x, "D"), "timedelta64"), (lambda x: timedelta(days=x), "pytimedelta"), (lambda x: Timedelta(x, "D"), "Timedelta[ns]"), - (lambda x: Timedelta(x, "D")._as_unit("s"), "Timedelta[s]"), + (lambda x: Timedelta(x, "D").as_unit("s"), "Timedelta[s]"), ], ) def test_constructor_dict_timedelta64_index(self, klass, name): diff --git a/pandas/tests/libs/test_lib.py b/pandas/tests/libs/test_lib.py index 052b19dbd306c..fd7c47d47112f 100644 --- a/pandas/tests/libs/test_lib.py +++ b/pandas/tests/libs/test_lib.py @@ -50,7 +50,20 @@ def test_fast_multiget_timedelta_resos(self): td = Timedelta(days=1) mapping1 = {td: 1} - mapping2 = {td._as_unit("s"): 1} + mapping2 = {td.as_unit("s"): 1} + + oindex = Index([td * n for n in range(3)])._values.astype(object) + + expected = lib.fast_multiget(mapping1, oindex) + result = lib.fast_multiget(mapping2, oindex) + tm.assert_numpy_array_equal(result, expected) + + # case that can't be cast to td64ns + td = Timedelta(np.timedelta64(400, "Y")) + assert hash(td) == hash(td.as_unit("ms")) + assert hash(td) == hash(td.as_unit("us")) + mapping1 = {td: 1} + mapping2 = {td.as_unit("ms"): 1} oindex = Index([td * n for n in range(3)])._values.astype(object)