From a5d2b9a4650fe63cdbd57e34b66e90d457dcd571 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Sun, 21 May 2023 07:48:20 -0400 Subject: [PATCH 1/4] ENH/PERF: pyarrow timestamp & duration conversion consistency --- doc/source/whatsnew/v2.1.0.rst | 3 +- pandas/core/arrays/arrow/array.py | 57 +++++++++++++++++++++--- pandas/core/arrays/datetimelike.py | 14 ++++++ pandas/tests/extension/test_arrow.py | 66 ++++++++++++++++++++++++++++ 4 files changed, 133 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index db97602dcf4df..c58c38a602bb1 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -300,7 +300,7 @@ Performance improvements - Performance improvement in :meth:`Series.to_numpy` when dtype is a numpy float dtype and ``na_value`` is ``np.nan`` (:issue:`52430`) - Performance improvement in :meth:`~arrays.ArrowExtensionArray.to_numpy` (:issue:`52525`) - Performance improvement when doing various reshaping operations on :class:`arrays.IntegerArrays` & :class:`arrays.FloatingArray` by avoiding doing unnecessary validation (:issue:`53013`) -- +- Performance improvement in :meth:`~arrays.ArrowExtensionArray.astype` when converting from a pyarrow timestamp or duration dtype to numpy (:issue:`#####`) .. --------------------------------------------------------------------------- .. _whatsnew_210.bug_fixes: @@ -447,6 +447,7 @@ Sparse ExtensionArray ^^^^^^^^^^^^^^ - Bug in :class:`~arrays.ArrowExtensionArray` converting pandas non-nanosecond temporal objects from non-zero values to zero values (:issue:`53171`) +- Bug in :metho:`~arrays.ArrowExtensionArray.__iter__` and :metho:`~arrays.ArrowExtensionArray.__getitem__` returning python datetime and timedelta objects for non-nano dtypes (:issue:`#####`) - Bug in :meth:`Series.quantile` for pyarrow temporal types raising ArrowInvalid (:issue:`52678`) - Bug in :meth:`Series.rank` returning wrong order for small values with ``Float64`` dtype (:issue:`52471`) - Bug where the ``__from_arrow__`` method of masked ExtensionDtypes(e.g. :class:`Float64Dtype`, :class:`BooleanDtype`) would not accept pyarrow arrays of type ``pyarrow.null()`` (:issue:`52223`) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 3b92804f65112..1d725c1c2050b 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -533,9 +533,16 @@ def __getitem__(self, item: PositionalIndexer): if isinstance(value, pa.ChunkedArray): return type(self)(value) else: + pa_type = self._pa_array.type scalar = value.as_py() if scalar is None: return self._dtype.na_value + elif pa.types.is_timestamp(pa_type) and pa_type.unit != "ns": + # GH ##### + return Timestamp(scalar).as_unit(pa_type.unit) + elif pa.types.is_duration(pa_type) and pa_type.unit != "ns": + # GH ##### + return Timedelta(scalar).as_unit(pa_type.unit) else: return scalar @@ -544,10 +551,18 @@ def __iter__(self) -> Iterator[Any]: Iterate over elements of the array. """ na_value = self._dtype.na_value + # GH ##### + pa_type = self._pa_array.type + box_timestamp = pa.types.is_timestamp(pa_type) and pa_type.unit != "ns" + box_timedelta = pa.types.is_duration(pa_type) and pa_type.unit != "ns" for value in self._pa_array: val = value.as_py() if val is None: yield na_value + elif box_timestamp: + yield Timestamp(val).as_unit(pa_type.unit) + elif box_timedelta: + yield Timedelta(val).as_unit(pa_type.unit) else: yield val @@ -1157,16 +1172,46 @@ def to_numpy( copy: bool = False, na_value: object = lib.no_default, ) -> np.ndarray: - if dtype is None and self._hasna: - dtype = object + if dtype is not None: + dtype = np.dtype(dtype) + elif self._hasna: + dtype = np.dtype(object) + if na_value is lib.no_default: na_value = self.dtype.na_value pa_type = self._pa_array.type - if pa.types.is_temporal(pa_type) and not pa.types.is_date(pa_type): - # temporal types with units and/or timezones currently - # require pandas/python scalars to pass all tests - # TODO: improve performance (this is slow) + if pa.types.is_timestamp(pa_type): + from pandas.core.arrays.datetimes import ( + DatetimeArray, + tz_to_dtype, + ) + + np_dtype = np.dtype(f"M8[{pa_type.unit}]") + result = self._pa_array.to_numpy() + result = result.astype(np_dtype, copy=copy) + if dtype is None or dtype.kind == "O": + dta_dtype = tz_to_dtype(pa_type.tz, pa_type.unit) + result = DatetimeArray._simple_new(result, dtype=dta_dtype) + result = result.to_numpy(dtype=object, na_value=na_value) + elif result.dtype != dtype: + result = result.astype(dtype, copy=False) + return result + elif pa.types.is_duration(pa_type): + from pandas.core.arrays.timedeltas import TimedeltaArray + + np_dtype = np.dtype(f"m8[{pa_type.unit}]") + result = self._pa_array.to_numpy() + result = result.astype(np_dtype, copy=copy) + if dtype is None or dtype.kind == "O": + result = TimedeltaArray._simple_new(result, dtype=np_dtype) + result = result.to_numpy(dtype=object, na_value=na_value) + elif result.dtype != dtype: + result = result.astype(dtype, copy=False) + return result + elif pa.types.is_time(pa_type): + # convert to list of python datetime.time objects before + # wrapping in ndarray result = np.array(list(self), dtype=dtype) elif is_object_dtype(dtype) and self._hasna: result = np.empty(len(self), dtype=object) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 964e5a0944f16..fe722e2661146 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -2204,6 +2204,20 @@ def ensure_arraylike_for_datetimelike(data, copy: bool, cls_name: str): ): data = data.to_numpy("int64", na_value=iNaT) copy = False + elif isinstance(data, ArrowExtensionArray) and data.dtype.kind == "M": + from pandas.core.arrays import DatetimeArray + from pandas.core.arrays.datetimes import tz_to_dtype + + pa_type = data._pa_array.type + dtype = tz_to_dtype(tz=pa_type.tz, unit=pa_type.unit) + data = data.to_numpy(f"M8[{pa_type.unit}]", na_value=iNaT) + data = DatetimeArray._simple_new(data, dtype=dtype) + copy = False + elif isinstance(data, ArrowExtensionArray) and data.dtype.kind == "m": + pa_type = data._pa_array.type + dtype = np.dtype(f"m8[{pa_type.unit}]") + data = data.to_numpy(dtype, na_value=iNaT) + copy = False elif not isinstance(data, (np.ndarray, ExtensionArray)) or isinstance( data, ArrowExtensionArray ): diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 9e5955da76d1c..5640a5921ac16 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3008,3 +3008,69 @@ def test_comparison_temporal(pa_type): result = arr > val expected = ArrowExtensionArray(pa.array([False, True, True], type=pa.bool_())) tm.assert_extension_array_equal(result, expected) + + +@pytest.mark.parametrize( + "pa_type", tm.DATETIME_PYARROW_DTYPES + tm.TIMEDELTA_PYARROW_DTYPES +) +def test_getitem_temporal(pa_type): + # GH ##### + arr = ArrowExtensionArray(pa.array([1, 2, 3], type=pa_type)) + result = arr[1] + if pa.types.is_duration(pa_type): + expected = pd.Timedelta(2, unit=pa_type.unit).as_unit(pa_type.unit) + assert isinstance(result, pd.Timedelta) + else: + expected = pd.Timestamp(2, unit=pa_type.unit, tz=pa_type.tz).as_unit( + pa_type.unit + ) + assert isinstance(result, pd.Timestamp) + assert result.unit == expected.unit + assert result == expected + + +@pytest.mark.parametrize( + "pa_type", tm.DATETIME_PYARROW_DTYPES + tm.TIMEDELTA_PYARROW_DTYPES +) +def test_iter_temporal(pa_type): + # GH ##### + arr = ArrowExtensionArray(pa.array([1, None], type=pa_type)) + result = list(arr) + if pa.types.is_duration(pa_type): + expected = [ + pd.Timedelta(1, unit=pa_type.unit).as_unit(pa_type.unit), + pd.NA, + ] + assert isinstance(result[0], pd.Timedelta) + else: + expected = [ + pd.Timestamp(1, unit=pa_type.unit, tz=pa_type.tz).as_unit(pa_type.unit), + pd.NA, + ] + assert isinstance(result[0], pd.Timestamp) + assert result[0].unit == expected[0].unit + assert result == expected + + +@pytest.mark.parametrize( + "pa_type", tm.DATETIME_PYARROW_DTYPES + tm.TIMEDELTA_PYARROW_DTYPES +) +def test_to_numpy_temporal(pa_type): + # GH ##### + arr = ArrowExtensionArray(pa.array([1, None], type=pa_type)) + result = arr.to_numpy() + if pa.types.is_duration(pa_type): + expected = [ + pd.Timedelta(1, unit=pa_type.unit).as_unit(pa_type.unit), + pd.NA, + ] + assert isinstance(result[0], pd.Timedelta) + else: + expected = [ + pd.Timestamp(1, unit=pa_type.unit, tz=pa_type.tz).as_unit(pa_type.unit), + pd.NA, + ] + assert isinstance(result[0], pd.Timestamp) + expected = np.array(expected, dtype=object) + assert result[0].unit == expected[0].unit + tm.assert_numpy_array_equal(result, expected) From 94e66d692409532dfceccb703c93c5aeb341fd7f Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Sun, 21 May 2023 07:56:41 -0400 Subject: [PATCH 2/4] gh refs --- doc/source/whatsnew/v2.1.0.rst | 4 ++-- pandas/core/arrays/arrow/array.py | 6 +++--- pandas/tests/extension/test_arrow.py | 6 +++--- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index c58c38a602bb1..d40a97c7076e6 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -298,9 +298,9 @@ Performance improvements - Performance improvement in :meth:`Series.corr` and :meth:`Series.cov` for extension dtypes (:issue:`52502`) - Performance improvement in :meth:`Series.str.get` for pyarrow-backed strings (:issue:`53152`) - Performance improvement in :meth:`Series.to_numpy` when dtype is a numpy float dtype and ``na_value`` is ``np.nan`` (:issue:`52430`) +- Performance improvement in :meth:`~arrays.ArrowExtensionArray.astype` when converting from a pyarrow timestamp or duration dtype to numpy (:issue:`53326`) - Performance improvement in :meth:`~arrays.ArrowExtensionArray.to_numpy` (:issue:`52525`) - Performance improvement when doing various reshaping operations on :class:`arrays.IntegerArrays` & :class:`arrays.FloatingArray` by avoiding doing unnecessary validation (:issue:`53013`) -- Performance improvement in :meth:`~arrays.ArrowExtensionArray.astype` when converting from a pyarrow timestamp or duration dtype to numpy (:issue:`#####`) .. --------------------------------------------------------------------------- .. _whatsnew_210.bug_fixes: @@ -447,9 +447,9 @@ Sparse ExtensionArray ^^^^^^^^^^^^^^ - Bug in :class:`~arrays.ArrowExtensionArray` converting pandas non-nanosecond temporal objects from non-zero values to zero values (:issue:`53171`) -- Bug in :metho:`~arrays.ArrowExtensionArray.__iter__` and :metho:`~arrays.ArrowExtensionArray.__getitem__` returning python datetime and timedelta objects for non-nano dtypes (:issue:`#####`) - Bug in :meth:`Series.quantile` for pyarrow temporal types raising ArrowInvalid (:issue:`52678`) - Bug in :meth:`Series.rank` returning wrong order for small values with ``Float64`` dtype (:issue:`52471`) +- Bug in :metho:`~arrays.ArrowExtensionArray.__iter__` and :metho:`~arrays.ArrowExtensionArray.__getitem__` returning python datetime and timedelta objects for non-nano dtypes (:issue:`53326`) - Bug where the ``__from_arrow__`` method of masked ExtensionDtypes(e.g. :class:`Float64Dtype`, :class:`BooleanDtype`) would not accept pyarrow arrays of type ``pyarrow.null()`` (:issue:`52223`) - diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 1d725c1c2050b..83cc39591c87e 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -538,10 +538,10 @@ def __getitem__(self, item: PositionalIndexer): if scalar is None: return self._dtype.na_value elif pa.types.is_timestamp(pa_type) and pa_type.unit != "ns": - # GH ##### + # GH 53326 return Timestamp(scalar).as_unit(pa_type.unit) elif pa.types.is_duration(pa_type) and pa_type.unit != "ns": - # GH ##### + # GH 53326 return Timedelta(scalar).as_unit(pa_type.unit) else: return scalar @@ -551,7 +551,7 @@ def __iter__(self) -> Iterator[Any]: Iterate over elements of the array. """ na_value = self._dtype.na_value - # GH ##### + # GH 53326 pa_type = self._pa_array.type box_timestamp = pa.types.is_timestamp(pa_type) and pa_type.unit != "ns" box_timedelta = pa.types.is_duration(pa_type) and pa_type.unit != "ns" diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 5640a5921ac16..1b0786bcd5d2e 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3014,7 +3014,7 @@ def test_comparison_temporal(pa_type): "pa_type", tm.DATETIME_PYARROW_DTYPES + tm.TIMEDELTA_PYARROW_DTYPES ) def test_getitem_temporal(pa_type): - # GH ##### + # GH 53326 arr = ArrowExtensionArray(pa.array([1, 2, 3], type=pa_type)) result = arr[1] if pa.types.is_duration(pa_type): @@ -3033,7 +3033,7 @@ def test_getitem_temporal(pa_type): "pa_type", tm.DATETIME_PYARROW_DTYPES + tm.TIMEDELTA_PYARROW_DTYPES ) def test_iter_temporal(pa_type): - # GH ##### + # GH 53326 arr = ArrowExtensionArray(pa.array([1, None], type=pa_type)) result = list(arr) if pa.types.is_duration(pa_type): @@ -3056,7 +3056,7 @@ def test_iter_temporal(pa_type): "pa_type", tm.DATETIME_PYARROW_DTYPES + tm.TIMEDELTA_PYARROW_DTYPES ) def test_to_numpy_temporal(pa_type): - # GH ##### + # GH 53326 arr = ArrowExtensionArray(pa.array([1, None], type=pa_type)) result = arr.to_numpy() if pa.types.is_duration(pa_type): From 5593084cfb094689a555fe39372c53bdef2d5721 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Sun, 21 May 2023 09:58:53 -0400 Subject: [PATCH 3/4] typo --- doc/source/whatsnew/v2.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index d40a97c7076e6..84b26d7f9e9a9 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -449,7 +449,7 @@ ExtensionArray - Bug in :class:`~arrays.ArrowExtensionArray` converting pandas non-nanosecond temporal objects from non-zero values to zero values (:issue:`53171`) - Bug in :meth:`Series.quantile` for pyarrow temporal types raising ArrowInvalid (:issue:`52678`) - Bug in :meth:`Series.rank` returning wrong order for small values with ``Float64`` dtype (:issue:`52471`) -- Bug in :metho:`~arrays.ArrowExtensionArray.__iter__` and :metho:`~arrays.ArrowExtensionArray.__getitem__` returning python datetime and timedelta objects for non-nano dtypes (:issue:`53326`) +- Bug in :method:`~arrays.ArrowExtensionArray.__iter__` and :method:`~arrays.ArrowExtensionArray.__getitem__` returning python datetime and timedelta objects for non-nano dtypes (:issue:`53326`) - Bug where the ``__from_arrow__`` method of masked ExtensionDtypes(e.g. :class:`Float64Dtype`, :class:`BooleanDtype`) would not accept pyarrow arrays of type ``pyarrow.null()`` (:issue:`52223`) - From 8fb518625490baa70338de72275742d479fb3b73 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Sun, 21 May 2023 15:54:58 -0400 Subject: [PATCH 4/4] whatsnew --- doc/source/whatsnew/v2.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 84b26d7f9e9a9..c90f6f96ab743 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -449,7 +449,7 @@ ExtensionArray - Bug in :class:`~arrays.ArrowExtensionArray` converting pandas non-nanosecond temporal objects from non-zero values to zero values (:issue:`53171`) - Bug in :meth:`Series.quantile` for pyarrow temporal types raising ArrowInvalid (:issue:`52678`) - Bug in :meth:`Series.rank` returning wrong order for small values with ``Float64`` dtype (:issue:`52471`) -- Bug in :method:`~arrays.ArrowExtensionArray.__iter__` and :method:`~arrays.ArrowExtensionArray.__getitem__` returning python datetime and timedelta objects for non-nano dtypes (:issue:`53326`) +- Bug in :meth:`~arrays.ArrowExtensionArray.__iter__` and :meth:`~arrays.ArrowExtensionArray.__getitem__` returning python datetime and timedelta objects for non-nano dtypes (:issue:`53326`) - Bug where the ``__from_arrow__`` method of masked ExtensionDtypes(e.g. :class:`Float64Dtype`, :class:`BooleanDtype`) would not accept pyarrow arrays of type ``pyarrow.null()`` (:issue:`52223`) -