diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index db97602dcf4df..c90f6f96ab743 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -298,9 +298,9 @@ Performance improvements - Performance improvement in :meth:`Series.corr` and :meth:`Series.cov` for extension dtypes (:issue:`52502`) - Performance improvement in :meth:`Series.str.get` for pyarrow-backed strings (:issue:`53152`) - Performance improvement in :meth:`Series.to_numpy` when dtype is a numpy float dtype and ``na_value`` is ``np.nan`` (:issue:`52430`) +- Performance improvement in :meth:`~arrays.ArrowExtensionArray.astype` when converting from a pyarrow timestamp or duration dtype to numpy (:issue:`53326`) - Performance improvement in :meth:`~arrays.ArrowExtensionArray.to_numpy` (:issue:`52525`) - Performance improvement when doing various reshaping operations on :class:`arrays.IntegerArrays` & :class:`arrays.FloatingArray` by avoiding doing unnecessary validation (:issue:`53013`) -- .. --------------------------------------------------------------------------- .. _whatsnew_210.bug_fixes: @@ -449,6 +449,7 @@ ExtensionArray - Bug in :class:`~arrays.ArrowExtensionArray` converting pandas non-nanosecond temporal objects from non-zero values to zero values (:issue:`53171`) - Bug in :meth:`Series.quantile` for pyarrow temporal types raising ArrowInvalid (:issue:`52678`) - Bug in :meth:`Series.rank` returning wrong order for small values with ``Float64`` dtype (:issue:`52471`) +- Bug in :meth:`~arrays.ArrowExtensionArray.__iter__` and :meth:`~arrays.ArrowExtensionArray.__getitem__` returning python datetime and timedelta objects for non-nano dtypes (:issue:`53326`) - Bug where the ``__from_arrow__`` method of masked ExtensionDtypes(e.g. :class:`Float64Dtype`, :class:`BooleanDtype`) would not accept pyarrow arrays of type ``pyarrow.null()`` (:issue:`52223`) - diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 3b92804f65112..83cc39591c87e 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -533,9 +533,16 @@ def __getitem__(self, item: PositionalIndexer): if isinstance(value, pa.ChunkedArray): return type(self)(value) else: + pa_type = self._pa_array.type scalar = value.as_py() if scalar is None: return self._dtype.na_value + elif pa.types.is_timestamp(pa_type) and pa_type.unit != "ns": + # GH 53326 + return Timestamp(scalar).as_unit(pa_type.unit) + elif pa.types.is_duration(pa_type) and pa_type.unit != "ns": + # GH 53326 + return Timedelta(scalar).as_unit(pa_type.unit) else: return scalar @@ -544,10 +551,18 @@ def __iter__(self) -> Iterator[Any]: Iterate over elements of the array. """ na_value = self._dtype.na_value + # GH 53326 + pa_type = self._pa_array.type + box_timestamp = pa.types.is_timestamp(pa_type) and pa_type.unit != "ns" + box_timedelta = pa.types.is_duration(pa_type) and pa_type.unit != "ns" for value in self._pa_array: val = value.as_py() if val is None: yield na_value + elif box_timestamp: + yield Timestamp(val).as_unit(pa_type.unit) + elif box_timedelta: + yield Timedelta(val).as_unit(pa_type.unit) else: yield val @@ -1157,16 +1172,46 @@ def to_numpy( copy: bool = False, na_value: object = lib.no_default, ) -> np.ndarray: - if dtype is None and self._hasna: - dtype = object + if dtype is not None: + dtype = np.dtype(dtype) + elif self._hasna: + dtype = np.dtype(object) + if na_value is lib.no_default: na_value = self.dtype.na_value pa_type = self._pa_array.type - if pa.types.is_temporal(pa_type) and not pa.types.is_date(pa_type): - # temporal types with units and/or timezones currently - # require pandas/python scalars to pass all tests - # TODO: improve performance (this is slow) + if pa.types.is_timestamp(pa_type): + from pandas.core.arrays.datetimes import ( + DatetimeArray, + tz_to_dtype, + ) + + np_dtype = np.dtype(f"M8[{pa_type.unit}]") + result = self._pa_array.to_numpy() + result = result.astype(np_dtype, copy=copy) + if dtype is None or dtype.kind == "O": + dta_dtype = tz_to_dtype(pa_type.tz, pa_type.unit) + result = DatetimeArray._simple_new(result, dtype=dta_dtype) + result = result.to_numpy(dtype=object, na_value=na_value) + elif result.dtype != dtype: + result = result.astype(dtype, copy=False) + return result + elif pa.types.is_duration(pa_type): + from pandas.core.arrays.timedeltas import TimedeltaArray + + np_dtype = np.dtype(f"m8[{pa_type.unit}]") + result = self._pa_array.to_numpy() + result = result.astype(np_dtype, copy=copy) + if dtype is None or dtype.kind == "O": + result = TimedeltaArray._simple_new(result, dtype=np_dtype) + result = result.to_numpy(dtype=object, na_value=na_value) + elif result.dtype != dtype: + result = result.astype(dtype, copy=False) + return result + elif pa.types.is_time(pa_type): + # convert to list of python datetime.time objects before + # wrapping in ndarray result = np.array(list(self), dtype=dtype) elif is_object_dtype(dtype) and self._hasna: result = np.empty(len(self), dtype=object) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 964e5a0944f16..fe722e2661146 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -2204,6 +2204,20 @@ def ensure_arraylike_for_datetimelike(data, copy: bool, cls_name: str): ): data = data.to_numpy("int64", na_value=iNaT) copy = False + elif isinstance(data, ArrowExtensionArray) and data.dtype.kind == "M": + from pandas.core.arrays import DatetimeArray + from pandas.core.arrays.datetimes import tz_to_dtype + + pa_type = data._pa_array.type + dtype = tz_to_dtype(tz=pa_type.tz, unit=pa_type.unit) + data = data.to_numpy(f"M8[{pa_type.unit}]", na_value=iNaT) + data = DatetimeArray._simple_new(data, dtype=dtype) + copy = False + elif isinstance(data, ArrowExtensionArray) and data.dtype.kind == "m": + pa_type = data._pa_array.type + dtype = np.dtype(f"m8[{pa_type.unit}]") + data = data.to_numpy(dtype, na_value=iNaT) + copy = False elif not isinstance(data, (np.ndarray, ExtensionArray)) or isinstance( data, ArrowExtensionArray ): diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 9e5955da76d1c..1b0786bcd5d2e 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3008,3 +3008,69 @@ def test_comparison_temporal(pa_type): result = arr > val expected = ArrowExtensionArray(pa.array([False, True, True], type=pa.bool_())) tm.assert_extension_array_equal(result, expected) + + +@pytest.mark.parametrize( + "pa_type", tm.DATETIME_PYARROW_DTYPES + tm.TIMEDELTA_PYARROW_DTYPES +) +def test_getitem_temporal(pa_type): + # GH 53326 + arr = ArrowExtensionArray(pa.array([1, 2, 3], type=pa_type)) + result = arr[1] + if pa.types.is_duration(pa_type): + expected = pd.Timedelta(2, unit=pa_type.unit).as_unit(pa_type.unit) + assert isinstance(result, pd.Timedelta) + else: + expected = pd.Timestamp(2, unit=pa_type.unit, tz=pa_type.tz).as_unit( + pa_type.unit + ) + assert isinstance(result, pd.Timestamp) + assert result.unit == expected.unit + assert result == expected + + +@pytest.mark.parametrize( + "pa_type", tm.DATETIME_PYARROW_DTYPES + tm.TIMEDELTA_PYARROW_DTYPES +) +def test_iter_temporal(pa_type): + # GH 53326 + arr = ArrowExtensionArray(pa.array([1, None], type=pa_type)) + result = list(arr) + if pa.types.is_duration(pa_type): + expected = [ + pd.Timedelta(1, unit=pa_type.unit).as_unit(pa_type.unit), + pd.NA, + ] + assert isinstance(result[0], pd.Timedelta) + else: + expected = [ + pd.Timestamp(1, unit=pa_type.unit, tz=pa_type.tz).as_unit(pa_type.unit), + pd.NA, + ] + assert isinstance(result[0], pd.Timestamp) + assert result[0].unit == expected[0].unit + assert result == expected + + +@pytest.mark.parametrize( + "pa_type", tm.DATETIME_PYARROW_DTYPES + tm.TIMEDELTA_PYARROW_DTYPES +) +def test_to_numpy_temporal(pa_type): + # GH 53326 + arr = ArrowExtensionArray(pa.array([1, None], type=pa_type)) + result = arr.to_numpy() + if pa.types.is_duration(pa_type): + expected = [ + pd.Timedelta(1, unit=pa_type.unit).as_unit(pa_type.unit), + pd.NA, + ] + assert isinstance(result[0], pd.Timedelta) + else: + expected = [ + pd.Timestamp(1, unit=pa_type.unit, tz=pa_type.tz).as_unit(pa_type.unit), + pd.NA, + ] + assert isinstance(result[0], pd.Timestamp) + expected = np.array(expected, dtype=object) + assert result[0].unit == expected[0].unit + tm.assert_numpy_array_equal(result, expected)