Skip to content

ENH/PERF: pyarrow timestamp & duration conversion consistency/performance #53326

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
May 22, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion doc/source/whatsnew/v2.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -298,9 +298,9 @@ Performance improvements
- Performance improvement in :meth:`Series.corr` and :meth:`Series.cov` for extension dtypes (:issue:`52502`)
- Performance improvement in :meth:`Series.str.get` for pyarrow-backed strings (:issue:`53152`)
- Performance improvement in :meth:`Series.to_numpy` when dtype is a numpy float dtype and ``na_value`` is ``np.nan`` (:issue:`52430`)
- Performance improvement in :meth:`~arrays.ArrowExtensionArray.astype` when converting from a pyarrow timestamp or duration dtype to numpy (:issue:`53326`)
- Performance improvement in :meth:`~arrays.ArrowExtensionArray.to_numpy` (:issue:`52525`)
- Performance improvement when doing various reshaping operations on :class:`arrays.IntegerArrays` & :class:`arrays.FloatingArray` by avoiding doing unnecessary validation (:issue:`53013`)
-

.. ---------------------------------------------------------------------------
.. _whatsnew_210.bug_fixes:
Expand Down Expand Up @@ -449,6 +449,7 @@ ExtensionArray
- Bug in :class:`~arrays.ArrowExtensionArray` converting pandas non-nanosecond temporal objects from non-zero values to zero values (:issue:`53171`)
- Bug in :meth:`Series.quantile` for pyarrow temporal types raising ArrowInvalid (:issue:`52678`)
- Bug in :meth:`Series.rank` returning wrong order for small values with ``Float64`` dtype (:issue:`52471`)
- Bug in :meth:`~arrays.ArrowExtensionArray.__iter__` and :meth:`~arrays.ArrowExtensionArray.__getitem__` returning python datetime and timedelta objects for non-nano dtypes (:issue:`53326`)
- Bug where the ``__from_arrow__`` method of masked ExtensionDtypes(e.g. :class:`Float64Dtype`, :class:`BooleanDtype`) would not accept pyarrow arrays of type ``pyarrow.null()`` (:issue:`52223`)
-

Expand Down
57 changes: 51 additions & 6 deletions pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -533,9 +533,16 @@ def __getitem__(self, item: PositionalIndexer):
if isinstance(value, pa.ChunkedArray):
return type(self)(value)
else:
pa_type = self._pa_array.type
scalar = value.as_py()
if scalar is None:
return self._dtype.na_value
elif pa.types.is_timestamp(pa_type) and pa_type.unit != "ns":
# GH 53326
return Timestamp(scalar).as_unit(pa_type.unit)
elif pa.types.is_duration(pa_type) and pa_type.unit != "ns":
# GH 53326
return Timedelta(scalar).as_unit(pa_type.unit)
else:
return scalar

Expand All @@ -544,10 +551,18 @@ def __iter__(self) -> Iterator[Any]:
Iterate over elements of the array.
"""
na_value = self._dtype.na_value
# GH 53326
pa_type = self._pa_array.type
box_timestamp = pa.types.is_timestamp(pa_type) and pa_type.unit != "ns"
box_timedelta = pa.types.is_duration(pa_type) and pa_type.unit != "ns"
for value in self._pa_array:
val = value.as_py()
if val is None:
yield na_value
elif box_timestamp:
yield Timestamp(val).as_unit(pa_type.unit)
elif box_timedelta:
yield Timedelta(val).as_unit(pa_type.unit)
else:
yield val

Expand Down Expand Up @@ -1157,16 +1172,46 @@ def to_numpy(
copy: bool = False,
na_value: object = lib.no_default,
) -> np.ndarray:
if dtype is None and self._hasna:
dtype = object
if dtype is not None:
dtype = np.dtype(dtype)
elif self._hasna:
dtype = np.dtype(object)

if na_value is lib.no_default:
na_value = self.dtype.na_value

pa_type = self._pa_array.type
if pa.types.is_temporal(pa_type) and not pa.types.is_date(pa_type):
# temporal types with units and/or timezones currently
# require pandas/python scalars to pass all tests
# TODO: improve performance (this is slow)
if pa.types.is_timestamp(pa_type):
from pandas.core.arrays.datetimes import (
DatetimeArray,
tz_to_dtype,
)

np_dtype = np.dtype(f"M8[{pa_type.unit}]")
result = self._pa_array.to_numpy()
result = result.astype(np_dtype, copy=copy)
if dtype is None or dtype.kind == "O":
dta_dtype = tz_to_dtype(pa_type.tz, pa_type.unit)
result = DatetimeArray._simple_new(result, dtype=dta_dtype)
result = result.to_numpy(dtype=object, na_value=na_value)
elif result.dtype != dtype:
result = result.astype(dtype, copy=False)
return result
elif pa.types.is_duration(pa_type):
from pandas.core.arrays.timedeltas import TimedeltaArray

np_dtype = np.dtype(f"m8[{pa_type.unit}]")
result = self._pa_array.to_numpy()
result = result.astype(np_dtype, copy=copy)
if dtype is None or dtype.kind == "O":
result = TimedeltaArray._simple_new(result, dtype=np_dtype)
result = result.to_numpy(dtype=object, na_value=na_value)
elif result.dtype != dtype:
result = result.astype(dtype, copy=False)
return result
elif pa.types.is_time(pa_type):
# convert to list of python datetime.time objects before
# wrapping in ndarray
result = np.array(list(self), dtype=dtype)
elif is_object_dtype(dtype) and self._hasna:
result = np.empty(len(self), dtype=object)
Expand Down
14 changes: 14 additions & 0 deletions pandas/core/arrays/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -2204,6 +2204,20 @@ def ensure_arraylike_for_datetimelike(data, copy: bool, cls_name: str):
):
data = data.to_numpy("int64", na_value=iNaT)
copy = False
elif isinstance(data, ArrowExtensionArray) and data.dtype.kind == "M":
from pandas.core.arrays import DatetimeArray
from pandas.core.arrays.datetimes import tz_to_dtype

pa_type = data._pa_array.type
dtype = tz_to_dtype(tz=pa_type.tz, unit=pa_type.unit)
data = data.to_numpy(f"M8[{pa_type.unit}]", na_value=iNaT)
data = DatetimeArray._simple_new(data, dtype=dtype)
copy = False
elif isinstance(data, ArrowExtensionArray) and data.dtype.kind == "m":
pa_type = data._pa_array.type
dtype = np.dtype(f"m8[{pa_type.unit}]")
data = data.to_numpy(dtype, na_value=iNaT)
copy = False
elif not isinstance(data, (np.ndarray, ExtensionArray)) or isinstance(
data, ArrowExtensionArray
):
Expand Down
66 changes: 66 additions & 0 deletions pandas/tests/extension/test_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -3008,3 +3008,69 @@ def test_comparison_temporal(pa_type):
result = arr > val
expected = ArrowExtensionArray(pa.array([False, True, True], type=pa.bool_()))
tm.assert_extension_array_equal(result, expected)


@pytest.mark.parametrize(
"pa_type", tm.DATETIME_PYARROW_DTYPES + tm.TIMEDELTA_PYARROW_DTYPES
)
def test_getitem_temporal(pa_type):
# GH 53326
arr = ArrowExtensionArray(pa.array([1, 2, 3], type=pa_type))
result = arr[1]
if pa.types.is_duration(pa_type):
expected = pd.Timedelta(2, unit=pa_type.unit).as_unit(pa_type.unit)
assert isinstance(result, pd.Timedelta)
else:
expected = pd.Timestamp(2, unit=pa_type.unit, tz=pa_type.tz).as_unit(
pa_type.unit
)
assert isinstance(result, pd.Timestamp)
assert result.unit == expected.unit
assert result == expected


@pytest.mark.parametrize(
"pa_type", tm.DATETIME_PYARROW_DTYPES + tm.TIMEDELTA_PYARROW_DTYPES
)
def test_iter_temporal(pa_type):
# GH 53326
arr = ArrowExtensionArray(pa.array([1, None], type=pa_type))
result = list(arr)
if pa.types.is_duration(pa_type):
expected = [
pd.Timedelta(1, unit=pa_type.unit).as_unit(pa_type.unit),
pd.NA,
]
assert isinstance(result[0], pd.Timedelta)
else:
expected = [
pd.Timestamp(1, unit=pa_type.unit, tz=pa_type.tz).as_unit(pa_type.unit),
pd.NA,
]
assert isinstance(result[0], pd.Timestamp)
assert result[0].unit == expected[0].unit
assert result == expected


@pytest.mark.parametrize(
"pa_type", tm.DATETIME_PYARROW_DTYPES + tm.TIMEDELTA_PYARROW_DTYPES
)
def test_to_numpy_temporal(pa_type):
# GH 53326
arr = ArrowExtensionArray(pa.array([1, None], type=pa_type))
result = arr.to_numpy()
if pa.types.is_duration(pa_type):
expected = [
pd.Timedelta(1, unit=pa_type.unit).as_unit(pa_type.unit),
pd.NA,
]
assert isinstance(result[0], pd.Timedelta)
else:
expected = [
pd.Timestamp(1, unit=pa_type.unit, tz=pa_type.tz).as_unit(pa_type.unit),
pd.NA,
]
assert isinstance(result[0], pd.Timestamp)
expected = np.array(expected, dtype=object)
assert result[0].unit == expected[0].unit
tm.assert_numpy_array_equal(result, expected)