From 89de6698271591d3f970662fa3efc50a20a4e66b Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 21 Aug 2023 21:31:57 +0200 Subject: [PATCH 1/3] BUG: drop_duplicates raising for boolean arrow dtype with missing values --- doc/source/whatsnew/v2.1.0.rst | 2 ++ pandas/core/algorithms.py | 11 ++++++++--- pandas/tests/series/methods/test_drop_duplicates.py | 7 +++++++ 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index d1a689dc60830..7b9c7857ea9e8 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -630,6 +630,7 @@ Performance improvements - Performance improvement in :meth:`DataFrame.transpose` when transposing a DataFrame with a single pyarrow dtype (:issue:`54224`) - Performance improvement in :meth:`Series.add` for pyarrow string and binary dtypes (:issue:`53150`) - Performance improvement in :meth:`Series.corr` and :meth:`Series.cov` for extension dtypes (:issue:`52502`) +- Performance improvement in :meth:`Series.drop_duplicates` for ``ArrowDtype`` (:issue:`54667`). - Performance improvement in :meth:`Series.ffill`, :meth:`Series.bfill`, :meth:`DataFrame.ffill`, :meth:`DataFrame.bfill` with pyarrow dtypes (:issue:`53950`) - Performance improvement in :meth:`Series.str.get_dummies` for pyarrow-backed strings (:issue:`53655`) - Performance improvement in :meth:`Series.str.get` for pyarrow-backed strings (:issue:`53152`) @@ -836,6 +837,7 @@ ExtensionArray - Bug in :class:`~arrays.ArrowExtensionArray` converting pandas non-nanosecond temporal objects from non-zero values to zero values (:issue:`53171`) - Bug in :meth:`Series.quantile` for pyarrow temporal types raising ArrowInvalid (:issue:`52678`) - Bug in :meth:`Series.rank` returning wrong order for small values with ``Float64`` dtype (:issue:`52471`) +- Bug in :meth:`Series.unique` for boolean ``ArrowDtype`` with ``NA`` values (:issue:`54667`) - Bug in :meth:`~arrays.ArrowExtensionArray.__iter__` and :meth:`~arrays.ArrowExtensionArray.__getitem__` returning python datetime and timedelta objects for non-nano dtypes (:issue:`53326`) - Bug where the :class:`DataFrame` repr would not work when a column would have an :class:`ArrowDtype` with an ``pyarrow.ExtensionDtype`` (:issue:`54063`) - Bug where the ``__from_arrow__`` method of masked ExtensionDtypes(e.g. :class:`Float64Dtype`, :class:`BooleanDtype`) would not accept pyarrow arrays of type ``pyarrow.null()`` (:issue:`52223`) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 14dee202a9d8d..dd062e935f47f 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -55,6 +55,7 @@ ) from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.dtypes import ( + ArrowDtype, BaseMaskedDtype, CategoricalDtype, ExtensionDtype, @@ -996,9 +997,13 @@ def duplicated( ------- duplicated : ndarray[bool] """ - if hasattr(values, "dtype") and isinstance(values.dtype, BaseMaskedDtype): - values = cast("BaseMaskedArray", values) - return htable.duplicated(values._data, keep=keep, mask=values._mask) + if hasattr(values, "dtype"): + if isinstance(values.dtype, ArrowDtype): + values = values._to_masked() + + if isinstance(values.dtype, BaseMaskedDtype): + values = cast("BaseMaskedArray", values) + return htable.duplicated(values._data, keep=keep, mask=values._mask) values = _ensure_data(values) return htable.duplicated(values, keep=keep) diff --git a/pandas/tests/series/methods/test_drop_duplicates.py b/pandas/tests/series/methods/test_drop_duplicates.py index 7e4503be2ec47..8c9a6f1b9cac6 100644 --- a/pandas/tests/series/methods/test_drop_duplicates.py +++ b/pandas/tests/series/methods/test_drop_duplicates.py @@ -249,3 +249,10 @@ def test_drop_duplicates_ignore_index(self): result = ser.drop_duplicates(ignore_index=True) expected = Series([1, 2, 3]) tm.assert_series_equal(result, expected) + + def test_duplicated_arrow_dtype(keep): + pytest.importorskip("pyarrow") + ser = Series([True, False, None, False], dtype="bool[pyarrow]") + result = ser.drop_duplicates() + expected = Series([True, False, None], dtype="bool[pyarrow]") + tm.assert_series_equal(result, expected) From 49d278b77f637502f78d2d5d2dc1f2b89294f003 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 21 Aug 2023 22:36:47 +0200 Subject: [PATCH 2/3] Fix typing --- pandas/core/algorithms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index dd062e935f47f..06da747a450ee 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -999,7 +999,7 @@ def duplicated( """ if hasattr(values, "dtype"): if isinstance(values.dtype, ArrowDtype): - values = values._to_masked() + values = values._to_masked() # type: ignore[union-attr] if isinstance(values.dtype, BaseMaskedDtype): values = cast("BaseMaskedArray", values) From 3a85d4c945cebe594de1bc813dcd8a9fc9fc05b3 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Tue, 22 Aug 2023 17:32:00 +0200 Subject: [PATCH 3/3] Update test_drop_duplicates.py --- pandas/tests/series/methods/test_drop_duplicates.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/series/methods/test_drop_duplicates.py b/pandas/tests/series/methods/test_drop_duplicates.py index 60d3207e45ac0..324ab1204e16e 100644 --- a/pandas/tests/series/methods/test_drop_duplicates.py +++ b/pandas/tests/series/methods/test_drop_duplicates.py @@ -250,7 +250,7 @@ def test_drop_duplicates_ignore_index(self): expected = Series([1, 2, 3]) tm.assert_series_equal(result, expected) - def test_duplicated_arrow_dtype(keep): + def test_duplicated_arrow_dtype(self): pytest.importorskip("pyarrow") ser = Series([True, False, None, False], dtype="bool[pyarrow]") result = ser.drop_duplicates()