From 9bcad624274d10480d30aa60c75d3d1397c1ce42 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 13 Feb 2023 21:55:34 +0100 Subject: [PATCH 1/3] ENH: Improve performance for arrow dtypes in monotonic join --- doc/source/whatsnew/v2.0.0.rst | 2 +- pandas/core/indexes/base.py | 15 +++++++++++++-- pandas/tests/indexes/test_setops.py | 8 ++++++++ 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 29f360e050548..5c0ce5fde43d7 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -1089,7 +1089,7 @@ Performance improvements - Performance improvement in :meth:`Series.rank` for pyarrow-backed dtypes (:issue:`50264`) - Performance improvement in :meth:`Series.searchsorted` for pyarrow-backed dtypes (:issue:`50447`) - Performance improvement in :meth:`Series.fillna` for extension array dtypes (:issue:`49722`, :issue:`50078`) -- Performance improvement in :meth:`Index.join`, :meth:`Index.intersection` and :meth:`Index.union` for masked dtypes when :class:`Index` is monotonic (:issue:`50310`) +- Performance improvement in :meth:`Index.join`, :meth:`Index.intersection` and :meth:`Index.union` for masked and arrow dtypes when :class:`Index` is monotonic (:issue:`50310`) - Performance improvement for :meth:`Series.value_counts` with nullable dtype (:issue:`48338`) - Performance improvement for :class:`Series` constructor passing integer numpy array with nullable dtype (:issue:`48338`) - Performance improvement for :class:`DatetimeIndex` constructor passing a list (:issue:`48609`) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 363bfe76d40fb..4b01f1468937b 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -145,6 +145,7 @@ validate_putmask, ) from pandas.core.arrays import ( + ArrowExtensionArray, BaseMaskedArray, Categorical, ExtensionArray, @@ -4847,8 +4848,10 @@ def _can_use_libjoin(self) -> bool: if type(self) is Index: # excludes EAs, but include masks, we get here with monotonic # values only, meaning no NA - return isinstance(self.dtype, np.dtype) or isinstance( - self.values, BaseMaskedArray + return ( + isinstance(self.dtype, np.dtype) + or isinstance(self.values, BaseMaskedArray) + or isinstance(self._values, ArrowExtensionArray) ) return not is_interval_dtype(self.dtype) @@ -4939,6 +4942,10 @@ def _get_join_target(self) -> ArrayLike: if isinstance(self._values, BaseMaskedArray): # This is only used if our array is monotonic, so no NAs present return self._values._data + elif isinstance(self._values, ArrowExtensionArray): + # This is only used if our array is monotonic, so no missing values + # present + return self._values.to_numpy() return self._get_engine_target() def _from_join_target(self, result: np.ndarray) -> ArrayLike: @@ -4948,6 +4955,10 @@ def _from_join_target(self, result: np.ndarray) -> ArrayLike: """ if isinstance(self.values, BaseMaskedArray): return type(self.values)(result, np.zeros(result.shape, dtype=np.bool_)) + elif isinstance(self.values, ArrowExtensionArray): + import pyarrow as pa + + return type(self.values)(pa.array(result)) return result @doc(IndexOpsMixin._memory_usage) diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index 001efe07b5d2b..28e894368fb8c 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -886,3 +886,11 @@ def test_symmetric_difference_non_index(self, sort): result = index1.symmetric_difference(index2, result_name="new_name", sort=sort) assert tm.equalContents(result, expected) assert result.name == "new_name" + + def test_union_ea_dtypes(self, any_numeric_ea_and_arrow_dtype): + # GH# + idx = Index([1, 2, 3], dtype=any_numeric_ea_and_arrow_dtype) + idx2 = Index([3, 4, 5], dtype=any_numeric_ea_and_arrow_dtype) + result = idx.union(idx2) + expected = Index([1, 2, 3, 4, 5], dtype=any_numeric_ea_and_arrow_dtype) + tm.assert_index_equal(result, expected) From ee3b59e77dde1e7479647ec303f67ad34dc4e90c Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 13 Feb 2023 21:57:04 +0100 Subject: [PATCH 2/3] Add gh ref --- doc/source/whatsnew/v2.0.0.rst | 2 +- pandas/tests/indexes/test_setops.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 5c0ce5fde43d7..08ba7d80f6880 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -1089,7 +1089,7 @@ Performance improvements - Performance improvement in :meth:`Series.rank` for pyarrow-backed dtypes (:issue:`50264`) - Performance improvement in :meth:`Series.searchsorted` for pyarrow-backed dtypes (:issue:`50447`) - Performance improvement in :meth:`Series.fillna` for extension array dtypes (:issue:`49722`, :issue:`50078`) -- Performance improvement in :meth:`Index.join`, :meth:`Index.intersection` and :meth:`Index.union` for masked and arrow dtypes when :class:`Index` is monotonic (:issue:`50310`) +- Performance improvement in :meth:`Index.join`, :meth:`Index.intersection` and :meth:`Index.union` for masked and arrow dtypes when :class:`Index` is monotonic (:issue:`50310`, :issue:`51365`) - Performance improvement for :meth:`Series.value_counts` with nullable dtype (:issue:`48338`) - Performance improvement for :class:`Series` constructor passing integer numpy array with nullable dtype (:issue:`48338`) - Performance improvement for :class:`DatetimeIndex` constructor passing a list (:issue:`48609`) diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index 28e894368fb8c..708de02518b73 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -888,7 +888,7 @@ def test_symmetric_difference_non_index(self, sort): assert result.name == "new_name" def test_union_ea_dtypes(self, any_numeric_ea_and_arrow_dtype): - # GH# + # GH#51365 idx = Index([1, 2, 3], dtype=any_numeric_ea_and_arrow_dtype) idx2 = Index([3, 4, 5], dtype=any_numeric_ea_and_arrow_dtype) result = idx.union(idx2) From df057ad24a92b96021f0369897374890f4d62aec Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 16 Feb 2023 01:15:15 +0100 Subject: [PATCH 3/3] Change --- pandas/core/indexes/base.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 4b01f1468937b..9d4a4ca8a5140 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4956,9 +4956,7 @@ def _from_join_target(self, result: np.ndarray) -> ArrayLike: if isinstance(self.values, BaseMaskedArray): return type(self.values)(result, np.zeros(result.shape, dtype=np.bool_)) elif isinstance(self.values, ArrowExtensionArray): - import pyarrow as pa - - return type(self.values)(pa.array(result)) + return type(self.values)._from_sequence(result) return result @doc(IndexOpsMixin._memory_usage)