From 45dc2f57293d203bc8f290e9072a9bf097c04c62 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Sun, 16 Oct 2022 09:04:14 -0400 Subject: [PATCH 1/5] perf ea.tolist --- asv_bench/benchmarks/array.py | 3 ++ asv_bench/benchmarks/join_merge.py | 34 ++++++++++++++++++++++ pandas/core/arrays/base.py | 3 +- pandas/tests/arrays/string_/test_string.py | 8 +++++ 4 files changed, 47 insertions(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/array.py b/asv_bench/benchmarks/array.py index df9d171a70397..301fc9d405057 100644 --- a/asv_bench/benchmarks/array.py +++ b/asv_bench/benchmarks/array.py @@ -71,3 +71,6 @@ def time_setitem_list(self, multiple_chunks): def time_setitem_slice(self, multiple_chunks): self.array[::10] = "foo" + + def time_tolist(self, multiple_chunks): + self.array.tolist() diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py index f6630857bee91..7a25f77da1c82 100644 --- a/asv_bench/benchmarks/join_merge.py +++ b/asv_bench/benchmarks/join_merge.py @@ -4,6 +4,7 @@ from pandas import ( DataFrame, + Index, MultiIndex, Series, array, @@ -92,6 +93,39 @@ def time_f_ordered(self, axis, ignore_index): concat(self.frame_f, axis=axis, ignore_index=ignore_index) +class ConcatIndexDtype: + + params = ( + ["datetime64[ns]", "int64", "Int64", "string[python]", "string[pyarrow]"], + [0, 1], + [True, False], + [True, False], + ) + param_names = ["dtype", "axis", "sort", "is_monotonic"] + + def setup(self, dtype, axis, sort, is_monotonic): + N = 10_000 + if dtype == "datetime64[ns]": + vals = date_range("1970-01-01", periods=N) + elif dtype in ("int64", "Int64"): + vals = np.arange(N, dtype=np.int64) + elif dtype in ("string[python]", "string[pyarrow]"): + vals = tm.makeStringIndex(N) + else: + raise NotImplementedError + + idx = Index(vals, dtype=dtype) + if is_monotonic: + idx = idx.sort_values() + else: + idx = idx[::-1] + + self.series = [Series(i, idx[i:]) for i in range(5)] + + def time_concat_series(self, dtype, axis, sort, is_monotonic): + concat(self.series, axis=axis, sort=sort) + + class Join: params = [True, False] diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 9758ca84d236b..9c260a62009a0 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1455,7 +1455,8 @@ def tolist(self) -> list: """ if self.ndim > 1: return [x.tolist() for x in self] - return list(self) + # faster than list(self) + return self.to_numpy().tolist() def delete(self: ExtensionArrayT, loc: PositionalIndexer) -> ExtensionArrayT: indexer = np.delete(np.arange(len(self)), loc) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index a7b8162eb981a..5d0eb81114a23 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -611,3 +611,11 @@ def test_setitem_scalar_with_mask_validation(dtype): msg = "Scalar must be NA or str" with pytest.raises(ValueError, match=msg): ser[mask] = 1 + + +def test_tolist(dtype): + vals = ["a", "b", "c"] + arr = pd.array(vals, dtype=dtype) + result = arr.tolist() + expected = vals + tm.assert_equal(result, expected) From 77a21f8d449953f9b4e71313009b1d9db42bd258 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Sun, 16 Oct 2022 09:05:38 -0400 Subject: [PATCH 2/5] add another test --- pandas/tests/arrays/masked/test_function.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/tests/arrays/masked/test_function.py b/pandas/tests/arrays/masked/test_function.py index 9a86ef835e5ef..4c7bd6e293ef4 100644 --- a/pandas/tests/arrays/masked/test_function.py +++ b/pandas/tests/arrays/masked/test_function.py @@ -49,3 +49,9 @@ def test_round(data, numpy_dtype): dtype=data.dtype, ) tm.assert_extension_array_equal(result, expected) + + +def test_tolist(data): + result = data.tolist() + expected = list(data) + tm.assert_equal(result, expected) From 7367b99f2ac4591df7698791861394cb9a5563db Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Sun, 16 Oct 2022 09:28:55 -0400 Subject: [PATCH 3/5] whatsnew --- doc/source/whatsnew/v2.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index fea3d70d81554..a5aa186d33f4c 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -154,6 +154,7 @@ Performance improvements - Performance improvement in :func:`merge` and :meth:`DataFrame.join` when joining on a sorted :class:`MultiIndex` (:issue:`48504`) - Performance improvement in :meth:`DataFrame.loc` and :meth:`Series.loc` for tuple-based indexing of a :class:`MultiIndex` (:issue:`48384`) - Performance improvement for :meth:`MultiIndex.unique` (:issue:`48335`) +- Performance improvement for :func:`concat` with extension array backed indexes (:issue:`49128`) - Performance improvement in :meth:`DataFrame.join` when joining on a subset of a :class:`MultiIndex` (:issue:`48611`) - Performance improvement for :meth:`MultiIndex.intersection` (:issue:`48604`) - Performance improvement in ``var`` for nullable dtypes (:issue:`48379`). From c9a014a8c64d52bbfc9b6df6799b8f9868af7590 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Sun, 16 Oct 2022 09:59:37 -0400 Subject: [PATCH 4/5] specify dtype="object" --- pandas/core/arrays/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 9c260a62009a0..5eddfa852565e 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1456,7 +1456,7 @@ def tolist(self) -> list: if self.ndim > 1: return [x.tolist() for x in self] # faster than list(self) - return self.to_numpy().tolist() + return self.to_numpy(dtype="object").tolist() def delete(self: ExtensionArrayT, loc: PositionalIndexer) -> ExtensionArrayT: indexer = np.delete(np.arange(len(self)), loc) From f66c99bd4c0b435693d9621ea727cdb53e735147 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Sun, 16 Oct 2022 21:33:13 -0400 Subject: [PATCH 5/5] fixup --- pandas/core/arrays/base.py | 3 +-- pandas/core/arrays/masked.py | 9 +++++++++ pandas/core/arrays/string_.py | 7 ++++++- 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 5eddfa852565e..9758ca84d236b 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1455,8 +1455,7 @@ def tolist(self) -> list: """ if self.ndim > 1: return [x.tolist() for x in self] - # faster than list(self) - return self.to_numpy(dtype="object").tolist() + return list(self) def delete(self: ExtensionArrayT, loc: PositionalIndexer) -> ExtensionArrayT: indexer = np.delete(np.arange(len(self)), loc) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 15e201b8279de..746175ee3374d 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -427,6 +427,15 @@ def to_numpy( data = self._data.astype(dtype, copy=copy) return data + @doc(ExtensionArray.tolist) + def tolist(self): + if self.ndim > 1: + return [x.tolist() for x in self] + if not self._hasna: + # faster than list(self) + return list(self._data) + return list(self) + @overload def astype(self, dtype: npt.DTypeLike, copy: bool = ...) -> np.ndarray: ... diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 2f9857eb43860..451399255e7f9 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -20,6 +20,7 @@ ) from pandas.compat import pa_version_under1p01 from pandas.compat.numpy import function as nv +from pandas.util._decorators import doc from pandas.core.dtypes.base import ( ExtensionDtype, @@ -214,7 +215,11 @@ class BaseStringArray(ExtensionArray): Mixin class for StringArray, ArrowStringArray. """ - pass + @doc(ExtensionArray.tolist) + def tolist(self): + if self.ndim > 1: + return [x.tolist() for x in self] + return list(self.to_numpy()) class StringArray(BaseStringArray, PandasArray):