diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 184ca581902ee..5e0abdc04b50e 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -622,6 +622,7 @@ Performance improvements - Performance improvement in :meth:`CategoricalDtype.update_dtype` when ``dtype`` is a :class:`CategoricalDtype` with non ``None`` categories and ordered (:issue:`59647`) - Performance improvement in :meth:`DataFrame.__getitem__` when ``key`` is a :class:`DataFrame` with many columns (:issue:`61010`) - Performance improvement in :meth:`DataFrame.astype` when converting to extension floating dtypes, e.g. "Float64" (:issue:`60066`) +- Performance improvement in :meth:`DataFrame.stack` when using ``future_stack=True`` and the DataFrame does not have a :class:`MultiIndex` (:issue:`58391`) - Performance improvement in :meth:`DataFrame.where` when ``cond`` is a :class:`DataFrame` with many columns (:issue:`61010`) - Performance improvement in :meth:`to_hdf` avoid unnecessary reopenings of the HDF5 file to speedup data addition to files with a very large number of groups . (:issue:`58248`) - Performance improvement in ``DataFrameGroupBy.__len__`` and ``SeriesGroupBy.__len__`` (:issue:`57595`) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index c60fe71a7ff28..d2a838b616426 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -936,7 +936,20 @@ def stack_v3(frame: DataFrame, level: list[int]) -> Series | DataFrame: [k for k in range(frame.columns.nlevels - 1, -1, -1) if k not in set_levels] ) - result = stack_reshape(frame, level, set_levels, stack_cols) + result: Series | DataFrame + if not isinstance(frame.columns, MultiIndex): + # GH#58817 Fast path when we're stacking the columns of a non-MultiIndex. + # When columns are homogeneous EAs, we pass through object + # dtype but this is still slightly faster than the normal path. + if len(frame.columns) > 0 and frame._is_homogeneous_type: + dtype = frame._mgr.blocks[0].dtype + else: + dtype = None + result = frame._constructor_sliced( + frame._values.reshape(-1, order="F"), dtype=dtype + ) + else: + result = stack_reshape(frame, level, set_levels, stack_cols) # Construct the correct MultiIndex by combining the frame's index and # stacked columns. @@ -1018,6 +1031,8 @@ def stack_reshape( ------- The data of behind the stacked DataFrame. """ + # non-MultIndex takes a fast path. + assert isinstance(frame.columns, MultiIndex) # If we need to drop `level` from columns, it needs to be in descending order drop_levnums = sorted(level, reverse=True) @@ -1027,18 +1042,14 @@ def stack_reshape( if len(frame.columns) == 1: data = frame.copy(deep=False) else: - if not isinstance(frame.columns, MultiIndex) and not isinstance(idx, tuple): - # GH#57750 - if the frame is an Index with tuples, .loc below will fail - column_indexer = idx - else: - # Take the data from frame corresponding to this idx value - if len(level) == 1: - idx = (idx,) - gen = iter(idx) - column_indexer = tuple( - next(gen) if k in set_levels else slice(None) - for k in range(frame.columns.nlevels) - ) + # Take the data from frame corresponding to this idx value + if len(level) == 1: + idx = (idx,) + gen = iter(idx) + column_indexer = tuple( + next(gen) if k in set_levels else slice(None) + for k in range(frame.columns.nlevels) + ) data = frame.loc[:, column_indexer] if len(level) < frame.columns.nlevels: diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index 2915c0585f373..a760cbc3995b3 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas.core.dtypes.dtypes import NumpyEADtype + import pandas as pd import pandas._testing as tm from pandas.api.extensions import ExtensionArray @@ -266,7 +268,13 @@ def test_stack(self, data, columns, future_stack): expected = expected.astype(object) if isinstance(expected, pd.Series): - assert result.dtype == df.iloc[:, 0].dtype + if future_stack and isinstance(data.dtype, NumpyEADtype): + # GH#58817 future_stack=True constructs the result specifying the dtype + # using the dtype of the input; we thus get the underlying + # NumPy dtype as the result instead of the NumpyExtensionArray + assert result.dtype == df.iloc[:, 0].to_numpy().dtype + else: + assert result.dtype == df.iloc[:, 0].dtype else: assert all(result.dtypes == df.iloc[:, 0].dtype)