From eaf4fa9355cede31f28b1cc26a891f930fce70de Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 7 May 2021 13:24:41 -0700 Subject: [PATCH 1/9] ENH: retain masked EA dtypes in groupby with as_index=False --- pandas/core/groupby/generic.py | 4 ++-- pandas/core/groupby/grouper.py | 25 ++++++++++++++++++++--- pandas/core/groupby/ops.py | 26 +++++++++++++++++++++--- pandas/tests/extension/base/groupby.py | 8 ++++---- pandas/tests/extension/json/test_json.py | 4 ---- pandas/tests/extension/test_boolean.py | 8 ++++---- 6 files changed, 55 insertions(+), 20 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 55e8578b2cef4..c7ff439c649ea 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1046,7 +1046,7 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) self._insert_inaxis_grouper_inplace(result) result.index = np.arange(len(result)) - return result._convert(datetime=True) + return result agg = aggregate @@ -1633,7 +1633,7 @@ def _wrap_agged_manager(self, mgr: Manager2D) -> DataFrame: if self.axis == 1: result = result.T - return self._reindex_output(result)._convert(datetime=True) + return self._reindex_output(result) def _iterate_column_groupbys(self): for i, colname in enumerate(self._selected_obj.columns): diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index f1762a2535ff7..0307b008731f9 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -10,6 +10,7 @@ import numpy as np from pandas._typing import ( + ArrayLike, FrameOrSeries, final, ) @@ -462,6 +463,8 @@ def __init__( self.in_axis = in_axis self.dropna = dropna + self._group_arraylike = None + # right place for this? if isinstance(grouper, (Series, Index)) and name is None: self.name = grouper.name @@ -601,6 +604,22 @@ def codes(self) -> np.ndarray: # expected "ndarray") return self._codes # type: ignore[return-value] + @cache_readonly + def result_arraylike(self) -> ArrayLike: + """ + Analogous to result_index, but holding an ArrayLike to ensure + we can can retain ExtensionDtypes. + """ + ridx = self.result_index # initialized _group_arraylike + + if self._group_arraylike is None: + # This should only occur when ridx is CategoricalIndex + self._group_arraylike = ridx._values + + # error: Incompatible return value type (got "None", expected + # "Union[ExtensionArray, ndarray]") + return self._group_arraylike # type: ignore[return-value] + @cache_readonly def result_index(self) -> Index: if self.all_grouper is not None: @@ -623,7 +642,7 @@ def _make_codes(self) -> None: # we have a list of groupers if isinstance(self.grouper, ops.BaseGrouper): codes = self.grouper.codes_info - uniques = self.grouper.result_index + uniques = self.grouper.result_arraylike else: # GH35667, replace dropna=False with na_sentinel=None if not self.dropna: @@ -633,9 +652,9 @@ def _make_codes(self) -> None: codes, uniques = algorithms.factorize( self.grouper, sort=self.sort, na_sentinel=na_sentinel ) - uniques = Index(uniques, name=self.name) self._codes = codes - self._group_index = uniques + self._group_arraylike = uniques + self._group_index = Index(uniques, name=self.name) @cache_readonly def groups(self) -> dict[Hashable, np.ndarray]: diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index b88f2b0200768..abc4f8802e110 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -908,6 +908,23 @@ def reconstructed_codes(self) -> list[np.ndarray]: ids, obs_ids, _ = self.group_info return decons_obs_group_ids(ids, obs_ids, self.shape, codes, xnull=True) + @cache_readonly + def result_arraylike(self) -> ArrayLike: + """ + Analogous to result_index, but returning an ndarray/ExtensionArray + allowing us to retain ExtensionDtypes not supported by Index. + """ + # TODO: once Index supports arbitrary EAs, this can be removed in favor + # of result_index + if len(self.groupings) == 1: + return self.groupings[0].result_arraylike + + codes = self.reconstructed_codes + levels = [ping.result_arraylike for ping in self.groupings] + return MultiIndex( + levels=levels, codes=codes, verify_integrity=False, names=self.names + )._values + @cache_readonly def result_index(self) -> Index: if len(self.groupings) == 1: @@ -924,12 +941,12 @@ def get_group_levels(self) -> list[Index]: # Note: only called from _insert_inaxis_grouper_inplace, which # is only called for BaseGrouper, never for BinGrouper if len(self.groupings) == 1: - return [self.groupings[0].result_index] + return [self.groupings[0].result_arraylike] name_list = [] for ping, codes in zip(self.groupings, self.reconstructed_codes): codes = ensure_platform_int(codes) - levels = ping.result_index.take(codes) + levels = ping.result_arraylike.take(codes) name_list.append(levels) @@ -991,7 +1008,10 @@ def agg_series(self, obj: Series, func: F) -> ArrayLike: result = self._aggregate_series_fast(obj, func) cast_back = False - npvalues = lib.maybe_convert_objects(result, try_float=False) + convert_datetime = obj.dtype.kind == "M" + npvalues = lib.maybe_convert_objects( + result, try_float=False, convert_datetime=convert_datetime + ) if cast_back: # TODO: Is there a documented reason why we dont always cast_back? out = maybe_cast_pointwise_result(npvalues, obj.dtype, numeric_only=True) diff --git a/pandas/tests/extension/base/groupby.py b/pandas/tests/extension/base/groupby.py index 30b115b9dba6f..23a295c291b5d 100644 --- a/pandas/tests/extension/base/groupby.py +++ b/pandas/tests/extension/base/groupby.py @@ -22,14 +22,14 @@ def test_grouping_grouper(self, data_for_grouping): def test_groupby_extension_agg(self, as_index, data_for_grouping): df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) result = df.groupby("B", as_index=as_index).A.mean() - _, index = pd.factorize(data_for_grouping, sort=True) + _, uniques = pd.factorize(data_for_grouping, sort=True) - index = pd.Index(index, name="B") - expected = pd.Series([3, 1, 4], index=index, name="A") if as_index: + index = pd.Index(uniques, name="B") + expected = pd.Series([3, 1, 4], index=index, name="A") self.assert_series_equal(result, expected) else: - expected = expected.reset_index() + expected = pd.DataFrame({"B": uniques, "A": [3, 1, 4]}) self.assert_frame_equal(result, expected) def test_groupby_agg_extension(self, data_for_grouping): diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index b8fa158083327..b5bb68e8a9a12 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -312,10 +312,6 @@ def test_groupby_extension_apply(self): we'll be able to dispatch unique. """ - @pytest.mark.parametrize("as_index", [True, False]) - def test_groupby_extension_agg(self, as_index, data_for_grouping): - super().test_groupby_extension_agg(as_index, data_for_grouping) - @pytest.mark.xfail(reason="GH#39098: Converts agg result to object") def test_groupby_agg_extension(self, data_for_grouping): super().test_groupby_agg_extension(data_for_grouping) diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index 33d82a1d64fb7..287fa0f20309d 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -269,14 +269,14 @@ def test_grouping_grouper(self, data_for_grouping): def test_groupby_extension_agg(self, as_index, data_for_grouping): df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": data_for_grouping}) result = df.groupby("B", as_index=as_index).A.mean() - _, index = pd.factorize(data_for_grouping, sort=True) + _, uniques = pd.factorize(data_for_grouping, sort=True) - index = pd.Index(index, name="B") - expected = pd.Series([3, 1], index=index, name="A") if as_index: + index = pd.Index(uniques, name="B") + expected = pd.Series([3, 1], index=index, name="A") self.assert_series_equal(result, expected) else: - expected = expected.reset_index() + expected = pd.DataFrame({"B": uniques, "A": [3, 1]}) self.assert_frame_equal(result, expected) def test_groupby_agg_extension(self, data_for_grouping): From 02005f47be2b4fa6358ef1766d930af2f9ea7ca0 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 7 May 2021 15:33:02 -0700 Subject: [PATCH 2/9] 32bit compt --- pandas/tests/groupby/test_groupby.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index f716a3a44cd54..9d2c2e012555f 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -693,6 +693,8 @@ def test_ops_not_as_index(reduction_func): if reduction_func == "size": expected = expected.rename("size") expected = expected.reset_index() + # 32 bit compat -> groupby preserves dtype whereas reset_index casts to int64 + expected["a"] = expected["a"].astype(df["a"].dtype) g = df.groupby("a", as_index=False) From 93ed7e75179958f3688b6d2e7593af40f9ee9b8f Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 7 May 2021 16:33:33 -0700 Subject: [PATCH 3/9] 32bit compat --- pandas/tests/groupby/test_groupby.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 9d2c2e012555f..33b81dbd75ea0 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -693,8 +693,10 @@ def test_ops_not_as_index(reduction_func): if reduction_func == "size": expected = expected.rename("size") expected = expected.reset_index() - # 32 bit compat -> groupby preserves dtype whereas reset_index casts to int64 - expected["a"] = expected["a"].astype(df["a"].dtype) + + if reduction_func != "size": + # 32 bit compat -> groupby preserves dtype whereas reset_index casts to int64 + expected["a"] = expected["a"].astype(df["a"].dtype) g = df.groupby("a", as_index=False) From d4a986aabd88d4e916829107ce98093511993ded Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 10 May 2021 14:55:11 -0700 Subject: [PATCH 4/9] whatsnew --- doc/source/whatsnew/v1.3.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 5adc8540e6864..96c3f1b2c3176 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -227,7 +227,7 @@ Other enhancements - Constructing a :class:`DataFrame` or :class:`Series` with the ``data`` argument being a Python iterable that is *not* a NumPy ``ndarray`` consisting of NumPy scalars will now result in a dtype with a precision the maximum of the NumPy scalars; this was already the case when ``data`` is a NumPy ``ndarray`` (:issue:`40908`) - Add keyword ``sort`` to :func:`pivot_table` to allow non-sorting of the result (:issue:`39143`) - Add keyword ``dropna`` to :meth:`DataFrame.value_counts` to allow counting rows that include ``NA`` values (:issue:`41325`) -- +- :class:`DataFrameGroupBy` operations with ``as_index=False`` now correctly retain ``ExtensionDtype`` dtypes for columns being grouped on (:issue:`41373`) .. --------------------------------------------------------------------------- From cf6b37dc2b304b53781adb4fce9e190aa4c81225 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 17 May 2021 13:32:36 -0700 Subject: [PATCH 5/9] rename result_arraylike-> group_arraylike --- pandas/core/groupby/grouper.py | 2 +- pandas/core/groupby/ops.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index ec8daeb60de23..9cd13954e4cbc 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -602,7 +602,7 @@ def codes(self) -> np.ndarray: return self._codes # type: ignore[return-value] @cache_readonly - def result_arraylike(self) -> ArrayLike: + def group_arraylike(self) -> ArrayLike: """ Analogous to result_index, but holding an ArrayLike to ensure we can can retain ExtensionDtypes. diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 4ffc7e1ded92e..2c25b2fbd1c31 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -916,10 +916,10 @@ def result_arraylike(self) -> ArrayLike: # TODO: once Index supports arbitrary EAs, this can be removed in favor # of result_index if len(self.groupings) == 1: - return self.groupings[0].result_arraylike + return self.groupings[0].group_arraylike codes = self.reconstructed_codes - levels = [ping.result_arraylike for ping in self.groupings] + levels = [ping.group_arraylike for ping in self.groupings] return MultiIndex( levels=levels, codes=codes, verify_integrity=False, names=self.names )._values @@ -940,12 +940,12 @@ def get_group_levels(self) -> list[Index]: # Note: only called from _insert_inaxis_grouper_inplace, which # is only called for BaseGrouper, never for BinGrouper if len(self.groupings) == 1: - return [self.groupings[0].result_arraylike] + return [self.groupings[0].group_arraylike] name_list = [] for ping, codes in zip(self.groupings, self.reconstructed_codes): codes = ensure_platform_int(codes) - levels = ping.result_arraylike.take(codes) + levels = ping.group_arraylike.take(codes) name_list.append(levels) From 446a77e5cc9eb201684d74260cbdc7fb03775dac Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 21 May 2021 14:22:49 -0700 Subject: [PATCH 6/9] fix test --- pandas/tests/extension/base/groupby.py | 4 ++-- pandas/tests/extension/test_boolean.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tests/extension/base/groupby.py b/pandas/tests/extension/base/groupby.py index 0ca9edc63b460..4c8dc6ca1ad9c 100644 --- a/pandas/tests/extension/base/groupby.py +++ b/pandas/tests/extension/base/groupby.py @@ -26,10 +26,10 @@ def test_groupby_extension_agg(self, as_index, data_for_grouping): if as_index: index = pd.Index(uniques, name="B") - expected = pd.Series([3, 1, 4], index=index, name="A") + expected = pd.Series([3.0, 1.0, 4.0], index=index, name="A") self.assert_series_equal(result, expected) else: - expected = pd.DataFrame({"B": uniques, "A": [3, 1, 4]}) + expected = pd.DataFrame({"B": uniques, "A": [3.0, 1.0, 4.0]}) self.assert_frame_equal(result, expected) def test_groupby_agg_extension(self, data_for_grouping): diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index 4d8527250f5c9..395540993dc15 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -273,10 +273,10 @@ def test_groupby_extension_agg(self, as_index, data_for_grouping): if as_index: index = pd.Index(uniques, name="B") - expected = pd.Series([3, 1], index=index, name="A") + expected = pd.Series([3.0, 1.0], index=index, name="A") self.assert_series_equal(result, expected) else: - expected = pd.DataFrame({"B": uniques, "A": [3, 1]}) + expected = pd.DataFrame({"B": uniques, "A": [3.0, 1.0]}) self.assert_frame_equal(result, expected) def test_groupby_agg_extension(self, data_for_grouping): From ac4402baf94e0cf632a7535dde182f5a32105b54 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 17 Jun 2021 20:48:02 -0700 Subject: [PATCH 7/9] fix broken uint64 cases --- pandas/core/groupby/generic.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index bcaafed17daff..72bf171988f9f 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1690,7 +1690,9 @@ def _wrap_agged_manager(self, mgr: Manager2D) -> DataFrame: if self.axis == 1: result = result.T - return self._reindex_output(result) + # Note: we only need to pass datetime=True in order to get numeric + # values converted + return self._reindex_output(result)._convert(datetime=True) def _iterate_column_groupbys(self, obj: FrameOrSeries): for i, colname in enumerate(obj.columns): From e22cf97170d2f062729534090faf8e84ec7ed5c3 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 1 Jul 2021 16:37:53 -0700 Subject: [PATCH 8/9] move whatsnew to 1.4.0 --- doc/source/whatsnew/v1.3.0.rst | 1 - doc/source/whatsnew/v1.4.0.rst | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 26516ed66496f..59ec974aab0a4 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -275,7 +275,6 @@ Other enhancements - Add keyword ``sort`` to :func:`pivot_table` to allow non-sorting of the result (:issue:`39143`) - Add keyword ``dropna`` to :meth:`DataFrame.value_counts` to allow counting rows that include ``NA`` values (:issue:`41325`) - :meth:`Series.replace` will now cast results to ``PeriodDtype`` where possible instead of ``object`` dtype (:issue:`41526`) -- :class:`DataFrameGroupBy` operations with ``as_index=False`` now correctly retain ``ExtensionDtype`` dtypes for columns being grouped on (:issue:`41373`) - Improved error message in ``corr`` and ``cov`` methods on :class:`.Rolling`, :class:`.Expanding`, and :class:`.ExponentialMovingWindow` when ``other`` is not a :class:`DataFrame` or :class:`Series` (:issue:`41741`) - :meth:`Series.between` can now accept ``left`` or ``right`` as arguments to ``inclusive`` to include only the left or right boundary (:issue:`40245`) - :meth:`DataFrame.explode` now supports exploding multiple columns. Its ``column`` argument now also accepts a list of str or tuples for exploding on multiple columns at the same time (:issue:`39240`) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index d10eac2722e15..9d93d7d56ddf3 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -29,7 +29,7 @@ enhancement2 Other enhancements ^^^^^^^^^^^^^^^^^^ -- +- :class:`DataFrameGroupBy` operations with ``as_index=False`` now correctly retain ``ExtensionDtype`` dtypes for columns being grouped on (:issue:`41373`) - .. --------------------------------------------------------------------------- From a32cb83d0277b58051d3637ce673fd1f8ba7a3d1 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 8 Jul 2021 10:40:35 -0700 Subject: [PATCH 9/9] revert unnecessary convert_datetime --- pandas/core/groupby/ops.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index f51eefedd918b..762aca9ea5d9f 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -1028,10 +1028,7 @@ def agg_series( else: result = self._aggregate_series_fast(obj, func) - convert_datetime = obj.dtype.kind == "M" - npvalues = lib.maybe_convert_objects( - result, try_float=False, convert_datetime=convert_datetime - ) + npvalues = lib.maybe_convert_objects(result, try_float=False) if preserve_dtype: out = maybe_cast_pointwise_result(npvalues, obj.dtype, numeric_only=True) else: