From 07fe1954fc087e503892cb449f8fb2c05f2da0d9 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 27 Apr 2025 10:15:18 -0400 Subject: [PATCH 1/3] BUG: groupby.groups with NA categories fails --- pandas/core/groupby/grouper.py | 22 +++++++++++++++++++--- pandas/tests/groupby/test_categorical.py | 17 +++++++++++++++++ 2 files changed, 36 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index c9d874fc08dbe..f5c6a2bbacbef 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -12,11 +12,16 @@ import numpy as np +from pandas._libs import ( + algos as libalgos, +) from pandas._libs.tslibs import OutOfBoundsDatetime from pandas.errors import InvalidIndexError from pandas.util._decorators import cache_readonly from pandas.core.dtypes.common import ( + ensure_int64, + ensure_platform_int, is_list_like, is_scalar, ) @@ -38,7 +43,10 @@ ) from pandas.core.series import Series -from pandas.io.formats.printing import pprint_thing +from pandas.io.formats.printing import ( + PrettyDict, + pprint_thing, +) if TYPE_CHECKING: from collections.abc import ( @@ -668,8 +676,16 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: def groups(self) -> dict[Hashable, Index]: codes, uniques = self._codes_and_uniques uniques = Index._with_infer(uniques, name=self.name) - cats = Categorical.from_codes(codes, uniques, validate=False) - return self._index.groupby(cats) + + r, counts = libalgos.groupsort_indexer(ensure_platform_int(codes), len(uniques)) + counts = ensure_int64(counts).cumsum() + _result = (r[start:end] for start, end in zip(counts, counts[1:])) + result = dict(zip(uniques, _result)) + + # map to the label + result = {k: self._index.take(v) for k, v in result.items()} + + return PrettyDict(result) @property def observed_grouping(self) -> Grouping: diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index e49be8c00b426..cae3013642739 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -506,6 +506,23 @@ def test_observed_groups(observed): tm.assert_dict_equal(result, expected) +def test_groups_na_category(dropna, observed): + # https://github.com/pandas-dev/pandas/issues/61356 + df = DataFrame( + {"cat": Categorical(["a", np.nan, "a"], categories=list("adb"))}, + index=list("xyz"), + ) + g = df.groupby("cat", observed=observed, dropna=dropna) + + result = g.groups + expected = {"a": Index(["x", "z"])} + if not dropna: + expected |= {np.nan: Index(["y"])} + if not observed: + expected |= {"b": Index([]), "d": Index([])} + tm.assert_dict_equal(result, expected) + + @pytest.mark.parametrize( "keys, expected_values, expected_index_levels", [ From 6e3ecf35f4c71759a5ae2224027a51ebd277577f Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 27 Apr 2025 10:18:49 -0400 Subject: [PATCH 2/3] cleanup --- pandas/core/groupby/grouper.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index f5c6a2bbacbef..f8e92b7e2650a 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -680,10 +680,8 @@ def groups(self) -> dict[Hashable, Index]: r, counts = libalgos.groupsort_indexer(ensure_platform_int(codes), len(uniques)) counts = ensure_int64(counts).cumsum() _result = (r[start:end] for start, end in zip(counts, counts[1:])) - result = dict(zip(uniques, _result)) - # map to the label - result = {k: self._index.take(v) for k, v in result.items()} + result = {k: self._index.take(v) for k, v in zip(uniques, _result)} return PrettyDict(result) From cec8b331ed6601dc18a34acb275c602e736af09e Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 27 Apr 2025 10:51:32 -0400 Subject: [PATCH 3/3] whatsnew --- doc/source/whatsnew/v3.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index fe2886a022ad5..5223b36006102 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -806,6 +806,7 @@ Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - Bug in :meth:`.DataFrameGroupBy.__len__` and :meth:`.SeriesGroupBy.__len__` would raise when the grouping contained NA values and ``dropna=False`` (:issue:`58644`) - Bug in :meth:`.DataFrameGroupBy.any` that returned True for groups where all Timedelta values are NaT. (:issue:`59712`) +- Bug in :meth:`.DataFrameGroupBy.groups` and :meth:`.SeriesGroupBy.groups` would fail when the groups were :class:`Categorical` with an NA value (:issue:`61356`) - Bug in :meth:`.DataFrameGroupBy.groups` and :meth:`.SeriesGroupby.groups` that would not respect groupby argument ``dropna`` (:issue:`55919`) - Bug in :meth:`.DataFrameGroupBy.median` where nat values gave an incorrect result. (:issue:`57926`) - Bug in :meth:`.DataFrameGroupBy.quantile` when ``interpolation="nearest"`` is inconsistent with :meth:`DataFrame.quantile` (:issue:`47942`)