From a7ed22b6293af5c0d5cd998ff95d9a483f7b3c53 Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Thu, 26 Aug 2021 22:18:16 +0530 Subject: [PATCH 1/4] BUG: groupby.agg dropping incorrectly dropping nan --- doc/source/whatsnew/v1.3.3.rst | 2 +- pandas/core/groupby/groupby.py | 6 +++++- pandas/tests/groupby/test_apply.py | 14 ++++++++++++++ 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.3.3.rst b/doc/source/whatsnew/v1.3.3.rst index 1340188c3d609..eba10598f08a0 100644 --- a/doc/source/whatsnew/v1.3.3.rst +++ b/doc/source/whatsnew/v1.3.3.rst @@ -17,7 +17,7 @@ Fixed regressions - Fixed regression in :class:`DataFrame` constructor failing to broadcast for defined :class:`Index` and len one list of :class:`Timestamp` (:issue:`42810`) - Performance regression in :meth:`core.window.ewm.ExponentialMovingWindow.mean` (:issue:`42333`) - Fixed regression in :meth:`.GroupBy.agg` incorrectly raising in some cases (:issue:`42390`) -- +- Fixed regression in :meth:`.GroupBy.agg` where ``nan`` values were dropped even with ``dropna=False`` (:issue:`43205`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index b1f3ddcfac069..01079df2590fb 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1010,7 +1010,11 @@ def reset_identity(values): if not not_indexed_same: result = concat(values, axis=self.axis) - ax = self.filter(lambda x: True).axes[self.axis] + ax = ( + self.filter(lambda x: True).axes[self.axis] + if self.dropna + else self._selected_obj._get_axis(self.axis) + ) # this is a very unfortunate situation # we can't use reindex to restore the original order diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 25529e65118c8..7afade8ee8e86 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -1150,3 +1150,17 @@ def test_doctest_example2(): {"B": [1.0, 0.0], "C": [2.0, 0.0]}, index=Index(["a", "b"], name="A") ) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("dropna", [True, False]) +def test_apply_dropna_with_indexed_same2(dropna): + # GH#43205 + df = DataFrame( + { + "a": [1, 2, 3, 4, 5, 6, 7, 8, 9], + "b": [1, np.nan, 1, np.nan, 2, 1, 2, np.nan, 1], + } + ) + result = df.groupby("b", dropna=dropna).apply(lambda x: x) + expected = df.dropna() if dropna else df + tm.assert_frame_equal(result, expected) From de9fb75cfefa7805e6304eeb179ce082950e2bc5 Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Thu, 26 Aug 2021 23:03:54 +0530 Subject: [PATCH 2/4] changed .agg to .apply --- doc/source/whatsnew/v1.3.3.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.3.rst b/doc/source/whatsnew/v1.3.3.rst index eba10598f08a0..f52cf061c9ea9 100644 --- a/doc/source/whatsnew/v1.3.3.rst +++ b/doc/source/whatsnew/v1.3.3.rst @@ -17,7 +17,7 @@ Fixed regressions - Fixed regression in :class:`DataFrame` constructor failing to broadcast for defined :class:`Index` and len one list of :class:`Timestamp` (:issue:`42810`) - Performance regression in :meth:`core.window.ewm.ExponentialMovingWindow.mean` (:issue:`42333`) - Fixed regression in :meth:`.GroupBy.agg` incorrectly raising in some cases (:issue:`42390`) -- Fixed regression in :meth:`.GroupBy.agg` where ``nan`` values were dropped even with ``dropna=False`` (:issue:`43205`) +- Fixed regression in :meth:`.GroupBy.apply` where ``nan`` values were dropped even with ``dropna=False`` (:issue:`43205`) .. --------------------------------------------------------------------------- From 1be0556942b290a3130a9d1de7882db8515adc05 Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Sun, 5 Sep 2021 13:21:24 +0530 Subject: [PATCH 3/4] added test to existing --- pandas/tests/groupby/test_apply.py | 30 +++++------------------------- 1 file changed, 5 insertions(+), 25 deletions(-) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 0a199c0b7ddec..06aa7d4bc5d62 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -1062,9 +1062,10 @@ def test_apply_by_cols_equals_apply_by_rows_transposed(): tm.assert_frame_equal(by_cols, df) -def test_apply_dropna_with_indexed_same(): +@pytest.mark.parametrize("dropna", [True, False]) +def test_apply_dropna_with_indexed_same(dropna): # GH 38227 - + # GH#43205 df = DataFrame( { "col": [1, 2, 3, 4, 5], @@ -1072,15 +1073,8 @@ def test_apply_dropna_with_indexed_same(): }, index=list("xxyxz"), ) - result = df.groupby("group").apply(lambda x: x) - expected = DataFrame( - { - "col": [1, 4, 5], - "group": ["a", "b", "b"], - }, - index=list("xxz"), - ) - + result = df.groupby("group", dropna=dropna).apply(lambda x: x) + expected = df.dropna() if dropna else df tm.assert_frame_equal(result, expected) @@ -1151,17 +1145,3 @@ def test_doctest_example2(): {"B": [1.0, 0.0], "C": [2.0, 0.0]}, index=Index(["a", "b"], name="A") ) tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("dropna", [True, False]) -def test_apply_dropna_with_indexed_same2(dropna): - # GH#43205 - df = DataFrame( - { - "a": [1, 2, 3, 4, 5, 6, 7, 8, 9], - "b": [1, np.nan, 1, np.nan, 2, 1, 2, np.nan, 1], - } - ) - result = df.groupby("b", dropna=dropna).apply(lambda x: x) - expected = df.dropna() if dropna else df - tm.assert_frame_equal(result, expected) From db077c9479682fa449afdd41f8c3f0201f36c452 Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Sun, 5 Sep 2021 13:37:48 +0530 Subject: [PATCH 4/4] reordered df with dropna=False --- pandas/tests/groupby/test_apply.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 06aa7d4bc5d62..1aa5845e20753 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -1074,7 +1074,7 @@ def test_apply_dropna_with_indexed_same(dropna): index=list("xxyxz"), ) result = df.groupby("group", dropna=dropna).apply(lambda x: x) - expected = df.dropna() if dropna else df + expected = df.dropna() if dropna else df.iloc[[0, 3, 1, 2, 4]] tm.assert_frame_equal(result, expected)