From 8ce450a6a6648cc8f7a51114f67be8b669fa24a8 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Tue, 30 Nov 2021 08:14:06 +0000 Subject: [PATCH 01/16] Implement indexing --- pandas/core/groupby/groupby.py | 14 +++++++++++++- pandas/core/groupby/indexing.py | 16 ++++++++++++++++ 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 8cd5712597fef..f0fe34044f0b9 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -100,7 +100,10 @@ class providing the base-class of operations. numba_, ops, ) -from pandas.core.groupby.indexing import GroupByIndexingMixin +from pandas.core.groupby.indexing import ( + GroupByIndexingMixin, + GroupByNthSelector, +) from pandas.core.indexes.api import ( CategoricalIndex, Index, @@ -902,6 +905,15 @@ def __getattr__(self, attr: str): f"'{type(self).__name__}' object has no attribute '{attr}'" ) + def __getattribute__(self, attr): + # Intercept nth to allow indexing + if attr == "nth": + return GroupByNthSelector(self) + elif attr == "nth_actual": + return super().__getattribute__("nth") + else: + return super().__getattribute__(attr) + @final def _make_wrapper(self, name: str) -> Callable: assert name in self._apply_allowlist diff --git a/pandas/core/groupby/indexing.py b/pandas/core/groupby/indexing.py index 4b3bb6bc0aa50..b94659135d415 100644 --- a/pandas/core/groupby/indexing.py +++ b/pandas/core/groupby/indexing.py @@ -3,6 +3,7 @@ from typing import ( TYPE_CHECKING, Iterable, + Literal, cast, ) @@ -281,3 +282,18 @@ def __getitem__(self, arg: PositionalIndexer | tuple) -> DataFrame | Series: self.groupby_object._reset_group_selection() mask = self.groupby_object._make_mask_from_positional_indexer(arg) return self.groupby_object._mask_selected_obj(mask) + + +class GroupByNthSelector: + def __init__(self, groupby_object: groupby.GroupBy): + self.groupby_object = groupby_object + + def __call__( + self, + n: PositionalIndexer | tuple, + dropna: Literal["any", "all", None] = None, + ) -> DataFrame | Series: + return self.groupby_object.nth_actual(n, dropna) + + def __getitem__(self, n: PositionalIndexer | tuple) -> DataFrame | Series: + return self.groupby_object.nth_actual(n) From 55148078ae8bcab53f5a933faa0d78e41421966f Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Tue, 30 Nov 2021 09:25:47 +0000 Subject: [PATCH 02/16] What's new and docstring --- doc/source/whatsnew/v1.4.0.rst | 8 ++++++++ pandas/core/groupby/groupby.py | 22 +++++++++++++++++++++- 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 71903d10a6983..a621bf14b6da1 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -164,6 +164,14 @@ Previously, negative arguments returned empty frames. df.groupby("A").nth(slice(1, -1)) df.groupby("A").nth([slice(None, 1), slice(-1, None)]) +:meth:`.GroupBy.nth` now accepts index notation. + +.. ipython:: python + + df.groupby("A").nth[1, -1] + df.groupby("A").nth[1:-1] + df.groupby("A").nth[:1, -1:] + .. _whatsnew_140.dict_tight: DataFrame.from_dict and DataFrame.to_dict have new ``'tight'`` option diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index f0fe34044f0b9..56737699c8c19 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -906,7 +906,7 @@ def __getattr__(self, attr: str): ) def __getattribute__(self, attr): - # Intercept nth to allow indexing + # Intercept nth to allow both call and index if attr == "nth": return GroupByNthSelector(self) elif attr == "nth_actual": @@ -2536,6 +2536,9 @@ def nth( """ Take the nth row from each group if n is an int, otherwise a subset of rows. + Can be either a call or an index. dropna is not available with index notation. + Index notation accepts a comma separated list of integers and slices. + If dropna, will take the nth non-null row, dropna is either 'all' or 'any'; this is equivalent to calling dropna(how=dropna) before the groupby. @@ -2547,6 +2550,7 @@ def nth( .. versionchanged:: 1.4.0 Added slice and lists containiing slices. + Added index notation. dropna : {'any', 'all', None}, default None Apply the specified dropna operation before counting which row is @@ -2592,6 +2596,22 @@ def nth( 1 2.0 2 3.0 + Index notation may also be used + + >>> g.nth[0, 1] + B + A + 1 NaN + 1 2.0 + 2 3.0 + 2 5.0 + >>> g.nth[:-1] + B + A + 1 NaN + 1 2.0 + 2 3.0 + Specifying `dropna` allows count ignoring ``NaN`` >>> g.nth(0, dropna='any') From 55fee64dbdac304c9a8f0967ce60fe09f74fdc08 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Tue, 30 Nov 2021 09:37:46 +0000 Subject: [PATCH 03/16] Update test_nth.py --- pandas/tests/groupby/test_nth.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/test_nth.py index a5cb511763eee..515a815bad6a4 100644 --- a/pandas/tests/groupby/test_nth.py +++ b/pandas/tests/groupby/test_nth.py @@ -726,6 +726,15 @@ def test_slice(slice_test_df, slice_test_grouped, arg, expected_rows): tm.assert_frame_equal(result, expected) +def test_nth_indexed(slice_test_df, slice_test_grouped): + # Test index notation GH #44688 + + result = slice_test_grouped.nth[0, 1, -2:] + expected = slice_test_df.iloc[[0, 1, 2, 3, 4, 6, 7]] + + tm.assert_frame_equal(result, expected) + + def test_invalid_argument(slice_test_grouped): # Test for error on invalid argument From b5785d7e667eed2f67865d017ac08fceb7543101 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Tue, 30 Nov 2021 11:05:21 +0000 Subject: [PATCH 04/16] Trigger CI From 251dfdbe5a3b866c3a4877b21fa9dd1232253903 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Tue, 30 Nov 2021 14:03:58 +0000 Subject: [PATCH 05/16] Update indexing.py --- pandas/core/groupby/indexing.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/core/groupby/indexing.py b/pandas/core/groupby/indexing.py index b94659135d415..d103005043084 100644 --- a/pandas/core/groupby/indexing.py +++ b/pandas/core/groupby/indexing.py @@ -285,6 +285,9 @@ def __getitem__(self, arg: PositionalIndexer | tuple) -> DataFrame | Series: class GroupByNthSelector: + """ + Dynamically substituted for GroupBy.nth to enable both call and index + """ def __init__(self, groupby_object: groupby.GroupBy): self.groupby_object = groupby_object From 707e267589e23d93185dd43446704020ebfa4175 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Tue, 30 Nov 2021 14:13:37 +0000 Subject: [PATCH 06/16] Update indexing.py --- pandas/core/groupby/indexing.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/groupby/indexing.py b/pandas/core/groupby/indexing.py index d103005043084..f98bdf4b8be29 100644 --- a/pandas/core/groupby/indexing.py +++ b/pandas/core/groupby/indexing.py @@ -288,6 +288,7 @@ class GroupByNthSelector: """ Dynamically substituted for GroupBy.nth to enable both call and index """ + def __init__(self, groupby_object: groupby.GroupBy): self.groupby_object = groupby_object From e59facb087889999e0f32026d5c5e83b8c87664a Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Thu, 2 Dec 2021 13:44:15 +0000 Subject: [PATCH 07/16] Trigger CI From da69aa104ea618e8bf1f1a947b658f8b07d0ecdc Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Fri, 3 Dec 2021 09:28:03 +0000 Subject: [PATCH 08/16] Trigger CI From 8ab22c2365f9103ed7fee5479708a4b4cedb72a4 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Sat, 4 Dec 2021 08:52:44 +0000 Subject: [PATCH 09/16] arg type --- pandas/core/groupby/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 56737699c8c19..2876ec1cb5a0d 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -905,7 +905,7 @@ def __getattr__(self, attr: str): f"'{type(self).__name__}' object has no attribute '{attr}'" ) - def __getattribute__(self, attr): + def __getattribute__(self, attr: str): # Intercept nth to allow both call and index if attr == "nth": return GroupByNthSelector(self) From 762fc73c8a9a5d1f27d36f423a6f5ce8f37e3731 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Sat, 4 Dec 2021 09:20:02 +0000 Subject: [PATCH 10/16] Add call equivalent to index --- pandas/tests/groupby/test_nth.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/test_nth.py index 515a815bad6a4..972cbb60d8189 100644 --- a/pandas/tests/groupby/test_nth.py +++ b/pandas/tests/groupby/test_nth.py @@ -727,14 +727,17 @@ def test_slice(slice_test_df, slice_test_grouped, arg, expected_rows): def test_nth_indexed(slice_test_df, slice_test_grouped): - # Test index notation GH #44688 + # Test index notation GH #44688 result = slice_test_grouped.nth[0, 1, -2:] + equivalent = slice_test_grouped.nth([0, 1, slice(-2, None)]) expected = slice_test_df.iloc[[0, 1, 2, 3, 4, 6, 7]] + tm.assert_frame_equal(result, equivalent) tm.assert_frame_equal(result, expected) + def test_invalid_argument(slice_test_grouped): # Test for error on invalid argument From 83228d936852ed5717a5a432204823f0ae6f6673 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Sat, 4 Dec 2021 09:29:27 +0000 Subject: [PATCH 11/16] Update test_nth.py --- pandas/tests/groupby/test_nth.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/test_nth.py index 972cbb60d8189..7aeb07ed2d69f 100644 --- a/pandas/tests/groupby/test_nth.py +++ b/pandas/tests/groupby/test_nth.py @@ -737,7 +737,6 @@ def test_nth_indexed(slice_test_df, slice_test_grouped): tm.assert_frame_equal(result, expected) - def test_invalid_argument(slice_test_grouped): # Test for error on invalid argument From b84a8a96c902dbb3fdca1c38ff8273c44b670a41 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Sat, 4 Dec 2021 10:11:10 +0000 Subject: [PATCH 12/16] Update test_nth.py --- pandas/tests/groupby/test_nth.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/test_nth.py index 7aeb07ed2d69f..751da3c0f3195 100644 --- a/pandas/tests/groupby/test_nth.py +++ b/pandas/tests/groupby/test_nth.py @@ -720,11 +720,12 @@ def test_groupby_last_first_nth_with_none(method, nulls_fixture): def test_slice(slice_test_df, slice_test_grouped, arg, expected_rows): # Test slices GH #42947 - result = slice_test_grouped.nth(arg) + result = slice_test_grouped.nth[arg] + equivalent = slice_test_grouped.nth(arg) expected = slice_test_df.iloc[expected_rows] tm.assert_frame_equal(result, expected) - + tm.assert_frame_equal(equivalent, expected) def test_nth_indexed(slice_test_df, slice_test_grouped): # Test index notation GH #44688 @@ -733,8 +734,8 @@ def test_nth_indexed(slice_test_df, slice_test_grouped): equivalent = slice_test_grouped.nth([0, 1, slice(-2, None)]) expected = slice_test_df.iloc[[0, 1, 2, 3, 4, 6, 7]] - tm.assert_frame_equal(result, equivalent) tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(equivalent, expected) def test_invalid_argument(slice_test_grouped): From 3360a53fbc2d25d73dbe863a661ab88fc90086d2 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Sat, 4 Dec 2021 10:20:45 +0000 Subject: [PATCH 13/16] Update test_nth.py --- pandas/tests/groupby/test_nth.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/test_nth.py index 751da3c0f3195..50d08aa27c1d5 100644 --- a/pandas/tests/groupby/test_nth.py +++ b/pandas/tests/groupby/test_nth.py @@ -727,6 +727,7 @@ def test_slice(slice_test_df, slice_test_grouped, arg, expected_rows): tm.assert_frame_equal(result, expected) tm.assert_frame_equal(equivalent, expected) + def test_nth_indexed(slice_test_df, slice_test_grouped): # Test index notation GH #44688 From afbc458be68bb9af75b29e2ec9767d5755f62655 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Sat, 4 Dec 2021 10:54:43 +0000 Subject: [PATCH 14/16] Trigger CI From 1fa1c0f71be8d712cd0bd647e822488e7d9f3cf5 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Sat, 4 Dec 2021 15:42:51 +0000 Subject: [PATCH 15/16] Added slice axis=1 test --- pandas/tests/groupby/test_nth.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/test_nth.py index 50d08aa27c1d5..9fbf8f8e9adcf 100644 --- a/pandas/tests/groupby/test_nth.py +++ b/pandas/tests/groupby/test_nth.py @@ -782,3 +782,30 @@ def test_groupby_nth_with_column_axis(): ) expected.columns.name = "y" tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "start, stop, expected_values, expected_columns", + [ + (None, None, [0, 1, 2, 3, 4], [5, 5, 5, 6, 6]), + (None, 1, [0, 3], [5, 6]), + (None, 9, [0, 1, 2, 3, 4], [5, 5, 5, 6, 6]), + (None, -1, [0, 1, 3], [5, 5, 6]), + (1, None, [1, 2, 4], [5, 5, 6]), + (1, -1, [1], [5]), + (-1, None, [2, 4], [5, 6]), + (-1, 2, [4], [6]), + ], +) +@pytest.mark.parametrize("method", ["call", "index"]) +def test_nth_slices_with_column_axis( + start, stop, expected_values, expected_columns, method +): + df = pd.DataFrame([range(5)], columns=[list("ABCDE")]) + gb = df.groupby([5, 5, 5, 6, 6], axis=1) + result = { + "call": lambda start, stop: gb.nth(slice(start, stop)), + "index": lambda start, stop: gb.nth[start:stop], + }[method](start, stop) + expected = pd.DataFrame([expected_values], columns=expected_columns) + tm.assert_frame_equal(result, expected) From a1f0ca9186011c10b4d28b49d52091e6cc9ffdc2 Mon Sep 17 00:00:00 2001 From: John Zangwill Date: Sat, 4 Dec 2021 16:01:16 +0000 Subject: [PATCH 16/16] Update test_nth.py --- pandas/tests/groupby/test_nth.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/test_nth.py index 9fbf8f8e9adcf..8a5f972c22640 100644 --- a/pandas/tests/groupby/test_nth.py +++ b/pandas/tests/groupby/test_nth.py @@ -801,11 +801,11 @@ def test_groupby_nth_with_column_axis(): def test_nth_slices_with_column_axis( start, stop, expected_values, expected_columns, method ): - df = pd.DataFrame([range(5)], columns=[list("ABCDE")]) + df = DataFrame([range(5)], columns=[list("ABCDE")]) gb = df.groupby([5, 5, 5, 6, 6], axis=1) result = { "call": lambda start, stop: gb.nth(slice(start, stop)), "index": lambda start, stop: gb.nth[start:stop], }[method](start, stop) - expected = pd.DataFrame([expected_values], columns=expected_columns) + expected = DataFrame([expected_values], columns=expected_columns) tm.assert_frame_equal(result, expected)