From 6a68c219dffa79521ecf9df0c985151a5f35d354 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 7 May 2021 14:53:05 -0700 Subject: [PATCH 1/2] REF: do less in Grouping.__init__ --- pandas/core/groupby/grouper.py | 93 ++++++++++++++++++++++------------ 1 file changed, 60 insertions(+), 33 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index f1762a2535ff7..8b4d3fab8f950 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -439,6 +439,9 @@ class Grouping: * groups : dict of {group -> label_list} """ + _codes: np.ndarray | None = None + _group_index: Index | None = None + def __init__( self, index: Index, @@ -462,6 +465,8 @@ def __init__( self.in_axis = in_axis self.dropna = dropna + self._passed_categorical = False + # right place for this? if isinstance(grouper, (Series, Index)) and name is None: self.name = grouper.name @@ -472,20 +477,16 @@ def __init__( # we have a single grouper which may be a myriad of things, # some of which are dependent on the passing in level - if level is not None: - if not isinstance(level, int): - if level not in index.names: - raise AssertionError(f"Level {level} not in index") - level = index.names.index(level) - + ilevel = self._ilevel + if ilevel is not None: if self.name is None: - self.name = index.names[level] + self.name = index.names[ilevel] ( - self.grouper, + self.grouper, # Index self._codes, self._group_index, - ) = index._get_grouper_for_level(self.grouper, level) + ) = index._get_grouper_for_level(self.grouper, ilevel) # a passed Grouper like, directly get the grouper in the same way # as single grouper groupby, use the group_info to get codes @@ -509,37 +510,24 @@ def __init__( if self.grouper is None and self.name is not None and self.obj is not None: self.grouper = self.obj[self.name] + if self.grouper.ndim > 1: + # i.e. DataFrame case reachable if columns non-unique + t = self.name or str(type(self.grouper)) + raise ValueError(f"Grouper for '{t}' not 1-dimensional") + elif isinstance(self.grouper, (list, tuple)): self.grouper = com.asarray_tuplesafe(self.grouper) # a passed Categorical elif is_categorical_dtype(self.grouper): + self._passed_categorical = True self.grouper, self.all_grouper = recode_for_groupby( self.grouper, self.sort, observed ) - categories = self.grouper.categories - - # we make a CategoricalIndex out of the cat grouper - # preserving the categories / ordered attributes - self._codes = self.grouper.codes - if observed: - codes = algorithms.unique1d(self.grouper.codes) - codes = codes[codes != -1] - if sort or self.grouper.ordered: - codes = np.sort(codes) - else: - codes = np.arange(len(categories)) - - self._group_index = CategoricalIndex( - Categorical.from_codes( - codes=codes, categories=categories, ordered=self.grouper.ordered - ), - name=self.name, - ) # we are done - if isinstance(self.grouper, Grouping): + elif isinstance(self.grouper, Grouping): self.grouper = self.grouper.grouper # no level passed @@ -577,8 +565,20 @@ def __repr__(self) -> str: def __iter__(self): return iter(self.indices) - _codes: np.ndarray | None = None - _group_index: Index | None = None + @cache_readonly + def _ilevel(self) -> int | None: + """ + If necessary, converted index level name to index level position. + """ + level = self.level + if level is None: + return None + if not isinstance(level, int): + index = self.index + if level not in index.names: + raise AssertionError(f"Level {level} not in index") + return index.names.index(level) + return level @property def ngroups(self) -> int: @@ -595,6 +595,12 @@ def indices(self): @property def codes(self) -> np.ndarray: + if self._passed_categorical: + # we make a CategoricalIndex out of the cat grouper + # preserving the categories / ordered attributes + cat = self.grouper + return cat.codes + if self._codes is None: self._make_codes() # error: Incompatible return value type (got "Optional[ndarray]", @@ -605,12 +611,33 @@ def codes(self) -> np.ndarray: def result_index(self) -> Index: if self.all_grouper is not None: group_idx = self.group_index - assert isinstance(group_idx, CategoricalIndex) # set in __init__ + assert isinstance(group_idx, CategoricalIndex) return recode_from_groupby(self.all_grouper, self.sort, group_idx) return self.group_index - @property + @cache_readonly def group_index(self) -> Index: + if self._passed_categorical: + # we make a CategoricalIndex out of the cat grouper + # preserving the categories / ordered attributes + cat = self.grouper + categories = cat.categories + + if self.observed: + codes = algorithms.unique1d(cat.codes) + codes = codes[codes != -1] + if self.sort or cat.ordered: + codes = np.sort(codes) + else: + codes = np.arange(len(categories)) + + return CategoricalIndex( + Categorical.from_codes( + codes=codes, categories=categories, ordered=cat.ordered + ), + name=self.name, + ) + if self._group_index is None: self._make_codes() assert self._group_index is not None From 1b04e51f21f67f2101e248355d8a71638a5532e3 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 7 May 2021 15:45:14 -0700 Subject: [PATCH 2/2] mypy fixup --- pandas/core/groupby/generic.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 55e8578b2cef4..868dffb5e1b04 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -777,11 +777,7 @@ def apply_series_value_counts(): # multi-index components codes = self.grouper.reconstructed_codes codes = [rep(level_codes) for level_codes in codes] + [llab(lab, inc)] - # error: List item 0 has incompatible type "Union[ndarray, Any]"; - # expected "Index" - levels = [ping.group_index for ping in self.grouper.groupings] + [ - lev # type: ignore[list-item] - ] + levels = [ping.group_index for ping in self.grouper.groupings] + [lev] names = self.grouper.names + [self._selection_name] if dropna: