diff --git a/doc/source/whatsnew/v0.16.0.txt b/doc/source/whatsnew/v0.16.0.txt index e8b398aec4b74..0234a0dab8e28 100644 --- a/doc/source/whatsnew/v0.16.0.txt +++ b/doc/source/whatsnew/v0.16.0.txt @@ -174,6 +174,7 @@ Performance - Performance improvement of up to 10x in ``DataFrame.count`` and ``DataFrame.dropna`` by taking advantage of homogeneous/heterogeneous dtypes appropriately (:issue:`9136`) - Performance improvement of up to 20x in ``DataFrame.count`` when using a ``MultiIndex`` and the ``level`` keyword argument (:issue:`9163`) - Performance and memory usage improvements in ``merge`` when key space exceeds ``int64`` bounds (:issue:`9151`) +- Performance improvements in multi-key ``groupby`` (:issue:`9429`) Bug Fixes ~~~~~~~~~ diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 28a1656832d56..0a12484f9ab3a 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1217,11 +1217,9 @@ class BaseGrouper(object): """ def __init__(self, axis, groupings, sort=True, group_keys=True): - self.axis = axis - self.groupings = groupings - self.sort = sort - self.group_keys = group_keys - self.compressed = True + self._filter_empty_groups = self.compressed = len(groupings) != 1 + self.axis, self.groupings, self.sort, self.group_keys = \ + axis, groupings, sort, group_keys @property def shape(self): @@ -1373,31 +1371,34 @@ def _get_compressed_labels(self): return _compress_group_index(group_index) ping = self.groupings[0] - self.compressed = False - self._filter_empty_groups = False - return ping.labels, np.arange(len(ping.group_index)) @cache_readonly def ngroups(self): return len(self.result_index) + @property + def recons_labels(self): + comp_ids, obs_ids, _ = self.group_info + labels = (ping.labels for ping in self.groupings) + return decons_obs_group_ids(comp_ids, obs_ids, self.shape, labels) + @cache_readonly def result_index(self): - recons = self.get_group_levels() - return MultiIndex.from_arrays(recons, names=self.names) + if not self.compressed and len(self.groupings) == 1: + return self.groupings[0].group_index.rename(self.names[0]) - def get_group_levels(self): - comp_ids, obs_ids, _ = self.group_info + return MultiIndex(levels=[ping.group_index for ping in self.groupings], + labels=self.recons_labels, + verify_integrity=False, + names=self.names) + def get_group_levels(self): if not self.compressed and len(self.groupings) == 1: return [self.groupings[0].group_index] - recons_labels = decons_obs_group_ids(comp_ids, obs_ids, - self.shape, (ping.labels for ping in self.groupings)) - name_list = [] - for ping, labels in zip(self.groupings, recons_labels): + for ping, labels in zip(self.groupings, self.recons_labels): labels = com._ensure_platform_int(labels) levels = ping.group_index.take(labels) @@ -1432,8 +1433,6 @@ def get_group_levels(self): _name_functions = {} - _filter_empty_groups = True - def _get_aggregate_function(self, how, values): dtype_str = values.dtype.name @@ -1797,8 +1796,6 @@ def size(self): 'ohlc': lambda *args: ['open', 'high', 'low', 'close'] } - _filter_empty_groups = True - def _aggregate(self, result, counts, values, how, is_numeric=True): agg_func, dtype = self._get_aggregate_function(how, values) diff --git a/vb_suite/groupby.py b/vb_suite/groupby.py index fd18c81a7d00d..eb690df4870e8 100644 --- a/vb_suite/groupby.py +++ b/vb_suite/groupby.py @@ -501,6 +501,25 @@ def f(g): groupby_int64_overflow = Benchmark("df.groupby(list('abcde')).max()", setup, name='groupby_int64_overflow') + +setup = common_setup + ''' +from itertools import product +from string import ascii_letters, digits + +n = 5 * 7 * 11 * (1 << 9) +alpha = list(map(''.join, product(ascii_letters + digits, repeat=4))) +f = lambda k: np.repeat(np.random.choice(alpha, n // k), k) + +df = DataFrame({'a': f(11), 'b': f(7), 'c': f(5), 'd': f(1)}) +df['joe'] = (np.random.randn(len(df)) * 10).round(3) + +i = np.random.permutation(len(df)) +df = df.iloc[i].reset_index(drop=True).copy() +''' + +groupby_multi_index = Benchmark("df.groupby(list('abcd')).max()", setup, + name='groupby_multi_index') + #---------------------------------------------------------------------- # groupby with a variable value for ngroups