From 82782234603650478e783714005e19760c3d5d1a Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 3 May 2021 15:54:50 -0700 Subject: [PATCH 1/2] REF: avoid unnecessary raising in object-dtype case --- pandas/_libs/reduction.pyx | 15 +++++++++------ pandas/core/groupby/generic.py | 2 +- pandas/core/groupby/ops.py | 2 +- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 09999b6970bca..0eafccab34946 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -27,12 +27,15 @@ from pandas._libs.lib import ( ) -cpdef check_result_array(object obj): +cdef cnp.dtype _dtype_obj = np.dtype("object") - if (is_array(obj) or - (isinstance(obj, list) and len(obj) == 0) or - getattr(obj, 'shape', None) == (0,)): - raise ValueError('Must produce aggregated value') + +cpdef check_result_array(object obj, object dtype): + if is_array(obj): + if dtype != _dtype_obj: + # if it is object dtype, we the function can be a reduction/aggregation + # and still return an ndarray + raise ValueError("Must produce aggregated value") cdef class _BaseGrouper: @@ -89,7 +92,7 @@ cdef class _BaseGrouper: # On the first pass, we check the output shape to see # if this looks like a reduction. initialized = True - check_result_array(res) + check_result_array(res, cached_series.dtype) return res, initialized diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 4f60660dfb499..a69f7ef9dcd49 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -530,7 +530,7 @@ def _aggregate_named(self, func, *args, **kwargs): output = libreduction.extract_result(output) if not initialized: # We only do this validation on the first iteration - libreduction.check_result_array(output) + libreduction.check_result_array(output, group.dtype) initialized = True result[name] = output diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 975a902f49db9..d649240c1df88 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -1027,7 +1027,7 @@ def _aggregate_series_pure_python(self, obj: Series, func: F): if not initialized: # We only do this validation on the first iteration - libreduction.check_result_array(res) + libreduction.check_result_array(res, group.dtype) initialized = True counts[i] = group.shape[0] From f464bdcda64064ea46fedaabc7501df327f2e37a Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 3 May 2021 18:45:32 -0700 Subject: [PATCH 2/2] flesh out comment --- pandas/_libs/reduction.pyx | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 0eafccab34946..9bef6cb428e8a 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -31,10 +31,13 @@ cdef cnp.dtype _dtype_obj = np.dtype("object") cpdef check_result_array(object obj, object dtype): + # Our operation is supposed to be an aggregation/reduction. If + # it returns an ndarray, this likely means an invalid operation has + # been passed. See test_apply_without_aggregation, test_agg_must_agg if is_array(obj): if dtype != _dtype_obj: - # if it is object dtype, we the function can be a reduction/aggregation - # and still return an ndarray + # If it is object dtype, the function can be a reduction/aggregation + # and still return an ndarray e.g. test_agg_over_numpy_arrays raise ValueError("Must produce aggregated value")