diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 19b448a1871c2..19facd1569095 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -347,6 +347,7 @@ Bug fixes - Fixed bug in :meth:`Series.diff` allowing non-integer values for the ``periods`` argument. (:issue:`56607`) - Fixed bug in :meth:`Series.rank` that doesn't preserve missing values for nullable integers when ``na_option='keep'``. (:issue:`56976`) - Fixed bug in :meth:`Series.replace` and :meth:`DataFrame.replace` inconsistently replacing matching instances when ``regex=True`` and missing values are present. (:issue:`56599`) +- Fixed bug in :meth:`Series.kurt` with low variance arrays getting zeroed out even when numerically stable (:issue:`57972`) Categorical ^^^^^^^^^^^ diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 22092551ec882..bae353953ba14 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -1351,9 +1351,13 @@ def nankurt( # floating point error # # #18044 in _libs/windows.pyx calc_kurt follow this behavior - # to fix the fperr to treat denom <1e-14 as zero - numerator = _zero_out_fperr(numerator) - denominator = _zero_out_fperr(denominator) + # to fix the fperr to treat denom <1e-14 as zero (default cutoff) + # GH-57972 set cutoff lower for low variance arrays to prevent cutoff of otherwise + # numerically stable values. Scipy.kurtosis and this implementation start + # diverging for examples with cutoffs below e-281 + cutoff = 1e-281 + numerator = _zero_out_fperr(numerator, cutoff) + denominator = _zero_out_fperr(denominator, cutoff) if not isinstance(denominator, np.ndarray): # if ``denom`` is a scalar, check these corner cases first before @@ -1565,12 +1569,12 @@ def check_below_min_count( return False -def _zero_out_fperr(arg): +def _zero_out_fperr(arg, cutoff=1e-14): # #18044 reference this behavior to fix rolling skew/kurt issue if isinstance(arg, np.ndarray): - return np.where(np.abs(arg) < 1e-14, 0, arg) + return np.where(np.abs(arg) < cutoff, 0, arg) else: - return arg.dtype.type(0) if np.abs(arg) < 1e-14 else arg + return arg.dtype.type(0) if np.abs(arg) < cutoff else arg @disallow("M8", "m8") diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index ce41f1e76de79..a74a97d351fda 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -1105,6 +1105,21 @@ def test_nans_skipna(self, samples, actual_kurt): kurt = nanops.nankurt(samples, skipna=True) tm.assert_almost_equal(kurt, actual_kurt) + def test_arrays_with_low_variance(self): + # GH-57972 + # sample arrays with low variance have a lower threshold for breakdown + # of numerical stability and should be handled accordingly + n = 10_000 + n2 = 10 + # scipy.kurt is nan at e-81, + # both kurtosis start diverging from each other around e-76 + scale = 1e-72 + low_var = np.array([-2.3 * scale] * n2 + [-4.1 * scale] * n2 + [0.0] * n) + # calculated with scipy.status kurtosis(low_var_samples, bias=False) + scipy_kurt = 632.556235239126 + kurt = nanops.nankurt(low_var) + tm.assert_almost_equal(kurt, scipy_kurt) + @property def prng(self): return np.random.default_rng(2)