From 45b3dc7b0c1bb2080002f947eb4cc52c5cb1f2d8 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Sat, 11 Sep 2021 22:59:20 -0400 Subject: [PATCH 1/6] BUG/PERF: sparse min/max don't densify --- asv_bench/benchmarks/sparse.py | 14 +++++ doc/source/whatsnew/v1.4.0.rst | 1 + pandas/core/arrays/sparse/array.py | 69 ++++++++++++++++++++---- pandas/tests/arrays/sparse/test_array.py | 56 ++++++++++++++----- 4 files changed, 115 insertions(+), 25 deletions(-) diff --git a/asv_bench/benchmarks/sparse.py b/asv_bench/benchmarks/sparse.py index bcc3edab4a349..1d684a64cad70 100644 --- a/asv_bench/benchmarks/sparse.py +++ b/asv_bench/benchmarks/sparse.py @@ -166,4 +166,18 @@ def time_division(self, fill_value): self.arr1 / self.arr2 +class MinMax: + + params = ["min", "max"] + param_names = ["func"] + + def setup(self, func): + N = 100000 + arr = make_array(N, 1e-4, np.nan, np.float64) + self.sp_arr = SparseArray(arr) + + def time_min_max(self, func): + getattr(self.sp_arr, func) + + from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 328499a4ae98e..b6aca2e8dacd4 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -300,6 +300,7 @@ Performance improvements - Performance improvement in :meth:`Series.sparse.to_coo` (:issue:`42880`) - Performance improvement in indexing with a :class:`MultiIndex` indexer on another :class:`MultiIndex` (:issue:43370`) - Performance improvement in :meth:`GroupBy.quantile` (:issue:`43469`) +- :meth:`SparseArray.min` and :meth:`SparseArray.max` no longer require converting to a dense array (:issue:`?`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 77142ef450487..7d4773199bed7 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -1433,23 +1433,70 @@ def mean(self, axis=0, *args, **kwargs): nsparse = self.sp_index.ngaps return (sp_sum + self.fill_value * nsparse) / (ct + nsparse) - def max(self, axis=0, *args, **kwargs): + def max(self, axis: int = 0, *args, **kwargs) -> Scalar: + """ + Max of non-NA/null values + + Parameters + ---------- + axis : int, default 0 + Not Used. NumPy compatibility. + *args, **kwargs + Not Used. NumPy compatibility. + + Returns + ------- + scalar + """ nv.validate_max(args, kwargs) + return self._min_max("max") - # This condition returns a nan if there are no valid values in the array. - if self.size > 0 and self._valid_sp_values.size == 0: - return self.fill_value - else: - return np.nanmax(self, axis) + def min(self, axis: int = 0, *args, **kwargs) -> Scalar: + """ + Min of non-NA/null values + + Parameters + ---------- + axis : int, default 0 + Not Used. NumPy compatibility. + *args, **kwargs + Not Used. NumPy compatibility. - def min(self, axis=0, *args, **kwargs): + Returns + ------- + scalar + """ nv.validate_min(args, kwargs) + return self._min_max("min") - # This condition returns a nan if there are no valid values in the array. - if self.size > 0 and self._valid_sp_values.size == 0: - return self.fill_value + def _min_max(self, kind: Literal["min", "max"]) -> Scalar: + """ + Min/max of non-NA/null values + + Parameters + ---------- + kind : {"min", "max"} + + Returns + ------- + scalar + """ + valid_vals = self._valid_sp_values + has_nonnull_fill_vals = not self._null_fill_value and self.sp_index.ngaps > 0 + if len(valid_vals) > 0: + sp_min_max = getattr(valid_vals, kind)() + + # If a non-null fill value is currently present, it might be the max + if has_nonnull_fill_vals: + func = max if kind == "max" else min + return func(sp_min_max, self.fill_value) + else: + return sp_min_max else: - return np.nanmin(self, axis) + if has_nonnull_fill_vals: + return self.fill_value + else: + return na_value_for_dtype(self.dtype.subtype) # ------------------------------------------------------------------------ # Ufuncs diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index fbbe521b0eaf1..7c3a3367169fb 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -1359,26 +1359,54 @@ def test_drop_duplicates_fill_value(): class TestMinMax: - plain_data = np.arange(5).astype(float) - data_neg = plain_data * (-1) - data_NaN = SparseArray(np.array([0, 1, 2, np.nan, 4])) - data_all_NaN = SparseArray(np.array([np.nan, np.nan, np.nan, np.nan, np.nan])) - data_NA_filled = SparseArray( - np.array([np.nan, np.nan, np.nan, np.nan, np.nan]), fill_value=5 - ) - @pytest.mark.parametrize( "raw_data,max_expected,min_expected", [ - (plain_data, [4], [0]), - (data_neg, [0], [-4]), - (data_NaN, [4], [0]), - (data_all_NaN, [np.nan], [np.nan]), - (data_NA_filled, [5], [5]), + (np.arange(5.0), [4], [0]), + (-np.arange(5.0), [0], [-4]), + (np.array([0, 1, 2, np.nan, 4]), [4], [0]), + (np.array([np.nan] * 5), [np.nan], [np.nan]), + (np.array([]), [np.nan], [np.nan]), ], ) - def test_maxmin(self, raw_data, max_expected, min_expected): + def test_nan_fill_value(self, raw_data, max_expected, min_expected): max_result = SparseArray(raw_data).max() min_result = SparseArray(raw_data).min() assert max_result in max_expected assert min_result in min_expected + + @pytest.mark.parametrize( + "fill_value,max_expected,min_expected", + [ + (100, 100, 0), + (-100, 1, -100), + ], + ) + def test_fill_value(self, fill_value, max_expected, min_expected): + arr = SparseArray( + np.array([fill_value, 0, 1]), dtype=SparseDtype("int", fill_value) + ) + max_result = arr.max() + assert max_result == max_expected + + min_result = arr.min() + assert min_result == min_expected + + @pytest.mark.parametrize("func", ["min", "max"]) + @pytest.mark.parametrize("data", [np.array([]), np.array([np.nan, np.nan])]) + @pytest.mark.parametrize( + "dtype,expected", + [ + (SparseDtype(np.float64, np.nan), np.nan), + (SparseDtype(np.float64, 5.0), np.nan), + (SparseDtype("datetime64[ns]", pd.NaT), pd.NaT), + (SparseDtype("datetime64[ns]", pd.to_datetime("2018-05-05")), pd.NaT), + ], + ) + def test_na_value_if_no_valid_values(self, func, data, dtype, expected): + arr = SparseArray(data, dtype=dtype) + result = getattr(arr, func)() + if expected == pd.NaT: + assert result == pd.NaT + else: + assert np.isnan(result) From f3eea3132a97a3db3fd34617404f0b949610b344 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Sat, 11 Sep 2021 23:08:18 -0400 Subject: [PATCH 2/6] Update whatsnew --- doc/source/whatsnew/v1.4.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index b6aca2e8dacd4..41077637e5ac1 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -436,7 +436,7 @@ Reshaping Sparse ^^^^^^ - Bug in :meth:`DataFrame.sparse.to_coo` raising ``AttributeError`` when column names are not unique (:issue:`29564`) -- +- Bug in :meth:`SparseArray.max` and :meth:`SparseArray.min` raising ``ValueError`` for arrays with 0 non-null elements (:issue:`?`) - ExtensionArray From ef4ddc8a545d5f2fdc9d7a5e16acc5eae67c48ac Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Sat, 11 Sep 2021 23:26:26 -0400 Subject: [PATCH 3/6] Update benchmark --- asv_bench/benchmarks/sparse.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/asv_bench/benchmarks/sparse.py b/asv_bench/benchmarks/sparse.py index 1d684a64cad70..32baec25bef2d 100644 --- a/asv_bench/benchmarks/sparse.py +++ b/asv_bench/benchmarks/sparse.py @@ -168,16 +168,16 @@ def time_division(self, fill_value): class MinMax: - params = ["min", "max"] - param_names = ["func"] + params = (["min", "max"], [0.0, np.nan]) + param_names = ["func", "fill_value"] - def setup(self, func): - N = 100000 - arr = make_array(N, 1e-4, np.nan, np.float64) - self.sp_arr = SparseArray(arr) + def setup(self, func, fill_value): + N = 1_000_000 + arr = make_array(N, 1e-5, fill_value, np.float64) + self.sp_arr = SparseArray(arr, fill_value=fill_value) - def time_min_max(self, func): - getattr(self.sp_arr, func) + def time_min_max(self, func, fill_value): + getattr(self.sp_arr, func)() from .pandas_vb_common import setup # noqa: F401 isort:skip From b82cde32d872c4e1164dba4afca62a2f5e480e73 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Sat, 11 Sep 2021 23:30:38 -0400 Subject: [PATCH 4/6] Update comment --- pandas/core/arrays/sparse/array.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 7d4773199bed7..852cf219b8c2c 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -1486,7 +1486,7 @@ def _min_max(self, kind: Literal["min", "max"]) -> Scalar: if len(valid_vals) > 0: sp_min_max = getattr(valid_vals, kind)() - # If a non-null fill value is currently present, it might be the max + # If a non-null fill value is currently present, it might be the min/max if has_nonnull_fill_vals: func = max if kind == "max" else min return func(sp_min_max, self.fill_value) From 2d2545b2396875e7ebd6d9440a1fcc9ba1e395f6 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Sat, 11 Sep 2021 23:34:34 -0400 Subject: [PATCH 5/6] Update pr number --- doc/source/whatsnew/v1.4.0.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 41077637e5ac1..2e0223407e084 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -300,7 +300,7 @@ Performance improvements - Performance improvement in :meth:`Series.sparse.to_coo` (:issue:`42880`) - Performance improvement in indexing with a :class:`MultiIndex` indexer on another :class:`MultiIndex` (:issue:43370`) - Performance improvement in :meth:`GroupBy.quantile` (:issue:`43469`) -- :meth:`SparseArray.min` and :meth:`SparseArray.max` no longer require converting to a dense array (:issue:`?`) +- :meth:`SparseArray.min` and :meth:`SparseArray.max` no longer require converting to a dense array (:issue:`43526`) - .. --------------------------------------------------------------------------- @@ -436,7 +436,8 @@ Reshaping Sparse ^^^^^^ - Bug in :meth:`DataFrame.sparse.to_coo` raising ``AttributeError`` when column names are not unique (:issue:`29564`) -- Bug in :meth:`SparseArray.max` and :meth:`SparseArray.min` raising ``ValueError`` for arrays with 0 non-null elements (:issue:`?`) +- Bug in :meth:`SparseArray.max` and :meth:`SparseArray.min` raising ``ValueError`` for arrays with 0 non-null elements (:issue:`43527`) +- - ExtensionArray From 0ce141bc4466ff805574a1709f8af3b0ab76e79c Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Sun, 12 Sep 2021 11:20:16 -0400 Subject: [PATCH 6/6] Structure with elif instead --- pandas/core/arrays/sparse/array.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index d0722876d81cd..f715a4d47ab30 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -1515,11 +1515,10 @@ def _min_max(self, kind: Literal["min", "max"]) -> Scalar: return func(sp_min_max, self.fill_value) else: return sp_min_max + elif has_nonnull_fill_vals: + return self.fill_value else: - if has_nonnull_fill_vals: - return self.fill_value - else: - return na_value_for_dtype(self.dtype.subtype) + return na_value_for_dtype(self.dtype.subtype) # ------------------------------------------------------------------------ # Ufuncs