diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 4c1399a0defe7..e434d1b3ac3e9 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -14,12 +14,39 @@ including other versions of pandas. Enhancements ~~~~~~~~~~~~ -.. _whatsnew_210.enhancements.enhancement1: +.. _whatsnew_210.enhancements.better_dtype_inference_for_frame_reductions: + +Better dtype inference when doing reductions on dataframes of nullable arrays +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Dtype inference when doing reductions on DataFrames with nullable arrays has been improved (:issue:`52707`). + +*Previous behavior*: + +.. code-block:: ipython + + In [1]: df = pd.DataFrame({"a": [1], "b": [pd.NA]}, dtype="Int64") + In [2]: df.sum() + a 1 + b 0 + dtype: int64 + In [3]: df.sum(min_count=1) + a 1 + b + dtype: object + +With the new behavior, we keep the original dtype: + +*New behavior*: + +.. ipython:: python + + df = pd.DataFrame({"a": [1], "b": [pd.NA]}, dtype="Int64") + df.sum() + df.sum(min_count=1) -enhancement1 -^^^^^^^^^^^^ -.. _whatsnew_210.enhancements.enhancement2: +.. _whatsnew_210.enhancements.map_works_for_all_array_types: ``map(func, na_action="ignore")`` now works for all array types ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5341b87c39676..2de7eb923b841 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -49,6 +49,7 @@ from pandas._libs.hashtable import duplicated from pandas._libs.lib import ( NoDefault, + infer_dtype, is_range_indexer, no_default, ) @@ -94,6 +95,7 @@ is_dataclass, is_dict_like, is_dtype_equal, + is_extension_array_dtype, is_float, is_float_dtype, is_hashable, @@ -10899,14 +10901,29 @@ def _get_data() -> DataFrame: # simple case where we can use BlockManager.reduce res = df._mgr.reduce(blk_func) out = df._constructor(res).iloc[0] + mgr_dtypes = df._mgr.get_dtypes().tolist() + if out.dtype != object: + # e.g. if data dtype is UInt8 and out.dtype is uint64, then common is UInt64 + mgr_dtypes.append(out.dtype) + common_dtype = find_common_type(mgr_dtypes) if mgr_dtypes else None + is_ext_dtype = common_dtype is not None and is_extension_array_dtype( + common_dtype + ) + if out_dtype is not None: out = out.astype(out_dtype) + elif is_ext_dtype and out.dtype == common_dtype.type: + out = out.astype(common_dtype) elif (df._mgr.get_dtypes() == object).any(): out = out.astype(object) - elif len(self) == 0 and name in ("sum", "prod"): - # Even if we are object dtype, follow numpy and return - # float64, see test_apply_funcs_over_empty - out = out.astype(np.float64) + elif is_ext_dtype and out.dtype == object: + inferred_dtype = infer_dtype(out) + if isna(out).all(): + out = out.astype(common_dtype) + elif inferred_dtype == "integer": + out = out.astype("Int64") + elif inferred_dtype == "float": + out = out.astype("Float64") return out @@ -11157,11 +11174,6 @@ def idxmin( ) indices = res._values - # indices will always be np.ndarray since axis is not None and - # values is a 2d array for DataFrame - # error: Item "int" of "Union[int, Any]" has no attribute "__iter__" - assert isinstance(indices, np.ndarray) # for mypy - index = data._get_axis(axis) result = [index[i] if i >= 0 else np.nan for i in indices] final_result = data._constructor_sliced(result, index=data._get_agg_axis(axis)) @@ -11182,11 +11194,6 @@ def idxmax( ) indices = res._values - # indices will always be np.ndarray since axis is not None and - # values is a 2d array for DataFrame - # error: Item "int" of "Union[int, Any]" has no attribute "__iter__" - assert isinstance(indices, np.ndarray) # for mypy - index = data._get_axis(axis) result = [index[i] if i >= 0 else np.nan for i in indices] final_result = data._constructor_sliced(result, index=data._get_agg_axis(axis)) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 0d352b8e34f37..dee8744388678 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -6,7 +6,10 @@ import numpy as np import pytest -from pandas.compat import is_platform_windows +from pandas.compat import ( + IS64, + is_platform_windows, +) import pandas.util._test_decorators as td import pandas as pd @@ -29,6 +32,8 @@ nanops, ) +is_windows_or_is32 = is_platform_windows() or not IS64 + def assert_stat_op_calc( opname, @@ -917,7 +922,7 @@ def test_mean_extensionarray_numeric_only_true(self): arr = np.random.randint(1000, size=(10, 5)) df = DataFrame(arr, dtype="Int64") result = df.mean(numeric_only=True) - expected = DataFrame(arr).mean() + expected = DataFrame(arr, dtype="Float64").mean() tm.assert_series_equal(result, expected) def test_stats_mixed_type(self, float_string_frame): @@ -1544,6 +1549,100 @@ def test_reduction_timedelta_smallest_unit(self): tm.assert_series_equal(result, expected) +class TestEmptyDataFrameReductions: + @pytest.mark.parametrize( + "opname, dtype, exp_value, exp_dtype", + [ + ("sum", np.int8, 0, np.int64), + ("prod", np.int8, 1, np.int_), + ("sum", np.int64, 0, np.int64), + ("prod", np.int64, 1, np.int64), + ("sum", np.uint8, 0, np.int64), + ("prod", np.uint8, 1, np.uint), + ("sum", np.uint64, 0, np.int64), + ("prod", np.uint64, 1, np.uint64), + ("sum", np.float32, 0, np.float32), + ("prod", np.float32, 1, np.float32), + ("sum", np.float64, 0, np.float64), + ], + ) + def test_df_empty_min_count_0(self, opname, dtype, exp_value, exp_dtype): + df = DataFrame({0: [], 1: []}, dtype=dtype) + result = getattr(df, opname)(min_count=0) + + expected = Series([exp_value, exp_value], dtype=exp_dtype) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "opname, dtype, exp_dtype", + [ + ("sum", np.int8, np.float64), + ("prod", np.int8, np.float64), + ("sum", np.int64, np.float64), + ("prod", np.int64, np.float64), + ("sum", np.uint8, np.float64), + ("prod", np.uint8, np.float64), + ("sum", np.uint64, np.float64), + ("prod", np.uint64, np.float64), + ("sum", np.float32, np.float32), + ("prod", np.float32, np.float32), + ("sum", np.float64, np.float64), + ], + ) + def test_df_empty_min_count_1(self, opname, dtype, exp_dtype): + df = DataFrame({0: [], 1: []}, dtype=dtype) + result = getattr(df, opname)(min_count=1) + + expected = Series([np.nan, np.nan], dtype=exp_dtype) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "opname, dtype, exp_value, exp_dtype", + [ + ("sum", "Int8", 0, ("Int32" if is_windows_or_is32 else "Int64")), + ("prod", "Int8", 1, ("Int32" if is_windows_or_is32 else "Int64")), + ("sum", "Int64", 0, "Int64"), + ("prod", "Int64", 1, "Int64"), + ("sum", "UInt8", 0, ("UInt32" if is_windows_or_is32 else "UInt64")), + ("prod", "UInt8", 1, ("UInt32" if is_windows_or_is32 else "UInt64")), + ("sum", "UInt64", 0, "UInt64"), + ("prod", "UInt64", 1, "UInt64"), + ("sum", "Float32", 0, "Float32"), + ("prod", "Float32", 1, "Float32"), + ("sum", "Float64", 0, "Float64"), + ], + ) + def test_df_empty_nullable_min_count_0(self, opname, dtype, exp_value, exp_dtype): + df = DataFrame({0: [], 1: []}, dtype=dtype) + result = getattr(df, opname)(min_count=0) + + expected = Series([exp_value, exp_value], dtype=exp_dtype) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "opname, dtype, exp_dtype", + [ + ("sum", "Int8", "Int8"), + ("prod", "Int8", "Int8"), + ("sum", "Int64", "Int64"), + ("prod", "Int64", "Int64"), + ("sum", "UInt8", "UInt8"), + ("prod", "UInt8", "UInt8"), + ("sum", "UInt64", "UInt64"), + ("prod", "UInt64", "UInt64"), + ("sum", "Float32", "Float32"), + ("prod", "Float32", "Float32"), + ("sum", "Float64", "Float64"), + ], + ) + def test_df_empty_nullable_min_count_1(self, opname, dtype, exp_dtype): + df = DataFrame({0: [], 1: []}, dtype=dtype) + result = getattr(df, opname)(min_count=1) + + expected = Series([pd.NA, pd.NA], dtype=exp_dtype) + tm.assert_series_equal(result, expected) + + class TestNuisanceColumns: @pytest.mark.parametrize("method", ["any", "all"]) def test_any_all_categorical_dtype_nuisance_column(self, method): @@ -1678,7 +1777,9 @@ def test_minmax_extensionarray(method, numeric_only): df = DataFrame({"Int64": ser}) result = getattr(df, method)(numeric_only=numeric_only) expected = Series( - [getattr(int64_info, method)], index=Index(["Int64"], dtype="object") + [getattr(int64_info, method)], + index=Index(["Int64"], dtype="object"), + dtype=pd.Int64Dtype(), ) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index e5599d60b4f0d..aaedf00932345 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -945,7 +945,7 @@ def test_apply_multi_level_name(category): b = pd.Categorical(b, categories=[1, 2, 3]) expected_index = pd.CategoricalIndex([1, 2, 3], categories=[1, 2, 3], name="B") # GH#40669 - summing an empty frame gives float dtype - expected_values = [20.0, 25.0, 0.0] + expected_values = [20, 25, 0] else: expected_index = Index([1, 2], name="B") expected_values = [20, 25]