From 6c255d0a956c4b35f208cba673b52934c78b3d4c Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Fri, 1 May 2020 01:56:51 +0200 Subject: [PATCH 1/9] API: make min/max on empty datetime df consistent with datetime series (#33704) --- pandas/core/nanops.py | 3 +-- pandas/tests/arithmetic/test_datetime64.py | 15 +++++++++++++++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 32b05872ded3f..8a90ccf0622b7 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -384,8 +384,7 @@ def _na_for_min_count( else: assert axis is not None # assertion to make mypy happy result_shape = values.shape[:axis] + values.shape[axis + 1 :] - result = np.empty(result_shape, dtype=values.dtype) - result.fill(fill_value) + result = np.full(result_shape, fill_value) return result diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index 8c480faa4ee81..b131a2294780f 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -2471,3 +2471,18 @@ def test_dt64arr_addsub_object_dtype_2d(): assert result2.shape == (4, 1) assert result2.freq is None assert (result2.asi8 == 0).all() + + +def test_sum_empty_df_series(): + # Calling the following defined sum function returned an error for dataframes but + # returned NaT for series. # Check that the API is consistent in this sense when + # operating on empty Series/DataFrames. See GH:33704 for more information + df = pd.DataFrame(dict(x=pd.to_datetime([]))) + series = pd.Series(pd.to_datetime([])) + assert (df.min().x is NaT) == (series.min() is NaT) + assert (df.max().x is NaT) == (series.max() is NaT) + + df = pd.DataFrame(dict(x=[np.nan])) + series = pd.Series([np.nan]) + assert np.isnan(df.min().x) == np.isnan(series.min()) + assert np.isnan(df.max().x) == np.isnan(series.max()) From 363898df29fb353a644ebbae0a9b856774679170 Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Fri, 1 May 2020 17:53:46 +0200 Subject: [PATCH 2/9] update nanops and tests as result of PR discussion --- pandas/core/nanops.py | 7 +++++- pandas/tests/arithmetic/test_datetime64.py | 15 ------------- pandas/tests/reductions/test_reductions.py | 25 ++++++++++++++++++++++ private_tests/test_df_min_1.py | 8 +++++++ private_tests/test_nan.py | 8 +++++++ 5 files changed, 47 insertions(+), 16 deletions(-) create mode 100644 private_tests/test_df_min_1.py create mode 100644 private_tests/test_nan.py diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 8a90ccf0622b7..ab2fe5c667fae 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -384,7 +384,12 @@ def _na_for_min_count( else: assert axis is not None # assertion to make mypy happy result_shape = values.shape[:axis] + values.shape[axis + 1 :] - result = np.full(result_shape, fill_value) + # calling np.full with dtype parameter throws an ValueError when called + # with np.datetime64 and pd.NaT + try: + result = np.full(result_shape, fill_value, dtype=values.dtype) + except ValueError: + result = np.full(result_shape, fill_value) return result diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index b131a2294780f..8c480faa4ee81 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -2471,18 +2471,3 @@ def test_dt64arr_addsub_object_dtype_2d(): assert result2.shape == (4, 1) assert result2.freq is None assert (result2.asi8 == 0).all() - - -def test_sum_empty_df_series(): - # Calling the following defined sum function returned an error for dataframes but - # returned NaT for series. # Check that the API is consistent in this sense when - # operating on empty Series/DataFrames. See GH:33704 for more information - df = pd.DataFrame(dict(x=pd.to_datetime([]))) - series = pd.Series(pd.to_datetime([])) - assert (df.min().x is NaT) == (series.min() is NaT) - assert (df.max().x is NaT) == (series.max() is NaT) - - df = pd.DataFrame(dict(x=[np.nan])) - series = pd.Series([np.nan]) - assert np.isnan(df.min().x) == np.isnan(series.min()) - assert np.isnan(df.max().x) == np.isnan(series.max()) diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index f6e0d2f0c1751..650dfa51e79ac 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -1035,6 +1035,31 @@ def test_minmax_nat_dataframe(self, nat_df): assert nat_df.min(skipna=False)[0] is pd.NaT assert nat_df.max(skipna=False)[0] is pd.NaT + def test_min_max_dt64_with_NaT(self): + # Calling the following sum functions returned an error for dataframes but + # returned NaT for series. These tests check that the API is consistent in + # min/max calls on empty Series/DataFrames. See GH:33704 for more + # information + + df = pd.DataFrame(dict(x=pd.to_datetime([]))) + expected_dt_series = pd.Series(pd.to_datetime([])) + # check axis 0 + assert (df.min(axis=0).x is NaT) == (expected_dt_series.min() is NaT) + assert (df.max(axis=0).x is NaT) == (expected_dt_series.max() is NaT) + + # check axis 1 + tm.assert_series_equal(df.min(axis=1), expected_dt_series) + tm.assert_series_equal(df.max(axis=1), expected_dt_series) + + df = pd.DataFrame(dict(x=[])) + expected_float_series = pd.Series([], dtype=float) + # check axis 0 + assert np.isnan(df.min(axis=0).x) == np.isnan(expected_float_series.min()) + assert np.isnan(df.max(axis=0).x) == np.isnan(expected_float_series.max()) + # check axis 1 + tm.assert_series_equal(df.min(axis=1), expected_float_series) + tm.assert_series_equal(df.min(axis=1), expected_float_series) + def test_min_max(self): rng = pd.date_range("1/1/2000", "12/31/2000") rng2 = rng.take(np.random.permutation(len(rng))) diff --git a/private_tests/test_df_min_1.py b/private_tests/test_df_min_1.py new file mode 100644 index 0000000000000..58faf8454bc37 --- /dev/null +++ b/private_tests/test_df_min_1.py @@ -0,0 +1,8 @@ +import pandas as pd + +if __name__ == '__main__': + df = pd.DataFrame(dict(x=pd.to_datetime([]))) + series = pd.Series(pd.to_datetime([])) + res = df.min(1) + print(res) + diff --git a/private_tests/test_nan.py b/private_tests/test_nan.py new file mode 100644 index 0000000000000..987b5bb8b28ff --- /dev/null +++ b/private_tests/test_nan.py @@ -0,0 +1,8 @@ +import pandas as pd +import numpy as np + +if __name__ == '__main__': + df = pd.DataFrame(dict(x=[np.nan, np.nan])) + import pdb; pdb.set_trace() + res = df.max() + print(res) From 0d9a754eef893a9cfb69f8ca127b5f766741cf45 Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Fri, 1 May 2020 17:55:53 +0200 Subject: [PATCH 3/9] removed git tracking of test file --- private_tests/test_df_min_1.py | 8 -------- 1 file changed, 8 deletions(-) delete mode 100644 private_tests/test_df_min_1.py diff --git a/private_tests/test_df_min_1.py b/private_tests/test_df_min_1.py deleted file mode 100644 index 58faf8454bc37..0000000000000 --- a/private_tests/test_df_min_1.py +++ /dev/null @@ -1,8 +0,0 @@ -import pandas as pd - -if __name__ == '__main__': - df = pd.DataFrame(dict(x=pd.to_datetime([]))) - series = pd.Series(pd.to_datetime([])) - res = df.min(1) - print(res) - From 6e14ff833d515d334972125ee40689827af1d814 Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Fri, 1 May 2020 17:56:32 +0200 Subject: [PATCH 4/9] removed git tracking of test file --- private_tests/test_nan.py | 8 -------- 1 file changed, 8 deletions(-) delete mode 100644 private_tests/test_nan.py diff --git a/private_tests/test_nan.py b/private_tests/test_nan.py deleted file mode 100644 index 987b5bb8b28ff..0000000000000 --- a/private_tests/test_nan.py +++ /dev/null @@ -1,8 +0,0 @@ -import pandas as pd -import numpy as np - -if __name__ == '__main__': - df = pd.DataFrame(dict(x=[np.nan, np.nan])) - import pdb; pdb.set_trace() - res = df.max() - print(res) From 60ea00ff597aece011b76a7a0df56e8b1266b110 Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Fri, 1 May 2020 18:24:15 +0200 Subject: [PATCH 5/9] added whatsnew entry --- doc/source/whatsnew/v1.1.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 823bfc75e4304..4551f51a3a6d3 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -545,6 +545,7 @@ Datetimelike - Bug in :meth:`DatetimeIndex.intersection` losing ``freq`` and timezone in some cases (:issue:`33604`) - Bug in :class:`DatetimeIndex` addition and subtraction with some types of :class:`DateOffset` objects incorrectly retaining an invalid ``freq`` attribute (:issue:`33779`) - Bug in :class:`DatetimeIndex` where setting the ``freq`` attribute on an index could silently change the ``freq`` attribute on another index viewing the same data (:issue:`33552`) +- Bug in :meth:`nanops._na_for_min_count` when called with empty :class:`DataFrame` of ``timedelta64`` dtype (:issue:`33911`) Timedelta ^^^^^^^^^ From 5e57597c9ccceaeb44c0bce83777c3998db87434 Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Fri, 1 May 2020 18:34:16 +0200 Subject: [PATCH 6/9] move tests to test_analytics.py --- pandas/tests/frame/test_analytics.py | 24 +++++++++++++++++++++ pandas/tests/reductions/test_reductions.py | 25 ---------------------- 2 files changed, 24 insertions(+), 25 deletions(-) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 75afc59382a75..863b9f36acf10 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -1250,3 +1250,27 @@ def test_min_max_dt64_with_NaT(self): res = df.max() exp = pd.Series([pd.NaT], index=["foo"]) tm.assert_series_equal(res, exp) + + # Calling the following sum functions returned an error for dataframes but + # returned NaT for series. These tests check that the API is consistent in + # min/max calls on empty Series/DataFrames. See GH:33704 for more + # information + df = pd.DataFrame(dict(x=pd.to_datetime([]))) + expected_dt_series = pd.Series(pd.to_datetime([])) + # check axis 0 + assert (df.min(axis=0).x is pd.NaT) == (expected_dt_series.min() is pd.NaT) + assert (df.max(axis=0).x is pd.NaT) == (expected_dt_series.max() is pd.NaT) + + # check axis 1 + tm.assert_series_equal(df.min(axis=1), expected_dt_series) + tm.assert_series_equal(df.max(axis=1), expected_dt_series) + + df = pd.DataFrame(dict(x=[])) + expected_float_series = pd.Series([], dtype=float) + # check axis 0 + assert np.isnan(df.min(axis=0).x) == np.isnan(expected_float_series.min()) + assert np.isnan(df.max(axis=0).x) == np.isnan(expected_float_series.max()) + # check axis 1 + tm.assert_series_equal(df.min(axis=1), expected_float_series) + tm.assert_series_equal(df.min(axis=1), expected_float_series) + diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 650dfa51e79ac..f6e0d2f0c1751 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -1035,31 +1035,6 @@ def test_minmax_nat_dataframe(self, nat_df): assert nat_df.min(skipna=False)[0] is pd.NaT assert nat_df.max(skipna=False)[0] is pd.NaT - def test_min_max_dt64_with_NaT(self): - # Calling the following sum functions returned an error for dataframes but - # returned NaT for series. These tests check that the API is consistent in - # min/max calls on empty Series/DataFrames. See GH:33704 for more - # information - - df = pd.DataFrame(dict(x=pd.to_datetime([]))) - expected_dt_series = pd.Series(pd.to_datetime([])) - # check axis 0 - assert (df.min(axis=0).x is NaT) == (expected_dt_series.min() is NaT) - assert (df.max(axis=0).x is NaT) == (expected_dt_series.max() is NaT) - - # check axis 1 - tm.assert_series_equal(df.min(axis=1), expected_dt_series) - tm.assert_series_equal(df.max(axis=1), expected_dt_series) - - df = pd.DataFrame(dict(x=[])) - expected_float_series = pd.Series([], dtype=float) - # check axis 0 - assert np.isnan(df.min(axis=0).x) == np.isnan(expected_float_series.min()) - assert np.isnan(df.max(axis=0).x) == np.isnan(expected_float_series.max()) - # check axis 1 - tm.assert_series_equal(df.min(axis=1), expected_float_series) - tm.assert_series_equal(df.min(axis=1), expected_float_series) - def test_min_max(self): rng = pd.date_range("1/1/2000", "12/31/2000") rng2 = rng.take(np.random.permutation(len(rng))) From c62921bcfc1ec58bf0da60d5b6b825479d3dfe17 Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Fri, 1 May 2020 19:35:07 +0200 Subject: [PATCH 7/9] removed blank line at-end-of-file test_analytics.py --- pandas/tests/frame/test_analytics.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 863b9f36acf10..69c59053c4c37 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -1273,4 +1273,3 @@ def test_min_max_dt64_with_NaT(self): # check axis 1 tm.assert_series_equal(df.min(axis=1), expected_float_series) tm.assert_series_equal(df.min(axis=1), expected_float_series) - From 77fd5c67c3a5f470ca2795d4eb16692741e316ac Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Thu, 7 May 2020 20:46:59 +0200 Subject: [PATCH 8/9] splitted tests, update comment in nanops and update whatsnew entry --- doc/source/whatsnew/v1.1.0.rst | 2 +- pandas/core/nanops.py | 2 +- pandas/tests/frame/test_analytics.py | 3 +++ 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 783d89fc6e468..8e67c1c24fd90 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -560,7 +560,7 @@ Datetimelike - Bug in :meth:`DatetimeIndex.intersection` losing ``freq`` and timezone in some cases (:issue:`33604`) - Bug in :class:`DatetimeIndex` addition and subtraction with some types of :class:`DateOffset` objects incorrectly retaining an invalid ``freq`` attribute (:issue:`33779`) - Bug in :class:`DatetimeIndex` where setting the ``freq`` attribute on an index could silently change the ``freq`` attribute on another index viewing the same data (:issue:`33552`) -- Bug in :meth:`nanops._na_for_min_count` when called with empty :class:`DataFrame` of ``timedelta64`` dtype (:issue:`33911`) +- :meth:`DataFrame.min`/:meth:`DataFrame.max` not returning consistent result with :meth:`Series.min`/:meth:`Series.max` when called on objects initialized with empty :func:`pd.to_datetime` - Bug in :meth:`DatetimeIndex.intersection` and :meth:`TimedeltaIndex.intersection` with results not having the correct ``name`` attribute (:issue:`33904`) Timedelta diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 3d04cd77d1410..9c227ff460a97 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -385,7 +385,7 @@ def _na_for_min_count( assert axis is not None # assertion to make mypy happy result_shape = values.shape[:axis] + values.shape[axis + 1 :] # calling np.full with dtype parameter throws an ValueError when called - # with np.datetime64 and pd.NaT + # with dtype=np.datetime64 and and fill_value=pd.NaT try: result = np.full(result_shape, fill_value, dtype=values.dtype) except ValueError: diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 02efaea0b0e08..a5a44e3ba67ac 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -1259,6 +1259,7 @@ def test_min_max_dt64_with_NaT(self): exp = pd.Series([pd.NaT], index=["foo"]) tm.assert_series_equal(res, exp) + def test_min_max_dt64_api_consistency_with_NaT(self): # Calling the following sum functions returned an error for dataframes but # returned NaT for series. These tests check that the API is consistent in # min/max calls on empty Series/DataFrames. See GH:33704 for more @@ -1273,6 +1274,8 @@ def test_min_max_dt64_with_NaT(self): tm.assert_series_equal(df.min(axis=1), expected_dt_series) tm.assert_series_equal(df.max(axis=1), expected_dt_series) + def test_min_max_dt64_api_consistency_empty_df(self): + # check DataFrame/Series api consistency when calling min/max on an empty DataFrame/Series. df = pd.DataFrame(dict(x=[])) expected_float_series = pd.Series([], dtype=float) # check axis 0 From 9219ec31581368b0dca968af919b1040b1ac56bd Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Thu, 7 May 2020 21:14:41 +0200 Subject: [PATCH 9/9] update test_analytics for linting --- pandas/tests/frame/test_analytics.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index a5a44e3ba67ac..7869815c24037 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -1275,7 +1275,8 @@ def test_min_max_dt64_api_consistency_with_NaT(self): tm.assert_series_equal(df.max(axis=1), expected_dt_series) def test_min_max_dt64_api_consistency_empty_df(self): - # check DataFrame/Series api consistency when calling min/max on an empty DataFrame/Series. + # check DataFrame/Series api consistency when calling min/max on an empty + # DataFrame/Series. df = pd.DataFrame(dict(x=[])) expected_float_series = pd.Series([], dtype=float) # check axis 0