From 1fa7a2d3163d47bdbbcd717f792829c71680721d Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 20 Oct 2020 11:12:47 -0700 Subject: [PATCH 1/2] TST/REF: collect fillna tests --- pandas/tests/series/methods/test_fillna.py | 602 ++++++++++++++++++++- pandas/tests/series/test_missing.py | 591 +------------------- 2 files changed, 601 insertions(+), 592 deletions(-) diff --git a/pandas/tests/series/methods/test_fillna.py b/pandas/tests/series/methods/test_fillna.py index b6a6f4e8200d4..02214d66347e1 100644 --- a/pandas/tests/series/methods/test_fillna.py +++ b/pandas/tests/series/methods/test_fillna.py @@ -1,13 +1,504 @@ -from datetime import timedelta +from datetime import datetime, timedelta import numpy as np import pytest +import pytz -from pandas import Categorical, DataFrame, NaT, Period, Series, Timedelta, Timestamp +from pandas import ( + Categorical, + DataFrame, + DatetimeIndex, + NaT, + Period, + Series, + Timedelta, + Timestamp, + isna, +) import pandas._testing as tm class TestSeriesFillNA: + def test_fillna(self, datetime_series): + ts = Series([0.0, 1.0, 2.0, 3.0, 4.0], index=tm.makeDateIndex(5)) + + tm.assert_series_equal(ts, ts.fillna(method="ffill")) + + ts[2] = np.NaN + + exp = Series([0.0, 1.0, 1.0, 3.0, 4.0], index=ts.index) + tm.assert_series_equal(ts.fillna(method="ffill"), exp) + + exp = Series([0.0, 1.0, 3.0, 3.0, 4.0], index=ts.index) + tm.assert_series_equal(ts.fillna(method="backfill"), exp) + + exp = Series([0.0, 1.0, 5.0, 3.0, 4.0], index=ts.index) + tm.assert_series_equal(ts.fillna(value=5), exp) + + msg = "Must specify a fill 'value' or 'method'" + with pytest.raises(ValueError, match=msg): + ts.fillna() + + msg = "Cannot specify both 'value' and 'method'" + with pytest.raises(ValueError, match=msg): + datetime_series.fillna(value=0, method="ffill") + + # GH#5703 + s1 = Series([np.nan]) + s2 = Series([1]) + result = s1.fillna(s2) + expected = Series([1.0]) + tm.assert_series_equal(result, expected) + result = s1.fillna({}) + tm.assert_series_equal(result, s1) + result = s1.fillna(Series((), dtype=object)) + tm.assert_series_equal(result, s1) + result = s2.fillna(s1) + tm.assert_series_equal(result, s2) + result = s1.fillna({0: 1}) + tm.assert_series_equal(result, expected) + result = s1.fillna({1: 1}) + tm.assert_series_equal(result, Series([np.nan])) + result = s1.fillna({0: 1, 1: 1}) + tm.assert_series_equal(result, expected) + result = s1.fillna(Series({0: 1, 1: 1})) + tm.assert_series_equal(result, expected) + result = s1.fillna(Series({0: 1, 1: 1}, index=[4, 5])) + tm.assert_series_equal(result, s1) + + s1 = Series([0, 1, 2], list("abc")) + s2 = Series([0, np.nan, 2], list("bac")) + result = s2.fillna(s1) + expected = Series([0, 0, 2.0], list("bac")) + tm.assert_series_equal(result, expected) + + # limit + ser = Series(np.nan, index=[0, 1, 2]) + result = ser.fillna(999, limit=1) + expected = Series([999, np.nan, np.nan], index=[0, 1, 2]) + tm.assert_series_equal(result, expected) + + result = ser.fillna(999, limit=2) + expected = Series([999, 999, np.nan], index=[0, 1, 2]) + tm.assert_series_equal(result, expected) + + # GH#9043 + # make sure a string representation of int/float values can be filled + # correctly without raising errors or being converted + vals = ["0", "1.5", "-0.3"] + for val in vals: + ser = Series([0, 1, np.nan, np.nan, 4], dtype="float64") + result = ser.fillna(val) + expected = Series([0, 1, val, val, 4], dtype="object") + tm.assert_series_equal(result, expected) + + def test_fillna_consistency(self): + # GH#16402 + # fillna with a tz aware to a tz-naive, should result in object + + ser = Series([Timestamp("20130101"), NaT]) + + result = ser.fillna(Timestamp("20130101", tz="US/Eastern")) + expected = Series( + [Timestamp("20130101"), Timestamp("2013-01-01", tz="US/Eastern")], + dtype="object", + ) + tm.assert_series_equal(result, expected) + + # where (we ignore the errors=) + result = ser.where( + [True, False], Timestamp("20130101", tz="US/Eastern"), errors="ignore" + ) + tm.assert_series_equal(result, expected) + + result = ser.where( + [True, False], Timestamp("20130101", tz="US/Eastern"), errors="ignore" + ) + tm.assert_series_equal(result, expected) + + # with a non-datetime + result = ser.fillna("foo") + expected = Series([Timestamp("20130101"), "foo"]) + tm.assert_series_equal(result, expected) + + # assignment + ser2 = ser.copy() + ser2[1] = "foo" + tm.assert_series_equal(ser2, expected) + + def test_fillna_downcast(self): + # GH#15277 + # infer int64 from float64 + ser = Series([1.0, np.nan]) + result = ser.fillna(0, downcast="infer") + expected = Series([1, 0]) + tm.assert_series_equal(result, expected) + + # infer int64 from float64 when fillna value is a dict + ser = Series([1.0, np.nan]) + result = ser.fillna({1: 0}, downcast="infer") + expected = Series([1, 0]) + tm.assert_series_equal(result, expected) + + def test_timedelta_fillna(self): + # GH#3371 + ser = Series( + [ + Timestamp("20130101"), + Timestamp("20130101"), + Timestamp("20130102"), + Timestamp("20130103 9:01:01"), + ] + ) + td = ser.diff() + + # reg fillna + result = td.fillna(Timedelta(seconds=0)) + expected = Series( + [ + timedelta(0), + timedelta(0), + timedelta(1), + timedelta(days=1, seconds=9 * 3600 + 60 + 1), + ] + ) + tm.assert_series_equal(result, expected) + + # interpreted as seconds, deprecated + with pytest.raises(TypeError, match="Passing integers to fillna"): + td.fillna(1) + + result = td.fillna(Timedelta(seconds=1)) + expected = Series( + [ + timedelta(seconds=1), + timedelta(0), + timedelta(1), + timedelta(days=1, seconds=9 * 3600 + 60 + 1), + ] + ) + tm.assert_series_equal(result, expected) + + result = td.fillna(timedelta(days=1, seconds=1)) + expected = Series( + [ + timedelta(days=1, seconds=1), + timedelta(0), + timedelta(1), + timedelta(days=1, seconds=9 * 3600 + 60 + 1), + ] + ) + tm.assert_series_equal(result, expected) + + result = td.fillna(np.timedelta64(int(1e9))) + expected = Series( + [ + timedelta(seconds=1), + timedelta(0), + timedelta(1), + timedelta(days=1, seconds=9 * 3600 + 60 + 1), + ] + ) + tm.assert_series_equal(result, expected) + + result = td.fillna(NaT) + expected = Series( + [ + NaT, + timedelta(0), + timedelta(1), + timedelta(days=1, seconds=9 * 3600 + 60 + 1), + ], + dtype="m8[ns]", + ) + tm.assert_series_equal(result, expected) + + # ffill + td[2] = np.nan + result = td.ffill() + expected = td.fillna(Timedelta(seconds=0)) + expected[0] = np.nan + tm.assert_series_equal(result, expected) + + # bfill + td[2] = np.nan + result = td.bfill() + expected = td.fillna(Timedelta(seconds=0)) + expected[2] = timedelta(days=1, seconds=9 * 3600 + 60 + 1) + tm.assert_series_equal(result, expected) + + def test_datetime64_fillna(self): + + ser = Series( + [ + Timestamp("20130101"), + Timestamp("20130101"), + Timestamp("20130102"), + Timestamp("20130103 9:01:01"), + ] + ) + ser[2] = np.nan + + # ffill + result = ser.ffill() + expected = Series( + [ + Timestamp("20130101"), + Timestamp("20130101"), + Timestamp("20130101"), + Timestamp("20130103 9:01:01"), + ] + ) + tm.assert_series_equal(result, expected) + + # bfill + result = ser.bfill() + expected = Series( + [ + Timestamp("20130101"), + Timestamp("20130101"), + Timestamp("20130103 9:01:01"), + Timestamp("20130103 9:01:01"), + ] + ) + tm.assert_series_equal(result, expected) + + # GH#6587 + # make sure that we are treating as integer when filling + # this also tests inference of a datetime-like with NaT's + ser = Series([NaT, NaT, "2013-08-05 15:30:00.000001"]) + expected = Series( + [ + "2013-08-05 15:30:00.000001", + "2013-08-05 15:30:00.000001", + "2013-08-05 15:30:00.000001", + ], + dtype="M8[ns]", + ) + result = ser.fillna(method="backfill") + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("tz", ["US/Eastern", "Asia/Tokyo"]) + def test_datetime64_tz_fillna(self, tz): + # DatetimeBlock + ser = Series( + [ + Timestamp("2011-01-01 10:00"), + NaT, + Timestamp("2011-01-03 10:00"), + NaT, + ] + ) + null_loc = Series([False, True, False, True]) + + result = ser.fillna(Timestamp("2011-01-02 10:00")) + expected = Series( + [ + Timestamp("2011-01-01 10:00"), + Timestamp("2011-01-02 10:00"), + Timestamp("2011-01-03 10:00"), + Timestamp("2011-01-02 10:00"), + ] + ) + tm.assert_series_equal(expected, result) + # check s is not changed + tm.assert_series_equal(isna(ser), null_loc) + + result = ser.fillna(Timestamp("2011-01-02 10:00", tz=tz)) + expected = Series( + [ + Timestamp("2011-01-01 10:00"), + Timestamp("2011-01-02 10:00", tz=tz), + Timestamp("2011-01-03 10:00"), + Timestamp("2011-01-02 10:00", tz=tz), + ] + ) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(isna(ser), null_loc) + + result = ser.fillna("AAA") + expected = Series( + [ + Timestamp("2011-01-01 10:00"), + "AAA", + Timestamp("2011-01-03 10:00"), + "AAA", + ], + dtype=object, + ) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(isna(ser), null_loc) + + result = ser.fillna( + { + 1: Timestamp("2011-01-02 10:00", tz=tz), + 3: Timestamp("2011-01-04 10:00"), + } + ) + expected = Series( + [ + Timestamp("2011-01-01 10:00"), + Timestamp("2011-01-02 10:00", tz=tz), + Timestamp("2011-01-03 10:00"), + Timestamp("2011-01-04 10:00"), + ] + ) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(isna(ser), null_loc) + + result = ser.fillna( + {1: Timestamp("2011-01-02 10:00"), 3: Timestamp("2011-01-04 10:00")} + ) + expected = Series( + [ + Timestamp("2011-01-01 10:00"), + Timestamp("2011-01-02 10:00"), + Timestamp("2011-01-03 10:00"), + Timestamp("2011-01-04 10:00"), + ] + ) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(isna(ser), null_loc) + + # DatetimeBlockTZ + idx = DatetimeIndex(["2011-01-01 10:00", NaT, "2011-01-03 10:00", NaT], tz=tz) + ser = Series(idx) + assert ser.dtype == f"datetime64[ns, {tz}]" + tm.assert_series_equal(isna(ser), null_loc) + + result = ser.fillna(Timestamp("2011-01-02 10:00")) + expected = Series( + [ + Timestamp("2011-01-01 10:00", tz=tz), + Timestamp("2011-01-02 10:00"), + Timestamp("2011-01-03 10:00", tz=tz), + Timestamp("2011-01-02 10:00"), + ] + ) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(isna(ser), null_loc) + + result = ser.fillna(Timestamp("2011-01-02 10:00", tz=tz)) + idx = DatetimeIndex( + [ + "2011-01-01 10:00", + "2011-01-02 10:00", + "2011-01-03 10:00", + "2011-01-02 10:00", + ], + tz=tz, + ) + expected = Series(idx) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(isna(ser), null_loc) + + result = ser.fillna(Timestamp("2011-01-02 10:00", tz=tz).to_pydatetime()) + idx = DatetimeIndex( + [ + "2011-01-01 10:00", + "2011-01-02 10:00", + "2011-01-03 10:00", + "2011-01-02 10:00", + ], + tz=tz, + ) + expected = Series(idx) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(isna(ser), null_loc) + + result = ser.fillna("AAA") + expected = Series( + [ + Timestamp("2011-01-01 10:00", tz=tz), + "AAA", + Timestamp("2011-01-03 10:00", tz=tz), + "AAA", + ], + dtype=object, + ) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(isna(ser), null_loc) + + result = ser.fillna( + { + 1: Timestamp("2011-01-02 10:00", tz=tz), + 3: Timestamp("2011-01-04 10:00"), + } + ) + expected = Series( + [ + Timestamp("2011-01-01 10:00", tz=tz), + Timestamp("2011-01-02 10:00", tz=tz), + Timestamp("2011-01-03 10:00", tz=tz), + Timestamp("2011-01-04 10:00"), + ] + ) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(isna(ser), null_loc) + + result = ser.fillna( + { + 1: Timestamp("2011-01-02 10:00", tz=tz), + 3: Timestamp("2011-01-04 10:00", tz=tz), + } + ) + expected = Series( + [ + Timestamp("2011-01-01 10:00", tz=tz), + Timestamp("2011-01-02 10:00", tz=tz), + Timestamp("2011-01-03 10:00", tz=tz), + Timestamp("2011-01-04 10:00", tz=tz), + ] + ) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(isna(ser), null_loc) + + # filling with a naive/other zone, coerce to object + result = ser.fillna(Timestamp("20130101")) + expected = Series( + [ + Timestamp("2011-01-01 10:00", tz=tz), + Timestamp("2013-01-01"), + Timestamp("2011-01-03 10:00", tz=tz), + Timestamp("2013-01-01"), + ] + ) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(isna(ser), null_loc) + + result = ser.fillna(Timestamp("20130101", tz="US/Pacific")) + expected = Series( + [ + Timestamp("2011-01-01 10:00", tz=tz), + Timestamp("2013-01-01", tz="US/Pacific"), + Timestamp("2011-01-03 10:00", tz=tz), + Timestamp("2013-01-01", tz="US/Pacific"), + ] + ) + tm.assert_series_equal(expected, result) + tm.assert_series_equal(isna(ser), null_loc) + + def test_fillna_dt64tz_with_method(self): + # with timezone + # GH#15855 + ser = Series([Timestamp("2012-11-11 00:00:00+01:00"), NaT]) + exp = Series( + [ + Timestamp("2012-11-11 00:00:00+01:00"), + Timestamp("2012-11-11 00:00:00+01:00"), + ] + ) + tm.assert_series_equal(ser.fillna(method="pad"), exp) + + ser = Series([NaT, Timestamp("2012-11-11 00:00:00+01:00")]) + exp = Series( + [ + Timestamp("2012-11-11 00:00:00+01:00"), + Timestamp("2012-11-11 00:00:00+01:00"), + ] + ) + tm.assert_series_equal(ser.fillna(method="bfill"), exp) + def test_fillna_pytimedelta(self): # GH#8209 ser = Series([np.nan, Timedelta("1 days")], index=["A", "B"]) @@ -153,6 +644,12 @@ def test_fillna_categorical_raises(self): # --------------------------------------------------------------- # Invalid Usages + def test_fillna_invalid_method(self, datetime_series): + try: + datetime_series.fillna(method="ffil") + except ValueError as inst: + assert "ffil" in str(inst) + def test_fillna_listlike_invalid(self): ser = Series(np.random.randint(-100, 100, 50)) msg = '"value" parameter must be a scalar or dict, but you passed a "list"' @@ -176,3 +673,104 @@ def test_fillna_method_and_limit_invalid(self): for method in ["backfill", "bfill", "pad", "ffill", None]: with pytest.raises(ValueError, match=msg): ser.fillna(1, limit=limit, method=method) + + +class TestFillnaPad: + def test_fillna_bug(self): + ser = Series([np.nan, 1.0, np.nan, 3.0, np.nan], ["z", "a", "b", "c", "d"]) + filled = ser.fillna(method="ffill") + expected = Series([np.nan, 1.0, 1.0, 3.0, 3.0], ser.index) + tm.assert_series_equal(filled, expected) + + filled = ser.fillna(method="bfill") + expected = Series([1.0, 1.0, 3.0, 3.0, np.nan], ser.index) + tm.assert_series_equal(filled, expected) + + def test_ffill(self): + ts = Series([0.0, 1.0, 2.0, 3.0, 4.0], index=tm.makeDateIndex(5)) + ts[2] = np.NaN + tm.assert_series_equal(ts.ffill(), ts.fillna(method="ffill")) + + def test_ffill_mixed_dtypes_without_missing_data(self): + # GH#14956 + series = Series([datetime(2015, 1, 1, tzinfo=pytz.utc), 1]) + result = series.ffill() + tm.assert_series_equal(series, result) + + def test_bfill(self): + ts = Series([0.0, 1.0, 2.0, 3.0, 4.0], index=tm.makeDateIndex(5)) + ts[2] = np.NaN + tm.assert_series_equal(ts.bfill(), ts.fillna(method="bfill")) + + def test_pad_nan(self): + x = Series( + [np.nan, 1.0, np.nan, 3.0, np.nan], ["z", "a", "b", "c", "d"], dtype=float + ) + + return_value = x.fillna(method="pad", inplace=True) + assert return_value is None + + expected = Series( + [np.nan, 1.0, 1.0, 3.0, 3.0], ["z", "a", "b", "c", "d"], dtype=float + ) + tm.assert_series_equal(x[1:], expected[1:]) + assert np.isnan(x[0]), np.isnan(expected[0]) + + def test_series_fillna_limit(self): + index = np.arange(10) + s = Series(np.random.randn(10), index=index) + + result = s[:2].reindex(index) + result = result.fillna(method="pad", limit=5) + + expected = s[:2].reindex(index).fillna(method="pad") + expected[-3:] = np.nan + tm.assert_series_equal(result, expected) + + result = s[-2:].reindex(index) + result = result.fillna(method="bfill", limit=5) + + expected = s[-2:].reindex(index).fillna(method="backfill") + expected[:3] = np.nan + tm.assert_series_equal(result, expected) + + def test_series_pad_backfill_limit(self): + index = np.arange(10) + s = Series(np.random.randn(10), index=index) + + result = s[:2].reindex(index, method="pad", limit=5) + + expected = s[:2].reindex(index).fillna(method="pad") + expected[-3:] = np.nan + tm.assert_series_equal(result, expected) + + result = s[-2:].reindex(index, method="backfill", limit=5) + + expected = s[-2:].reindex(index).fillna(method="backfill") + expected[:3] = np.nan + tm.assert_series_equal(result, expected) + + def test_fillna_int(self): + ser = Series(np.random.randint(-100, 100, 50)) + return_value = ser.fillna(method="ffill", inplace=True) + assert return_value is None + tm.assert_series_equal(ser.fillna(method="ffill", inplace=False), ser) + + def test_datetime64tz_fillna_round_issue(self): + # GH#14872 + + data = Series( + [NaT, NaT, datetime(2016, 12, 12, 22, 24, 6, 100001, tzinfo=pytz.utc)] + ) + + filled = data.fillna(method="bfill") + + expected = Series( + [ + datetime(2016, 12, 12, 22, 24, 6, 100001, tzinfo=pytz.utc), + datetime(2016, 12, 12, 22, 24, 6, 100001, tzinfo=pytz.utc), + datetime(2016, 12, 12, 22, 24, 6, 100001, tzinfo=pytz.utc), + ] + ) + + tm.assert_series_equal(filled, expected) diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index 0144e4257efe0..f268e7e3556cb 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -1,8 +1,7 @@ -from datetime import datetime, timedelta +from datetime import timedelta import numpy as np import pytest -import pytz from pandas._libs import iNaT @@ -14,7 +13,6 @@ IntervalIndex, NaT, Series, - Timedelta, Timestamp, date_range, isna, @@ -23,440 +21,6 @@ class TestSeriesMissingData: - def test_timedelta_fillna(self): - # GH 3371 - s = Series( - [ - Timestamp("20130101"), - Timestamp("20130101"), - Timestamp("20130102"), - Timestamp("20130103 9:01:01"), - ] - ) - td = s.diff() - - # reg fillna - result = td.fillna(Timedelta(seconds=0)) - expected = Series( - [ - timedelta(0), - timedelta(0), - timedelta(1), - timedelta(days=1, seconds=9 * 3600 + 60 + 1), - ] - ) - tm.assert_series_equal(result, expected) - - # interpreted as seconds, deprecated - with pytest.raises(TypeError, match="Passing integers to fillna"): - td.fillna(1) - - result = td.fillna(Timedelta(seconds=1)) - expected = Series( - [ - timedelta(seconds=1), - timedelta(0), - timedelta(1), - timedelta(days=1, seconds=9 * 3600 + 60 + 1), - ] - ) - tm.assert_series_equal(result, expected) - - result = td.fillna(timedelta(days=1, seconds=1)) - expected = Series( - [ - timedelta(days=1, seconds=1), - timedelta(0), - timedelta(1), - timedelta(days=1, seconds=9 * 3600 + 60 + 1), - ] - ) - tm.assert_series_equal(result, expected) - - result = td.fillna(np.timedelta64(int(1e9))) - expected = Series( - [ - timedelta(seconds=1), - timedelta(0), - timedelta(1), - timedelta(days=1, seconds=9 * 3600 + 60 + 1), - ] - ) - tm.assert_series_equal(result, expected) - - result = td.fillna(NaT) - expected = Series( - [ - NaT, - timedelta(0), - timedelta(1), - timedelta(days=1, seconds=9 * 3600 + 60 + 1), - ], - dtype="m8[ns]", - ) - tm.assert_series_equal(result, expected) - - # ffill - td[2] = np.nan - result = td.ffill() - expected = td.fillna(Timedelta(seconds=0)) - expected[0] = np.nan - tm.assert_series_equal(result, expected) - - # bfill - td[2] = np.nan - result = td.bfill() - expected = td.fillna(Timedelta(seconds=0)) - expected[2] = timedelta(days=1, seconds=9 * 3600 + 60 + 1) - tm.assert_series_equal(result, expected) - - def test_datetime64_fillna(self): - - s = Series( - [ - Timestamp("20130101"), - Timestamp("20130101"), - Timestamp("20130102"), - Timestamp("20130103 9:01:01"), - ] - ) - s[2] = np.nan - - # ffill - result = s.ffill() - expected = Series( - [ - Timestamp("20130101"), - Timestamp("20130101"), - Timestamp("20130101"), - Timestamp("20130103 9:01:01"), - ] - ) - tm.assert_series_equal(result, expected) - - # bfill - result = s.bfill() - expected = Series( - [ - Timestamp("20130101"), - Timestamp("20130101"), - Timestamp("20130103 9:01:01"), - Timestamp("20130103 9:01:01"), - ] - ) - tm.assert_series_equal(result, expected) - - # GH 6587 - # make sure that we are treating as integer when filling - # this also tests inference of a datetime-like with NaT's - s = Series([pd.NaT, pd.NaT, "2013-08-05 15:30:00.000001"]) - expected = Series( - [ - "2013-08-05 15:30:00.000001", - "2013-08-05 15:30:00.000001", - "2013-08-05 15:30:00.000001", - ], - dtype="M8[ns]", - ) - result = s.fillna(method="backfill") - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize("tz", ["US/Eastern", "Asia/Tokyo"]) - def test_datetime64_tz_fillna(self, tz): - # DatetimeBlock - s = Series( - [ - Timestamp("2011-01-01 10:00"), - pd.NaT, - Timestamp("2011-01-03 10:00"), - pd.NaT, - ] - ) - null_loc = pd.Series([False, True, False, True]) - - result = s.fillna(pd.Timestamp("2011-01-02 10:00")) - expected = Series( - [ - Timestamp("2011-01-01 10:00"), - Timestamp("2011-01-02 10:00"), - Timestamp("2011-01-03 10:00"), - Timestamp("2011-01-02 10:00"), - ] - ) - tm.assert_series_equal(expected, result) - # check s is not changed - tm.assert_series_equal(pd.isna(s), null_loc) - - result = s.fillna(pd.Timestamp("2011-01-02 10:00", tz=tz)) - expected = Series( - [ - Timestamp("2011-01-01 10:00"), - Timestamp("2011-01-02 10:00", tz=tz), - Timestamp("2011-01-03 10:00"), - Timestamp("2011-01-02 10:00", tz=tz), - ] - ) - tm.assert_series_equal(expected, result) - tm.assert_series_equal(pd.isna(s), null_loc) - - result = s.fillna("AAA") - expected = Series( - [ - Timestamp("2011-01-01 10:00"), - "AAA", - Timestamp("2011-01-03 10:00"), - "AAA", - ], - dtype=object, - ) - tm.assert_series_equal(expected, result) - tm.assert_series_equal(pd.isna(s), null_loc) - - result = s.fillna( - { - 1: pd.Timestamp("2011-01-02 10:00", tz=tz), - 3: pd.Timestamp("2011-01-04 10:00"), - } - ) - expected = Series( - [ - Timestamp("2011-01-01 10:00"), - Timestamp("2011-01-02 10:00", tz=tz), - Timestamp("2011-01-03 10:00"), - Timestamp("2011-01-04 10:00"), - ] - ) - tm.assert_series_equal(expected, result) - tm.assert_series_equal(pd.isna(s), null_loc) - - result = s.fillna( - {1: pd.Timestamp("2011-01-02 10:00"), 3: pd.Timestamp("2011-01-04 10:00")} - ) - expected = Series( - [ - Timestamp("2011-01-01 10:00"), - Timestamp("2011-01-02 10:00"), - Timestamp("2011-01-03 10:00"), - Timestamp("2011-01-04 10:00"), - ] - ) - tm.assert_series_equal(expected, result) - tm.assert_series_equal(pd.isna(s), null_loc) - - # DatetimeBlockTZ - idx = pd.DatetimeIndex( - ["2011-01-01 10:00", pd.NaT, "2011-01-03 10:00", pd.NaT], tz=tz - ) - s = pd.Series(idx) - assert s.dtype == f"datetime64[ns, {tz}]" - tm.assert_series_equal(pd.isna(s), null_loc) - - result = s.fillna(pd.Timestamp("2011-01-02 10:00")) - expected = Series( - [ - Timestamp("2011-01-01 10:00", tz=tz), - Timestamp("2011-01-02 10:00"), - Timestamp("2011-01-03 10:00", tz=tz), - Timestamp("2011-01-02 10:00"), - ] - ) - tm.assert_series_equal(expected, result) - tm.assert_series_equal(pd.isna(s), null_loc) - - result = s.fillna(pd.Timestamp("2011-01-02 10:00", tz=tz)) - idx = pd.DatetimeIndex( - [ - "2011-01-01 10:00", - "2011-01-02 10:00", - "2011-01-03 10:00", - "2011-01-02 10:00", - ], - tz=tz, - ) - expected = Series(idx) - tm.assert_series_equal(expected, result) - tm.assert_series_equal(pd.isna(s), null_loc) - - result = s.fillna(pd.Timestamp("2011-01-02 10:00", tz=tz).to_pydatetime()) - idx = pd.DatetimeIndex( - [ - "2011-01-01 10:00", - "2011-01-02 10:00", - "2011-01-03 10:00", - "2011-01-02 10:00", - ], - tz=tz, - ) - expected = Series(idx) - tm.assert_series_equal(expected, result) - tm.assert_series_equal(pd.isna(s), null_loc) - - result = s.fillna("AAA") - expected = Series( - [ - Timestamp("2011-01-01 10:00", tz=tz), - "AAA", - Timestamp("2011-01-03 10:00", tz=tz), - "AAA", - ], - dtype=object, - ) - tm.assert_series_equal(expected, result) - tm.assert_series_equal(pd.isna(s), null_loc) - - result = s.fillna( - { - 1: pd.Timestamp("2011-01-02 10:00", tz=tz), - 3: pd.Timestamp("2011-01-04 10:00"), - } - ) - expected = Series( - [ - Timestamp("2011-01-01 10:00", tz=tz), - Timestamp("2011-01-02 10:00", tz=tz), - Timestamp("2011-01-03 10:00", tz=tz), - Timestamp("2011-01-04 10:00"), - ] - ) - tm.assert_series_equal(expected, result) - tm.assert_series_equal(pd.isna(s), null_loc) - - result = s.fillna( - { - 1: pd.Timestamp("2011-01-02 10:00", tz=tz), - 3: pd.Timestamp("2011-01-04 10:00", tz=tz), - } - ) - expected = Series( - [ - Timestamp("2011-01-01 10:00", tz=tz), - Timestamp("2011-01-02 10:00", tz=tz), - Timestamp("2011-01-03 10:00", tz=tz), - Timestamp("2011-01-04 10:00", tz=tz), - ] - ) - tm.assert_series_equal(expected, result) - tm.assert_series_equal(pd.isna(s), null_loc) - - # filling with a naive/other zone, coerce to object - result = s.fillna(Timestamp("20130101")) - expected = Series( - [ - Timestamp("2011-01-01 10:00", tz=tz), - Timestamp("2013-01-01"), - Timestamp("2011-01-03 10:00", tz=tz), - Timestamp("2013-01-01"), - ] - ) - tm.assert_series_equal(expected, result) - tm.assert_series_equal(pd.isna(s), null_loc) - - result = s.fillna(Timestamp("20130101", tz="US/Pacific")) - expected = Series( - [ - Timestamp("2011-01-01 10:00", tz=tz), - Timestamp("2013-01-01", tz="US/Pacific"), - Timestamp("2011-01-03 10:00", tz=tz), - Timestamp("2013-01-01", tz="US/Pacific"), - ] - ) - tm.assert_series_equal(expected, result) - tm.assert_series_equal(pd.isna(s), null_loc) - - def test_fillna_dt64tz_with_method(self): - # with timezone - # GH 15855 - ser = pd.Series([pd.Timestamp("2012-11-11 00:00:00+01:00"), pd.NaT]) - exp = pd.Series( - [ - pd.Timestamp("2012-11-11 00:00:00+01:00"), - pd.Timestamp("2012-11-11 00:00:00+01:00"), - ] - ) - tm.assert_series_equal(ser.fillna(method="pad"), exp) - - ser = pd.Series([pd.NaT, pd.Timestamp("2012-11-11 00:00:00+01:00")]) - exp = pd.Series( - [ - pd.Timestamp("2012-11-11 00:00:00+01:00"), - pd.Timestamp("2012-11-11 00:00:00+01:00"), - ] - ) - tm.assert_series_equal(ser.fillna(method="bfill"), exp) - - def test_fillna_consistency(self): - # GH 16402 - # fillna with a tz aware to a tz-naive, should result in object - - s = Series([Timestamp("20130101"), pd.NaT]) - - result = s.fillna(Timestamp("20130101", tz="US/Eastern")) - expected = Series( - [Timestamp("20130101"), Timestamp("2013-01-01", tz="US/Eastern")], - dtype="object", - ) - tm.assert_series_equal(result, expected) - - # where (we ignore the errors=) - result = s.where( - [True, False], Timestamp("20130101", tz="US/Eastern"), errors="ignore" - ) - tm.assert_series_equal(result, expected) - - result = s.where( - [True, False], Timestamp("20130101", tz="US/Eastern"), errors="ignore" - ) - tm.assert_series_equal(result, expected) - - # with a non-datetime - result = s.fillna("foo") - expected = Series([Timestamp("20130101"), "foo"]) - tm.assert_series_equal(result, expected) - - # assignment - s2 = s.copy() - s2[1] = "foo" - tm.assert_series_equal(s2, expected) - - def test_datetime64tz_fillna_round_issue(self): - # GH 14872 - - data = pd.Series( - [pd.NaT, pd.NaT, datetime(2016, 12, 12, 22, 24, 6, 100001, tzinfo=pytz.utc)] - ) - - filled = data.fillna(method="bfill") - - expected = pd.Series( - [ - datetime(2016, 12, 12, 22, 24, 6, 100001, tzinfo=pytz.utc), - datetime(2016, 12, 12, 22, 24, 6, 100001, tzinfo=pytz.utc), - datetime(2016, 12, 12, 22, 24, 6, 100001, tzinfo=pytz.utc), - ] - ) - - tm.assert_series_equal(filled, expected) - - def test_fillna_downcast(self): - # GH 15277 - # infer int64 from float64 - s = pd.Series([1.0, np.nan]) - result = s.fillna(0, downcast="infer") - expected = pd.Series([1, 0]) - tm.assert_series_equal(result, expected) - - # infer int64 from float64 when fillna value is a dict - s = pd.Series([1.0, np.nan]) - result = s.fillna({1: 0}, downcast="infer") - expected = pd.Series([1, 0]) - tm.assert_series_equal(result, expected) - - def test_fillna_int(self): - s = Series(np.random.randint(-100, 100, 50)) - return_value = s.fillna(method="ffill", inplace=True) - assert return_value is None - tm.assert_series_equal(s.fillna(method="ffill", inplace=False), s) - def test_categorical_nan_equality(self): cat = Series(Categorical(["a", "b", "c", np.nan])) exp = Series([True, True, True, False]) @@ -531,111 +95,6 @@ def test_isnull_for_inf_deprecated(self): tm.assert_series_equal(r, e) tm.assert_series_equal(dr, de) - def test_fillna(self, datetime_series): - ts = Series([0.0, 1.0, 2.0, 3.0, 4.0], index=tm.makeDateIndex(5)) - - tm.assert_series_equal(ts, ts.fillna(method="ffill")) - - ts[2] = np.NaN - - exp = Series([0.0, 1.0, 1.0, 3.0, 4.0], index=ts.index) - tm.assert_series_equal(ts.fillna(method="ffill"), exp) - - exp = Series([0.0, 1.0, 3.0, 3.0, 4.0], index=ts.index) - tm.assert_series_equal(ts.fillna(method="backfill"), exp) - - exp = Series([0.0, 1.0, 5.0, 3.0, 4.0], index=ts.index) - tm.assert_series_equal(ts.fillna(value=5), exp) - - msg = "Must specify a fill 'value' or 'method'" - with pytest.raises(ValueError, match=msg): - ts.fillna() - - msg = "Cannot specify both 'value' and 'method'" - with pytest.raises(ValueError, match=msg): - datetime_series.fillna(value=0, method="ffill") - - # GH 5703 - s1 = Series([np.nan]) - s2 = Series([1]) - result = s1.fillna(s2) - expected = Series([1.0]) - tm.assert_series_equal(result, expected) - result = s1.fillna({}) - tm.assert_series_equal(result, s1) - result = s1.fillna(Series((), dtype=object)) - tm.assert_series_equal(result, s1) - result = s2.fillna(s1) - tm.assert_series_equal(result, s2) - result = s1.fillna({0: 1}) - tm.assert_series_equal(result, expected) - result = s1.fillna({1: 1}) - tm.assert_series_equal(result, Series([np.nan])) - result = s1.fillna({0: 1, 1: 1}) - tm.assert_series_equal(result, expected) - result = s1.fillna(Series({0: 1, 1: 1})) - tm.assert_series_equal(result, expected) - result = s1.fillna(Series({0: 1, 1: 1}, index=[4, 5])) - tm.assert_series_equal(result, s1) - - s1 = Series([0, 1, 2], list("abc")) - s2 = Series([0, np.nan, 2], list("bac")) - result = s2.fillna(s1) - expected = Series([0, 0, 2.0], list("bac")) - tm.assert_series_equal(result, expected) - - # limit - s = Series(np.nan, index=[0, 1, 2]) - result = s.fillna(999, limit=1) - expected = Series([999, np.nan, np.nan], index=[0, 1, 2]) - tm.assert_series_equal(result, expected) - - result = s.fillna(999, limit=2) - expected = Series([999, 999, np.nan], index=[0, 1, 2]) - tm.assert_series_equal(result, expected) - - # GH 9043 - # make sure a string representation of int/float values can be filled - # correctly without raising errors or being converted - vals = ["0", "1.5", "-0.3"] - for val in vals: - s = Series([0, 1, np.nan, np.nan, 4], dtype="float64") - result = s.fillna(val) - expected = Series([0, 1, val, val, 4], dtype="object") - tm.assert_series_equal(result, expected) - - def test_fillna_bug(self): - x = Series([np.nan, 1.0, np.nan, 3.0, np.nan], ["z", "a", "b", "c", "d"]) - filled = x.fillna(method="ffill") - expected = Series([np.nan, 1.0, 1.0, 3.0, 3.0], x.index) - tm.assert_series_equal(filled, expected) - - filled = x.fillna(method="bfill") - expected = Series([1.0, 1.0, 3.0, 3.0, np.nan], x.index) - tm.assert_series_equal(filled, expected) - - def test_fillna_invalid_method(self, datetime_series): - try: - datetime_series.fillna(method="ffil") - except ValueError as inst: - assert "ffil" in str(inst) - - def test_ffill(self): - ts = Series([0.0, 1.0, 2.0, 3.0, 4.0], index=tm.makeDateIndex(5)) - ts[2] = np.NaN - tm.assert_series_equal(ts.ffill(), ts.fillna(method="ffill")) - - def test_ffill_mixed_dtypes_without_missing_data(self): - # GH14956 - series = pd.Series([datetime(2015, 1, 1, tzinfo=pytz.utc), 1]) - result = series.ffill() - tm.assert_series_equal(series, result) - - def test_bfill(self): - ts = Series([0.0, 1.0, 2.0, 3.0, 4.0], index=tm.makeDateIndex(5)) - ts[2] = np.NaN - tm.assert_series_equal(ts.bfill(), ts.fillna(method="bfill")) - def test_timedelta64_nan(self): td = Series([timedelta(days=i) for i in range(10)]) @@ -773,20 +232,6 @@ def test_notna(self): expected = Series([True, True, False]) tm.assert_series_equal(ser.notna(), expected) - def test_pad_nan(self): - x = Series( - [np.nan, 1.0, np.nan, 3.0, np.nan], ["z", "a", "b", "c", "d"], dtype=float - ) - - return_value = x.fillna(method="pad", inplace=True) - assert return_value is None - - expected = Series( - [np.nan, 1.0, 1.0, 3.0, 3.0], ["z", "a", "b", "c", "d"], dtype=float - ) - tm.assert_series_equal(x[1:], expected[1:]) - assert np.isnan(x[0]), np.isnan(expected[0]) - def test_pad_require_monotonicity(self): rng = date_range("1/1/2000", "3/1/2000", freq="B") @@ -806,37 +251,3 @@ def test_dropna_preserve_name(self, datetime_series): return_value = ts.dropna(inplace=True) assert return_value is None assert ts.name == name - - def test_series_fillna_limit(self): - index = np.arange(10) - s = Series(np.random.randn(10), index=index) - - result = s[:2].reindex(index) - result = result.fillna(method="pad", limit=5) - - expected = s[:2].reindex(index).fillna(method="pad") - expected[-3:] = np.nan - tm.assert_series_equal(result, expected) - - result = s[-2:].reindex(index) - result = result.fillna(method="bfill", limit=5) - - expected = s[-2:].reindex(index).fillna(method="backfill") - expected[:3] = np.nan - tm.assert_series_equal(result, expected) - - def test_series_pad_backfill_limit(self): - index = np.arange(10) - s = Series(np.random.randn(10), index=index) - - result = s[:2].reindex(index, method="pad", limit=5) - - expected = s[:2].reindex(index).fillna(method="pad") - expected[-3:] = np.nan - tm.assert_series_equal(result, expected) - - result = s[-2:].reindex(index, method="backfill", limit=5) - - expected = s[-2:].reindex(index).fillna(method="backfill") - expected[:3] = np.nan - tm.assert_series_equal(result, expected) From 8a85d5dc1567a38b383ffdf4d6cfc9435a8b9244 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 20 Oct 2020 11:20:27 -0700 Subject: [PATCH 2/2] REF/TST: collect fillna tests --- pandas/tests/frame/methods/test_fillna.py | 526 ++++++++++++++++++++++ pandas/tests/frame/test_missing.py | 513 +-------------------- 2 files changed, 527 insertions(+), 512 deletions(-) create mode 100644 pandas/tests/frame/methods/test_fillna.py diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py new file mode 100644 index 0000000000000..9fa1aa65379c5 --- /dev/null +++ b/pandas/tests/frame/methods/test_fillna.py @@ -0,0 +1,526 @@ +import numpy as np +import pytest + +from pandas import ( + Categorical, + DataFrame, + DatetimeIndex, + NaT, + PeriodIndex, + Series, + TimedeltaIndex, + Timestamp, + date_range, +) +import pandas._testing as tm +from pandas.tests.frame.common import _check_mixed_float + + +class TestFillNA: + def test_fillna_datetime(self, datetime_frame): + tf = datetime_frame + tf.loc[tf.index[:5], "A"] = np.nan + tf.loc[tf.index[-5:], "A"] = np.nan + + zero_filled = datetime_frame.fillna(0) + assert (zero_filled.loc[zero_filled.index[:5], "A"] == 0).all() + + padded = datetime_frame.fillna(method="pad") + assert np.isnan(padded.loc[padded.index[:5], "A"]).all() + assert ( + padded.loc[padded.index[-5:], "A"] == padded.loc[padded.index[-5], "A"] + ).all() + + msg = "Must specify a fill 'value' or 'method'" + with pytest.raises(ValueError, match=msg): + datetime_frame.fillna() + msg = "Cannot specify both 'value' and 'method'" + with pytest.raises(ValueError, match=msg): + datetime_frame.fillna(5, method="ffill") + + def test_fillna_mixed_type(self, float_string_frame): + + mf = float_string_frame + mf.loc[mf.index[5:20], "foo"] = np.nan + mf.loc[mf.index[-10:], "A"] = np.nan + # TODO: make stronger assertion here, GH 25640 + mf.fillna(value=0) + mf.fillna(method="pad") + + def test_fillna_mixed_float(self, mixed_float_frame): + + # mixed numeric (but no float16) + mf = mixed_float_frame.reindex(columns=["A", "B", "D"]) + mf.loc[mf.index[-10:], "A"] = np.nan + result = mf.fillna(value=0) + _check_mixed_float(result, dtype=dict(C=None)) + + result = mf.fillna(method="pad") + _check_mixed_float(result, dtype=dict(C=None)) + + def test_fillna_empty(self): + # empty frame (GH#2778) + df = DataFrame(columns=["x"]) + for m in ["pad", "backfill"]: + df.x.fillna(method=m, inplace=True) + df.x.fillna(method=m) + + def test_fillna_different_dtype(self): + # with different dtype (GH#3386) + df = DataFrame( + [["a", "a", np.nan, "a"], ["b", "b", np.nan, "b"], ["c", "c", np.nan, "c"]] + ) + + result = df.fillna({2: "foo"}) + expected = DataFrame( + [["a", "a", "foo", "a"], ["b", "b", "foo", "b"], ["c", "c", "foo", "c"]] + ) + tm.assert_frame_equal(result, expected) + + return_value = df.fillna({2: "foo"}, inplace=True) + tm.assert_frame_equal(df, expected) + assert return_value is None + + def test_fillna_limit_and_value(self): + # limit and value + df = DataFrame(np.random.randn(10, 3)) + df.iloc[2:7, 0] = np.nan + df.iloc[3:5, 2] = np.nan + + expected = df.copy() + expected.iloc[2, 0] = 999 + expected.iloc[3, 2] = 999 + result = df.fillna(999, limit=1) + tm.assert_frame_equal(result, expected) + + def test_fillna_datelike(self): + # with datelike + # GH#6344 + df = DataFrame( + { + "Date": [NaT, Timestamp("2014-1-1")], + "Date2": [Timestamp("2013-1-1"), NaT], + } + ) + + expected = df.copy() + expected["Date"] = expected["Date"].fillna(df.loc[df.index[0], "Date2"]) + result = df.fillna(value={"Date": df["Date2"]}) + tm.assert_frame_equal(result, expected) + + def test_fillna_tzaware(self): + # with timezone + # GH#15855 + df = DataFrame({"A": [Timestamp("2012-11-11 00:00:00+01:00"), NaT]}) + exp = DataFrame( + { + "A": [ + Timestamp("2012-11-11 00:00:00+01:00"), + Timestamp("2012-11-11 00:00:00+01:00"), + ] + } + ) + tm.assert_frame_equal(df.fillna(method="pad"), exp) + + df = DataFrame({"A": [NaT, Timestamp("2012-11-11 00:00:00+01:00")]}) + exp = DataFrame( + { + "A": [ + Timestamp("2012-11-11 00:00:00+01:00"), + Timestamp("2012-11-11 00:00:00+01:00"), + ] + } + ) + tm.assert_frame_equal(df.fillna(method="bfill"), exp) + + def test_fillna_tzaware_different_column(self): + # with timezone in another column + # GH#15522 + df = DataFrame( + { + "A": date_range("20130101", periods=4, tz="US/Eastern"), + "B": [1, 2, np.nan, np.nan], + } + ) + result = df.fillna(method="pad") + expected = DataFrame( + { + "A": date_range("20130101", periods=4, tz="US/Eastern"), + "B": [1.0, 2.0, 2.0, 2.0], + } + ) + tm.assert_frame_equal(result, expected) + + def test_na_actions_categorical(self): + + cat = Categorical([1, 2, 3, np.nan], categories=[1, 2, 3]) + vals = ["a", "b", np.nan, "d"] + df = DataFrame({"cats": cat, "vals": vals}) + cat2 = Categorical([1, 2, 3, 3], categories=[1, 2, 3]) + vals2 = ["a", "b", "b", "d"] + df_exp_fill = DataFrame({"cats": cat2, "vals": vals2}) + cat3 = Categorical([1, 2, 3], categories=[1, 2, 3]) + vals3 = ["a", "b", np.nan] + df_exp_drop_cats = DataFrame({"cats": cat3, "vals": vals3}) + cat4 = Categorical([1, 2], categories=[1, 2, 3]) + vals4 = ["a", "b"] + df_exp_drop_all = DataFrame({"cats": cat4, "vals": vals4}) + + # fillna + res = df.fillna(value={"cats": 3, "vals": "b"}) + tm.assert_frame_equal(res, df_exp_fill) + + msg = "'fill_value=4' is not present in this Categorical's categories" + with pytest.raises(ValueError, match=msg): + df.fillna(value={"cats": 4, "vals": "c"}) + + res = df.fillna(method="pad") + tm.assert_frame_equal(res, df_exp_fill) + + # dropna + res = df.dropna(subset=["cats"]) + tm.assert_frame_equal(res, df_exp_drop_cats) + + res = df.dropna() + tm.assert_frame_equal(res, df_exp_drop_all) + + # make sure that fillna takes missing values into account + c = Categorical([np.nan, "b", np.nan], categories=["a", "b"]) + df = DataFrame({"cats": c, "vals": [1, 2, 3]}) + + cat_exp = Categorical(["a", "b", "a"], categories=["a", "b"]) + df_exp = DataFrame({"cats": cat_exp, "vals": [1, 2, 3]}) + + res = df.fillna("a") + tm.assert_frame_equal(res, df_exp) + + def test_fillna_categorical_nan(self): + # GH#14021 + # np.nan should always be a valid filler + cat = Categorical([np.nan, 2, np.nan]) + val = Categorical([np.nan, np.nan, np.nan]) + df = DataFrame({"cats": cat, "vals": val}) + + # GH#32950 df.median() is poorly behaved because there is no + # Categorical.median + median = Series({"cats": 2.0, "vals": np.nan}) + + res = df.fillna(median) + v_exp = [np.nan, np.nan, np.nan] + df_exp = DataFrame({"cats": [2, 2, 2], "vals": v_exp}, dtype="category") + tm.assert_frame_equal(res, df_exp) + + result = df.cats.fillna(np.nan) + tm.assert_series_equal(result, df.cats) + + result = df.vals.fillna(np.nan) + tm.assert_series_equal(result, df.vals) + + idx = DatetimeIndex( + ["2011-01-01 09:00", "2016-01-01 23:45", "2011-01-01 09:00", NaT, NaT] + ) + df = DataFrame({"a": Categorical(idx)}) + tm.assert_frame_equal(df.fillna(value=NaT), df) + + idx = PeriodIndex(["2011-01", "2011-01", "2011-01", NaT, NaT], freq="M") + df = DataFrame({"a": Categorical(idx)}) + tm.assert_frame_equal(df.fillna(value=NaT), df) + + idx = TimedeltaIndex(["1 days", "2 days", "1 days", NaT, NaT]) + df = DataFrame({"a": Categorical(idx)}) + tm.assert_frame_equal(df.fillna(value=NaT), df) + + def test_fillna_downcast(self): + # GH#15277 + # infer int64 from float64 + df = DataFrame({"a": [1.0, np.nan]}) + result = df.fillna(0, downcast="infer") + expected = DataFrame({"a": [1, 0]}) + tm.assert_frame_equal(result, expected) + + # infer int64 from float64 when fillna value is a dict + df = DataFrame({"a": [1.0, np.nan]}) + result = df.fillna({"a": 0}, downcast="infer") + expected = DataFrame({"a": [1, 0]}) + tm.assert_frame_equal(result, expected) + + def test_fillna_dtype_conversion(self): + # make sure that fillna on an empty frame works + df = DataFrame(index=["A", "B", "C"], columns=[1, 2, 3, 4, 5]) + result = df.dtypes + expected = Series([np.dtype("object")] * 5, index=[1, 2, 3, 4, 5]) + tm.assert_series_equal(result, expected) + + result = df.fillna(1) + expected = DataFrame(1, index=["A", "B", "C"], columns=[1, 2, 3, 4, 5]) + tm.assert_frame_equal(result, expected) + + # empty block + df = DataFrame(index=range(3), columns=["A", "B"], dtype="float64") + result = df.fillna("nan") + expected = DataFrame("nan", index=range(3), columns=["A", "B"]) + tm.assert_frame_equal(result, expected) + + # equiv of replace + df = DataFrame(dict(A=[1, np.nan], B=[1.0, 2.0])) + for v in ["", 1, np.nan, 1.0]: + expected = df.replace(np.nan, v) + result = df.fillna(v) + tm.assert_frame_equal(result, expected) + + def test_fillna_datetime_columns(self): + # GH#7095 + df = DataFrame( + { + "A": [-1, -2, np.nan], + "B": date_range("20130101", periods=3), + "C": ["foo", "bar", None], + "D": ["foo2", "bar2", None], + }, + index=date_range("20130110", periods=3), + ) + result = df.fillna("?") + expected = DataFrame( + { + "A": [-1, -2, "?"], + "B": date_range("20130101", periods=3), + "C": ["foo", "bar", "?"], + "D": ["foo2", "bar2", "?"], + }, + index=date_range("20130110", periods=3), + ) + tm.assert_frame_equal(result, expected) + + df = DataFrame( + { + "A": [-1, -2, np.nan], + "B": [Timestamp("2013-01-01"), Timestamp("2013-01-02"), NaT], + "C": ["foo", "bar", None], + "D": ["foo2", "bar2", None], + }, + index=date_range("20130110", periods=3), + ) + result = df.fillna("?") + expected = DataFrame( + { + "A": [-1, -2, "?"], + "B": [Timestamp("2013-01-01"), Timestamp("2013-01-02"), "?"], + "C": ["foo", "bar", "?"], + "D": ["foo2", "bar2", "?"], + }, + index=date_range("20130110", periods=3), + ) + tm.assert_frame_equal(result, expected) + + def test_ffill(self, datetime_frame): + datetime_frame["A"][:5] = np.nan + datetime_frame["A"][-5:] = np.nan + + tm.assert_frame_equal( + datetime_frame.ffill(), datetime_frame.fillna(method="ffill") + ) + + def test_bfill(self, datetime_frame): + datetime_frame["A"][:5] = np.nan + datetime_frame["A"][-5:] = np.nan + + tm.assert_frame_equal( + datetime_frame.bfill(), datetime_frame.fillna(method="bfill") + ) + + def test_frame_pad_backfill_limit(self): + index = np.arange(10) + df = DataFrame(np.random.randn(10, 4), index=index) + + result = df[:2].reindex(index, method="pad", limit=5) + + expected = df[:2].reindex(index).fillna(method="pad") + expected.values[-3:] = np.nan + tm.assert_frame_equal(result, expected) + + result = df[-2:].reindex(index, method="backfill", limit=5) + + expected = df[-2:].reindex(index).fillna(method="backfill") + expected.values[:3] = np.nan + tm.assert_frame_equal(result, expected) + + def test_frame_fillna_limit(self): + index = np.arange(10) + df = DataFrame(np.random.randn(10, 4), index=index) + + result = df[:2].reindex(index) + result = result.fillna(method="pad", limit=5) + + expected = df[:2].reindex(index).fillna(method="pad") + expected.values[-3:] = np.nan + tm.assert_frame_equal(result, expected) + + result = df[-2:].reindex(index) + result = result.fillna(method="backfill", limit=5) + + expected = df[-2:].reindex(index).fillna(method="backfill") + expected.values[:3] = np.nan + tm.assert_frame_equal(result, expected) + + def test_fillna_skip_certain_blocks(self): + # don't try to fill boolean, int blocks + + df = DataFrame(np.random.randn(10, 4).astype(int)) + + # it works! + df.fillna(np.nan) + + @pytest.mark.parametrize("type", [int, float]) + def test_fillna_positive_limit(self, type): + df = DataFrame(np.random.randn(10, 4)).astype(type) + + msg = "Limit must be greater than 0" + with pytest.raises(ValueError, match=msg): + df.fillna(0, limit=-5) + + @pytest.mark.parametrize("type", [int, float]) + def test_fillna_integer_limit(self, type): + df = DataFrame(np.random.randn(10, 4)).astype(type) + + msg = "Limit must be an integer" + with pytest.raises(ValueError, match=msg): + df.fillna(0, limit=0.5) + + def test_fillna_inplace(self): + df = DataFrame(np.random.randn(10, 4)) + df[1][:4] = np.nan + df[3][-4:] = np.nan + + expected = df.fillna(value=0) + assert expected is not df + + df.fillna(value=0, inplace=True) + tm.assert_frame_equal(df, expected) + + expected = df.fillna(value={0: 0}, inplace=True) + assert expected is None + + df[1][:4] = np.nan + df[3][-4:] = np.nan + expected = df.fillna(method="ffill") + assert expected is not df + + df.fillna(method="ffill", inplace=True) + tm.assert_frame_equal(df, expected) + + def test_fillna_dict_series(self): + df = DataFrame( + { + "a": [np.nan, 1, 2, np.nan, np.nan], + "b": [1, 2, 3, np.nan, np.nan], + "c": [np.nan, 1, 2, 3, 4], + } + ) + + result = df.fillna({"a": 0, "b": 5}) + + expected = df.copy() + expected["a"] = expected["a"].fillna(0) + expected["b"] = expected["b"].fillna(5) + tm.assert_frame_equal(result, expected) + + # it works + result = df.fillna({"a": 0, "b": 5, "d": 7}) + + # Series treated same as dict + result = df.fillna(df.max()) + expected = df.fillna(df.max().to_dict()) + tm.assert_frame_equal(result, expected) + + # disable this for now + with pytest.raises(NotImplementedError, match="column by column"): + df.fillna(df.max(1), axis=1) + + def test_fillna_dataframe(self): + # GH#8377 + df = DataFrame( + { + "a": [np.nan, 1, 2, np.nan, np.nan], + "b": [1, 2, 3, np.nan, np.nan], + "c": [np.nan, 1, 2, 3, 4], + }, + index=list("VWXYZ"), + ) + + # df2 may have different index and columns + df2 = DataFrame( + { + "a": [np.nan, 10, 20, 30, 40], + "b": [50, 60, 70, 80, 90], + "foo": ["bar"] * 5, + }, + index=list("VWXuZ"), + ) + + result = df.fillna(df2) + + # only those columns and indices which are shared get filled + expected = DataFrame( + { + "a": [np.nan, 1, 2, np.nan, 40], + "b": [1, 2, 3, np.nan, 90], + "c": [np.nan, 1, 2, 3, 4], + }, + index=list("VWXYZ"), + ) + + tm.assert_frame_equal(result, expected) + + def test_fillna_columns(self): + df = DataFrame(np.random.randn(10, 10)) + df.values[:, ::2] = np.nan + + result = df.fillna(method="ffill", axis=1) + expected = df.T.fillna(method="pad").T + tm.assert_frame_equal(result, expected) + + df.insert(6, "foo", 5) + result = df.fillna(method="ffill", axis=1) + expected = df.astype(float).fillna(method="ffill", axis=1) + tm.assert_frame_equal(result, expected) + + def test_fillna_invalid_method(self, float_frame): + with pytest.raises(ValueError, match="ffil"): + float_frame.fillna(method="ffil") + + def test_fillna_invalid_value(self, float_frame): + # list + msg = '"value" parameter must be a scalar or dict, but you passed a "{}"' + with pytest.raises(TypeError, match=msg.format("list")): + float_frame.fillna([1, 2]) + # tuple + with pytest.raises(TypeError, match=msg.format("tuple")): + float_frame.fillna((1, 2)) + # frame with series + msg = ( + '"value" parameter must be a scalar, dict or Series, but you ' + 'passed a "DataFrame"' + ) + with pytest.raises(TypeError, match=msg): + float_frame.iloc[:, 0].fillna(float_frame) + + def test_fillna_col_reordering(self): + cols = ["COL." + str(i) for i in range(5, 0, -1)] + data = np.random.rand(20, 5) + df = DataFrame(index=range(20), columns=cols, data=data) + filled = df.fillna(method="ffill") + assert df.columns.tolist() == filled.columns.tolist() + + def test_fill_corner(self, float_frame, float_string_frame): + mf = float_string_frame + mf.loc[mf.index[5:20], "foo"] = np.nan + mf.loc[mf.index[-10:], "A"] = np.nan + + filled = float_string_frame.fillna(value=0) + assert (filled.loc[filled.index[5:20], "foo"] == 0).all() + del float_string_frame["foo"] + + empty_float = float_frame.reindex(columns=[]) + + # TODO(wesm): unused? + result = empty_float.fillna(value=0) # noqa diff --git a/pandas/tests/frame/test_missing.py b/pandas/tests/frame/test_missing.py index 5d3f8e3a2f7c1..2338602a208e7 100644 --- a/pandas/tests/frame/test_missing.py +++ b/pandas/tests/frame/test_missing.py @@ -5,9 +5,8 @@ import pytest import pandas as pd -from pandas import Categorical, DataFrame, Series, Timestamp, date_range +from pandas import DataFrame, Series import pandas._testing as tm -from pandas.tests.frame.common import _check_mixed_float class TestDataFrameMissingData: @@ -208,513 +207,3 @@ def test_dropna_categorical_interval_index(self): expected = df result = df.dropna() tm.assert_frame_equal(result, expected) - - def test_fillna_datetime(self, datetime_frame): - tf = datetime_frame - tf.loc[tf.index[:5], "A"] = np.nan - tf.loc[tf.index[-5:], "A"] = np.nan - - zero_filled = datetime_frame.fillna(0) - assert (zero_filled.loc[zero_filled.index[:5], "A"] == 0).all() - - padded = datetime_frame.fillna(method="pad") - assert np.isnan(padded.loc[padded.index[:5], "A"]).all() - assert ( - padded.loc[padded.index[-5:], "A"] == padded.loc[padded.index[-5], "A"] - ).all() - - msg = "Must specify a fill 'value' or 'method'" - with pytest.raises(ValueError, match=msg): - datetime_frame.fillna() - msg = "Cannot specify both 'value' and 'method'" - with pytest.raises(ValueError, match=msg): - datetime_frame.fillna(5, method="ffill") - - def test_fillna_mixed_type(self, float_string_frame): - - mf = float_string_frame - mf.loc[mf.index[5:20], "foo"] = np.nan - mf.loc[mf.index[-10:], "A"] = np.nan - # TODO: make stronger assertion here, GH 25640 - mf.fillna(value=0) - mf.fillna(method="pad") - - def test_fillna_mixed_float(self, mixed_float_frame): - - # mixed numeric (but no float16) - mf = mixed_float_frame.reindex(columns=["A", "B", "D"]) - mf.loc[mf.index[-10:], "A"] = np.nan - result = mf.fillna(value=0) - _check_mixed_float(result, dtype=dict(C=None)) - - result = mf.fillna(method="pad") - _check_mixed_float(result, dtype=dict(C=None)) - - def test_fillna_empty(self): - # empty frame (GH #2778) - df = DataFrame(columns=["x"]) - for m in ["pad", "backfill"]: - df.x.fillna(method=m, inplace=True) - df.x.fillna(method=m) - - def test_fillna_different_dtype(self): - # with different dtype (GH#3386) - df = DataFrame( - [["a", "a", np.nan, "a"], ["b", "b", np.nan, "b"], ["c", "c", np.nan, "c"]] - ) - - result = df.fillna({2: "foo"}) - expected = DataFrame( - [["a", "a", "foo", "a"], ["b", "b", "foo", "b"], ["c", "c", "foo", "c"]] - ) - tm.assert_frame_equal(result, expected) - - return_value = df.fillna({2: "foo"}, inplace=True) - tm.assert_frame_equal(df, expected) - assert return_value is None - - def test_fillna_limit_and_value(self): - # limit and value - df = DataFrame(np.random.randn(10, 3)) - df.iloc[2:7, 0] = np.nan - df.iloc[3:5, 2] = np.nan - - expected = df.copy() - expected.iloc[2, 0] = 999 - expected.iloc[3, 2] = 999 - result = df.fillna(999, limit=1) - tm.assert_frame_equal(result, expected) - - def test_fillna_datelike(self): - # with datelike - # GH#6344 - df = DataFrame( - { - "Date": [pd.NaT, Timestamp("2014-1-1")], - "Date2": [Timestamp("2013-1-1"), pd.NaT], - } - ) - - expected = df.copy() - expected["Date"] = expected["Date"].fillna(df.loc[df.index[0], "Date2"]) - result = df.fillna(value={"Date": df["Date2"]}) - tm.assert_frame_equal(result, expected) - - def test_fillna_tzaware(self): - # with timezone - # GH#15855 - df = pd.DataFrame({"A": [pd.Timestamp("2012-11-11 00:00:00+01:00"), pd.NaT]}) - exp = pd.DataFrame( - { - "A": [ - pd.Timestamp("2012-11-11 00:00:00+01:00"), - pd.Timestamp("2012-11-11 00:00:00+01:00"), - ] - } - ) - tm.assert_frame_equal(df.fillna(method="pad"), exp) - - df = pd.DataFrame({"A": [pd.NaT, pd.Timestamp("2012-11-11 00:00:00+01:00")]}) - exp = pd.DataFrame( - { - "A": [ - pd.Timestamp("2012-11-11 00:00:00+01:00"), - pd.Timestamp("2012-11-11 00:00:00+01:00"), - ] - } - ) - tm.assert_frame_equal(df.fillna(method="bfill"), exp) - - def test_fillna_tzaware_different_column(self): - # with timezone in another column - # GH#15522 - df = pd.DataFrame( - { - "A": pd.date_range("20130101", periods=4, tz="US/Eastern"), - "B": [1, 2, np.nan, np.nan], - } - ) - result = df.fillna(method="pad") - expected = pd.DataFrame( - { - "A": pd.date_range("20130101", periods=4, tz="US/Eastern"), - "B": [1.0, 2.0, 2.0, 2.0], - } - ) - tm.assert_frame_equal(result, expected) - - def test_na_actions_categorical(self): - - cat = Categorical([1, 2, 3, np.nan], categories=[1, 2, 3]) - vals = ["a", "b", np.nan, "d"] - df = DataFrame({"cats": cat, "vals": vals}) - cat2 = Categorical([1, 2, 3, 3], categories=[1, 2, 3]) - vals2 = ["a", "b", "b", "d"] - df_exp_fill = DataFrame({"cats": cat2, "vals": vals2}) - cat3 = Categorical([1, 2, 3], categories=[1, 2, 3]) - vals3 = ["a", "b", np.nan] - df_exp_drop_cats = DataFrame({"cats": cat3, "vals": vals3}) - cat4 = Categorical([1, 2], categories=[1, 2, 3]) - vals4 = ["a", "b"] - df_exp_drop_all = DataFrame({"cats": cat4, "vals": vals4}) - - # fillna - res = df.fillna(value={"cats": 3, "vals": "b"}) - tm.assert_frame_equal(res, df_exp_fill) - - msg = "'fill_value=4' is not present in this Categorical's categories" - with pytest.raises(ValueError, match=msg): - df.fillna(value={"cats": 4, "vals": "c"}) - - res = df.fillna(method="pad") - tm.assert_frame_equal(res, df_exp_fill) - - # dropna - res = df.dropna(subset=["cats"]) - tm.assert_frame_equal(res, df_exp_drop_cats) - - res = df.dropna() - tm.assert_frame_equal(res, df_exp_drop_all) - - # make sure that fillna takes missing values into account - c = Categorical([np.nan, "b", np.nan], categories=["a", "b"]) - df = pd.DataFrame({"cats": c, "vals": [1, 2, 3]}) - - cat_exp = Categorical(["a", "b", "a"], categories=["a", "b"]) - df_exp = DataFrame({"cats": cat_exp, "vals": [1, 2, 3]}) - - res = df.fillna("a") - tm.assert_frame_equal(res, df_exp) - - def test_fillna_categorical_nan(self): - # GH 14021 - # np.nan should always be a valid filler - cat = Categorical([np.nan, 2, np.nan]) - val = Categorical([np.nan, np.nan, np.nan]) - df = DataFrame({"cats": cat, "vals": val}) - - # GH#32950 df.median() is poorly behaved because there is no - # Categorical.median - median = Series({"cats": 2.0, "vals": np.nan}) - - res = df.fillna(median) - v_exp = [np.nan, np.nan, np.nan] - df_exp = DataFrame({"cats": [2, 2, 2], "vals": v_exp}, dtype="category") - tm.assert_frame_equal(res, df_exp) - - result = df.cats.fillna(np.nan) - tm.assert_series_equal(result, df.cats) - - result = df.vals.fillna(np.nan) - tm.assert_series_equal(result, df.vals) - - idx = pd.DatetimeIndex( - ["2011-01-01 09:00", "2016-01-01 23:45", "2011-01-01 09:00", pd.NaT, pd.NaT] - ) - df = DataFrame({"a": Categorical(idx)}) - tm.assert_frame_equal(df.fillna(value=pd.NaT), df) - - idx = pd.PeriodIndex( - ["2011-01", "2011-01", "2011-01", pd.NaT, pd.NaT], freq="M" - ) - df = DataFrame({"a": Categorical(idx)}) - tm.assert_frame_equal(df.fillna(value=pd.NaT), df) - - idx = pd.TimedeltaIndex(["1 days", "2 days", "1 days", pd.NaT, pd.NaT]) - df = DataFrame({"a": Categorical(idx)}) - tm.assert_frame_equal(df.fillna(value=pd.NaT), df) - - def test_fillna_downcast(self): - # GH 15277 - # infer int64 from float64 - df = pd.DataFrame({"a": [1.0, np.nan]}) - result = df.fillna(0, downcast="infer") - expected = pd.DataFrame({"a": [1, 0]}) - tm.assert_frame_equal(result, expected) - - # infer int64 from float64 when fillna value is a dict - df = pd.DataFrame({"a": [1.0, np.nan]}) - result = df.fillna({"a": 0}, downcast="infer") - expected = pd.DataFrame({"a": [1, 0]}) - tm.assert_frame_equal(result, expected) - - def test_fillna_dtype_conversion(self): - # make sure that fillna on an empty frame works - df = DataFrame(index=["A", "B", "C"], columns=[1, 2, 3, 4, 5]) - result = df.dtypes - expected = Series([np.dtype("object")] * 5, index=[1, 2, 3, 4, 5]) - tm.assert_series_equal(result, expected) - - result = df.fillna(1) - expected = DataFrame(1, index=["A", "B", "C"], columns=[1, 2, 3, 4, 5]) - tm.assert_frame_equal(result, expected) - - # empty block - df = DataFrame(index=range(3), columns=["A", "B"], dtype="float64") - result = df.fillna("nan") - expected = DataFrame("nan", index=range(3), columns=["A", "B"]) - tm.assert_frame_equal(result, expected) - - # equiv of replace - df = DataFrame(dict(A=[1, np.nan], B=[1.0, 2.0])) - for v in ["", 1, np.nan, 1.0]: - expected = df.replace(np.nan, v) - result = df.fillna(v) - tm.assert_frame_equal(result, expected) - - def test_fillna_datetime_columns(self): - # GH 7095 - df = pd.DataFrame( - { - "A": [-1, -2, np.nan], - "B": date_range("20130101", periods=3), - "C": ["foo", "bar", None], - "D": ["foo2", "bar2", None], - }, - index=date_range("20130110", periods=3), - ) - result = df.fillna("?") - expected = pd.DataFrame( - { - "A": [-1, -2, "?"], - "B": date_range("20130101", periods=3), - "C": ["foo", "bar", "?"], - "D": ["foo2", "bar2", "?"], - }, - index=date_range("20130110", periods=3), - ) - tm.assert_frame_equal(result, expected) - - df = pd.DataFrame( - { - "A": [-1, -2, np.nan], - "B": [pd.Timestamp("2013-01-01"), pd.Timestamp("2013-01-02"), pd.NaT], - "C": ["foo", "bar", None], - "D": ["foo2", "bar2", None], - }, - index=date_range("20130110", periods=3), - ) - result = df.fillna("?") - expected = pd.DataFrame( - { - "A": [-1, -2, "?"], - "B": [pd.Timestamp("2013-01-01"), pd.Timestamp("2013-01-02"), "?"], - "C": ["foo", "bar", "?"], - "D": ["foo2", "bar2", "?"], - }, - index=pd.date_range("20130110", periods=3), - ) - tm.assert_frame_equal(result, expected) - - def test_ffill(self, datetime_frame): - datetime_frame["A"][:5] = np.nan - datetime_frame["A"][-5:] = np.nan - - tm.assert_frame_equal( - datetime_frame.ffill(), datetime_frame.fillna(method="ffill") - ) - - def test_bfill(self, datetime_frame): - datetime_frame["A"][:5] = np.nan - datetime_frame["A"][-5:] = np.nan - - tm.assert_frame_equal( - datetime_frame.bfill(), datetime_frame.fillna(method="bfill") - ) - - def test_frame_pad_backfill_limit(self): - index = np.arange(10) - df = DataFrame(np.random.randn(10, 4), index=index) - - result = df[:2].reindex(index, method="pad", limit=5) - - expected = df[:2].reindex(index).fillna(method="pad") - expected.values[-3:] = np.nan - tm.assert_frame_equal(result, expected) - - result = df[-2:].reindex(index, method="backfill", limit=5) - - expected = df[-2:].reindex(index).fillna(method="backfill") - expected.values[:3] = np.nan - tm.assert_frame_equal(result, expected) - - def test_frame_fillna_limit(self): - index = np.arange(10) - df = DataFrame(np.random.randn(10, 4), index=index) - - result = df[:2].reindex(index) - result = result.fillna(method="pad", limit=5) - - expected = df[:2].reindex(index).fillna(method="pad") - expected.values[-3:] = np.nan - tm.assert_frame_equal(result, expected) - - result = df[-2:].reindex(index) - result = result.fillna(method="backfill", limit=5) - - expected = df[-2:].reindex(index).fillna(method="backfill") - expected.values[:3] = np.nan - tm.assert_frame_equal(result, expected) - - def test_fillna_skip_certain_blocks(self): - # don't try to fill boolean, int blocks - - df = DataFrame(np.random.randn(10, 4).astype(int)) - - # it works! - df.fillna(np.nan) - - @pytest.mark.parametrize("type", [int, float]) - def test_fillna_positive_limit(self, type): - df = DataFrame(np.random.randn(10, 4)).astype(type) - - msg = "Limit must be greater than 0" - with pytest.raises(ValueError, match=msg): - df.fillna(0, limit=-5) - - @pytest.mark.parametrize("type", [int, float]) - def test_fillna_integer_limit(self, type): - df = DataFrame(np.random.randn(10, 4)).astype(type) - - msg = "Limit must be an integer" - with pytest.raises(ValueError, match=msg): - df.fillna(0, limit=0.5) - - def test_fillna_inplace(self): - df = DataFrame(np.random.randn(10, 4)) - df[1][:4] = np.nan - df[3][-4:] = np.nan - - expected = df.fillna(value=0) - assert expected is not df - - df.fillna(value=0, inplace=True) - tm.assert_frame_equal(df, expected) - - expected = df.fillna(value={0: 0}, inplace=True) - assert expected is None - - df[1][:4] = np.nan - df[3][-4:] = np.nan - expected = df.fillna(method="ffill") - assert expected is not df - - df.fillna(method="ffill", inplace=True) - tm.assert_frame_equal(df, expected) - - def test_fillna_dict_series(self): - df = DataFrame( - { - "a": [np.nan, 1, 2, np.nan, np.nan], - "b": [1, 2, 3, np.nan, np.nan], - "c": [np.nan, 1, 2, 3, 4], - } - ) - - result = df.fillna({"a": 0, "b": 5}) - - expected = df.copy() - expected["a"] = expected["a"].fillna(0) - expected["b"] = expected["b"].fillna(5) - tm.assert_frame_equal(result, expected) - - # it works - result = df.fillna({"a": 0, "b": 5, "d": 7}) - - # Series treated same as dict - result = df.fillna(df.max()) - expected = df.fillna(df.max().to_dict()) - tm.assert_frame_equal(result, expected) - - # disable this for now - with pytest.raises(NotImplementedError, match="column by column"): - df.fillna(df.max(1), axis=1) - - def test_fillna_dataframe(self): - # GH 8377 - df = DataFrame( - { - "a": [np.nan, 1, 2, np.nan, np.nan], - "b": [1, 2, 3, np.nan, np.nan], - "c": [np.nan, 1, 2, 3, 4], - }, - index=list("VWXYZ"), - ) - - # df2 may have different index and columns - df2 = DataFrame( - { - "a": [np.nan, 10, 20, 30, 40], - "b": [50, 60, 70, 80, 90], - "foo": ["bar"] * 5, - }, - index=list("VWXuZ"), - ) - - result = df.fillna(df2) - - # only those columns and indices which are shared get filled - expected = DataFrame( - { - "a": [np.nan, 1, 2, np.nan, 40], - "b": [1, 2, 3, np.nan, 90], - "c": [np.nan, 1, 2, 3, 4], - }, - index=list("VWXYZ"), - ) - - tm.assert_frame_equal(result, expected) - - def test_fillna_columns(self): - df = DataFrame(np.random.randn(10, 10)) - df.values[:, ::2] = np.nan - - result = df.fillna(method="ffill", axis=1) - expected = df.T.fillna(method="pad").T - tm.assert_frame_equal(result, expected) - - df.insert(6, "foo", 5) - result = df.fillna(method="ffill", axis=1) - expected = df.astype(float).fillna(method="ffill", axis=1) - tm.assert_frame_equal(result, expected) - - def test_fillna_invalid_method(self, float_frame): - with pytest.raises(ValueError, match="ffil"): - float_frame.fillna(method="ffil") - - def test_fillna_invalid_value(self, float_frame): - # list - msg = '"value" parameter must be a scalar or dict, but you passed a "{}"' - with pytest.raises(TypeError, match=msg.format("list")): - float_frame.fillna([1, 2]) - # tuple - with pytest.raises(TypeError, match=msg.format("tuple")): - float_frame.fillna((1, 2)) - # frame with series - msg = ( - '"value" parameter must be a scalar, dict or Series, but you ' - 'passed a "DataFrame"' - ) - with pytest.raises(TypeError, match=msg): - float_frame.iloc[:, 0].fillna(float_frame) - - def test_fillna_col_reordering(self): - cols = ["COL." + str(i) for i in range(5, 0, -1)] - data = np.random.rand(20, 5) - df = DataFrame(index=range(20), columns=cols, data=data) - filled = df.fillna(method="ffill") - assert df.columns.tolist() == filled.columns.tolist() - - def test_fill_corner(self, float_frame, float_string_frame): - mf = float_string_frame - mf.loc[mf.index[5:20], "foo"] = np.nan - mf.loc[mf.index[-10:], "A"] = np.nan - - filled = float_string_frame.fillna(value=0) - assert (filled.loc[filled.index[5:20], "foo"] == 0).all() - del float_string_frame["foo"] - - empty_float = float_frame.reindex(columns=[]) - - # TODO(wesm): unused? - result = empty_float.fillna(value=0) # noqa