From 24cd07d97cd2a507084bafa38f9c5c4fa6d0b538 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 21 Oct 2020 10:50:34 -0700 Subject: [PATCH 1/5] TST/REF: collect indexing tests by method --- .../tests/frame/indexing/test_categorical.py | 11 ----- pandas/tests/frame/indexing/test_datetime.py | 48 ------------------- pandas/tests/frame/indexing/test_setitem.py | 33 +++++++++++++ pandas/tests/frame/indexing/test_sparse.py | 17 +------ pandas/tests/frame/indexing/test_where.py | 11 +++++ pandas/tests/frame/methods/test_reindex.py | 15 ++++++ .../tests/frame/methods/test_reset_index.py | 12 +++++ 7 files changed, 72 insertions(+), 75 deletions(-) delete mode 100644 pandas/tests/frame/indexing/test_datetime.py diff --git a/pandas/tests/frame/indexing/test_categorical.py b/pandas/tests/frame/indexing/test_categorical.py index c876f78176e2e..cfc22b9b18729 100644 --- a/pandas/tests/frame/indexing/test_categorical.py +++ b/pandas/tests/frame/indexing/test_categorical.py @@ -394,14 +394,3 @@ def test_loc_indexing_preserves_index_category_dtype(self): result = df.loc[["a"]].index.levels[0] tm.assert_index_equal(result, expected) - - def test_categorical_filtering(self): - # GH22609 Verify filtering operations on DataFrames with categorical Series - df = DataFrame(data=[[0, 0], [1, 1]], columns=["a", "b"]) - df["b"] = df.b.astype("category") - - result = df.where(df.a > 0) - expected = df.copy() - expected.loc[0, :] = np.nan - - tm.assert_equal(result, expected) diff --git a/pandas/tests/frame/indexing/test_datetime.py b/pandas/tests/frame/indexing/test_datetime.py deleted file mode 100644 index 1866ac341def6..0000000000000 --- a/pandas/tests/frame/indexing/test_datetime.py +++ /dev/null @@ -1,48 +0,0 @@ -import pandas as pd -from pandas import DataFrame, Index, Series, date_range, notna -import pandas._testing as tm - - -class TestDataFrameIndexingDatetimeWithTZ: - def test_setitem(self, timezone_frame): - - df = timezone_frame - idx = df["B"].rename("foo") - - # setitem - df["C"] = idx - tm.assert_series_equal(df["C"], Series(idx, name="C")) - - df["D"] = "foo" - df["D"] = idx - tm.assert_series_equal(df["D"], Series(idx, name="D")) - del df["D"] - - # assert that A & C are not sharing the same base (e.g. they - # are copies) - b1 = df._mgr.blocks[1] - b2 = df._mgr.blocks[2] - tm.assert_extension_array_equal(b1.values, b2.values) - b1base = b1.values._data.base - b2base = b2.values._data.base - assert b1base is None or (id(b1base) != id(b2base)) - - # with nan - df2 = df.copy() - df2.iloc[1, 1] = pd.NaT - df2.iloc[1, 2] = pd.NaT - result = df2["B"] - tm.assert_series_equal(notna(result), Series([True, False, True], name="B")) - tm.assert_series_equal(df2.dtypes, df.dtypes) - - def test_set_reset(self): - - idx = Index(date_range("20130101", periods=3, tz="US/Eastern"), name="foo") - - # set/reset - df = DataFrame({"A": [0, 1, 2]}, index=idx) - result = df.reset_index() - assert result["foo"].dtype == "datetime64[ns, US/Eastern]" - - df = result.set_index("foo") - tm.assert_index_equal(df.index, idx) diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index 8313ab0b99bac..87c6ae09aac11 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -8,10 +8,12 @@ DataFrame, Index, Interval, + NaT, Period, Series, Timestamp, date_range, + notna, ) import pandas._testing as tm from pandas.core.arrays import SparseArray @@ -180,3 +182,34 @@ def test_setitem_extension_types(self, obj, dtype): df["obj"] = obj tm.assert_frame_equal(df, expected) + + def test_setitem_dt64tz(self, timezone_frame): + + df = timezone_frame + idx = df["B"].rename("foo") + + # setitem + df["C"] = idx + tm.assert_series_equal(df["C"], Series(idx, name="C")) + + df["D"] = "foo" + df["D"] = idx + tm.assert_series_equal(df["D"], Series(idx, name="D")) + del df["D"] + + # assert that A & C are not sharing the same base (e.g. they + # are copies) + b1 = df._mgr.blocks[1] + b2 = df._mgr.blocks[2] + tm.assert_extension_array_equal(b1.values, b2.values) + b1base = b1.values._data.base + b2base = b2.values._data.base + assert b1base is None or (id(b1base) != id(b2base)) + + # with nan + df2 = df.copy() + df2.iloc[1, 1] = NaT + df2.iloc[1, 2] = NaT + result = df2["B"] + tm.assert_series_equal(notna(result), Series([True, False, True], name="B")) + tm.assert_series_equal(df2.dtypes, df.dtypes) diff --git a/pandas/tests/frame/indexing/test_sparse.py b/pandas/tests/frame/indexing/test_sparse.py index 04e1c8b94c4d9..c0cd7faafb4db 100644 --- a/pandas/tests/frame/indexing/test_sparse.py +++ b/pandas/tests/frame/indexing/test_sparse.py @@ -27,7 +27,7 @@ def test_getitem_sparse_column(self): @pytest.mark.parametrize("spmatrix_t", ["coo_matrix", "csc_matrix", "csr_matrix"]) @pytest.mark.parametrize("dtype", [np.int64, np.float64, complex]) @td.skip_if_no_scipy - def test_locindexer_from_spmatrix(self, spmatrix_t, dtype): + def test_loc_getitem_from_spmatrix(self, spmatrix_t, dtype): import scipy.sparse spmatrix_t = getattr(scipy.sparse, spmatrix_t) @@ -50,21 +50,6 @@ def test_locindexer_from_spmatrix(self, spmatrix_t, dtype): expected = np.full(cols, SparseDtype(dtype, fill_value=0)) tm.assert_numpy_array_equal(result, expected) - def test_reindex(self): - # https://github.com/pandas-dev/pandas/issues/35286 - df = pd.DataFrame( - {"A": [0, 1], "B": pd.array([0, 1], dtype=pd.SparseDtype("int64", 0))} - ) - result = df.reindex([0, 2]) - expected = pd.DataFrame( - { - "A": [0.0, np.nan], - "B": pd.array([0.0, np.nan], dtype=pd.SparseDtype("float64", 0.0)), - }, - index=[0, 2], - ) - tm.assert_frame_equal(result, expected) - def test_all_sparse(self): df = pd.DataFrame({"A": pd.array([0, 0], dtype=pd.SparseDtype("int64"))}) result = df.loc[[0, 1]] diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py index 95209c0c35195..3495247585236 100644 --- a/pandas/tests/frame/indexing/test_where.py +++ b/pandas/tests/frame/indexing/test_where.py @@ -642,3 +642,14 @@ def test_df_where_with_category(self, kwargs): expected = Series(A, name="A") tm.assert_series_equal(result, expected) + + def test_where_categorical_filtering(self): + # GH#22609 Verify filtering operations on DataFrames with categorical Series + df = DataFrame(data=[[0, 0], [1, 1]], columns=["a", "b"]) + df["b"] = df["b"].astype("category") + + result = df.where(df["a"] > 0) + expected = df.copy() + expected.loc[0, :] = np.nan + + tm.assert_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_reindex.py b/pandas/tests/frame/methods/test_reindex.py index 99494191c043a..bd6eddc7bbaa5 100644 --- a/pandas/tests/frame/methods/test_reindex.py +++ b/pandas/tests/frame/methods/test_reindex.py @@ -12,6 +12,21 @@ class TestDataFrameSelectReindex: # These are specific reindex-based tests; other indexing tests should go in # test_indexing + def test_reindex_sparse(self): + # https://github.com/pandas-dev/pandas/issues/35286 + df = DataFrame( + {"A": [0, 1], "B": pd.array([0, 1], dtype=pd.SparseDtype("int64", 0))} + ) + result = df.reindex([0, 2]) + expected = DataFrame( + { + "A": [0.0, np.nan], + "B": pd.array([0.0, np.nan], dtype=pd.SparseDtype("float64", 0.0)), + }, + index=[0, 2], + ) + tm.assert_frame_equal(result, expected) + def test_reindex(self, float_frame): datetime_series = tm.makeTimeSeries(nper=30) diff --git a/pandas/tests/frame/methods/test_reset_index.py b/pandas/tests/frame/methods/test_reset_index.py index b88ef0e6691cb..9454080363b3e 100644 --- a/pandas/tests/frame/methods/test_reset_index.py +++ b/pandas/tests/frame/methods/test_reset_index.py @@ -18,6 +18,18 @@ class TestResetIndex: + def test_set_index_reset_index_dt64tz(self): + + idx = Index(date_range("20130101", periods=3, tz="US/Eastern"), name="foo") + + # set/reset + df = DataFrame({"A": [0, 1, 2]}, index=idx) + result = df.reset_index() + assert result["foo"].dtype == "datetime64[ns, US/Eastern]" + + df = result.set_index("foo") + tm.assert_index_equal(df.index, idx) + def test_reset_index_tz(self, tz_aware_fixture): # GH 3950 # reset_index with single level From ef081bf1273102e1b5628fd77bf30f525e8fa809 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 21 Oct 2020 10:57:08 -0700 Subject: [PATCH 2/5] TST/REF: misplaced reindex/reset_index tests --- pandas/tests/frame/indexing/test_indexing.py | 310 +----------------- pandas/tests/frame/methods/test_reindex.py | 282 +++++++++++++++- .../tests/frame/methods/test_reset_index.py | 12 + 3 files changed, 302 insertions(+), 302 deletions(-) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 0dee818613edb..36d2a40ef8d00 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1441,81 +1441,6 @@ def test_set_value_resize(self, float_frame): with pytest.raises(ValueError, match=msg): res._set_value("foobar", "baz", "sam") - def test_reindex_with_multi_index(self): - # https://github.com/pandas-dev/pandas/issues/29896 - # tests for reindexing a multi-indexed DataFrame with a new MultiIndex - # - # confirms that we can reindex a multi-indexed DataFrame with a new - # MultiIndex object correctly when using no filling, backfilling, and - # padding - # - # The DataFrame, `df`, used in this test is: - # c - # a b - # -1 0 A - # 1 B - # 2 C - # 3 D - # 4 E - # 5 F - # 6 G - # 0 0 A - # 1 B - # 2 C - # 3 D - # 4 E - # 5 F - # 6 G - # 1 0 A - # 1 B - # 2 C - # 3 D - # 4 E - # 5 F - # 6 G - # - # and the other MultiIndex, `new_multi_index`, is: - # 0: 0 0.5 - # 1: 2.0 - # 2: 5.0 - # 3: 5.8 - df = DataFrame( - { - "a": [-1] * 7 + [0] * 7 + [1] * 7, - "b": list(range(7)) * 3, - "c": ["A", "B", "C", "D", "E", "F", "G"] * 3, - } - ).set_index(["a", "b"]) - new_index = [0.5, 2.0, 5.0, 5.8] - new_multi_index = MultiIndex.from_product([[0], new_index], names=["a", "b"]) - - # reindexing w/o a `method` value - reindexed = df.reindex(new_multi_index) - expected = DataFrame( - {"a": [0] * 4, "b": new_index, "c": [np.nan, "C", "F", np.nan]} - ).set_index(["a", "b"]) - tm.assert_frame_equal(expected, reindexed) - - # reindexing with backfilling - expected = DataFrame( - {"a": [0] * 4, "b": new_index, "c": ["B", "C", "F", "G"]} - ).set_index(["a", "b"]) - reindexed_with_backfilling = df.reindex(new_multi_index, method="bfill") - tm.assert_frame_equal(expected, reindexed_with_backfilling) - - reindexed_with_backfilling = df.reindex(new_multi_index, method="backfill") - tm.assert_frame_equal(expected, reindexed_with_backfilling) - - # reindexing with padding - expected = DataFrame( - {"a": [0] * 4, "b": new_index, "c": ["A", "C", "F", "F"]} - ).set_index(["a", "b"]) - reindexed_with_padding = df.reindex(new_multi_index, method="pad") - tm.assert_frame_equal(expected, reindexed_with_padding) - - reindexed_with_padding = df.reindex(new_multi_index, method="ffill") - tm.assert_frame_equal(expected, reindexed_with_padding) - def test_set_value_with_index_dtype_change(self): df_orig = DataFrame(np.random.randn(3, 3), index=range(3), columns=list("ABC")) @@ -1674,216 +1599,11 @@ def test_loc_duplicates(self): df.loc[trange[bool_idx], "A"] += 6 tm.assert_frame_equal(df, expected) - @pytest.mark.parametrize( - "method,expected_values", - [ - ("nearest", [0, 1, 1, 2]), - ("pad", [np.nan, 0, 1, 1]), - ("backfill", [0, 1, 2, 2]), - ], - ) - def test_reindex_methods(self, method, expected_values): - df = DataFrame({"x": list(range(5))}) - target = np.array([-0.1, 0.9, 1.1, 1.5]) - - expected = DataFrame({"x": expected_values}, index=target) - actual = df.reindex(target, method=method) - tm.assert_frame_equal(expected, actual) - - actual = df.reindex(target, method=method, tolerance=1) - tm.assert_frame_equal(expected, actual) - actual = df.reindex(target, method=method, tolerance=[1, 1, 1, 1]) - tm.assert_frame_equal(expected, actual) - - e2 = expected[::-1] - actual = df.reindex(target[::-1], method=method) - tm.assert_frame_equal(e2, actual) - - new_order = [3, 0, 2, 1] - e2 = expected.iloc[new_order] - actual = df.reindex(target[new_order], method=method) - tm.assert_frame_equal(e2, actual) - - switched_method = ( - "pad" if method == "backfill" else "backfill" if method == "pad" else method - ) - actual = df[::-1].reindex(target, method=switched_method) - tm.assert_frame_equal(expected, actual) - - def test_reindex_methods_nearest_special(self): - df = DataFrame({"x": list(range(5))}) - target = np.array([-0.1, 0.9, 1.1, 1.5]) - - expected = DataFrame({"x": [0, 1, 1, np.nan]}, index=target) - actual = df.reindex(target, method="nearest", tolerance=0.2) - tm.assert_frame_equal(expected, actual) - - expected = DataFrame({"x": [0, np.nan, 1, np.nan]}, index=target) - actual = df.reindex(target, method="nearest", tolerance=[0.5, 0.01, 0.4, 0.1]) - tm.assert_frame_equal(expected, actual) - - def test_reindex_nearest_tz(self, tz_aware_fixture): - # GH26683 - tz = tz_aware_fixture - idx = pd.date_range("2019-01-01", periods=5, tz=tz) - df = DataFrame({"x": list(range(5))}, index=idx) - - expected = df.head(3) - actual = df.reindex(idx[:3], method="nearest") - tm.assert_frame_equal(expected, actual) - - def test_reindex_nearest_tz_empty_frame(self): - # https://github.com/pandas-dev/pandas/issues/31964 - dti = pd.DatetimeIndex(["2016-06-26 14:27:26+00:00"]) - df = DataFrame(index=pd.DatetimeIndex(["2016-07-04 14:00:59+00:00"])) - expected = DataFrame(index=dti) - result = df.reindex(dti, method="nearest") - tm.assert_frame_equal(result, expected) - - def test_reindex_frame_add_nat(self): - rng = date_range("1/1/2000 00:00:00", periods=10, freq="10s") - df = DataFrame({"A": np.random.randn(len(rng)), "B": rng}) - - result = df.reindex(range(15)) - assert np.issubdtype(result["B"].dtype, np.dtype("M8[ns]")) - - mask = com.isna(result)["B"] - assert mask[-5:].all() - assert not mask[:-5].any() - - def test_reindex_limit(self): - # GH 28631 - data = [["A", "A", "A"], ["B", "B", "B"], ["C", "C", "C"], ["D", "D", "D"]] - exp_data = [ - ["A", "A", "A"], - ["B", "B", "B"], - ["C", "C", "C"], - ["D", "D", "D"], - ["D", "D", "D"], - [np.nan, np.nan, np.nan], - ] - df = DataFrame(data) - result = df.reindex([0, 1, 2, 3, 4, 5], method="ffill", limit=1) - expected = DataFrame(exp_data) - tm.assert_frame_equal(result, expected) - def test_set_dataframe_column_ns_dtype(self): x = DataFrame([datetime.now(), datetime.now()]) assert x[0].dtype == np.dtype("M8[ns]") - def test_non_monotonic_reindex_methods(self): - dr = pd.date_range("2013-08-01", periods=6, freq="B") - data = np.random.randn(6, 1) - df = DataFrame(data, index=dr, columns=list("A")) - df_rev = DataFrame(data, index=dr[[3, 4, 5] + [0, 1, 2]], columns=list("A")) - # index is not monotonic increasing or decreasing - msg = "index must be monotonic increasing or decreasing" - with pytest.raises(ValueError, match=msg): - df_rev.reindex(df.index, method="pad") - with pytest.raises(ValueError, match=msg): - df_rev.reindex(df.index, method="ffill") - with pytest.raises(ValueError, match=msg): - df_rev.reindex(df.index, method="bfill") - with pytest.raises(ValueError, match=msg): - df_rev.reindex(df.index, method="nearest") - - def test_reindex_level(self): - from itertools import permutations - - icol = ["jim", "joe", "jolie"] - - def verify_first_level(df, level, idx, check_index_type=True): - def f(val): - return np.nonzero((df[level] == val).to_numpy())[0] - - i = np.concatenate(list(map(f, idx))) - left = df.set_index(icol).reindex(idx, level=level) - right = df.iloc[i].set_index(icol) - tm.assert_frame_equal(left, right, check_index_type=check_index_type) - - def verify(df, level, idx, indexer, check_index_type=True): - left = df.set_index(icol).reindex(idx, level=level) - right = df.iloc[indexer].set_index(icol) - tm.assert_frame_equal(left, right, check_index_type=check_index_type) - - df = DataFrame( - { - "jim": list("B" * 4 + "A" * 2 + "C" * 3), - "joe": list("abcdeabcd")[::-1], - "jolie": [10, 20, 30] * 3, - "joline": np.random.randint(0, 1000, 9), - } - ) - - target = [ - ["C", "B", "A"], - ["F", "C", "A", "D"], - ["A"], - ["A", "B", "C"], - ["C", "A", "B"], - ["C", "B"], - ["C", "A"], - ["A", "B"], - ["B", "A", "C"], - ] - - for idx in target: - verify_first_level(df, "jim", idx) - - # reindex by these causes different MultiIndex levels - for idx in [["D", "F"], ["A", "C", "B"]]: - verify_first_level(df, "jim", idx, check_index_type=False) - - verify(df, "joe", list("abcde"), [3, 2, 1, 0, 5, 4, 8, 7, 6]) - verify(df, "joe", list("abcd"), [3, 2, 1, 0, 5, 8, 7, 6]) - verify(df, "joe", list("abc"), [3, 2, 1, 8, 7, 6]) - verify(df, "joe", list("eca"), [1, 3, 4, 6, 8]) - verify(df, "joe", list("edc"), [0, 1, 4, 5, 6]) - verify(df, "joe", list("eadbc"), [3, 0, 2, 1, 4, 5, 8, 7, 6]) - verify(df, "joe", list("edwq"), [0, 4, 5]) - verify(df, "joe", list("wq"), [], check_index_type=False) - - df = DataFrame( - { - "jim": ["mid"] * 5 + ["btm"] * 8 + ["top"] * 7, - "joe": ["3rd"] * 2 - + ["1st"] * 3 - + ["2nd"] * 3 - + ["1st"] * 2 - + ["3rd"] * 3 - + ["1st"] * 2 - + ["3rd"] * 3 - + ["2nd"] * 2, - # this needs to be jointly unique with jim and joe or - # reindexing will fail ~1.5% of the time, this works - # out to needing unique groups of same size as joe - "jolie": np.concatenate( - [ - np.random.choice(1000, x, replace=False) - for x in [2, 3, 3, 2, 3, 2, 3, 2] - ] - ), - "joline": np.random.randn(20).round(3) * 10, - } - ) - - for idx in permutations(df["jim"].unique()): - for i in range(3): - verify_first_level(df, "jim", idx[: i + 1]) - - i = [2, 3, 4, 0, 1, 8, 9, 5, 6, 7, 10, 11, 12, 13, 14, 18, 19, 15, 16, 17] - verify(df, "joe", ["1st", "2nd", "3rd"], i) - - i = [0, 1, 2, 3, 4, 10, 11, 12, 5, 6, 7, 8, 9, 15, 16, 17, 18, 19, 13, 14] - verify(df, "joe", ["3rd", "2nd", "1st"], i) - - i = [0, 1, 5, 6, 7, 10, 11, 12, 18, 19, 15, 16, 17] - verify(df, "joe", ["2nd", "3rd"], i) - - i = [0, 1, 2, 3, 4, 10, 11, 12, 8, 9, 15, 16, 17, 13, 14] - verify(df, "joe", ["3rd", "1st"], i) - - def test_getitem_ix_float_duplicates(self): + def test_iloc_getitem_float_duplicates(self): df = DataFrame( np.random.randn(3, 3), index=[0.1, 0.2, 0.2], columns=list("abc") ) @@ -1929,7 +1649,7 @@ def test_setitem_with_unaligned_tz_aware_datetime_column(self): df.loc[[0, 1, 2], "dates"] = column[[1, 0, 2]] tm.assert_series_equal(df["dates"], column) - def test_setitem_datetime_coercion(self): + def test_loc_setitem_datetime_coercion(self): # gh-1048 df = DataFrame({"c": [pd.Timestamp("2010-10-01")] * 3}) df.loc[0:1, "c"] = np.datetime64("2008-08-08") @@ -1938,7 +1658,7 @@ def test_setitem_datetime_coercion(self): df.loc[2, "c"] = date(2005, 5, 5) assert pd.Timestamp("2005-05-05") == df.loc[2, "c"] - def test_setitem_datetimelike_with_inference(self): + def test_loc_setitem_datetimelike_with_inference(self): # GH 7592 # assignment of timedeltas with NaT @@ -1961,7 +1681,7 @@ def test_setitem_datetimelike_with_inference(self): tm.assert_series_equal(result, expected) @pytest.mark.parametrize("idxer", ["var", ["var"]]) - def test_setitem_datetimeindex_tz(self, idxer, tz_naive_fixture): + def test_loc_setitem_datetimeindex_tz(self, idxer, tz_naive_fixture): # GH 11365 tz = tz_naive_fixture idx = date_range(start="2015-07-12", periods=3, freq="H", tz=tz) @@ -2016,7 +1736,7 @@ def test_at_time_between_time_datetimeindex(self): result.loc[bkey] = df.iloc[binds] tm.assert_frame_equal(result, df) - def test_index_namedtuple(self): + def test_loc_getitem_index_namedtuple(self): from collections import namedtuple IndexType = namedtuple("IndexType", ["a", "b"]) @@ -2029,7 +1749,7 @@ def test_index_namedtuple(self): assert result == 1 @pytest.mark.parametrize("tpl", [tuple([1]), tuple([1, 2])]) - def test_index_single_double_tuples(self, tpl): + def test_loc_getitem_index_single_double_tuples(self, tpl): # GH 20991 idx = pd.Index([tuple([1]), tuple([1, 2])], name="A", tupleize_cols=False) df = DataFrame(index=idx) @@ -2039,7 +1759,7 @@ def test_index_single_double_tuples(self, tpl): expected = DataFrame(index=idx) tm.assert_frame_equal(result, expected) - def test_boolean_indexing(self): + def test_setitem_boolean_indexing(self): idx = list(range(3)) cols = ["A", "B", "C"] df1 = DataFrame( @@ -2062,7 +1782,7 @@ def test_boolean_indexing(self): with pytest.raises(ValueError, match="Item wrong length"): df1[df1.index[:-1] > 2] = -1 - def test_boolean_indexing_mixed(self): + def test_getitem_boolean_indexing_mixed(self): df = DataFrame( { 0: {35: np.nan, 40: np.nan, 43: np.nan, 49: np.nan, 50: np.nan}, @@ -2135,7 +1855,7 @@ def test_type_error_multiindex(self): result = dg["x", 0] tm.assert_series_equal(result, expected) - def test_interval_index(self): + def test_loc_getitem_interval_index(self): # GH 19977 index = pd.interval_range(start=0, periods=3) df = DataFrame( @@ -2201,18 +1921,6 @@ def test_setitem(self, uint64_frame): ), ) - def test_set_reset(self): - - idx = Index([2 ** 63, 2 ** 63 + 5, 2 ** 63 + 10], name="foo") - - # set/reset - df = DataFrame({"A": [0, 1, 2]}, index=idx) - result = df.reset_index() - assert result["foo"].dtype == np.dtype("uint64") - - df = result.set_index("foo") - tm.assert_index_equal(df.index, idx) - def test_object_casting_indexing_wraps_datetimelike(): # GH#31649, check the indexing methods all the way down the stack diff --git a/pandas/tests/frame/methods/test_reindex.py b/pandas/tests/frame/methods/test_reindex.py index bd6eddc7bbaa5..5a5aac87b057d 100644 --- a/pandas/tests/frame/methods/test_reindex.py +++ b/pandas/tests/frame/methods/test_reindex.py @@ -1,17 +1,297 @@ from datetime import datetime +from itertools import permutations import numpy as np import pytest import pandas as pd -from pandas import Categorical, DataFrame, Index, Series, date_range, isna +from pandas import Categorical, DataFrame, Index, MultiIndex, Series, date_range, isna import pandas._testing as tm +import pandas.core.common as com class TestDataFrameSelectReindex: # These are specific reindex-based tests; other indexing tests should go in # test_indexing + def test_reindex_with_multi_index(self): + # https://github.com/pandas-dev/pandas/issues/29896 + # tests for reindexing a multi-indexed DataFrame with a new MultiIndex + # + # confirms that we can reindex a multi-indexed DataFrame with a new + # MultiIndex object correctly when using no filling, backfilling, and + # padding + # + # The DataFrame, `df`, used in this test is: + # c + # a b + # -1 0 A + # 1 B + # 2 C + # 3 D + # 4 E + # 5 F + # 6 G + # 0 0 A + # 1 B + # 2 C + # 3 D + # 4 E + # 5 F + # 6 G + # 1 0 A + # 1 B + # 2 C + # 3 D + # 4 E + # 5 F + # 6 G + # + # and the other MultiIndex, `new_multi_index`, is: + # 0: 0 0.5 + # 1: 2.0 + # 2: 5.0 + # 3: 5.8 + df = DataFrame( + { + "a": [-1] * 7 + [0] * 7 + [1] * 7, + "b": list(range(7)) * 3, + "c": ["A", "B", "C", "D", "E", "F", "G"] * 3, + } + ).set_index(["a", "b"]) + new_index = [0.5, 2.0, 5.0, 5.8] + new_multi_index = MultiIndex.from_product([[0], new_index], names=["a", "b"]) + + # reindexing w/o a `method` value + reindexed = df.reindex(new_multi_index) + expected = DataFrame( + {"a": [0] * 4, "b": new_index, "c": [np.nan, "C", "F", np.nan]} + ).set_index(["a", "b"]) + tm.assert_frame_equal(expected, reindexed) + + # reindexing with backfilling + expected = DataFrame( + {"a": [0] * 4, "b": new_index, "c": ["B", "C", "F", "G"]} + ).set_index(["a", "b"]) + reindexed_with_backfilling = df.reindex(new_multi_index, method="bfill") + tm.assert_frame_equal(expected, reindexed_with_backfilling) + + reindexed_with_backfilling = df.reindex(new_multi_index, method="backfill") + tm.assert_frame_equal(expected, reindexed_with_backfilling) + + # reindexing with padding + expected = DataFrame( + {"a": [0] * 4, "b": new_index, "c": ["A", "C", "F", "F"]} + ).set_index(["a", "b"]) + reindexed_with_padding = df.reindex(new_multi_index, method="pad") + tm.assert_frame_equal(expected, reindexed_with_padding) + + reindexed_with_padding = df.reindex(new_multi_index, method="ffill") + tm.assert_frame_equal(expected, reindexed_with_padding) + + @pytest.mark.parametrize( + "method,expected_values", + [ + ("nearest", [0, 1, 1, 2]), + ("pad", [np.nan, 0, 1, 1]), + ("backfill", [0, 1, 2, 2]), + ], + ) + def test_reindex_methods(self, method, expected_values): + df = DataFrame({"x": list(range(5))}) + target = np.array([-0.1, 0.9, 1.1, 1.5]) + + expected = DataFrame({"x": expected_values}, index=target) + actual = df.reindex(target, method=method) + tm.assert_frame_equal(expected, actual) + + actual = df.reindex(target, method=method, tolerance=1) + tm.assert_frame_equal(expected, actual) + actual = df.reindex(target, method=method, tolerance=[1, 1, 1, 1]) + tm.assert_frame_equal(expected, actual) + + e2 = expected[::-1] + actual = df.reindex(target[::-1], method=method) + tm.assert_frame_equal(e2, actual) + + new_order = [3, 0, 2, 1] + e2 = expected.iloc[new_order] + actual = df.reindex(target[new_order], method=method) + tm.assert_frame_equal(e2, actual) + + switched_method = ( + "pad" if method == "backfill" else "backfill" if method == "pad" else method + ) + actual = df[::-1].reindex(target, method=switched_method) + tm.assert_frame_equal(expected, actual) + + def test_reindex_methods_nearest_special(self): + df = DataFrame({"x": list(range(5))}) + target = np.array([-0.1, 0.9, 1.1, 1.5]) + + expected = DataFrame({"x": [0, 1, 1, np.nan]}, index=target) + actual = df.reindex(target, method="nearest", tolerance=0.2) + tm.assert_frame_equal(expected, actual) + + expected = DataFrame({"x": [0, np.nan, 1, np.nan]}, index=target) + actual = df.reindex(target, method="nearest", tolerance=[0.5, 0.01, 0.4, 0.1]) + tm.assert_frame_equal(expected, actual) + + def test_reindex_nearest_tz(self, tz_aware_fixture): + # GH26683 + tz = tz_aware_fixture + idx = pd.date_range("2019-01-01", periods=5, tz=tz) + df = DataFrame({"x": list(range(5))}, index=idx) + + expected = df.head(3) + actual = df.reindex(idx[:3], method="nearest") + tm.assert_frame_equal(expected, actual) + + def test_reindex_nearest_tz_empty_frame(self): + # https://github.com/pandas-dev/pandas/issues/31964 + dti = pd.DatetimeIndex(["2016-06-26 14:27:26+00:00"]) + df = DataFrame(index=pd.DatetimeIndex(["2016-07-04 14:00:59+00:00"])) + expected = DataFrame(index=dti) + result = df.reindex(dti, method="nearest") + tm.assert_frame_equal(result, expected) + + def test_reindex_frame_add_nat(self): + rng = date_range("1/1/2000 00:00:00", periods=10, freq="10s") + df = DataFrame({"A": np.random.randn(len(rng)), "B": rng}) + + result = df.reindex(range(15)) + assert np.issubdtype(result["B"].dtype, np.dtype("M8[ns]")) + + mask = com.isna(result)["B"] + assert mask[-5:].all() + assert not mask[:-5].any() + + def test_reindex_limit(self): + # GH 28631 + data = [["A", "A", "A"], ["B", "B", "B"], ["C", "C", "C"], ["D", "D", "D"]] + exp_data = [ + ["A", "A", "A"], + ["B", "B", "B"], + ["C", "C", "C"], + ["D", "D", "D"], + ["D", "D", "D"], + [np.nan, np.nan, np.nan], + ] + df = DataFrame(data) + result = df.reindex([0, 1, 2, 3, 4, 5], method="ffill", limit=1) + expected = DataFrame(exp_data) + tm.assert_frame_equal(result, expected) + + def test_reindex_level(self): + icol = ["jim", "joe", "jolie"] + + def verify_first_level(df, level, idx, check_index_type=True): + def f(val): + return np.nonzero((df[level] == val).to_numpy())[0] + + i = np.concatenate(list(map(f, idx))) + left = df.set_index(icol).reindex(idx, level=level) + right = df.iloc[i].set_index(icol) + tm.assert_frame_equal(left, right, check_index_type=check_index_type) + + def verify(df, level, idx, indexer, check_index_type=True): + left = df.set_index(icol).reindex(idx, level=level) + right = df.iloc[indexer].set_index(icol) + tm.assert_frame_equal(left, right, check_index_type=check_index_type) + + df = DataFrame( + { + "jim": list("B" * 4 + "A" * 2 + "C" * 3), + "joe": list("abcdeabcd")[::-1], + "jolie": [10, 20, 30] * 3, + "joline": np.random.randint(0, 1000, 9), + } + ) + + target = [ + ["C", "B", "A"], + ["F", "C", "A", "D"], + ["A"], + ["A", "B", "C"], + ["C", "A", "B"], + ["C", "B"], + ["C", "A"], + ["A", "B"], + ["B", "A", "C"], + ] + + for idx in target: + verify_first_level(df, "jim", idx) + + # reindex by these causes different MultiIndex levels + for idx in [["D", "F"], ["A", "C", "B"]]: + verify_first_level(df, "jim", idx, check_index_type=False) + + verify(df, "joe", list("abcde"), [3, 2, 1, 0, 5, 4, 8, 7, 6]) + verify(df, "joe", list("abcd"), [3, 2, 1, 0, 5, 8, 7, 6]) + verify(df, "joe", list("abc"), [3, 2, 1, 8, 7, 6]) + verify(df, "joe", list("eca"), [1, 3, 4, 6, 8]) + verify(df, "joe", list("edc"), [0, 1, 4, 5, 6]) + verify(df, "joe", list("eadbc"), [3, 0, 2, 1, 4, 5, 8, 7, 6]) + verify(df, "joe", list("edwq"), [0, 4, 5]) + verify(df, "joe", list("wq"), [], check_index_type=False) + + df = DataFrame( + { + "jim": ["mid"] * 5 + ["btm"] * 8 + ["top"] * 7, + "joe": ["3rd"] * 2 + + ["1st"] * 3 + + ["2nd"] * 3 + + ["1st"] * 2 + + ["3rd"] * 3 + + ["1st"] * 2 + + ["3rd"] * 3 + + ["2nd"] * 2, + # this needs to be jointly unique with jim and joe or + # reindexing will fail ~1.5% of the time, this works + # out to needing unique groups of same size as joe + "jolie": np.concatenate( + [ + np.random.choice(1000, x, replace=False) + for x in [2, 3, 3, 2, 3, 2, 3, 2] + ] + ), + "joline": np.random.randn(20).round(3) * 10, + } + ) + + for idx in permutations(df["jim"].unique()): + for i in range(3): + verify_first_level(df, "jim", idx[: i + 1]) + + i = [2, 3, 4, 0, 1, 8, 9, 5, 6, 7, 10, 11, 12, 13, 14, 18, 19, 15, 16, 17] + verify(df, "joe", ["1st", "2nd", "3rd"], i) + + i = [0, 1, 2, 3, 4, 10, 11, 12, 5, 6, 7, 8, 9, 15, 16, 17, 18, 19, 13, 14] + verify(df, "joe", ["3rd", "2nd", "1st"], i) + + i = [0, 1, 5, 6, 7, 10, 11, 12, 18, 19, 15, 16, 17] + verify(df, "joe", ["2nd", "3rd"], i) + + i = [0, 1, 2, 3, 4, 10, 11, 12, 8, 9, 15, 16, 17, 13, 14] + verify(df, "joe", ["3rd", "1st"], i) + + def test_non_monotonic_reindex_methods(self): + dr = date_range("2013-08-01", periods=6, freq="B") + data = np.random.randn(6, 1) + df = DataFrame(data, index=dr, columns=list("A")) + df_rev = DataFrame(data, index=dr[[3, 4, 5] + [0, 1, 2]], columns=list("A")) + # index is not monotonic increasing or decreasing + msg = "index must be monotonic increasing or decreasing" + with pytest.raises(ValueError, match=msg): + df_rev.reindex(df.index, method="pad") + with pytest.raises(ValueError, match=msg): + df_rev.reindex(df.index, method="ffill") + with pytest.raises(ValueError, match=msg): + df_rev.reindex(df.index, method="bfill") + with pytest.raises(ValueError, match=msg): + df_rev.reindex(df.index, method="nearest") + def test_reindex_sparse(self): # https://github.com/pandas-dev/pandas/issues/35286 df = DataFrame( diff --git a/pandas/tests/frame/methods/test_reset_index.py b/pandas/tests/frame/methods/test_reset_index.py index 9454080363b3e..3c14192f02cf6 100644 --- a/pandas/tests/frame/methods/test_reset_index.py +++ b/pandas/tests/frame/methods/test_reset_index.py @@ -18,6 +18,18 @@ class TestResetIndex: + def test_set_reset(self): + + idx = Index([2 ** 63, 2 ** 63 + 5, 2 ** 63 + 10], name="foo") + + # set/reset + df = DataFrame({"A": [0, 1, 2]}, index=idx) + result = df.reset_index() + assert result["foo"].dtype == np.dtype("uint64") + + df = result.set_index("foo") + tm.assert_index_equal(df.index, idx) + def test_set_index_reset_index_dt64tz(self): idx = Index(date_range("20130101", periods=3, tz="US/Eastern"), name="foo") From bd5abb81246c0787658c563230460f9977f428b7 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 21 Oct 2020 10:50:34 -0700 Subject: [PATCH 3/5] TST/REF: collect indexing tests by method --- .../tests/frame/indexing/test_categorical.py | 11 ----- pandas/tests/frame/indexing/test_datetime.py | 48 ------------------- pandas/tests/frame/indexing/test_setitem.py | 33 +++++++++++++ pandas/tests/frame/indexing/test_sparse.py | 17 +------ pandas/tests/frame/indexing/test_where.py | 11 +++++ pandas/tests/frame/methods/test_reindex.py | 15 ++++++ .../tests/frame/methods/test_reset_index.py | 12 +++++ 7 files changed, 72 insertions(+), 75 deletions(-) delete mode 100644 pandas/tests/frame/indexing/test_datetime.py diff --git a/pandas/tests/frame/indexing/test_categorical.py b/pandas/tests/frame/indexing/test_categorical.py index c876f78176e2e..cfc22b9b18729 100644 --- a/pandas/tests/frame/indexing/test_categorical.py +++ b/pandas/tests/frame/indexing/test_categorical.py @@ -394,14 +394,3 @@ def test_loc_indexing_preserves_index_category_dtype(self): result = df.loc[["a"]].index.levels[0] tm.assert_index_equal(result, expected) - - def test_categorical_filtering(self): - # GH22609 Verify filtering operations on DataFrames with categorical Series - df = DataFrame(data=[[0, 0], [1, 1]], columns=["a", "b"]) - df["b"] = df.b.astype("category") - - result = df.where(df.a > 0) - expected = df.copy() - expected.loc[0, :] = np.nan - - tm.assert_equal(result, expected) diff --git a/pandas/tests/frame/indexing/test_datetime.py b/pandas/tests/frame/indexing/test_datetime.py deleted file mode 100644 index 1866ac341def6..0000000000000 --- a/pandas/tests/frame/indexing/test_datetime.py +++ /dev/null @@ -1,48 +0,0 @@ -import pandas as pd -from pandas import DataFrame, Index, Series, date_range, notna -import pandas._testing as tm - - -class TestDataFrameIndexingDatetimeWithTZ: - def test_setitem(self, timezone_frame): - - df = timezone_frame - idx = df["B"].rename("foo") - - # setitem - df["C"] = idx - tm.assert_series_equal(df["C"], Series(idx, name="C")) - - df["D"] = "foo" - df["D"] = idx - tm.assert_series_equal(df["D"], Series(idx, name="D")) - del df["D"] - - # assert that A & C are not sharing the same base (e.g. they - # are copies) - b1 = df._mgr.blocks[1] - b2 = df._mgr.blocks[2] - tm.assert_extension_array_equal(b1.values, b2.values) - b1base = b1.values._data.base - b2base = b2.values._data.base - assert b1base is None or (id(b1base) != id(b2base)) - - # with nan - df2 = df.copy() - df2.iloc[1, 1] = pd.NaT - df2.iloc[1, 2] = pd.NaT - result = df2["B"] - tm.assert_series_equal(notna(result), Series([True, False, True], name="B")) - tm.assert_series_equal(df2.dtypes, df.dtypes) - - def test_set_reset(self): - - idx = Index(date_range("20130101", periods=3, tz="US/Eastern"), name="foo") - - # set/reset - df = DataFrame({"A": [0, 1, 2]}, index=idx) - result = df.reset_index() - assert result["foo"].dtype == "datetime64[ns, US/Eastern]" - - df = result.set_index("foo") - tm.assert_index_equal(df.index, idx) diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index 8313ab0b99bac..87c6ae09aac11 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -8,10 +8,12 @@ DataFrame, Index, Interval, + NaT, Period, Series, Timestamp, date_range, + notna, ) import pandas._testing as tm from pandas.core.arrays import SparseArray @@ -180,3 +182,34 @@ def test_setitem_extension_types(self, obj, dtype): df["obj"] = obj tm.assert_frame_equal(df, expected) + + def test_setitem_dt64tz(self, timezone_frame): + + df = timezone_frame + idx = df["B"].rename("foo") + + # setitem + df["C"] = idx + tm.assert_series_equal(df["C"], Series(idx, name="C")) + + df["D"] = "foo" + df["D"] = idx + tm.assert_series_equal(df["D"], Series(idx, name="D")) + del df["D"] + + # assert that A & C are not sharing the same base (e.g. they + # are copies) + b1 = df._mgr.blocks[1] + b2 = df._mgr.blocks[2] + tm.assert_extension_array_equal(b1.values, b2.values) + b1base = b1.values._data.base + b2base = b2.values._data.base + assert b1base is None or (id(b1base) != id(b2base)) + + # with nan + df2 = df.copy() + df2.iloc[1, 1] = NaT + df2.iloc[1, 2] = NaT + result = df2["B"] + tm.assert_series_equal(notna(result), Series([True, False, True], name="B")) + tm.assert_series_equal(df2.dtypes, df.dtypes) diff --git a/pandas/tests/frame/indexing/test_sparse.py b/pandas/tests/frame/indexing/test_sparse.py index 04e1c8b94c4d9..c0cd7faafb4db 100644 --- a/pandas/tests/frame/indexing/test_sparse.py +++ b/pandas/tests/frame/indexing/test_sparse.py @@ -27,7 +27,7 @@ def test_getitem_sparse_column(self): @pytest.mark.parametrize("spmatrix_t", ["coo_matrix", "csc_matrix", "csr_matrix"]) @pytest.mark.parametrize("dtype", [np.int64, np.float64, complex]) @td.skip_if_no_scipy - def test_locindexer_from_spmatrix(self, spmatrix_t, dtype): + def test_loc_getitem_from_spmatrix(self, spmatrix_t, dtype): import scipy.sparse spmatrix_t = getattr(scipy.sparse, spmatrix_t) @@ -50,21 +50,6 @@ def test_locindexer_from_spmatrix(self, spmatrix_t, dtype): expected = np.full(cols, SparseDtype(dtype, fill_value=0)) tm.assert_numpy_array_equal(result, expected) - def test_reindex(self): - # https://github.com/pandas-dev/pandas/issues/35286 - df = pd.DataFrame( - {"A": [0, 1], "B": pd.array([0, 1], dtype=pd.SparseDtype("int64", 0))} - ) - result = df.reindex([0, 2]) - expected = pd.DataFrame( - { - "A": [0.0, np.nan], - "B": pd.array([0.0, np.nan], dtype=pd.SparseDtype("float64", 0.0)), - }, - index=[0, 2], - ) - tm.assert_frame_equal(result, expected) - def test_all_sparse(self): df = pd.DataFrame({"A": pd.array([0, 0], dtype=pd.SparseDtype("int64"))}) result = df.loc[[0, 1]] diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py index 95209c0c35195..3495247585236 100644 --- a/pandas/tests/frame/indexing/test_where.py +++ b/pandas/tests/frame/indexing/test_where.py @@ -642,3 +642,14 @@ def test_df_where_with_category(self, kwargs): expected = Series(A, name="A") tm.assert_series_equal(result, expected) + + def test_where_categorical_filtering(self): + # GH#22609 Verify filtering operations on DataFrames with categorical Series + df = DataFrame(data=[[0, 0], [1, 1]], columns=["a", "b"]) + df["b"] = df["b"].astype("category") + + result = df.where(df["a"] > 0) + expected = df.copy() + expected.loc[0, :] = np.nan + + tm.assert_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_reindex.py b/pandas/tests/frame/methods/test_reindex.py index 99494191c043a..bd6eddc7bbaa5 100644 --- a/pandas/tests/frame/methods/test_reindex.py +++ b/pandas/tests/frame/methods/test_reindex.py @@ -12,6 +12,21 @@ class TestDataFrameSelectReindex: # These are specific reindex-based tests; other indexing tests should go in # test_indexing + def test_reindex_sparse(self): + # https://github.com/pandas-dev/pandas/issues/35286 + df = DataFrame( + {"A": [0, 1], "B": pd.array([0, 1], dtype=pd.SparseDtype("int64", 0))} + ) + result = df.reindex([0, 2]) + expected = DataFrame( + { + "A": [0.0, np.nan], + "B": pd.array([0.0, np.nan], dtype=pd.SparseDtype("float64", 0.0)), + }, + index=[0, 2], + ) + tm.assert_frame_equal(result, expected) + def test_reindex(self, float_frame): datetime_series = tm.makeTimeSeries(nper=30) diff --git a/pandas/tests/frame/methods/test_reset_index.py b/pandas/tests/frame/methods/test_reset_index.py index b88ef0e6691cb..9454080363b3e 100644 --- a/pandas/tests/frame/methods/test_reset_index.py +++ b/pandas/tests/frame/methods/test_reset_index.py @@ -18,6 +18,18 @@ class TestResetIndex: + def test_set_index_reset_index_dt64tz(self): + + idx = Index(date_range("20130101", periods=3, tz="US/Eastern"), name="foo") + + # set/reset + df = DataFrame({"A": [0, 1, 2]}, index=idx) + result = df.reset_index() + assert result["foo"].dtype == "datetime64[ns, US/Eastern]" + + df = result.set_index("foo") + tm.assert_index_equal(df.index, idx) + def test_reset_index_tz(self, tz_aware_fixture): # GH 3950 # reset_index with single level From 2620aca843a997d368368de878990ca3b03b5259 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 21 Oct 2020 10:57:08 -0700 Subject: [PATCH 4/5] TST/REF: misplaced reindex/reset_index tests --- pandas/tests/frame/indexing/test_indexing.py | 310 +----------------- pandas/tests/frame/methods/test_reindex.py | 282 +++++++++++++++- .../tests/frame/methods/test_reset_index.py | 12 + 3 files changed, 302 insertions(+), 302 deletions(-) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 0dee818613edb..36d2a40ef8d00 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1441,81 +1441,6 @@ def test_set_value_resize(self, float_frame): with pytest.raises(ValueError, match=msg): res._set_value("foobar", "baz", "sam") - def test_reindex_with_multi_index(self): - # https://github.com/pandas-dev/pandas/issues/29896 - # tests for reindexing a multi-indexed DataFrame with a new MultiIndex - # - # confirms that we can reindex a multi-indexed DataFrame with a new - # MultiIndex object correctly when using no filling, backfilling, and - # padding - # - # The DataFrame, `df`, used in this test is: - # c - # a b - # -1 0 A - # 1 B - # 2 C - # 3 D - # 4 E - # 5 F - # 6 G - # 0 0 A - # 1 B - # 2 C - # 3 D - # 4 E - # 5 F - # 6 G - # 1 0 A - # 1 B - # 2 C - # 3 D - # 4 E - # 5 F - # 6 G - # - # and the other MultiIndex, `new_multi_index`, is: - # 0: 0 0.5 - # 1: 2.0 - # 2: 5.0 - # 3: 5.8 - df = DataFrame( - { - "a": [-1] * 7 + [0] * 7 + [1] * 7, - "b": list(range(7)) * 3, - "c": ["A", "B", "C", "D", "E", "F", "G"] * 3, - } - ).set_index(["a", "b"]) - new_index = [0.5, 2.0, 5.0, 5.8] - new_multi_index = MultiIndex.from_product([[0], new_index], names=["a", "b"]) - - # reindexing w/o a `method` value - reindexed = df.reindex(new_multi_index) - expected = DataFrame( - {"a": [0] * 4, "b": new_index, "c": [np.nan, "C", "F", np.nan]} - ).set_index(["a", "b"]) - tm.assert_frame_equal(expected, reindexed) - - # reindexing with backfilling - expected = DataFrame( - {"a": [0] * 4, "b": new_index, "c": ["B", "C", "F", "G"]} - ).set_index(["a", "b"]) - reindexed_with_backfilling = df.reindex(new_multi_index, method="bfill") - tm.assert_frame_equal(expected, reindexed_with_backfilling) - - reindexed_with_backfilling = df.reindex(new_multi_index, method="backfill") - tm.assert_frame_equal(expected, reindexed_with_backfilling) - - # reindexing with padding - expected = DataFrame( - {"a": [0] * 4, "b": new_index, "c": ["A", "C", "F", "F"]} - ).set_index(["a", "b"]) - reindexed_with_padding = df.reindex(new_multi_index, method="pad") - tm.assert_frame_equal(expected, reindexed_with_padding) - - reindexed_with_padding = df.reindex(new_multi_index, method="ffill") - tm.assert_frame_equal(expected, reindexed_with_padding) - def test_set_value_with_index_dtype_change(self): df_orig = DataFrame(np.random.randn(3, 3), index=range(3), columns=list("ABC")) @@ -1674,216 +1599,11 @@ def test_loc_duplicates(self): df.loc[trange[bool_idx], "A"] += 6 tm.assert_frame_equal(df, expected) - @pytest.mark.parametrize( - "method,expected_values", - [ - ("nearest", [0, 1, 1, 2]), - ("pad", [np.nan, 0, 1, 1]), - ("backfill", [0, 1, 2, 2]), - ], - ) - def test_reindex_methods(self, method, expected_values): - df = DataFrame({"x": list(range(5))}) - target = np.array([-0.1, 0.9, 1.1, 1.5]) - - expected = DataFrame({"x": expected_values}, index=target) - actual = df.reindex(target, method=method) - tm.assert_frame_equal(expected, actual) - - actual = df.reindex(target, method=method, tolerance=1) - tm.assert_frame_equal(expected, actual) - actual = df.reindex(target, method=method, tolerance=[1, 1, 1, 1]) - tm.assert_frame_equal(expected, actual) - - e2 = expected[::-1] - actual = df.reindex(target[::-1], method=method) - tm.assert_frame_equal(e2, actual) - - new_order = [3, 0, 2, 1] - e2 = expected.iloc[new_order] - actual = df.reindex(target[new_order], method=method) - tm.assert_frame_equal(e2, actual) - - switched_method = ( - "pad" if method == "backfill" else "backfill" if method == "pad" else method - ) - actual = df[::-1].reindex(target, method=switched_method) - tm.assert_frame_equal(expected, actual) - - def test_reindex_methods_nearest_special(self): - df = DataFrame({"x": list(range(5))}) - target = np.array([-0.1, 0.9, 1.1, 1.5]) - - expected = DataFrame({"x": [0, 1, 1, np.nan]}, index=target) - actual = df.reindex(target, method="nearest", tolerance=0.2) - tm.assert_frame_equal(expected, actual) - - expected = DataFrame({"x": [0, np.nan, 1, np.nan]}, index=target) - actual = df.reindex(target, method="nearest", tolerance=[0.5, 0.01, 0.4, 0.1]) - tm.assert_frame_equal(expected, actual) - - def test_reindex_nearest_tz(self, tz_aware_fixture): - # GH26683 - tz = tz_aware_fixture - idx = pd.date_range("2019-01-01", periods=5, tz=tz) - df = DataFrame({"x": list(range(5))}, index=idx) - - expected = df.head(3) - actual = df.reindex(idx[:3], method="nearest") - tm.assert_frame_equal(expected, actual) - - def test_reindex_nearest_tz_empty_frame(self): - # https://github.com/pandas-dev/pandas/issues/31964 - dti = pd.DatetimeIndex(["2016-06-26 14:27:26+00:00"]) - df = DataFrame(index=pd.DatetimeIndex(["2016-07-04 14:00:59+00:00"])) - expected = DataFrame(index=dti) - result = df.reindex(dti, method="nearest") - tm.assert_frame_equal(result, expected) - - def test_reindex_frame_add_nat(self): - rng = date_range("1/1/2000 00:00:00", periods=10, freq="10s") - df = DataFrame({"A": np.random.randn(len(rng)), "B": rng}) - - result = df.reindex(range(15)) - assert np.issubdtype(result["B"].dtype, np.dtype("M8[ns]")) - - mask = com.isna(result)["B"] - assert mask[-5:].all() - assert not mask[:-5].any() - - def test_reindex_limit(self): - # GH 28631 - data = [["A", "A", "A"], ["B", "B", "B"], ["C", "C", "C"], ["D", "D", "D"]] - exp_data = [ - ["A", "A", "A"], - ["B", "B", "B"], - ["C", "C", "C"], - ["D", "D", "D"], - ["D", "D", "D"], - [np.nan, np.nan, np.nan], - ] - df = DataFrame(data) - result = df.reindex([0, 1, 2, 3, 4, 5], method="ffill", limit=1) - expected = DataFrame(exp_data) - tm.assert_frame_equal(result, expected) - def test_set_dataframe_column_ns_dtype(self): x = DataFrame([datetime.now(), datetime.now()]) assert x[0].dtype == np.dtype("M8[ns]") - def test_non_monotonic_reindex_methods(self): - dr = pd.date_range("2013-08-01", periods=6, freq="B") - data = np.random.randn(6, 1) - df = DataFrame(data, index=dr, columns=list("A")) - df_rev = DataFrame(data, index=dr[[3, 4, 5] + [0, 1, 2]], columns=list("A")) - # index is not monotonic increasing or decreasing - msg = "index must be monotonic increasing or decreasing" - with pytest.raises(ValueError, match=msg): - df_rev.reindex(df.index, method="pad") - with pytest.raises(ValueError, match=msg): - df_rev.reindex(df.index, method="ffill") - with pytest.raises(ValueError, match=msg): - df_rev.reindex(df.index, method="bfill") - with pytest.raises(ValueError, match=msg): - df_rev.reindex(df.index, method="nearest") - - def test_reindex_level(self): - from itertools import permutations - - icol = ["jim", "joe", "jolie"] - - def verify_first_level(df, level, idx, check_index_type=True): - def f(val): - return np.nonzero((df[level] == val).to_numpy())[0] - - i = np.concatenate(list(map(f, idx))) - left = df.set_index(icol).reindex(idx, level=level) - right = df.iloc[i].set_index(icol) - tm.assert_frame_equal(left, right, check_index_type=check_index_type) - - def verify(df, level, idx, indexer, check_index_type=True): - left = df.set_index(icol).reindex(idx, level=level) - right = df.iloc[indexer].set_index(icol) - tm.assert_frame_equal(left, right, check_index_type=check_index_type) - - df = DataFrame( - { - "jim": list("B" * 4 + "A" * 2 + "C" * 3), - "joe": list("abcdeabcd")[::-1], - "jolie": [10, 20, 30] * 3, - "joline": np.random.randint(0, 1000, 9), - } - ) - - target = [ - ["C", "B", "A"], - ["F", "C", "A", "D"], - ["A"], - ["A", "B", "C"], - ["C", "A", "B"], - ["C", "B"], - ["C", "A"], - ["A", "B"], - ["B", "A", "C"], - ] - - for idx in target: - verify_first_level(df, "jim", idx) - - # reindex by these causes different MultiIndex levels - for idx in [["D", "F"], ["A", "C", "B"]]: - verify_first_level(df, "jim", idx, check_index_type=False) - - verify(df, "joe", list("abcde"), [3, 2, 1, 0, 5, 4, 8, 7, 6]) - verify(df, "joe", list("abcd"), [3, 2, 1, 0, 5, 8, 7, 6]) - verify(df, "joe", list("abc"), [3, 2, 1, 8, 7, 6]) - verify(df, "joe", list("eca"), [1, 3, 4, 6, 8]) - verify(df, "joe", list("edc"), [0, 1, 4, 5, 6]) - verify(df, "joe", list("eadbc"), [3, 0, 2, 1, 4, 5, 8, 7, 6]) - verify(df, "joe", list("edwq"), [0, 4, 5]) - verify(df, "joe", list("wq"), [], check_index_type=False) - - df = DataFrame( - { - "jim": ["mid"] * 5 + ["btm"] * 8 + ["top"] * 7, - "joe": ["3rd"] * 2 - + ["1st"] * 3 - + ["2nd"] * 3 - + ["1st"] * 2 - + ["3rd"] * 3 - + ["1st"] * 2 - + ["3rd"] * 3 - + ["2nd"] * 2, - # this needs to be jointly unique with jim and joe or - # reindexing will fail ~1.5% of the time, this works - # out to needing unique groups of same size as joe - "jolie": np.concatenate( - [ - np.random.choice(1000, x, replace=False) - for x in [2, 3, 3, 2, 3, 2, 3, 2] - ] - ), - "joline": np.random.randn(20).round(3) * 10, - } - ) - - for idx in permutations(df["jim"].unique()): - for i in range(3): - verify_first_level(df, "jim", idx[: i + 1]) - - i = [2, 3, 4, 0, 1, 8, 9, 5, 6, 7, 10, 11, 12, 13, 14, 18, 19, 15, 16, 17] - verify(df, "joe", ["1st", "2nd", "3rd"], i) - - i = [0, 1, 2, 3, 4, 10, 11, 12, 5, 6, 7, 8, 9, 15, 16, 17, 18, 19, 13, 14] - verify(df, "joe", ["3rd", "2nd", "1st"], i) - - i = [0, 1, 5, 6, 7, 10, 11, 12, 18, 19, 15, 16, 17] - verify(df, "joe", ["2nd", "3rd"], i) - - i = [0, 1, 2, 3, 4, 10, 11, 12, 8, 9, 15, 16, 17, 13, 14] - verify(df, "joe", ["3rd", "1st"], i) - - def test_getitem_ix_float_duplicates(self): + def test_iloc_getitem_float_duplicates(self): df = DataFrame( np.random.randn(3, 3), index=[0.1, 0.2, 0.2], columns=list("abc") ) @@ -1929,7 +1649,7 @@ def test_setitem_with_unaligned_tz_aware_datetime_column(self): df.loc[[0, 1, 2], "dates"] = column[[1, 0, 2]] tm.assert_series_equal(df["dates"], column) - def test_setitem_datetime_coercion(self): + def test_loc_setitem_datetime_coercion(self): # gh-1048 df = DataFrame({"c": [pd.Timestamp("2010-10-01")] * 3}) df.loc[0:1, "c"] = np.datetime64("2008-08-08") @@ -1938,7 +1658,7 @@ def test_setitem_datetime_coercion(self): df.loc[2, "c"] = date(2005, 5, 5) assert pd.Timestamp("2005-05-05") == df.loc[2, "c"] - def test_setitem_datetimelike_with_inference(self): + def test_loc_setitem_datetimelike_with_inference(self): # GH 7592 # assignment of timedeltas with NaT @@ -1961,7 +1681,7 @@ def test_setitem_datetimelike_with_inference(self): tm.assert_series_equal(result, expected) @pytest.mark.parametrize("idxer", ["var", ["var"]]) - def test_setitem_datetimeindex_tz(self, idxer, tz_naive_fixture): + def test_loc_setitem_datetimeindex_tz(self, idxer, tz_naive_fixture): # GH 11365 tz = tz_naive_fixture idx = date_range(start="2015-07-12", periods=3, freq="H", tz=tz) @@ -2016,7 +1736,7 @@ def test_at_time_between_time_datetimeindex(self): result.loc[bkey] = df.iloc[binds] tm.assert_frame_equal(result, df) - def test_index_namedtuple(self): + def test_loc_getitem_index_namedtuple(self): from collections import namedtuple IndexType = namedtuple("IndexType", ["a", "b"]) @@ -2029,7 +1749,7 @@ def test_index_namedtuple(self): assert result == 1 @pytest.mark.parametrize("tpl", [tuple([1]), tuple([1, 2])]) - def test_index_single_double_tuples(self, tpl): + def test_loc_getitem_index_single_double_tuples(self, tpl): # GH 20991 idx = pd.Index([tuple([1]), tuple([1, 2])], name="A", tupleize_cols=False) df = DataFrame(index=idx) @@ -2039,7 +1759,7 @@ def test_index_single_double_tuples(self, tpl): expected = DataFrame(index=idx) tm.assert_frame_equal(result, expected) - def test_boolean_indexing(self): + def test_setitem_boolean_indexing(self): idx = list(range(3)) cols = ["A", "B", "C"] df1 = DataFrame( @@ -2062,7 +1782,7 @@ def test_boolean_indexing(self): with pytest.raises(ValueError, match="Item wrong length"): df1[df1.index[:-1] > 2] = -1 - def test_boolean_indexing_mixed(self): + def test_getitem_boolean_indexing_mixed(self): df = DataFrame( { 0: {35: np.nan, 40: np.nan, 43: np.nan, 49: np.nan, 50: np.nan}, @@ -2135,7 +1855,7 @@ def test_type_error_multiindex(self): result = dg["x", 0] tm.assert_series_equal(result, expected) - def test_interval_index(self): + def test_loc_getitem_interval_index(self): # GH 19977 index = pd.interval_range(start=0, periods=3) df = DataFrame( @@ -2201,18 +1921,6 @@ def test_setitem(self, uint64_frame): ), ) - def test_set_reset(self): - - idx = Index([2 ** 63, 2 ** 63 + 5, 2 ** 63 + 10], name="foo") - - # set/reset - df = DataFrame({"A": [0, 1, 2]}, index=idx) - result = df.reset_index() - assert result["foo"].dtype == np.dtype("uint64") - - df = result.set_index("foo") - tm.assert_index_equal(df.index, idx) - def test_object_casting_indexing_wraps_datetimelike(): # GH#31649, check the indexing methods all the way down the stack diff --git a/pandas/tests/frame/methods/test_reindex.py b/pandas/tests/frame/methods/test_reindex.py index bd6eddc7bbaa5..5a5aac87b057d 100644 --- a/pandas/tests/frame/methods/test_reindex.py +++ b/pandas/tests/frame/methods/test_reindex.py @@ -1,17 +1,297 @@ from datetime import datetime +from itertools import permutations import numpy as np import pytest import pandas as pd -from pandas import Categorical, DataFrame, Index, Series, date_range, isna +from pandas import Categorical, DataFrame, Index, MultiIndex, Series, date_range, isna import pandas._testing as tm +import pandas.core.common as com class TestDataFrameSelectReindex: # These are specific reindex-based tests; other indexing tests should go in # test_indexing + def test_reindex_with_multi_index(self): + # https://github.com/pandas-dev/pandas/issues/29896 + # tests for reindexing a multi-indexed DataFrame with a new MultiIndex + # + # confirms that we can reindex a multi-indexed DataFrame with a new + # MultiIndex object correctly when using no filling, backfilling, and + # padding + # + # The DataFrame, `df`, used in this test is: + # c + # a b + # -1 0 A + # 1 B + # 2 C + # 3 D + # 4 E + # 5 F + # 6 G + # 0 0 A + # 1 B + # 2 C + # 3 D + # 4 E + # 5 F + # 6 G + # 1 0 A + # 1 B + # 2 C + # 3 D + # 4 E + # 5 F + # 6 G + # + # and the other MultiIndex, `new_multi_index`, is: + # 0: 0 0.5 + # 1: 2.0 + # 2: 5.0 + # 3: 5.8 + df = DataFrame( + { + "a": [-1] * 7 + [0] * 7 + [1] * 7, + "b": list(range(7)) * 3, + "c": ["A", "B", "C", "D", "E", "F", "G"] * 3, + } + ).set_index(["a", "b"]) + new_index = [0.5, 2.0, 5.0, 5.8] + new_multi_index = MultiIndex.from_product([[0], new_index], names=["a", "b"]) + + # reindexing w/o a `method` value + reindexed = df.reindex(new_multi_index) + expected = DataFrame( + {"a": [0] * 4, "b": new_index, "c": [np.nan, "C", "F", np.nan]} + ).set_index(["a", "b"]) + tm.assert_frame_equal(expected, reindexed) + + # reindexing with backfilling + expected = DataFrame( + {"a": [0] * 4, "b": new_index, "c": ["B", "C", "F", "G"]} + ).set_index(["a", "b"]) + reindexed_with_backfilling = df.reindex(new_multi_index, method="bfill") + tm.assert_frame_equal(expected, reindexed_with_backfilling) + + reindexed_with_backfilling = df.reindex(new_multi_index, method="backfill") + tm.assert_frame_equal(expected, reindexed_with_backfilling) + + # reindexing with padding + expected = DataFrame( + {"a": [0] * 4, "b": new_index, "c": ["A", "C", "F", "F"]} + ).set_index(["a", "b"]) + reindexed_with_padding = df.reindex(new_multi_index, method="pad") + tm.assert_frame_equal(expected, reindexed_with_padding) + + reindexed_with_padding = df.reindex(new_multi_index, method="ffill") + tm.assert_frame_equal(expected, reindexed_with_padding) + + @pytest.mark.parametrize( + "method,expected_values", + [ + ("nearest", [0, 1, 1, 2]), + ("pad", [np.nan, 0, 1, 1]), + ("backfill", [0, 1, 2, 2]), + ], + ) + def test_reindex_methods(self, method, expected_values): + df = DataFrame({"x": list(range(5))}) + target = np.array([-0.1, 0.9, 1.1, 1.5]) + + expected = DataFrame({"x": expected_values}, index=target) + actual = df.reindex(target, method=method) + tm.assert_frame_equal(expected, actual) + + actual = df.reindex(target, method=method, tolerance=1) + tm.assert_frame_equal(expected, actual) + actual = df.reindex(target, method=method, tolerance=[1, 1, 1, 1]) + tm.assert_frame_equal(expected, actual) + + e2 = expected[::-1] + actual = df.reindex(target[::-1], method=method) + tm.assert_frame_equal(e2, actual) + + new_order = [3, 0, 2, 1] + e2 = expected.iloc[new_order] + actual = df.reindex(target[new_order], method=method) + tm.assert_frame_equal(e2, actual) + + switched_method = ( + "pad" if method == "backfill" else "backfill" if method == "pad" else method + ) + actual = df[::-1].reindex(target, method=switched_method) + tm.assert_frame_equal(expected, actual) + + def test_reindex_methods_nearest_special(self): + df = DataFrame({"x": list(range(5))}) + target = np.array([-0.1, 0.9, 1.1, 1.5]) + + expected = DataFrame({"x": [0, 1, 1, np.nan]}, index=target) + actual = df.reindex(target, method="nearest", tolerance=0.2) + tm.assert_frame_equal(expected, actual) + + expected = DataFrame({"x": [0, np.nan, 1, np.nan]}, index=target) + actual = df.reindex(target, method="nearest", tolerance=[0.5, 0.01, 0.4, 0.1]) + tm.assert_frame_equal(expected, actual) + + def test_reindex_nearest_tz(self, tz_aware_fixture): + # GH26683 + tz = tz_aware_fixture + idx = pd.date_range("2019-01-01", periods=5, tz=tz) + df = DataFrame({"x": list(range(5))}, index=idx) + + expected = df.head(3) + actual = df.reindex(idx[:3], method="nearest") + tm.assert_frame_equal(expected, actual) + + def test_reindex_nearest_tz_empty_frame(self): + # https://github.com/pandas-dev/pandas/issues/31964 + dti = pd.DatetimeIndex(["2016-06-26 14:27:26+00:00"]) + df = DataFrame(index=pd.DatetimeIndex(["2016-07-04 14:00:59+00:00"])) + expected = DataFrame(index=dti) + result = df.reindex(dti, method="nearest") + tm.assert_frame_equal(result, expected) + + def test_reindex_frame_add_nat(self): + rng = date_range("1/1/2000 00:00:00", periods=10, freq="10s") + df = DataFrame({"A": np.random.randn(len(rng)), "B": rng}) + + result = df.reindex(range(15)) + assert np.issubdtype(result["B"].dtype, np.dtype("M8[ns]")) + + mask = com.isna(result)["B"] + assert mask[-5:].all() + assert not mask[:-5].any() + + def test_reindex_limit(self): + # GH 28631 + data = [["A", "A", "A"], ["B", "B", "B"], ["C", "C", "C"], ["D", "D", "D"]] + exp_data = [ + ["A", "A", "A"], + ["B", "B", "B"], + ["C", "C", "C"], + ["D", "D", "D"], + ["D", "D", "D"], + [np.nan, np.nan, np.nan], + ] + df = DataFrame(data) + result = df.reindex([0, 1, 2, 3, 4, 5], method="ffill", limit=1) + expected = DataFrame(exp_data) + tm.assert_frame_equal(result, expected) + + def test_reindex_level(self): + icol = ["jim", "joe", "jolie"] + + def verify_first_level(df, level, idx, check_index_type=True): + def f(val): + return np.nonzero((df[level] == val).to_numpy())[0] + + i = np.concatenate(list(map(f, idx))) + left = df.set_index(icol).reindex(idx, level=level) + right = df.iloc[i].set_index(icol) + tm.assert_frame_equal(left, right, check_index_type=check_index_type) + + def verify(df, level, idx, indexer, check_index_type=True): + left = df.set_index(icol).reindex(idx, level=level) + right = df.iloc[indexer].set_index(icol) + tm.assert_frame_equal(left, right, check_index_type=check_index_type) + + df = DataFrame( + { + "jim": list("B" * 4 + "A" * 2 + "C" * 3), + "joe": list("abcdeabcd")[::-1], + "jolie": [10, 20, 30] * 3, + "joline": np.random.randint(0, 1000, 9), + } + ) + + target = [ + ["C", "B", "A"], + ["F", "C", "A", "D"], + ["A"], + ["A", "B", "C"], + ["C", "A", "B"], + ["C", "B"], + ["C", "A"], + ["A", "B"], + ["B", "A", "C"], + ] + + for idx in target: + verify_first_level(df, "jim", idx) + + # reindex by these causes different MultiIndex levels + for idx in [["D", "F"], ["A", "C", "B"]]: + verify_first_level(df, "jim", idx, check_index_type=False) + + verify(df, "joe", list("abcde"), [3, 2, 1, 0, 5, 4, 8, 7, 6]) + verify(df, "joe", list("abcd"), [3, 2, 1, 0, 5, 8, 7, 6]) + verify(df, "joe", list("abc"), [3, 2, 1, 8, 7, 6]) + verify(df, "joe", list("eca"), [1, 3, 4, 6, 8]) + verify(df, "joe", list("edc"), [0, 1, 4, 5, 6]) + verify(df, "joe", list("eadbc"), [3, 0, 2, 1, 4, 5, 8, 7, 6]) + verify(df, "joe", list("edwq"), [0, 4, 5]) + verify(df, "joe", list("wq"), [], check_index_type=False) + + df = DataFrame( + { + "jim": ["mid"] * 5 + ["btm"] * 8 + ["top"] * 7, + "joe": ["3rd"] * 2 + + ["1st"] * 3 + + ["2nd"] * 3 + + ["1st"] * 2 + + ["3rd"] * 3 + + ["1st"] * 2 + + ["3rd"] * 3 + + ["2nd"] * 2, + # this needs to be jointly unique with jim and joe or + # reindexing will fail ~1.5% of the time, this works + # out to needing unique groups of same size as joe + "jolie": np.concatenate( + [ + np.random.choice(1000, x, replace=False) + for x in [2, 3, 3, 2, 3, 2, 3, 2] + ] + ), + "joline": np.random.randn(20).round(3) * 10, + } + ) + + for idx in permutations(df["jim"].unique()): + for i in range(3): + verify_first_level(df, "jim", idx[: i + 1]) + + i = [2, 3, 4, 0, 1, 8, 9, 5, 6, 7, 10, 11, 12, 13, 14, 18, 19, 15, 16, 17] + verify(df, "joe", ["1st", "2nd", "3rd"], i) + + i = [0, 1, 2, 3, 4, 10, 11, 12, 5, 6, 7, 8, 9, 15, 16, 17, 18, 19, 13, 14] + verify(df, "joe", ["3rd", "2nd", "1st"], i) + + i = [0, 1, 5, 6, 7, 10, 11, 12, 18, 19, 15, 16, 17] + verify(df, "joe", ["2nd", "3rd"], i) + + i = [0, 1, 2, 3, 4, 10, 11, 12, 8, 9, 15, 16, 17, 13, 14] + verify(df, "joe", ["3rd", "1st"], i) + + def test_non_monotonic_reindex_methods(self): + dr = date_range("2013-08-01", periods=6, freq="B") + data = np.random.randn(6, 1) + df = DataFrame(data, index=dr, columns=list("A")) + df_rev = DataFrame(data, index=dr[[3, 4, 5] + [0, 1, 2]], columns=list("A")) + # index is not monotonic increasing or decreasing + msg = "index must be monotonic increasing or decreasing" + with pytest.raises(ValueError, match=msg): + df_rev.reindex(df.index, method="pad") + with pytest.raises(ValueError, match=msg): + df_rev.reindex(df.index, method="ffill") + with pytest.raises(ValueError, match=msg): + df_rev.reindex(df.index, method="bfill") + with pytest.raises(ValueError, match=msg): + df_rev.reindex(df.index, method="nearest") + def test_reindex_sparse(self): # https://github.com/pandas-dev/pandas/issues/35286 df = DataFrame( diff --git a/pandas/tests/frame/methods/test_reset_index.py b/pandas/tests/frame/methods/test_reset_index.py index 9454080363b3e..3c14192f02cf6 100644 --- a/pandas/tests/frame/methods/test_reset_index.py +++ b/pandas/tests/frame/methods/test_reset_index.py @@ -18,6 +18,18 @@ class TestResetIndex: + def test_set_reset(self): + + idx = Index([2 ** 63, 2 ** 63 + 5, 2 ** 63 + 10], name="foo") + + # set/reset + df = DataFrame({"A": [0, 1, 2]}, index=idx) + result = df.reset_index() + assert result["foo"].dtype == np.dtype("uint64") + + df = result.set_index("foo") + tm.assert_index_equal(df.index, idx) + def test_set_index_reset_index_dt64tz(self): idx = Index(date_range("20130101", periods=3, tz="US/Eastern"), name="foo") From f6e908d5e14d3a093bd2f05b9639ea34ee35a4ba Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 21 Oct 2020 16:20:45 -0700 Subject: [PATCH 5/5] REF: misplaced CategoricalIndex tests --- .../tests/frame/indexing/test_categorical.py | 8 ------ .../indexes/categorical/test_indexing.py | 28 ++++++++++++++++++- pandas/tests/indexing/test_categorical.py | 26 ----------------- pandas/tests/reshape/test_cut.py | 7 +++++ 4 files changed, 34 insertions(+), 35 deletions(-) diff --git a/pandas/tests/frame/indexing/test_categorical.py b/pandas/tests/frame/indexing/test_categorical.py index cfc22b9b18729..e37d00c540974 100644 --- a/pandas/tests/frame/indexing/test_categorical.py +++ b/pandas/tests/frame/indexing/test_categorical.py @@ -352,14 +352,6 @@ def test_assigning_ops(self): df.loc[2:3, "b"] = Categorical(["b", "b"], categories=["a", "b"]) tm.assert_frame_equal(df, exp) - def test_functions_no_warnings(self): - df = DataFrame({"value": np.random.randint(0, 100, 20)}) - labels = [f"{i} - {i + 9}" for i in range(0, 100, 10)] - with tm.assert_produces_warning(False): - df["group"] = pd.cut( - df.value, range(0, 105, 10), right=False, labels=labels - ) - def test_setitem_single_row_categorical(self): # GH 25495 df = DataFrame({"Alpha": ["a"], "Numeric": [0]}) diff --git a/pandas/tests/indexes/categorical/test_indexing.py b/pandas/tests/indexes/categorical/test_indexing.py index 9cf901c0797d8..c720547aab3f8 100644 --- a/pandas/tests/indexes/categorical/test_indexing.py +++ b/pandas/tests/indexes/categorical/test_indexing.py @@ -2,7 +2,7 @@ import pytest import pandas as pd -from pandas import CategoricalIndex, Index, IntervalIndex +from pandas import CategoricalIndex, Index, IntervalIndex, Timestamp import pandas._testing as tm @@ -251,6 +251,32 @@ def test_get_indexer(self): with pytest.raises(NotImplementedError, match=msg): idx2.get_indexer(idx1, method="nearest") + def test_get_indexer_array(self): + arr = np.array( + [Timestamp("1999-12-31 00:00:00"), Timestamp("2000-12-31 00:00:00")], + dtype=object, + ) + cats = [Timestamp("1999-12-31 00:00:00"), Timestamp("2000-12-31 00:00:00")] + ci = CategoricalIndex(cats, categories=cats, ordered=False, dtype="category") + result = ci.get_indexer(arr) + expected = np.array([0, 1], dtype="intp") + tm.assert_numpy_array_equal(result, expected) + + def test_get_indexer_same_categories_same_order(self): + ci = CategoricalIndex(["a", "b"], categories=["a", "b"]) + + result = ci.get_indexer(CategoricalIndex(["b", "b"], categories=["a", "b"])) + expected = np.array([1, 1], dtype="intp") + tm.assert_numpy_array_equal(result, expected) + + def test_get_indexer_same_categories_different_order(self): + # https://github.com/pandas-dev/pandas/issues/19551 + ci = CategoricalIndex(["a", "b"], categories=["a", "b"]) + + result = ci.get_indexer(CategoricalIndex(["b", "b"], categories=["b", "a"])) + expected = np.array([1, 1], dtype="intp") + tm.assert_numpy_array_equal(result, expected) + class TestWhere: @pytest.mark.parametrize("klass", [list, tuple, np.array, pd.Series]) diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index 347ce2262a261..9b52c297ec688 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -435,32 +435,6 @@ def test_loc_listlike_dtypes(self): with pytest.raises(KeyError, match=msg): df.loc[["a", "x"]] - def test_get_indexer_array(self): - arr = np.array( - [Timestamp("1999-12-31 00:00:00"), Timestamp("2000-12-31 00:00:00")], - dtype=object, - ) - cats = [Timestamp("1999-12-31 00:00:00"), Timestamp("2000-12-31 00:00:00")] - ci = CategoricalIndex(cats, categories=cats, ordered=False, dtype="category") - result = ci.get_indexer(arr) - expected = np.array([0, 1], dtype="intp") - tm.assert_numpy_array_equal(result, expected) - - def test_get_indexer_same_categories_same_order(self): - ci = CategoricalIndex(["a", "b"], categories=["a", "b"]) - - result = ci.get_indexer(CategoricalIndex(["b", "b"], categories=["a", "b"])) - expected = np.array([1, 1], dtype="intp") - tm.assert_numpy_array_equal(result, expected) - - def test_get_indexer_same_categories_different_order(self): - # https://github.com/pandas-dev/pandas/issues/19551 - ci = CategoricalIndex(["a", "b"], categories=["a", "b"]) - - result = ci.get_indexer(CategoricalIndex(["b", "b"], categories=["b", "a"])) - expected = np.array([1, 1], dtype="intp") - tm.assert_numpy_array_equal(result, expected) - def test_getitem_with_listlike(self): # GH 16115 cats = Categorical([Timestamp("12-31-1999"), Timestamp("12-31-2000")]) diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py index e6091a63b3e97..8aa4012b3e77c 100644 --- a/pandas/tests/reshape/test_cut.py +++ b/pandas/tests/reshape/test_cut.py @@ -674,3 +674,10 @@ def test_cut_unordered_with_series_labels(): result = pd.cut(s, bins=bins, labels=labels, ordered=False) expected = Series(["a", "a", "b", "b", "c"], dtype="category") tm.assert_series_equal(result, expected) + + +def test_cut_no_warnings(): + df = DataFrame({"value": np.random.randint(0, 100, 20)}) + labels = [f"{i} - {i + 9}" for i in range(0, 100, 10)] + with tm.assert_produces_warning(False): + df["group"] = pd.cut(df.value, range(0, 105, 10), right=False, labels=labels)