From 42fb0265118c165f8e0e9fa6a6c485455537b265 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 6 May 2024 13:44:04 -0700 Subject: [PATCH 1/3] Reduce data sizes --- pandas/tests/frame/conftest.py | 4 ++-- pandas/tests/frame/indexing/test_indexing.py | 2 +- pandas/tests/frame/methods/test_at_time.py | 2 +- pandas/tests/frame/methods/test_cov_corr.py | 4 ++-- pandas/tests/frame/methods/test_fillna.py | 3 --- pandas/tests/frame/methods/test_to_csv.py | 18 ++++++++++++++---- pandas/tests/frame/methods/test_truncate.py | 2 +- 7 files changed, 21 insertions(+), 14 deletions(-) diff --git a/pandas/tests/frame/conftest.py b/pandas/tests/frame/conftest.py index e07024b2e2a09..8da7ac635f293 100644 --- a/pandas/tests/frame/conftest.py +++ b/pandas/tests/frame/conftest.py @@ -17,9 +17,9 @@ def datetime_frame() -> DataFrame: Columns are ['A', 'B', 'C', 'D'] """ return DataFrame( - np.random.default_rng(2).standard_normal((100, 4)), + np.random.default_rng(2).standard_normal((10, 4)), columns=Index(list("ABCD"), dtype=object), - index=date_range("2000-01-01", periods=100, freq="B"), + index=date_range("2000-01-01", periods=10, freq="B"), ) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 69e6228d6efde..ee08c10f96ae7 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -118,7 +118,7 @@ def test_setitem_list2(self): def test_getitem_boolean(self, mixed_float_frame, mixed_int_frame, datetime_frame): # boolean indexing - d = datetime_frame.index[10] + d = datetime_frame.index[len(datetime_frame) // 2] indexer = datetime_frame.index > d indexer_obj = indexer.astype(object) diff --git a/pandas/tests/frame/methods/test_at_time.py b/pandas/tests/frame/methods/test_at_time.py index 1ebe9920933d1..126899826fac3 100644 --- a/pandas/tests/frame/methods/test_at_time.py +++ b/pandas/tests/frame/methods/test_at_time.py @@ -97,7 +97,7 @@ def test_at_time_raises(self, frame_or_series): def test_at_time_axis(self, axis): # issue 8839 - rng = date_range("1/1/2000", "1/5/2000", freq="5min") + rng = date_range("1/1/2000", "1/2/2000", freq="5min") ts = DataFrame(np.random.default_rng(2).standard_normal((len(rng), len(rng)))) ts.index, ts.columns = rng, rng diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index 53aa44f264c7a..4151a1d27d06a 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -285,7 +285,7 @@ def test_corrwith(self, datetime_frame, dtype): b = datetime_frame.add(noise, axis=0) # make sure order does not matter - b = b.reindex(columns=b.columns[::-1], index=b.index[::-1][10:]) + b = b.reindex(columns=b.columns[::-1], index=b.index[::-1][len(a) // 2 :]) del b["B"] colcorr = a.corrwith(b, axis=0) @@ -301,7 +301,7 @@ def test_corrwith(self, datetime_frame, dtype): dropped = a.corrwith(b, axis=1, drop=True) assert a.index[-1] not in dropped.index - # non time-series data + def test_corrwith_non_timeseries_data(self): index = ["a", "b", "c", "d", "e"] columns = ["one", "two", "three", "four"] df1 = DataFrame( diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index e858c123e4dae..2ef7780e9a6d5 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -60,9 +60,6 @@ def test_fillna_datetime(self, datetime_frame): padded = datetime_frame.ffill() assert np.isnan(padded.loc[padded.index[:5], "A"]).all() - assert ( - padded.loc[padded.index[-5:], "A"] == padded.loc[padded.index[-5], "A"] - ).all() msg = r"missing 1 required positional argument: 'value'" with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py index 66a35c6f486a4..3a87f7ded1759 100644 --- a/pandas/tests/frame/methods/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -33,7 +33,7 @@ def read_csv(self, path, **kwargs): return read_csv(path, **params) - def test_to_csv_from_csv1(self, temp_file, float_frame, datetime_frame): + def test_to_csv_from_csv1(self, temp_file, float_frame): path = str(temp_file) float_frame.iloc[:5, float_frame.columns.get_loc("A")] = np.nan @@ -42,6 +42,8 @@ def test_to_csv_from_csv1(self, temp_file, float_frame, datetime_frame): float_frame.to_csv(path, header=False) float_frame.to_csv(path, index=False) + def test_to_csv_from_csv1_datetime(self, temp_file, datetime_frame): + path = str(temp_file) # test roundtrip # freq does not roundtrip datetime_frame.index = datetime_frame.index._with_freq(None) @@ -59,7 +61,8 @@ def test_to_csv_from_csv1(self, temp_file, float_frame, datetime_frame): recons = self.read_csv(path, index_col=None, parse_dates=True) tm.assert_almost_equal(datetime_frame.values, recons.values) - # corner case + def test_to_csv_from_csv1_corner_case(self, temp_file): + path = str(temp_file) dm = DataFrame( { "s1": Series(range(3), index=np.arange(3, dtype=np.int64)), @@ -1167,9 +1170,16 @@ def test_to_csv_with_dst_transitions(self, td, temp_file): result.index = to_datetime(result.index, utc=True).tz_convert("Europe/London") tm.assert_frame_equal(result, df) - def test_to_csv_with_dst_transitions_with_pickle(self, temp_file): + @pytest.mark.parametrize( + "start,end", + [ + ["2015-03-29", "2015-03-30"], + ["2015-10-25", "2015-10-26"], + ], + ) + def test_to_csv_with_dst_transitions_with_pickle(self, start, end, temp_file): # GH11619 - idx = date_range("2015-01-01", "2015-12-31", freq="h", tz="Europe/Paris") + idx = date_range(start, end, freq="h", tz="Europe/Paris") idx = idx._with_freq(None) # freq does not round-trip idx._data._freq = None # otherwise there is trouble on unpickle df = DataFrame({"values": 1, "idx": idx}, index=idx) diff --git a/pandas/tests/frame/methods/test_truncate.py b/pandas/tests/frame/methods/test_truncate.py index 12077952c2e03..f28f811148c5d 100644 --- a/pandas/tests/frame/methods/test_truncate.py +++ b/pandas/tests/frame/methods/test_truncate.py @@ -60,7 +60,7 @@ def test_truncate(self, datetime_frame, frame_or_series): truncated = ts.truncate(before=ts.index[-1] + ts.index.freq) assert len(truncated) == 0 - msg = "Truncate: 2000-01-06 00:00:00 must be after 2000-05-16 00:00:00" + msg = "Truncate: 2000-01-06 00:00:00 must be after 2000-01-11 00:00:00" with pytest.raises(ValueError, match=msg): ts.truncate( before=ts.index[-1] - ts.index.freq, after=ts.index[0] + ts.index.freq From 648cde0eacb0bda61c4762512b05e1174001efa2 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 6 May 2024 13:54:48 -0700 Subject: [PATCH 2/3] Split test, remove request --- pandas/tests/frame/test_block_internals.py | 9 ++++----- pandas/tests/frame/test_constructors.py | 2 +- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index efbcf8a5cf9dc..3f0e829f66361 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -249,20 +249,19 @@ def f(dtype): with pytest.raises(ValueError, match=msg): f("M8[ns]") - def test_pickle(self, float_string_frame, timezone_frame): - empty_frame = DataFrame() - + def test_pickle_float_string_frame(self, float_string_frame): unpickled = tm.round_trip_pickle(float_string_frame) tm.assert_frame_equal(float_string_frame, unpickled) # buglet float_string_frame._mgr.ndim - # empty + def test_pickle_empty(self): + empty_frame = DataFrame() unpickled = tm.round_trip_pickle(empty_frame) repr(unpickled) - # tz frame + def test_pickle_empty_tz_frame(self, timezone_frame): unpickled = tm.round_trip_pickle(timezone_frame) tm.assert_frame_equal(timezone_frame, unpickled) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 53476c2f7ce38..e2f12e6e459cb 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2280,7 +2280,7 @@ def test_check_dtype_empty_numeric_column(self, dtype): @pytest.mark.parametrize( "dtype", tm.STRING_DTYPES + tm.BYTES_DTYPES + tm.OBJECT_DTYPES ) - def test_check_dtype_empty_string_column(self, request, dtype): + def test_check_dtype_empty_string_column(self, dtype): # GH24386: Ensure dtypes are set correctly for an empty DataFrame. # Empty DataFrame is generated via dictionary data with non-overlapping columns. data = DataFrame({"a": [1, 2]}, columns=["b"], dtype=dtype) From f5985a7f37ac12f928ca44f3f88c3653f3c27f4e Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 6 May 2024 14:16:18 -0700 Subject: [PATCH 3/3] Reduce data of indices_dict --- pandas/conftest.py | 52 +++++++++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 21100178262c8..e1225f031b568 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -672,47 +672,47 @@ def _create_mi_with_dt64tz_level(): indices_dict = { - "string": Index([f"pandas_{i}" for i in range(100)]), - "datetime": date_range("2020-01-01", periods=100), - "datetime-tz": date_range("2020-01-01", periods=100, tz="US/Pacific"), - "period": period_range("2020-01-01", periods=100, freq="D"), - "timedelta": timedelta_range(start="1 day", periods=100, freq="D"), - "range": RangeIndex(100), - "int8": Index(np.arange(100), dtype="int8"), - "int16": Index(np.arange(100), dtype="int16"), - "int32": Index(np.arange(100), dtype="int32"), - "int64": Index(np.arange(100), dtype="int64"), - "uint8": Index(np.arange(100), dtype="uint8"), - "uint16": Index(np.arange(100), dtype="uint16"), - "uint32": Index(np.arange(100), dtype="uint32"), - "uint64": Index(np.arange(100), dtype="uint64"), - "float32": Index(np.arange(100), dtype="float32"), - "float64": Index(np.arange(100), dtype="float64"), + "string": Index([f"pandas_{i}" for i in range(10)]), + "datetime": date_range("2020-01-01", periods=10), + "datetime-tz": date_range("2020-01-01", periods=10, tz="US/Pacific"), + "period": period_range("2020-01-01", periods=10, freq="D"), + "timedelta": timedelta_range(start="1 day", periods=10, freq="D"), + "range": RangeIndex(10), + "int8": Index(np.arange(10), dtype="int8"), + "int16": Index(np.arange(10), dtype="int16"), + "int32": Index(np.arange(10), dtype="int32"), + "int64": Index(np.arange(10), dtype="int64"), + "uint8": Index(np.arange(10), dtype="uint8"), + "uint16": Index(np.arange(10), dtype="uint16"), + "uint32": Index(np.arange(10), dtype="uint32"), + "uint64": Index(np.arange(10), dtype="uint64"), + "float32": Index(np.arange(10), dtype="float32"), + "float64": Index(np.arange(10), dtype="float64"), "bool-object": Index([True, False] * 5, dtype=object), "bool-dtype": Index([True, False] * 5, dtype=bool), "complex64": Index( - np.arange(100, dtype="complex64") + 1.0j * np.arange(100, dtype="complex64") + np.arange(10, dtype="complex64") + 1.0j * np.arange(10, dtype="complex64") ), "complex128": Index( - np.arange(100, dtype="complex128") + 1.0j * np.arange(100, dtype="complex128") + np.arange(10, dtype="complex128") + 1.0j * np.arange(10, dtype="complex128") ), - "categorical": CategoricalIndex(list("abcd") * 25), - "interval": IntervalIndex.from_breaks(np.linspace(0, 100, num=101)), + "categorical": CategoricalIndex(list("abcd") * 2), + "interval": IntervalIndex.from_breaks(np.linspace(0, 100, num=11)), "empty": Index([]), "tuples": MultiIndex.from_tuples(zip(["foo", "bar", "baz"], [1, 2, 3])), "mi-with-dt64tz-level": _create_mi_with_dt64tz_level(), "multi": _create_multiindex(), "repeats": Index([0, 0, 1, 1, 2, 2]), - "nullable_int": Index(np.arange(100), dtype="Int64"), - "nullable_uint": Index(np.arange(100), dtype="UInt16"), - "nullable_float": Index(np.arange(100), dtype="Float32"), - "nullable_bool": Index(np.arange(100).astype(bool), dtype="boolean"), + "nullable_int": Index(np.arange(10), dtype="Int64"), + "nullable_uint": Index(np.arange(10), dtype="UInt16"), + "nullable_float": Index(np.arange(10), dtype="Float32"), + "nullable_bool": Index(np.arange(10).astype(bool), dtype="boolean"), "string-python": Index( - pd.array([f"pandas_{i}" for i in range(100)], dtype="string[python]") + pd.array([f"pandas_{i}" for i in range(10)], dtype="string[python]") ), } if has_pyarrow: - idx = Index(pd.array([f"pandas_{i}" for i in range(100)], dtype="string[pyarrow]")) + idx = Index(pd.array([f"pandas_{i}" for i in range(10)], dtype="string[pyarrow]")) indices_dict["string-pyarrow"] = idx