TST: Refactor more slow tests (#53820)

mroeschke · web-flow · commit df860581b835 · 2023-06-26T10:39:39.000-07:00
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -450,6 +450,9 @@ def unique_with_mask(values, mask: npt.NDArray[np.bool_] | None = None):
 unique1d = unique
 
 
+_MINIMUM_COMP_ARR_LEN = 1_000_000
+
+
 def isin(comps: ListLike, values: ListLike) -> npt.NDArray[np.bool_]:
     """
     Compute the isin boolean array.
@@ -518,7 +521,7 @@ def isin(comps: ListLike, values: ListLike) -> npt.NDArray[np.bool_]:
     # Albeit hashmap has O(1) look-up (vs. O(logn) in sorted array),
     # in1d is faster for small sizes
     if (
-        len(comps_array) > 1_000_000
+        len(comps_array) > _MINIMUM_COMP_ARR_LEN
         and len(values) <= 26
         and comps_array.dtype != object
     ):
diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py
@@ -45,6 +45,9 @@
     from pandas.io.formats.format import DataFrameFormatter
 
 
+_DEFAULT_CHUNKSIZE_CELLS = 100_000
+
+
 class CSVFormatter:
     cols: np.ndarray
 
@@ -163,7 +166,7 @@ def _initialize_columns(self, cols: Sequence[Hashable] | None) -> np.ndarray:
 
     def _initialize_chunksize(self, chunksize: int | None) -> int:
         if chunksize is None:
-            return (100000 // (len(self.cols) or 1)) or 1
+            return (_DEFAULT_CHUNKSIZE_CELLS // (len(self.cols) or 1)) or 1
         return int(chunksize)
 
     @property
diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py
@@ -752,13 +752,16 @@ def test_to_csv_chunking(self, chunksize):
             tm.assert_frame_equal(rs, aa)
 
     @pytest.mark.slow
-    def test_to_csv_wide_frame_formatting(self):
+    def test_to_csv_wide_frame_formatting(self, monkeypatch):
         # Issue #8621
-        df = DataFrame(np.random.randn(1, 100010), columns=None, index=None)
+        chunksize = 100
+        df = DataFrame(np.random.randn(1, chunksize + 10), columns=None, index=None)
         with tm.ensure_clean() as filename:
-            df.to_csv(filename, header=False, index=False)
+            with monkeypatch.context() as m:
+                m.setattr("pandas.io.formats.csvs._DEFAULT_CHUNKSIZE_CELLS", chunksize)
+                df.to_csv(filename, header=False, index=False)
             rs = read_csv(filename, header=None)
-            tm.assert_frame_equal(rs, df)
+        tm.assert_frame_equal(rs, df)
 
     def test_to_csv_bug(self):
         f1 = StringIO("a,1.0\nb,2.0")
diff --git a/pandas/tests/indexing/multiindex/test_chaining_and_caching.py b/pandas/tests/indexing/multiindex/test_chaining_and_caching.py
@@ -70,13 +70,11 @@ def test_indexer_caching():
     # GH5727
     # make sure that indexers are in the _internal_names_set
     n = 1000001
-    arrays = (range(n), range(n))
-    index = MultiIndex.from_tuples(zip(*arrays))
+    index = MultiIndex.from_arrays([np.arange(n), np.arange(n)])
     s = Series(np.zeros(n), index=index)
     str(s)
 
     # setitem
     expected = Series(np.ones(n), index=index)
-    s = Series(np.zeros(n), index=index)
     s[s == 0] = 1
     tm.assert_series_equal(s, expected)
diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py
@@ -228,16 +228,18 @@ def test_header_with_index_col(all_parsers):
 
 
 @pytest.mark.slow
-def test_index_col_large_csv(all_parsers):
+def test_index_col_large_csv(all_parsers, monkeypatch):
     # https://github.com/pandas-dev/pandas/issues/37094
     parser = all_parsers
 
-    N = 1_000_001
-    df = DataFrame({"a": range(N), "b": np.random.randn(N)})
+    ARR_LEN = 100
+    df = DataFrame({"a": range(ARR_LEN + 1), "b": np.random.randn(ARR_LEN + 1)})
 
     with tm.ensure_clean() as path:
         df.to_csv(path, index=False)
-        result = parser.read_csv(path, index_col=[0])
+        with monkeypatch.context() as m:
+            m.setattr("pandas.core.algorithms._MINIMUM_COMP_ARR_LEN", ARR_LEN)
+            result = parser.read_csv(path, index_col=[0])
 
     tm.assert_frame_equal(result, df.set_index("a"))
 
diff --git a/pandas/tests/plotting/frame/test_frame.py b/pandas/tests/plotting/frame/test_frame.py
@@ -423,44 +423,60 @@ def test_line_area_stacked(self, kind):
         df2 = df.set_index(df.index + 1)
         _check_plot_works(df2.plot, kind=kind, logx=True, stacked=True)
 
-    def test_line_area_nan_df(self):
+    @pytest.mark.parametrize(
+        "idx", [range(4), date_range("2023-01-1", freq="D", periods=4)]
+    )
+    def test_line_area_nan_df(self, idx):
         values1 = [1, 2, np.nan, 3]
         values2 = [3, np.nan, 2, 1]
-        df = DataFrame({"a": values1, "b": values2})
-        tdf = DataFrame({"a": values1, "b": values2}, index=tm.makeDateIndex(k=4))
-
-        for d in [df, tdf]:
-            ax = _check_plot_works(d.plot)
-            masked1 = ax.lines[0].get_ydata()
-            masked2 = ax.lines[1].get_ydata()
-            # remove nan for comparison purpose
-
-            exp = np.array([1, 2, 3], dtype=np.float64)
-            tm.assert_numpy_array_equal(np.delete(masked1.data, 2), exp)
-
-            exp = np.array([3, 2, 1], dtype=np.float64)
-            tm.assert_numpy_array_equal(np.delete(masked2.data, 1), exp)
-            tm.assert_numpy_array_equal(
-                masked1.mask, np.array([False, False, True, False])
-            )
-            tm.assert_numpy_array_equal(
-                masked2.mask, np.array([False, True, False, False])
-            )
+        df = DataFrame({"a": values1, "b": values2}, index=idx)
 
-            expected1 = np.array([1, 2, 0, 3], dtype=np.float64)
-            expected2 = np.array([3, 0, 2, 1], dtype=np.float64)
+        ax = _check_plot_works(df.plot)
+        masked1 = ax.lines[0].get_ydata()
+        masked2 = ax.lines[1].get_ydata()
+        # remove nan for comparison purpose
 
-            ax = _check_plot_works(d.plot, stacked=True)
-            tm.assert_numpy_array_equal(ax.lines[0].get_ydata(), expected1)
-            tm.assert_numpy_array_equal(ax.lines[1].get_ydata(), expected1 + expected2)
+        exp = np.array([1, 2, 3], dtype=np.float64)
+        tm.assert_numpy_array_equal(np.delete(masked1.data, 2), exp)
 
-            ax = _check_plot_works(d.plot.area)
-            tm.assert_numpy_array_equal(ax.lines[0].get_ydata(), expected1)
-            tm.assert_numpy_array_equal(ax.lines[1].get_ydata(), expected1 + expected2)
+        exp = np.array([3, 2, 1], dtype=np.float64)
+        tm.assert_numpy_array_equal(np.delete(masked2.data, 1), exp)
+        tm.assert_numpy_array_equal(masked1.mask, np.array([False, False, True, False]))
+        tm.assert_numpy_array_equal(masked2.mask, np.array([False, True, False, False]))
+
+    @pytest.mark.parametrize(
+        "idx", [range(4), date_range("2023-01-1", freq="D", periods=4)]
+    )
+    def test_line_area_nan_df_stacked(self, idx):
+        values1 = [1, 2, np.nan, 3]
+        values2 = [3, np.nan, 2, 1]
+        df = DataFrame({"a": values1, "b": values2}, index=idx)
+
+        expected1 = np.array([1, 2, 0, 3], dtype=np.float64)
+        expected2 = np.array([3, 0, 2, 1], dtype=np.float64)
+
+        ax = _check_plot_works(df.plot, stacked=True)
+        tm.assert_numpy_array_equal(ax.lines[0].get_ydata(), expected1)
+        tm.assert_numpy_array_equal(ax.lines[1].get_ydata(), expected1 + expected2)
+
+    @pytest.mark.parametrize(
+        "idx", [range(4), date_range("2023-01-1", freq="D", periods=4)]
+    )
+    @pytest.mark.parametrize("kwargs", [{}, {"stacked": False}])
+    def test_line_area_nan_df_stacked_area(self, idx, kwargs):
+        values1 = [1, 2, np.nan, 3]
+        values2 = [3, np.nan, 2, 1]
+        df = DataFrame({"a": values1, "b": values2}, index=idx)
+
+        expected1 = np.array([1, 2, 0, 3], dtype=np.float64)
+        expected2 = np.array([3, 0, 2, 1], dtype=np.float64)
 
-            ax = _check_plot_works(d.plot.area, stacked=False)
-            tm.assert_numpy_array_equal(ax.lines[0].get_ydata(), expected1)
+        ax = _check_plot_works(df.plot.area, **kwargs)
+        tm.assert_numpy_array_equal(ax.lines[0].get_ydata(), expected1)
+        if kwargs:
             tm.assert_numpy_array_equal(ax.lines[1].get_ydata(), expected2)
+        else:
+            tm.assert_numpy_array_equal(ax.lines[1].get_ydata(), expected1 + expected2)
 
     def test_line_lim(self):
         df = DataFrame(np.random.rand(6, 3), columns=["x", "y", "z"])
@@ -1537,27 +1553,31 @@ def test_errorbar_with_integer_column_names(self):
         _check_has_errorbars(ax, xerr=0, yerr=1)
 
     @pytest.mark.slow
-    def test_errorbar_with_partial_columns(self):
+    @pytest.mark.parametrize("kind", ["line", "bar"])
+    def test_errorbar_with_partial_columns_kind(self, kind):
         df = DataFrame(np.abs(np.random.randn(10, 3)))
         df_err = DataFrame(np.abs(np.random.randn(10, 2)), columns=[0, 2])
-        kinds = ["line", "bar"]
-        for kind in kinds:
-            ax = _check_plot_works(df.plot, yerr=df_err, kind=kind)
-            _check_has_errorbars(ax, xerr=0, yerr=2)
+        ax = _check_plot_works(df.plot, yerr=df_err, kind=kind)
+        _check_has_errorbars(ax, xerr=0, yerr=2)
 
+    @pytest.mark.slow
+    def test_errorbar_with_partial_columns_dti(self):
+        df = DataFrame(np.abs(np.random.randn(10, 3)))
+        df_err = DataFrame(np.abs(np.random.randn(10, 2)), columns=[0, 2])
         ix = date_range("1/1/2000", periods=10, freq="M")
         df.set_index(ix, inplace=True)
         df_err.set_index(ix, inplace=True)
         ax = _check_plot_works(df.plot, yerr=df_err, kind="line")
         _check_has_errorbars(ax, xerr=0, yerr=2)
 
+    @pytest.mark.slow
+    @pytest.mark.parametrize("err_box", [lambda x: x, DataFrame])
+    def test_errorbar_with_partial_columns_box(self, err_box):
         d = {"x": np.arange(12), "y": np.arange(12, 0, -1)}
         df = DataFrame(d)
-        d_err = {"x": np.ones(12) * 0.2, "z": np.ones(12) * 0.4}
-        df_err = DataFrame(d_err)
-        for err in [d_err, df_err]:
-            ax = _check_plot_works(df.plot, yerr=err)
-            _check_has_errorbars(ax, xerr=0, yerr=1)
+        err = err_box({"x": np.ones(12) * 0.2, "z": np.ones(12) * 0.4})
+        ax = _check_plot_works(df.plot, yerr=err)
+        _check_has_errorbars(ax, xerr=0, yerr=1)
 
     @pytest.mark.parametrize("kind", ["line", "bar", "barh"])
     def test_errorbar_timeseries(self, kind):
diff --git a/pandas/tests/plotting/frame/test_frame_subplots.py b/pandas/tests/plotting/frame/test_frame_subplots.py
@@ -245,15 +245,11 @@ def test_subplots_layout_single_column(
         assert axes.shape == expected_shape
 
     @pytest.mark.slow
-    def test_subplots_warnings(self):
+    @pytest.mark.parametrize("idx", [range(5), date_range("1/1/2000", periods=5)])
+    def test_subplots_warnings(self, idx):
         # GH 9464
         with tm.assert_produces_warning(None):
-            df = DataFrame(np.random.randn(100, 4))
-            df.plot(subplots=True, layout=(3, 2))
-
-            df = DataFrame(
-                np.random.randn(100, 4), index=date_range("1/1/2000", periods=100)
-            )
+            df = DataFrame(np.random.randn(5, 4), index=idx)
             df.plot(subplots=True, layout=(3, 2))
 
     def test_subplots_multiple_axes(self):
diff --git a/pandas/tests/plotting/test_misc.py b/pandas/tests/plotting/test_misc.py
@@ -218,44 +218,58 @@ def test_andrews_curves_handle(self):
         _check_colors(handles, linecolors=colors)
 
     @pytest.mark.slow
-    def test_parallel_coordinates(self, iris):
-        from matplotlib import cm
-
+    @pytest.mark.parametrize(
+        "color",
+        [("#556270", "#4ECDC4", "#C7F464"), ["dodgerblue", "aquamarine", "seagreen"]],
+    )
+    def test_parallel_coordinates_colors(self, iris, color):
         from pandas.plotting import parallel_coordinates
 
         df = iris
 
-        ax = _check_plot_works(parallel_coordinates, frame=df, class_column="Name")
-        nlines = len(ax.get_lines())
-        nxticks = len(ax.xaxis.get_ticklabels())
-
-        rgba = ("#556270", "#4ECDC4", "#C7F464")
         ax = _check_plot_works(
-            parallel_coordinates, frame=df, class_column="Name", color=rgba
+            parallel_coordinates, frame=df, class_column="Name", color=color
         )
-        _check_colors(ax.get_lines()[:10], linecolors=rgba, mapping=df["Name"][:10])
+        _check_colors(ax.get_lines()[:10], linecolors=color, mapping=df["Name"][:10])
 
-        cnames = ["dodgerblue", "aquamarine", "seagreen"]
-        ax = _check_plot_works(
-            parallel_coordinates, frame=df, class_column="Name", color=cnames
-        )
-        _check_colors(ax.get_lines()[:10], linecolors=cnames, mapping=df["Name"][:10])
+    @pytest.mark.slow
+    def test_parallel_coordinates_cmap(self, iris):
+        from matplotlib import cm
+
+        from pandas.plotting import parallel_coordinates
+
+        df = iris
 
         ax = _check_plot_works(
             parallel_coordinates, frame=df, class_column="Name", colormap=cm.jet
         )
         cmaps = [cm.jet(n) for n in np.linspace(0, 1, df["Name"].nunique())]
         _check_colors(ax.get_lines()[:10], linecolors=cmaps, mapping=df["Name"][:10])
 
+    @pytest.mark.slow
+    def test_parallel_coordinates_line_diff(self, iris):
+        from pandas.plotting import parallel_coordinates
+
+        df = iris
+
+        ax = _check_plot_works(parallel_coordinates, frame=df, class_column="Name")
+        nlines = len(ax.get_lines())
+        nxticks = len(ax.xaxis.get_ticklabels())
+
         ax = _check_plot_works(
             parallel_coordinates, frame=df, class_column="Name", axvlines=False
         )
         assert len(ax.get_lines()) == (nlines - nxticks)
 
+    @pytest.mark.slow
+    def test_parallel_coordinates_handles(self, iris):
+        from pandas.plotting import parallel_coordinates
+
+        df = iris
         colors = ["b", "g", "r"]
         df = DataFrame({"A": [1, 2, 3], "B": [1, 2, 3], "C": [1, 2, 3], "Name": colors})
         ax = parallel_coordinates(df, "Name", color=colors)
-        handles, labels = ax.get_legend_handles_labels()
+        handles, _ = ax.get_legend_handles_labels()
         _check_colors(handles, linecolors=colors)
 
     # not sure if this is indicative of a problem