From 9fd459c39ee89465053b40cecb53457015d986a6 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 31 Jan 2023 10:21:51 +0100 Subject: [PATCH 1/7] API / CoW: return read-only numpy arrays in .values/to_numpy() --- pandas/_testing/__init__.py | 2 +- pandas/core/arrays/string_.py | 5 ++++- pandas/core/base.py | 12 ++++++++-- pandas/core/internals/blocks.py | 14 +++++++++--- pandas/core/internals/managers.py | 13 ++++++----- pandas/io/parsers/base_parser.py | 2 +- pandas/tests/frame/indexing/test_indexing.py | 22 ++++++++++++------- pandas/tests/frame/indexing/test_insert.py | 13 +++++++---- pandas/tests/frame/indexing/test_setitem.py | 5 +++-- pandas/tests/frame/methods/test_copy.py | 1 + pandas/tests/frame/methods/test_cov_corr.py | 5 ++--- pandas/tests/frame/methods/test_fillna.py | 5 +++-- pandas/tests/frame/methods/test_join.py | 2 +- pandas/tests/frame/methods/test_quantile.py | 14 +++++++++--- .../tests/frame/methods/test_sort_values.py | 12 +++++++--- pandas/tests/frame/methods/test_transpose.py | 15 +++++++++---- pandas/tests/frame/methods/test_values.py | 11 +++++++--- pandas/tests/frame/test_block_internals.py | 8 ++++++- pandas/tests/frame/test_constructors.py | 9 ++++---- pandas/tests/groupby/test_function.py | 5 +++-- .../tests/indexing/multiindex/test_setitem.py | 15 +++++++++---- .../tests/indexing/multiindex/test_slice.py | 2 +- .../indexing/test_chaining_and_caching.py | 9 +++++++- pandas/tests/indexing/test_iloc.py | 8 +++++-- pandas/tests/indexing/test_loc.py | 4 +++- pandas/tests/io/pytables/test_round_trip.py | 4 ++-- pandas/tests/series/indexing/test_setitem.py | 2 +- pandas/tests/series/methods/test_fillna.py | 2 +- pandas/tests/window/test_pairwise.py | 4 +++- pandas/util/_test_decorators.py | 5 +++++ 30 files changed, 163 insertions(+), 67 deletions(-) diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 045254d2041fc..3ab313fd2dfa6 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -792,7 +792,7 @@ def _gen_unique_rand(rng, _extra_size): def makeMissingDataframe(density: float = 0.9, random_state=None) -> DataFrame: df = makeDataFrame() i, j = _create_missing_idx(*df.shape, density=density, random_state=random_state) - df.values[i, j] = np.nan + df.iloc[i, j] = np.nan return df diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 9b26db07fc28f..9aa02108fc7bd 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -423,7 +423,10 @@ def __setitem__(self, key, value): if len(value) and not lib.is_string_array(value, skipna=True): raise ValueError("Must provide strings.") - value[isna(value)] = libmissing.NA + mask = isna(value) + if mask.any(): + value = value.copy() + value[isna(value)] = libmissing.NA super().__setitem__(key, value) diff --git a/pandas/core/base.py b/pandas/core/base.py index 9cf93ea8eee2f..c3df997adeddf 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -20,6 +20,8 @@ import numpy as np +from pandas._config import using_copy_on_write + from pandas._libs import lib from pandas._typing import ( Axis, @@ -545,10 +547,16 @@ def to_numpy( result = np.asarray(values, dtype=dtype) - if copy and na_value is lib.no_default: + if (copy and na_value is lib.no_default) or ( + not copy and using_copy_on_write() + ): if np.shares_memory(self._values[:2], result[:2]): # Take slices to improve performance of check - result = result.copy() + if using_copy_on_write() and not copy: + result = result.view() + result.flags.writeable = False + else: + result = result.copy() return result diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 8fb6a18ca137a..2e5a92025354f 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -14,6 +14,8 @@ import numpy as np +from pandas._config import using_copy_on_write + from pandas._libs import ( Timestamp, internals as libinternals, @@ -2326,6 +2328,12 @@ def external_values(values: ArrayLike) -> ArrayLike: # NB: for datetime64tz this is different from np.asarray(values), since # that returns an object-dtype ndarray of Timestamps. # Avoid raising in .astype in casting from dt64tz to dt64 - return values._ndarray - else: - return values + values = values._ndarray + + if isinstance(values, np.ndarray) and using_copy_on_write(): + values = values.view() + values.flags.writeable = False + + # TODO(CoW) we should also mark our ExtensionArrays as read-only + + return values diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 71d8b20f18457..369254f124cbe 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1745,13 +1745,16 @@ def as_array( arr = np.asarray(blk.get_values()) if dtype: arr = arr.astype(dtype, copy=False) + + if copy: + arr = arr.copy() + elif using_copy_on_write(): + arr = arr.view() + arr.flags.writeable = False else: arr = self._interleave(dtype=dtype, na_value=na_value) - # The underlying data was copied within _interleave - copy = False - - if copy: - arr = arr.copy() + # The underlying data was copied within _interleave, so no need + # to further copy if copy=True or setting na_value if na_value is not lib.no_default: arr[isna(arr)] = na_value diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 6272f213ccef1..34414e20b892a 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -1103,7 +1103,7 @@ def converter(*date_cols): dayfirst=dayfirst, errors="ignore", cache=cache_dates, - ).to_numpy() + )._values else: try: result = tools.to_datetime( diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 8bb5948029ca1..3f24bff7f303d 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -332,7 +332,7 @@ def test_setitem2(self): def test_setitem_boolean(self, float_frame): df = float_frame.copy() - values = float_frame.values + values = float_frame.values.copy() df[df["A"] > 0] = 4 values[values[:, 0] > 0] = 4 @@ -368,16 +368,18 @@ def test_setitem_boolean(self, float_frame): df[df * 0] = 2 # index with DataFrame + df_orig = df.copy() mask = df > np.abs(df) - expected = df.copy() df[df > np.abs(df)] = np.nan - expected.values[mask.values] = np.nan + values = df_orig.values.copy() + values[mask.values] = np.nan + expected = DataFrame(values, index=df_orig.index, columns=df_orig.columns) tm.assert_frame_equal(df, expected) # set from DataFrame - expected = df.copy() df[df > np.abs(df)] = df * 2 - np.putmask(expected.values, mask.values, df.values * 2) + np.putmask(values, mask.values, df.values * 2) + expected = DataFrame(values, index=df_orig.index, columns=df_orig.columns) tm.assert_frame_equal(df, expected) def test_setitem_cast(self, float_frame): @@ -620,7 +622,7 @@ def test_setitem_fancy_scalar(self, float_frame): for idx in f.index[::5]: i = f.index.get_loc(idx) val = np.random.randn() - expected.values[i, j] = val + expected.iloc[i, j] = val ix[idx, col] = val tm.assert_frame_equal(f, expected) @@ -653,16 +655,20 @@ def test_setitem_fancy_boolean(self, float_frame): # from 2d, set with booleans frame = float_frame.copy() expected = float_frame.copy() + values = expected.values.copy() mask = frame["A"] > 0 frame.loc[mask] = 0.0 - expected.values[mask.values] = 0.0 + values[mask.values] = 0.0 + expected = DataFrame(values, index=expected.index, columns=expected.columns) tm.assert_frame_equal(frame, expected) frame = float_frame.copy() expected = float_frame.copy() + values = expected.values.copy() frame.loc[mask, ["A", "B"]] = 0.0 - expected.values[mask.values, :2] = 0.0 + values[mask.values, :2] = 0.0 + expected = DataFrame(values, index=expected.index, columns=expected.columns) tm.assert_frame_equal(frame, expected) def test_getitem_fancy_ints(self, float_frame): diff --git a/pandas/tests/frame/indexing/test_insert.py b/pandas/tests/frame/indexing/test_insert.py index f67ecf601f838..4a92f93b75974 100644 --- a/pandas/tests/frame/indexing/test_insert.py +++ b/pandas/tests/frame/indexing/test_insert.py @@ -72,7 +72,7 @@ def test_insert_with_columns_dups(self): ) tm.assert_frame_equal(df, exp) - def test_insert_item_cache(self, using_array_manager): + def test_insert_item_cache(self, using_array_manager, using_copy_on_write): df = DataFrame(np.random.randn(4, 3)) ser = df[0] @@ -86,9 +86,14 @@ def test_insert_item_cache(self, using_array_manager): for n in range(100): df[n + 3] = df[1] * n - ser.values[0] = 99 - - assert df.iloc[0, 0] == df[0][0] + if using_copy_on_write: + ser.iloc[0] = 99 + assert df.iloc[0, 0] == df[0][0] + assert df.iloc[0, 0] != 99 + else: + ser.values[0] = 99 + assert df.iloc[0, 0] == df[0][0] + assert df.iloc[0, 0] == 99 def test_insert_EA_no_warning(self): # PerformanceWarning about fragmented frame should not be raised when diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index 62f05cb523b1b..ec3285a9fde34 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -1006,8 +1006,9 @@ def test_setitem_boolean_mask(self, mask_type, float_frame): result = df.copy() result[mask] = np.nan - expected = df.copy() - expected.values[np.array(mask)] = np.nan + expected = df.values.copy() + expected[np.array(mask)] = np.nan + expected = DataFrame(expected, index=df.index, columns=df.columns) tm.assert_frame_equal(result, expected) @pytest.mark.xfail(reason="Currently empty indexers are treated as all False") diff --git a/pandas/tests/frame/methods/test_copy.py b/pandas/tests/frame/methods/test_copy.py index 1c0b0755e7d94..1e685fcce9f05 100644 --- a/pandas/tests/frame/methods/test_copy.py +++ b/pandas/tests/frame/methods/test_copy.py @@ -18,6 +18,7 @@ def test_copy_index_name_checking(self, float_frame, attr): getattr(cp, attr).name = "foo" assert getattr(float_frame, attr).name is None + @td.skip_copy_on_write_invalid_test def test_copy_cache(self): # GH#31784 _item_cache not cleared on copy causes incorrect reads after updates df = DataFrame({"a": [1]}) diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index 5082aea354d6d..c4f5b60918e84 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -218,9 +218,8 @@ def test_corr_item_cache(self, using_copy_on_write): _ = df.corr(numeric_only=True) if using_copy_on_write: - # TODO(CoW) we should disallow this, so `df` doesn't get updated - ser.values[0] = 99 - assert df.loc[0, "A"] == 99 + ser.iloc[0] = 99 + assert df.loc[0, "A"] == 0 else: # Check that the corr didn't break link between ser and df ser.values[0] = 99 diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index 0645afd861029..59d170d692620 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -550,8 +550,9 @@ def test_fillna_dataframe(self): tm.assert_frame_equal(result, expected) def test_fillna_columns(self): - df = DataFrame(np.random.randn(10, 10)) - df.values[:, ::2] = np.nan + arr = np.random.randn(10, 10) + arr[:, ::2] = np.nan + df = DataFrame(arr) result = df.fillna(method="ffill", axis=1) expected = df.T.fillna(method="pad").T diff --git a/pandas/tests/frame/methods/test_join.py b/pandas/tests/frame/methods/test_join.py index 9a4837939aceb..437a2908792ee 100644 --- a/pandas/tests/frame/methods/test_join.py +++ b/pandas/tests/frame/methods/test_join.py @@ -418,7 +418,7 @@ def test_join(self, multiindex_dataframe_random_data): b = frame.loc[frame.index[2:], ["B", "C"]] joined = a.join(b, how="outer").reindex(frame.index) - expected = frame.copy().values + expected = frame.copy().values.copy() expected[np.isnan(joined.values)] = np.nan expected = DataFrame(expected, index=frame.index, columns=frame.columns) diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py index 0a4d1dedfae9d..d001a3586b98a 100644 --- a/pandas/tests/frame/methods/test_quantile.py +++ b/pandas/tests/frame/methods/test_quantile.py @@ -767,7 +767,9 @@ def test_quantile_empty_no_columns(self, interp_method): expected.columns.name = "captain tightpants" tm.assert_frame_equal(result, expected) - def test_quantile_item_cache(self, using_array_manager, interp_method): + def test_quantile_item_cache( + self, using_array_manager, interp_method, using_copy_on_write + ): # previous behavior incorrect retained an invalid _item_cache entry interpolation, method = interp_method df = DataFrame(np.random.randn(4, 3), columns=["A", "B", "C"]) @@ -777,9 +779,15 @@ def test_quantile_item_cache(self, using_array_manager, interp_method): assert len(df._mgr.blocks) == 2 df.quantile(numeric_only=False, interpolation=interpolation, method=method) - ser.values[0] = 99 - assert df.iloc[0, 0] == df["A"][0] + if using_copy_on_write: + ser.iloc[0] = 99 + assert df.iloc[0, 0] == df["A"][0] + assert df.iloc[0, 0] != 99 + else: + ser.values[0] = 99 + assert df.iloc[0, 0] == df["A"][0] + assert df.iloc[0, 0] == 99 def test_invalid_method(self): with pytest.raises(ValueError, match="Invalid method: foo"): diff --git a/pandas/tests/frame/methods/test_sort_values.py b/pandas/tests/frame/methods/test_sort_values.py index 43fcb25d122fb..8b6f0c6292a41 100644 --- a/pandas/tests/frame/methods/test_sort_values.py +++ b/pandas/tests/frame/methods/test_sort_values.py @@ -598,7 +598,7 @@ def test_sort_values_nat_na_position_default(self): result = expected.sort_values(["A", "date"]) tm.assert_frame_equal(result, expected) - def test_sort_values_item_cache(self, using_array_manager): + def test_sort_values_item_cache(self, using_array_manager, using_copy_on_write): # previous behavior incorrect retained an invalid _item_cache entry df = DataFrame(np.random.randn(4, 3), columns=["A", "B", "C"]) df["D"] = df["A"] * 2 @@ -607,9 +607,15 @@ def test_sort_values_item_cache(self, using_array_manager): assert len(df._mgr.blocks) == 2 df.sort_values(by="A") - ser.values[0] = 99 - assert df.iloc[0, 0] == df["A"][0] + if using_copy_on_write: + ser.iloc[0] = 99 + assert df.iloc[0, 0] == df["A"][0] + assert df.iloc[0, 0] != 99 + else: + ser.values[0] = 99 + assert df.iloc[0, 0] == df["A"][0] + assert df.iloc[0, 0] == 99 def test_sort_values_reshaping(self): # GH 39426 diff --git a/pandas/tests/frame/methods/test_transpose.py b/pandas/tests/frame/methods/test_transpose.py index 4f4477ad993ca..df3072e63d462 100644 --- a/pandas/tests/frame/methods/test_transpose.py +++ b/pandas/tests/frame/methods/test_transpose.py @@ -111,11 +111,18 @@ def test_transpose_float(self, float_frame): assert s.dtype == np.object_ @td.skip_array_manager_invalid_test - def test_transpose_get_view(self, float_frame): + def test_transpose_get_view(self, request, float_frame, using_copy_on_write): dft = float_frame.T - dft.values[:, 5:10] = 5 - - assert (float_frame.values[5:10] == 5).all() + dft.iloc[:, 5:10] = 5 + + if using_copy_on_write: + # TODO(CoW) transpose + request.node.add_marker( + pytest.mark.xfail(reason="transpose doesn't yet do CoW") + ) + assert (float_frame.values[5:10] != 5).all() + else: + assert (float_frame.values[5:10] == 5).all() @td.skip_array_manager_invalid_test def test_transpose_get_view_dt64tzget_view(self): diff --git a/pandas/tests/frame/methods/test_values.py b/pandas/tests/frame/methods/test_values.py index 1f134af68be6b..0be4ad2de7f9c 100644 --- a/pandas/tests/frame/methods/test_values.py +++ b/pandas/tests/frame/methods/test_values.py @@ -16,9 +16,14 @@ class TestDataFrameValues: @td.skip_array_manager_invalid_test - def test_values(self, float_frame): - float_frame.values[:, 0] = 5.0 - assert (float_frame.values[:, 0] == 5).all() + def test_values(self, float_frame, using_copy_on_write): + if using_copy_on_write: + with pytest.raises(ValueError, match="read-only"): + float_frame.values[:, 0] = 5.0 + assert (float_frame.values[:, 0] != 5).all() + else: + float_frame.values[:, 0] = 5.0 + assert (float_frame.values[:, 0] == 5).all() def test_more_values(self, float_string_frame): values = float_string_frame.values diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index 04f4766e49227..44e78d8368c74 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -85,7 +85,13 @@ def test_consolidate_inplace(self, float_frame): for letter in range(ord("A"), ord("Z")): float_frame[chr(letter)] = chr(letter) - def test_modify_values(self, float_frame): + def test_modify_values(self, float_frame, using_copy_on_write): + if using_copy_on_write: + with pytest.raises(ValueError, match="read-only"): + float_frame.values[5] = 5 + assert (float_frame.values[5] != 5).all() + return + float_frame.values[5] = 5 assert (float_frame.values[5] == 5).all() diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index e009ba45514a2..4bb340fc94bfb 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2102,13 +2102,14 @@ def test_constructor_frame_copy(self, float_frame): def test_constructor_ndarray_copy(self, float_frame, using_array_manager): if not using_array_manager: - df = DataFrame(float_frame.values) + arr = float_frame.values.copy() + df = DataFrame(arr) - float_frame.values[5] = 5 + arr[5] = 5 assert (df.values[5] == 5).all() - df = DataFrame(float_frame.values, copy=True) - float_frame.values[6] = 6 + df = DataFrame(arr, copy=True) + arr[6] = 6 assert not (df.values[6] == 6).all() else: arr = float_frame.values.copy() diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 1e16e353cc1a4..d95f50fc043f3 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -354,8 +354,9 @@ def test_cython_api2(): def test_cython_median(): - df = DataFrame(np.random.randn(1000)) - df.values[::2] = np.nan + arr = np.random.randn(1000) + arr[::2] = np.nan + df = DataFrame(arr) labels = np.random.randint(0, 50, size=1000).astype(float) labels[::17] = np.nan diff --git a/pandas/tests/indexing/multiindex/test_setitem.py b/pandas/tests/indexing/multiindex/test_setitem.py index 3ca057b80e578..a0affa836c679 100644 --- a/pandas/tests/indexing/multiindex/test_setitem.py +++ b/pandas/tests/indexing/multiindex/test_setitem.py @@ -284,7 +284,7 @@ def test_series_setitem(self, multiindex_year_month_day_dataframe_random_data): def test_frame_getitem_setitem_boolean(self, multiindex_dataframe_random_data): frame = multiindex_dataframe_random_data df = frame.T.copy() - values = df.values + values = df.values.copy() result = df[df > 0] expected = df.where(df > 0) @@ -487,12 +487,19 @@ def test_setitem_new_column_all_na(self): @td.skip_array_manager_invalid_test # df["foo"] select multiple columns -> .values # is not a view -def test_frame_setitem_view_direct(multiindex_dataframe_random_data): +def test_frame_setitem_view_direct( + multiindex_dataframe_random_data, using_copy_on_write +): # this works because we are modifying the underlying array # really a no-no df = multiindex_dataframe_random_data.T - df["foo"].values[:] = 0 - assert (df["foo"].values == 0).all() + if using_copy_on_write: + with pytest.raises(ValueError, match="read-only"): + df["foo"].values[:] = 0 + assert (df["foo"].values != 0).all() + else: + df["foo"].values[:] = 0 + assert (df["foo"].values == 0).all() def test_frame_setitem_copy_raises( diff --git a/pandas/tests/indexing/multiindex/test_slice.py b/pandas/tests/indexing/multiindex/test_slice.py index dd16cca28677c..96d631964ab65 100644 --- a/pandas/tests/indexing/multiindex/test_slice.py +++ b/pandas/tests/indexing/multiindex/test_slice.py @@ -750,7 +750,7 @@ def test_int_series_slicing(self, multiindex_year_month_day_dataframe_random_dat exp = ymd["A"].copy() s[5:] = 0 - exp.values[5:] = 0 + exp.iloc[5:] = 0 tm.assert_numpy_array_equal(s.values, exp.values) result = ymd[5:] diff --git a/pandas/tests/indexing/test_chaining_and_caching.py b/pandas/tests/indexing/test_chaining_and_caching.py index 5e7abeb86705b..82c07f95f9898 100644 --- a/pandas/tests/indexing/test_chaining_and_caching.py +++ b/pandas/tests/indexing/test_chaining_and_caching.py @@ -582,7 +582,7 @@ def test_cache_updating(self): assert "Hello Friend" in df["A"].index assert "Hello Friend" in df["B"].index - def test_cache_updating2(self): + def test_cache_updating2(self, using_copy_on_write): # 10264 df = DataFrame( np.zeros((5, 5), dtype="int64"), @@ -590,7 +590,14 @@ def test_cache_updating2(self): index=range(5), ) df["f"] = 0 + df_orig = df.copy() # TODO(CoW) protect underlying values of being written to? + if using_copy_on_write: + with pytest.raises(ValueError, match="read-only"): + df.f.values[3] = 1 + tm.assert_frame_equal(df, df_orig) + return + df.f.values[3] = 1 df.f.values[3] = 2 diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 27dd16172b992..2ea8daf1d9b9b 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -110,7 +110,7 @@ def test_iloc_setitem_fullcol_categorical(self, indexer, key, using_array_manage tm.assert_frame_equal(df, expected) @pytest.mark.parametrize("box", [array, Series]) - def test_iloc_setitem_ea_inplace(self, frame_or_series, box): + def test_iloc_setitem_ea_inplace(self, frame_or_series, box, using_copy_on_write): # GH#38952 Case with not setting a full column # IntegerArray without NAs arr = array([1, 2, 3, 4]) @@ -131,7 +131,11 @@ def test_iloc_setitem_ea_inplace(self, frame_or_series, box): # Check that we are actually in-place if frame_or_series is Series: - assert obj.values is values + if using_copy_on_write: + assert obj.values is not values + assert np.shares_memory(obj.values, values) + else: + assert obj.values is values else: assert np.shares_memory(obj[0].values, values) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 5445053027940..fd8f01383963d 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -2576,8 +2576,10 @@ def test_loc_setitem_boolean_and_column(self, float_frame): mask = float_frame["A"] > 0 float_frame.loc[mask, "B"] = 0 - expected.values[mask.values, 1] = 0 + values = expected.values.copy() + values[mask.values, 1] = 0 + expected = DataFrame(values, index=expected.index, columns=expected.columns) tm.assert_frame_equal(float_frame, expected) def test_loc_setitem_ndframe_values_alignment(self, using_copy_on_write): diff --git a/pandas/tests/io/pytables/test_round_trip.py b/pandas/tests/io/pytables/test_round_trip.py index 5c7c4f9ce0b75..99534dceaf7be 100644 --- a/pandas/tests/io/pytables/test_round_trip.py +++ b/pandas/tests/io/pytables/test_round_trip.py @@ -371,8 +371,8 @@ def test_frame(compression, setup_path): df = tm.makeDataFrame() # put in some random NAs - df.values[0, 0] = np.nan - df.values[5, 3] = np.nan + df.iloc[0, 0] = np.nan + df.iloc[5, 3] = np.nan _check_roundtrip_table( df, tm.assert_frame_equal, path=setup_path, compression=compression diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index 1a52e02ee314f..ecd2a78862b00 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -899,7 +899,7 @@ def expected(self, dtype): arr = np.arange(5).astype(dtype) ser = Series(arr) ser = ser.astype(object) - ser.values[0] = np.timedelta64(4, "ns") + ser.iloc[0] = np.timedelta64(4, "ns") return ser @pytest.fixture diff --git a/pandas/tests/series/methods/test_fillna.py b/pandas/tests/series/methods/test_fillna.py index 9fded26c37caf..df37fa031e31c 100644 --- a/pandas/tests/series/methods/test_fillna.py +++ b/pandas/tests/series/methods/test_fillna.py @@ -32,7 +32,7 @@ def test_fillna_nat(self): filled2 = series.fillna(value=series.values[2]) expected = series.copy() - expected.values[3] = expected.values[2] + expected.iloc[3] = expected.iloc[2] tm.assert_series_equal(filled, expected) tm.assert_series_equal(filled2, expected) diff --git a/pandas/tests/window/test_pairwise.py b/pandas/tests/window/test_pairwise.py index 2b656a5d4e7b9..bcec0613b38b0 100644 --- a/pandas/tests/window/test_pairwise.py +++ b/pandas/tests/window/test_pairwise.py @@ -93,7 +93,9 @@ def test_flex_binary_frame(method, frame): tm.assert_frame_equal(res2, exp) frame2 = frame.copy() - frame2.values[:] = np.random.randn(*frame2.shape) + frame2 = DataFrame( + np.random.randn(*frame2.shape), index=frame2.index, columns=frame2.columns + ) res3 = getattr(frame.rolling(window=10), method)(frame2) exp = DataFrame( diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index de100dba8144d..44409b0051e5c 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -299,3 +299,8 @@ def mark_array_manager_not_yet_implemented(request) -> None: get_option("mode.copy_on_write"), reason="Not yet implemented/adapted for Copy-on-Write mode", ) + +skip_copy_on_write_invalid_test = pytest.mark.skipif( + get_option("mode.copy_on_write"), + reason="Test not valid for Copy-on-Write mode", +) From 5ec2763f8d030d7f5ba41261a418db40e54555d3 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 14 Feb 2023 22:00:04 +0100 Subject: [PATCH 2/7] actually commit tests that I wrote --- pandas/tests/copy_view/test_array.py | 102 +++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) create mode 100644 pandas/tests/copy_view/test_array.py diff --git a/pandas/tests/copy_view/test_array.py b/pandas/tests/copy_view/test_array.py new file mode 100644 index 0000000000000..326182c520b9e --- /dev/null +++ b/pandas/tests/copy_view/test_array.py @@ -0,0 +1,102 @@ +import numpy as np +import pytest + +from pandas import ( + DataFrame, + Series, +) +import pandas._testing as tm +from pandas.tests.copy_view.util import get_array + +# ----------------------------------------------------------------------------- +# Copy/view behaviour for accessing underlying array of Series/DataFrame + + +def test_series_values(using_copy_on_write): + ser = Series([1, 2, 3], name="name") + ser_orig = ser.copy() + + arr = ser.values + + if using_copy_on_write: + # .values still gives a view but is read-only + assert np.shares_memory(arr, get_array(ser, "name")) + assert arr.flags.writeable is False + + # mutating series through arr therefore doesn't work + with pytest.raises(ValueError, match="read-only"): + arr[0] = 0 + tm.assert_series_equal(ser, ser_orig) + + # mutating the series itself still works + ser.iloc[0] = 0 + assert ser.values[0] == 0 + else: + assert arr.flags.writeable is True + arr[0] = 0 + assert ser.iloc[0] == 0 + + +def test_dataframe_values(using_copy_on_write, using_array_manager): + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + df_orig = df.copy() + + arr = df.values + + if using_copy_on_write: + # .values still gives a view but is read-only + assert np.shares_memory(arr, get_array(df, "a")) + assert arr.flags.writeable is False + + # mutating series through arr therefore doesn't work + with pytest.raises(ValueError, match="read-only"): + arr[0, 0] = 0 + tm.assert_frame_equal(df, df_orig) + + # mutating the series itself still works + df.iloc[0, 0] = 0 + assert df.values[0, 0] == 0 + else: + assert arr.flags.writeable is True + arr[0, 0] = 0 + if not using_array_manager: + assert df.iloc[0, 0] == 0 + else: + tm.assert_frame_equal(df, df_orig) + + +def test_series_to_numpy(using_copy_on_write): + ser = Series([1, 2, 3], name="name") + ser_orig = ser.copy() + + # default: copy=False, no dtype or NAs + arr = ser.to_numpy() + if using_copy_on_write: + # to_numpy still gives a view but is read-only + assert np.shares_memory(arr, get_array(ser, "name")) + assert arr.flags.writeable is False + + # mutating series through arr therefore doesn't work + with pytest.raises(ValueError, match="read-only"): + arr[0] = 0 + tm.assert_series_equal(ser, ser_orig) + + # mutating the series itself still works + ser.iloc[0] = 0 + assert ser.values[0] == 0 + else: + assert arr.flags.writeable is True + arr[0] = 0 + assert ser.iloc[0] == 0 + + # specify copy=False gives a writeable array + ser = Series([1, 2, 3], name="name") + arr = ser.to_numpy(copy=True) + assert not np.shares_memory(arr, get_array(ser, "name")) + assert arr.flags.writeable is True + + # specifying a dtype that already causes a copy also gives a writeable array + ser = Series([1, 2, 3], name="name") + arr = ser.to_numpy(dtype="float64") + assert not np.shares_memory(arr, get_array(ser, "name")) + assert arr.flags.writeable is True From 3ade20ddf455b5ab0d025f7fcb34a1cd2f6865a2 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 14 Feb 2023 22:10:29 +0100 Subject: [PATCH 3/7] also handle Series/DataFrame __array__ --- pandas/core/generic.py | 8 +++++++- pandas/core/series.py | 8 +++++++- pandas/tests/copy_view/test_array.py | 18 ++++++++++++++---- 3 files changed, 28 insertions(+), 6 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index cb321f0584294..52c0f3d641d0f 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1989,7 +1989,13 @@ def empty(self) -> bool_t: __array_priority__: int = 1000 def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray: - return np.asarray(self._values, dtype=dtype) + values = self._values + arr = np.asarray(values, dtype=dtype) + if arr is values and using_copy_on_write(): + # TODO(CoW) also properly handle extension dtypes + arr = arr.view() + arr.flags.writeable = False + return arr @final def __array_ufunc__( diff --git a/pandas/core/series.py b/pandas/core/series.py index c8b431abd1958..03fc09ca9e40b 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -874,7 +874,13 @@ def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray: array(['1999-12-31T23:00:00.000000000', ...], dtype='datetime64[ns]') """ - return np.asarray(self._values, dtype) + values = self._values + arr = np.asarray(values, dtype=dtype) + if arr is values and using_copy_on_write(): + # TODO(CoW) also properly handle extension dtypes + arr = arr.view() + arr.flags.writeable = False + return arr # ---------------------------------------------------------------------- # Unary Methods diff --git a/pandas/tests/copy_view/test_array.py b/pandas/tests/copy_view/test_array.py index 326182c520b9e..501ef27bc291e 100644 --- a/pandas/tests/copy_view/test_array.py +++ b/pandas/tests/copy_view/test_array.py @@ -12,11 +12,16 @@ # Copy/view behaviour for accessing underlying array of Series/DataFrame -def test_series_values(using_copy_on_write): +@pytest.mark.parametrize( + "method", + [lambda ser: ser.values, lambda ser: np.asarray(ser)], + ids=["values", "asarray"], +) +def test_series_values(using_copy_on_write, method): ser = Series([1, 2, 3], name="name") ser_orig = ser.copy() - arr = ser.values + arr = method(ser) if using_copy_on_write: # .values still gives a view but is read-only @@ -37,11 +42,16 @@ def test_series_values(using_copy_on_write): assert ser.iloc[0] == 0 -def test_dataframe_values(using_copy_on_write, using_array_manager): +@pytest.mark.parametrize( + "method", + [lambda df: df.values, lambda df: np.asarray(df)], + ids=["values", "asarray"], +) +def test_dataframe_values(using_copy_on_write, using_array_manager, method): df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) df_orig = df.copy() - arr = df.values + arr = method(df) if using_copy_on_write: # .values still gives a view but is read-only From 41239189af4a4d02340b565c2f2e0e286b3b8ce8 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 15 Feb 2023 22:37:36 +0100 Subject: [PATCH 4/7] fix tests --- pandas/tests/frame/indexing/test_where.py | 2 +- pandas/tests/series/indexing/test_indexing.py | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py index f0fb0a0595cbd..f9c6079d9e68c 100644 --- a/pandas/tests/frame/indexing/test_where.py +++ b/pandas/tests/frame/indexing/test_where.py @@ -982,7 +982,7 @@ def test_where_dt64_2d(): df = DataFrame(dta, columns=["A", "B"]) - mask = np.asarray(df.isna()) + mask = np.asarray(df.isna()).copy() mask[:, 1] = True # setting all of one column, none of the other diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index f12d752a1a764..b347691c3c101 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -95,8 +95,6 @@ def test_basic_getitem_dt64tz_values(): def test_getitem_setitem_ellipsis(): s = Series(np.random.randn(10)) - np.fix(s) - result = s[...] tm.assert_series_equal(result, s) From 11b8ecda7d417eb74ed1043478f3b5d2ae490eb0 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 6 Mar 2023 11:15:57 +0100 Subject: [PATCH 5/7] update tests --- pandas/tests/frame/methods/test_transpose.py | 6 +----- pandas/tests/indexing/test_chaining_and_caching.py | 1 - 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/pandas/tests/frame/methods/test_transpose.py b/pandas/tests/frame/methods/test_transpose.py index aa2ac07e26c0c..755465bfdd991 100644 --- a/pandas/tests/frame/methods/test_transpose.py +++ b/pandas/tests/frame/methods/test_transpose.py @@ -110,15 +110,11 @@ def test_transpose_float(self, float_frame): assert s.dtype == np.object_ @td.skip_array_manager_invalid_test - def test_transpose_get_view(self, request, float_frame, using_copy_on_write): + def test_transpose_get_view(self, float_frame, using_copy_on_write): dft = float_frame.T dft.iloc[:, 5:10] = 5 if using_copy_on_write: - # TODO(CoW) transpose - request.node.add_marker( - pytest.mark.xfail(reason="transpose doesn't yet do CoW") - ) assert (float_frame.values[5:10] != 5).all() else: assert (float_frame.values[5:10] == 5).all() diff --git a/pandas/tests/indexing/test_chaining_and_caching.py b/pandas/tests/indexing/test_chaining_and_caching.py index c01d099c2fa0a..d2224988b70fc 100644 --- a/pandas/tests/indexing/test_chaining_and_caching.py +++ b/pandas/tests/indexing/test_chaining_and_caching.py @@ -572,7 +572,6 @@ def test_cache_updating2(self, using_copy_on_write): ) df["f"] = 0 df_orig = df.copy() - # TODO(CoW) protect underlying values of being written to? if using_copy_on_write: with pytest.raises(ValueError, match="read-only"): df.f.values[3] = 1 From fbbd9aa631977347f432046818e3bcfc48dbdac0 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 6 Mar 2023 11:30:04 +0100 Subject: [PATCH 6/7] add failing test for np.fix --- pandas/tests/series/test_ufunc.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/pandas/tests/series/test_ufunc.py b/pandas/tests/series/test_ufunc.py index f143155bc07da..888991b8c98a9 100644 --- a/pandas/tests/series/test_ufunc.py +++ b/pandas/tests/series/test_ufunc.py @@ -5,6 +5,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd import pandas._testing as tm from pandas.arrays import SparseArray @@ -451,3 +453,16 @@ def add3(x, y, z): ) with pytest.raises(NotImplementedError, match=re.escape(msg)): ufunc(ser, ser, df) + + +# TODO(CoW) see https://github.com/pandas-dev/pandas/pull/51082 +td.skip_copy_on_write_not_yet_implemented + + +def test_np_fix(): + # np.fix is not a ufunc but is composed of several ufunc calls under the hood + # with `out` and `where` keywords + ser = pd.Series([-1.5, -0.5, 0.5, 1.5]) + result = np.fix(ser) + expected = pd.Series([-1.0, -0.0, 0.0, 1.0]) + tm.assert_series_equal(result, expected) From 1bc1fed5de6fe11b5df2c92596eeafdfaa5429c5 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 13 Mar 2023 11:43:40 +0100 Subject: [PATCH 7/7] proper skip --- pandas/tests/series/test_ufunc.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/tests/series/test_ufunc.py b/pandas/tests/series/test_ufunc.py index 888991b8c98a9..ac36103edcdcc 100644 --- a/pandas/tests/series/test_ufunc.py +++ b/pandas/tests/series/test_ufunc.py @@ -456,9 +456,7 @@ def add3(x, y, z): # TODO(CoW) see https://github.com/pandas-dev/pandas/pull/51082 -td.skip_copy_on_write_not_yet_implemented - - +@td.skip_copy_on_write_not_yet_implemented def test_np_fix(): # np.fix is not a ufunc but is composed of several ufunc calls under the hood # with `out` and `where` keywords