From e823c97a39615193265ac8d96718c0a47edcd11b Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 2 Oct 2023 16:58:28 +0100 Subject: [PATCH 1/7] BUG: Inserting ndim=0 array does not infer string dtype --- doc/source/whatsnew/v2.1.2.rst | 2 +- pandas/core/construction.py | 9 ++++++++- pandas/tests/frame/indexing/test_indexing.py | 12 ++++++++++++ 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index 1a25b848e0f84..b6c86621537f8 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -24,7 +24,7 @@ Bug fixes ~~~~~~~~~ - Fixed bug in :meth:`DataFrame.resample` not respecting ``closed`` and ``label`` arguments for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55282`) - Fixed bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55281`) -- +- Fixed bug in :meth:`DataFrame.__setitem__` not inferring string dtype for zero-dimensional array with ``infer_string=True`` (:issue:`TODO`) .. --------------------------------------------------------------------------- .. _whatsnew_212.other: diff --git a/pandas/core/construction.py b/pandas/core/construction.py index aaac0dc73486f..bb1b4978d2367 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -562,7 +562,14 @@ def sanitize_array( if not is_list_like(data): if index is None: raise ValueError("index must be specified when data is not list-like") - data = construct_1d_arraylike_from_scalar(data, len(index), dtype) + if isinstance(data, str) and using_pyarrow_string_dtype(): + from pandas.core.arrays.string_ import StringDtype + + dtype = StringDtype("pyarrow_numpy") + data = dtype.construct_array_type()._from_sequence_of_strings([data]) + else: + data = construct_1d_arraylike_from_scalar(data, len(index), dtype) + return data elif isinstance(data, ABCExtensionArray): diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 370cbf0f33174..ac4a820b1b319 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1905,6 +1905,18 @@ def test_adding_new_conditional_column() -> None: tm.assert_frame_equal(df, expected) +def test_add_new_column_infer_string(): + # GH# + df = DataFrame({"x": [1]}) + with pd.option_context("future.infer_string", True): + df.loc[df["x"] == 1, "y"] = "1" + expected = DataFrame( + {"x": [1], "y": Series(["1"], dtype="string[pyarrow_numpy]")}, + columns=Index(["x", "y"], dtype="string[pyarrow_numpy]"), + ) + tm.assert_frame_equal(df, expected) + + class TestSetitemValidation: # This is adapted from pandas/tests/arrays/masked/test_indexing.py # but checks for warnings instead of errors. From 18b040eccf4ccd59a33ef528620f86fc0c791693 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 2 Oct 2023 17:06:49 +0100 Subject: [PATCH 2/7] Fix --- pandas/core/construction.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index bb1b4978d2367..1b0150dbab602 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -566,7 +566,9 @@ def sanitize_array( from pandas.core.arrays.string_ import StringDtype dtype = StringDtype("pyarrow_numpy") - data = dtype.construct_array_type()._from_sequence_of_strings([data]) + data = dtype.construct_array_type()._from_sequence_of_strings( + [data] * len(index) + ) else: data = construct_1d_arraylike_from_scalar(data, len(index), dtype) From b3ac326df06ad26e6f82114ea8357df8610e33b8 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Tue, 3 Oct 2023 11:44:12 +0200 Subject: [PATCH 3/7] Update v2.1.2.rst --- doc/source/whatsnew/v2.1.2.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index b6c86621537f8..cf724c400231b 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -24,7 +24,7 @@ Bug fixes ~~~~~~~~~ - Fixed bug in :meth:`DataFrame.resample` not respecting ``closed`` and ``label`` arguments for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55282`) - Fixed bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55281`) -- Fixed bug in :meth:`DataFrame.__setitem__` not inferring string dtype for zero-dimensional array with ``infer_string=True`` (:issue:`TODO`) +- Fixed bug in :meth:`DataFrame.__setitem__` not inferring string dtype for zero-dimensional array with ``infer_string=True`` (:issue:`55366`) .. --------------------------------------------------------------------------- .. _whatsnew_212.other: From c5c800977d75f9833e8f8dde74e8cd2bb81c24eb Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Tue, 3 Oct 2023 11:44:32 +0200 Subject: [PATCH 4/7] Update test_indexing.py --- pandas/tests/frame/indexing/test_indexing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index ac4a820b1b319..5902411aca2f9 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1906,7 +1906,7 @@ def test_adding_new_conditional_column() -> None: def test_add_new_column_infer_string(): - # GH# + # GH#55366 df = DataFrame({"x": [1]}) with pd.option_context("future.infer_string", True): df.loc[df["x"] == 1, "y"] = "1" From 2f2a99b3c26e0cca4a35057c8ff42052a3b0a9fa Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Tue, 3 Oct 2023 18:02:51 +0200 Subject: [PATCH 5/7] Update test_indexing.py --- pandas/tests/frame/indexing/test_indexing.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 5902411aca2f9..6d4eedb49ff83 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1907,6 +1907,7 @@ def test_adding_new_conditional_column() -> None: def test_add_new_column_infer_string(): # GH#55366 + pytest.importorskip("pyarrow") df = DataFrame({"x": [1]}) with pd.option_context("future.infer_string", True): df.loc[df["x"] == 1, "y"] = "1" From edc9b7167269c370df8630ff49ba1bcdb5b27c70 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Tue, 3 Oct 2023 18:22:00 +0200 Subject: [PATCH 6/7] Update v2.1.2.rst --- doc/source/whatsnew/v2.1.2.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index f94dceafb4a3d..98e8d8fe9a4a9 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -22,9 +22,9 @@ Fixed regressions Bug fixes ~~~~~~~~~ +- Fixed bug in :meth:`DataFrame.__setitem__` not inferring string dtype for zero-dimensional array with ``infer_string=True`` (:issue:`55366`) - Fixed bug in :meth:`DataFrame.resample` not respecting ``closed`` and ``label`` arguments for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55282`) - Fixed bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55281`) -- Fixed bug in :meth:`DataFrame.__setitem__` not inferring string dtype for zero-dimensional array with ``infer_string=True`` (:issue:`55366`) - Silence ``Period[B]`` warnings introduced by :issue:`53446` during normal plotting activity (:issue:`55138`) - From 75cf7df7ec1966c4eef0e8b25a0ecfc3d752d577 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 3 Oct 2023 23:44:51 +0200 Subject: [PATCH 7/7] Fix --- pandas/core/construction.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 1b0150dbab602..e661d590ab330 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -566,11 +566,7 @@ def sanitize_array( from pandas.core.arrays.string_ import StringDtype dtype = StringDtype("pyarrow_numpy") - data = dtype.construct_array_type()._from_sequence_of_strings( - [data] * len(index) - ) - else: - data = construct_1d_arraylike_from_scalar(data, len(index), dtype) + data = construct_1d_arraylike_from_scalar(data, len(index), dtype) return data