diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 85d9acff353be..81f5ad34c53a0 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -700,7 +700,7 @@ Conversion Strings ^^^^^^^ -- +- Bug in the conversion from ``pyarrow.ChunkedArray`` to :class:`~arrays.StringArray` when the original had zero chunks (:issue:`41040`) - Interval diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 0a0bfccc0ea15..6271a13875371 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -126,7 +126,12 @@ def __from_arrow__( bool_arr = BooleanArray._from_sequence(np.array(arr)) results.append(bool_arr) - return BooleanArray._concat_same_type(results) + if not results: + return BooleanArray( + np.array([], dtype=np.bool_), np.array([], dtype=np.bool_) + ) + else: + return BooleanArray._concat_same_type(results) def coerce_to_array( diff --git a/pandas/core/arrays/numeric.py b/pandas/core/arrays/numeric.py index 4908000a68810..bc467e93c2c2c 100644 --- a/pandas/core/arrays/numeric.py +++ b/pandas/core/arrays/numeric.py @@ -66,7 +66,11 @@ def __from_arrow__( num_arr = array_class(data.copy(), ~mask, copy=False) results.append(num_arr) - if len(results) == 1: + if not results: + return array_class( + np.array([], dtype=self.numpy_dtype), np.array([], dtype=np.bool_) + ) + elif len(results) == 1: # avoid additional copy in _concat_same_type return results[0] else: diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 600aacec9c87a..6954b512c7ad0 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -118,7 +118,10 @@ def __from_arrow__( str_arr = StringArray._from_sequence(np.array(arr)) results.append(str_arr) - return StringArray._concat_same_type(results) + if results: + return StringArray._concat_same_type(results) + else: + return StringArray(np.array([], dtype="object")) class StringArray(PandasArray): diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 84eede019251b..e09c24c94992d 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -1005,6 +1005,8 @@ def __from_arrow__( parr[~mask] = NaT results.append(parr) + if not results: + return PeriodArray(np.array([], dtype="int64"), freq=self.freq, copy=False) return PeriodArray._concat_same_type(results) @@ -1238,6 +1240,12 @@ def __from_arrow__( iarr = IntervalArray.from_arrays(left, right, closed=array.type.closed) results.append(iarr) + if not results: + return IntervalArray.from_arrays( + np.array([], dtype=self.subtype), + np.array([], dtype=self.subtype), + closed=array.type.closed, + ) return IntervalArray._concat_same_type(results) def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None: diff --git a/pandas/tests/arrays/interval/test_interval.py b/pandas/tests/arrays/interval/test_interval.py index d8fca91c5516a..fde45a1e39bb2 100644 --- a/pandas/tests/arrays/interval/test_interval.py +++ b/pandas/tests/arrays/interval/test_interval.py @@ -271,6 +271,13 @@ def test_arrow_table_roundtrip(breaks): expected = pd.concat([df, df], ignore_index=True) tm.assert_frame_equal(result, expected) + # GH-41040 + table = pa.table( + [pa.chunked_array([], type=table.column(0).type)], schema=table.schema + ) + result = table.to_pandas() + tm.assert_frame_equal(result, expected[0:0]) + @pyarrow_skip @pytest.mark.parametrize( diff --git a/pandas/tests/arrays/masked/test_arrow_compat.py b/pandas/tests/arrays/masked/test_arrow_compat.py index 8bb32dec2cc0e..ec5794a34ac45 100644 --- a/pandas/tests/arrays/masked/test_arrow_compat.py +++ b/pandas/tests/arrays/masked/test_arrow_compat.py @@ -41,6 +41,22 @@ def test_arrow_roundtrip(data): tm.assert_frame_equal(result, df) +@td.skip_if_no("pyarrow", min_version="0.15.1.dev") +def test_arrow_load_from_zero_chunks(data): + # GH-41040 + import pyarrow as pa + + df = pd.DataFrame({"a": data[0:0]}) + table = pa.table(df) + assert table.field("a").type == str(data.dtype.numpy_dtype) + table = pa.table( + [pa.chunked_array([], type=table.field("a").type)], schema=table.schema + ) + result = table.to_pandas() + assert result["a"].dtype == data.dtype + tm.assert_frame_equal(result, df) + + @td.skip_if_no("pyarrow", min_version="0.16.0") def test_arrow_from_arrow_uint(): # https://github.com/pandas-dev/pandas/issues/31896 diff --git a/pandas/tests/arrays/period/test_arrow_compat.py b/pandas/tests/arrays/period/test_arrow_compat.py index f4e803cf4405f..398972a682504 100644 --- a/pandas/tests/arrays/period/test_arrow_compat.py +++ b/pandas/tests/arrays/period/test_arrow_compat.py @@ -100,6 +100,26 @@ def test_arrow_table_roundtrip(): tm.assert_frame_equal(result, expected) +@pyarrow_skip +def test_arrow_load_from_zero_chunks(): + # GH-41040 + import pyarrow as pa + + from pandas.core.arrays._arrow_utils import ArrowPeriodType + + arr = PeriodArray([], freq="D") + df = pd.DataFrame({"a": arr}) + + table = pa.table(df) + assert isinstance(table.field("a").type, ArrowPeriodType) + table = pa.table( + [pa.chunked_array([], type=table.column(0).type)], schema=table.schema + ) + result = table.to_pandas() + assert isinstance(result["a"].dtype, PeriodDtype) + tm.assert_frame_equal(result, df) + + @pyarrow_skip def test_arrow_table_roundtrip_without_metadata(): import pyarrow as pa diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 2b2db49c62ba2..e2d8e522abb35 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -476,6 +476,22 @@ def test_arrow_roundtrip(dtype, dtype_object): assert result.loc[2, "a"] is pd.NA +@td.skip_if_no("pyarrow", min_version="0.15.1.dev") +def test_arrow_load_from_zero_chunks(dtype, dtype_object): + # GH-41040 + import pyarrow as pa + + data = pd.array([], dtype=dtype) + df = pd.DataFrame({"a": data}) + table = pa.table(df) + assert table.field("a").type == "string" + # Instantiate the same table with no chunks at all + table = pa.table([pa.chunked_array([], type=pa.string())], schema=table.schema) + result = table.to_pandas() + assert isinstance(result["a"].dtype, dtype_object) + tm.assert_frame_equal(result, df) + + def test_value_counts_na(dtype): arr = pd.array(["a", "b", "a", pd.NA], dtype=dtype) result = arr.value_counts(dropna=False)