From 723b8daf160f0fc82be61aaed79298647baacf21 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 1 Mar 2023 23:30:00 +0100 Subject: [PATCH 1/4] BUG: read_parquet does not respect index for arrow dtype backend --- pandas/io/parquet.py | 32 ++++++++++++++++++++++++-------- pandas/tests/io/test_parquet.py | 29 +++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 8 deletions(-) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index aec31f40f8570..bec951bddc7df 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -24,7 +24,9 @@ from pandas import ( DataFrame, + Index, MultiIndex, + RangeIndex, arrays, get_option, ) @@ -250,14 +252,28 @@ def read( if dtype_backend == "pandas": result = pa_table.to_pandas(**to_pandas_kwargs) elif dtype_backend == "pyarrow": - result = DataFrame( - { - col_name: arrays.ArrowExtensionArray(pa_col) - for col_name, pa_col in zip( - pa_table.column_names, pa_table.itercolumns() - ) - } - ) + index_columns = pa_table.schema.pandas_metadata.get("index_columns", []) + result_dc = { + col_name: arrays.ArrowExtensionArray(pa_col) + for col_name, pa_col in zip( + pa_table.column_names, pa_table.itercolumns() + ) + } + if len(index_columns) == 1 and isinstance(index_columns[0], dict): + params = index_columns[0] + idx = RangeIndex( + params.get("start"), params.get("stop"), params.get("step") + ) + + else: + index_data = [ + result_dc.pop(index_col) for index_col in index_columns + ] + if len(index_data) == 1: + idx = Index(index_data[0], name=index_columns[0]) + else: + idx = MultiIndex.from_arrays(index_data, names=index_columns) + result = DataFrame(result_dc, index=idx) if manager == "array": result = result._as_manager("array", copy=False) return result diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 353dc4f1cbd8a..f200a8ae66138 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -18,6 +18,7 @@ import pandas.util._test_decorators as td import pandas as pd +from pandas import RangeIndex import pandas._testing as tm from pandas.util.version import Version @@ -1225,3 +1226,31 @@ def test_bytes_file_name(self, engine): result = read_parquet(path, engine=engine) tm.assert_frame_equal(result, df) + + @pytest.mark.parametrize("index", ["A", ["A", "B"]]) + def test_pyarrow_backed_df_index(self, index, pa): + # GH#48944 + obj = pd.DataFrame(data={"A": [0, 1], "B": [1, 0], "C": 1}) + df = obj.set_index(index) + with tm.ensure_clean("test.parquet") as path: + with open(path.encode(), "wb") as f: + df.to_parquet(f) + + with pd.option_context("mode.dtype_backend", "pyarrow"): + result = read_parquet(path, engine="pyarrow") + expected = obj.astype("int64[pyarrow]").set_index(index) + tm.assert_frame_equal(result, expected) + + def test_pyarrow_backed_df_range_index(self, pa): + # GH#48944 + df = pd.DataFrame( + data={"A": [0, 1], "B": [1, 0]}, index=RangeIndex(start=100, stop=102) + ) + with tm.ensure_clean("test.parquet") as path: + with open(path.encode(), "wb") as f: + df.to_parquet(f) + + with pd.option_context("mode.dtype_backend", "pyarrow"): + result = read_parquet(path, engine="pyarrow") + expected = df.astype("int64[pyarrow]") + tm.assert_frame_equal(result, expected) From ca3dca78eaee1fbccc4beed5a351e81715ebf0a0 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 1 Mar 2023 23:35:27 +0100 Subject: [PATCH 2/4] BUG: read_parquet does not respect index for arrow dtype backend --- pandas/io/parquet.py | 9 +++++++-- pandas/tests/io/test_parquet.py | 9 +++++++-- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index bec951bddc7df..7d3ac277369d5 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -259,7 +259,9 @@ def read( pa_table.column_names, pa_table.itercolumns() ) } - if len(index_columns) == 1 and isinstance(index_columns[0], dict): + if len(index_columns) == 0: + idx = None + elif len(index_columns) == 1 and isinstance(index_columns[0], dict): params = index_columns[0] idx = RangeIndex( params.get("start"), params.get("stop"), params.get("step") @@ -270,7 +272,10 @@ def read( result_dc.pop(index_col) for index_col in index_columns ] if len(index_data) == 1: - idx = Index(index_data[0], name=index_columns[0]) + name = index_columns[0] + if isinstance(name, str) and name.startswith("__index_level_"): + name = None + idx = Index(index_data[0], name=name) else: idx = MultiIndex.from_arrays(index_data, names=index_columns) result = DataFrame(result_dc, index=idx) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index f200a8ae66138..5b7883530eca2 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1241,16 +1241,21 @@ def test_pyarrow_backed_df_index(self, index, pa): expected = obj.astype("int64[pyarrow]").set_index(index) tm.assert_frame_equal(result, expected) - def test_pyarrow_backed_df_range_index(self, pa): + @pytest.mark.parametrize("index", [True, False, None]) + def test_pyarrow_backed_df_range_index(self, pa, index): # GH#48944 df = pd.DataFrame( data={"A": [0, 1], "B": [1, 0]}, index=RangeIndex(start=100, stop=102) ) with tm.ensure_clean("test.parquet") as path: with open(path.encode(), "wb") as f: - df.to_parquet(f) + df.to_parquet(f, index=index) with pd.option_context("mode.dtype_backend", "pyarrow"): result = read_parquet(path, engine="pyarrow") expected = df.astype("int64[pyarrow]") + if index is False: + expected = expected.reset_index(drop=True) + elif index: + expected.index = pd.Index([100, 101], dtype="int64[pyarrow]") tm.assert_frame_equal(result, expected) From 28502ff273e93e783cb245ec46d92c756bc09e94 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 2 Mar 2023 00:18:24 +0100 Subject: [PATCH 3/4] Fix mypy --- pandas/io/parquet.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 7d3ac277369d5..90d0ab6872e5c 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -259,6 +259,7 @@ def read( pa_table.column_names, pa_table.itercolumns() ) } + idx: Index | None if len(index_columns) == 0: idx = None elif len(index_columns) == 1 and isinstance(index_columns[0], dict): From 53f3700d1d5a433a2cb0610e82db110ea2333fb0 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 2 Mar 2023 02:30:32 +0100 Subject: [PATCH 4/4] Account for index name --- pandas/io/parquet.py | 5 ++++- pandas/tests/io/test_parquet.py | 8 +++++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 90d0ab6872e5c..3d8346aeb97a2 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -265,7 +265,10 @@ def read( elif len(index_columns) == 1 and isinstance(index_columns[0], dict): params = index_columns[0] idx = RangeIndex( - params.get("start"), params.get("stop"), params.get("step") + params.get("start"), + params.get("stop"), + params.get("step"), + name=params.get("name"), ) else: diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 5b7883530eca2..b8d02fc04f90e 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1241,11 +1241,13 @@ def test_pyarrow_backed_df_index(self, index, pa): expected = obj.astype("int64[pyarrow]").set_index(index) tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("name", [None, "test"]) @pytest.mark.parametrize("index", [True, False, None]) - def test_pyarrow_backed_df_range_index(self, pa, index): + def test_pyarrow_backed_df_range_index(self, pa, index, name): # GH#48944 df = pd.DataFrame( - data={"A": [0, 1], "B": [1, 0]}, index=RangeIndex(start=100, stop=102) + data={"A": [0, 1], "B": [1, 0]}, + index=RangeIndex(start=100, stop=102, name=name), ) with tm.ensure_clean("test.parquet") as path: with open(path.encode(), "wb") as f: @@ -1257,5 +1259,5 @@ def test_pyarrow_backed_df_range_index(self, pa, index): if index is False: expected = expected.reset_index(drop=True) elif index: - expected.index = pd.Index([100, 101], dtype="int64[pyarrow]") + expected.index = pd.Index([100, 101], dtype="int64[pyarrow]", name=name) tm.assert_frame_equal(result, expected)