From 45bd6ba6a4675ad8d6a7323bcb5e91a76184fe35 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 8 Mar 2023 16:42:59 -0800 Subject: [PATCH 1/3] ENH: Add misc pyarrow types to ArrowDtype.type --- pandas/core/arrays/arrow/dtype.py | 10 +++++++++- pandas/tests/extension/test_arrow.py | 21 ++++++++++++++++++--- 2 files changed, 27 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/arrow/dtype.py b/pandas/core/arrays/arrow/dtype.py index a262a45feb9c5..a2e1cf32254fc 100644 --- a/pandas/core/arrays/arrow/dtype.py +++ b/pandas/core/arrays/arrow/dtype.py @@ -106,7 +106,7 @@ def type(self): return int elif pa.types.is_floating(pa_type): return float - elif pa.types.is_string(pa_type): + elif pa.types.is_string(pa_type) or pa.types.is_large_string(pa_type): return str elif ( pa.types.is_binary(pa_type) @@ -132,6 +132,14 @@ def type(self): return time elif pa.types.is_decimal(pa_type): return Decimal + elif pa.types.is_dictionary(pa_type): + from pandas.core.arrays import Categorical + + return Categorical + elif pa.types.is_list(pa_type) or pa.types.is_large_list(pa_type): + return list + elif pa.types.is_map(pa_type): + return dict elif pa.types.is_null(pa_type): # TODO: None? pd.NA? pa.null? return type(pa_type) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 38a3c2c85e829..239e28931d9ed 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -51,6 +51,7 @@ is_string_dtype, is_unsigned_integer_dtype, ) +from pandas.core.arrays.categorical import Categorical from pandas.tests.extension import base pa = pytest.importorskip("pyarrow", minversion="7.0.0") @@ -1544,9 +1545,23 @@ def test_mode_dropna_false_mode_na(data): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("arrow_dtype", [pa.binary(), pa.binary(16), pa.large_binary()]) -def test_arrow_dtype_type(arrow_dtype): - assert ArrowDtype(arrow_dtype).type == bytes +@pytest.mark.parametrize( + "arrow_dtype, expected_type", + [ + [pa.binary(), bytes], + [pa.binary(16), bytes], + [pa.large_binary(), bytes], + [pa.large_string(), str], + [pa.list_(pa.int64()), list], + [pa.large_list(pa.int64()), list], + [pa.map_(pa.string(), pa.int64()), dict], + [pa.dictionary(pa.int64(), pa.int64()), Categorical], + ], +) +def test_arrow_dtype_type(arrow_dtype, expected_type): + # GH 51845 + # TODO: Redundant with test_getitem_scalar once arrow_dtype exists in data fixture + assert ArrowDtype(arrow_dtype).type == expected_type def test_is_bool_dtype(): From 290fa852309111565589e496876da979bdcd332a Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 9 Mar 2023 10:28:40 -0800 Subject: [PATCH 2/3] change exception --- pandas/tests/extension/test_arrow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index a39cd7a34c9f1..918117355424f 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1940,7 +1940,7 @@ def test_str_get(i, exp): @pytest.mark.xfail( reason="TODO: StringMethods._validate should support Arrow list types", - raises=NotImplementedError, + raises=AttributeError, ) def test_str_join(): ser = pd.Series(ArrowExtensionArray(pa.array([list("abc"), list("123"), None]))) From c467c918cd8aa60ff2d51fdcee4a7576487d428b Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 9 Mar 2023 11:43:02 -0800 Subject: [PATCH 3/3] Change to CategoricalDtypeType --- pandas/core/arrays/arrow/dtype.py | 7 ++++--- pandas/tests/extension/test_arrow.py | 4 ++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/arrow/dtype.py b/pandas/core/arrays/arrow/dtype.py index a2e1cf32254fc..2ba0711de98f9 100644 --- a/pandas/core/arrays/arrow/dtype.py +++ b/pandas/core/arrays/arrow/dtype.py @@ -27,6 +27,7 @@ StorageExtensionDtype, register_extension_dtype, ) +from pandas.core.dtypes.dtypes import CategoricalDtypeType if not pa_version_under7p0: import pyarrow as pa @@ -133,9 +134,9 @@ def type(self): elif pa.types.is_decimal(pa_type): return Decimal elif pa.types.is_dictionary(pa_type): - from pandas.core.arrays import Categorical - - return Categorical + # TODO: Potentially change this & CategoricalDtype.type to + # something more representative of the scalar + return CategoricalDtypeType elif pa.types.is_list(pa_type) or pa.types.is_large_list(pa_type): return list elif pa.types.is_map(pa_type): diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 918117355424f..22920e077123e 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -39,6 +39,7 @@ from pandas.errors import PerformanceWarning from pandas.core.dtypes.common import is_any_int_dtype +from pandas.core.dtypes.dtypes import CategoricalDtypeType import pandas as pd import pandas._testing as tm @@ -51,7 +52,6 @@ is_string_dtype, is_unsigned_integer_dtype, ) -from pandas.core.arrays.categorical import Categorical from pandas.tests.extension import base pa = pytest.importorskip("pyarrow", minversion="7.0.0") @@ -1541,7 +1541,7 @@ def test_mode_dropna_false_mode_na(data): [pa.list_(pa.int64()), list], [pa.large_list(pa.int64()), list], [pa.map_(pa.string(), pa.int64()), dict], - [pa.dictionary(pa.int64(), pa.int64()), Categorical], + [pa.dictionary(pa.int64(), pa.int64()), CategoricalDtypeType], ], ) def test_arrow_dtype_type(arrow_dtype, expected_type):