From 3fad33f9f29c83b3ace5c6ef57fca76a091c646e Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 28 Jun 2025 10:07:44 -0700 Subject: [PATCH 1/2] POC: consistent NaN treatment for pyarrow dtypes --- pandas/_libs/parsers.pyx | 2 +- pandas/core/arrays/arrow/array.py | 54 ++++++++++++++++++------ pandas/core/arrays/string_.py | 8 +++- pandas/core/generic.py | 19 ++++++++- pandas/tests/extension/test_arrow.py | 2 +- pandas/tests/groupby/test_reductions.py | 6 ++- pandas/tests/series/methods/test_rank.py | 9 ++++ 7 files changed, 81 insertions(+), 19 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index c29cdbcf5975e..34e7ee9f15120 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1456,7 +1456,7 @@ def _maybe_upcast( if isinstance(arr, IntegerArray) and arr.isna().all(): # use null instead of int64 in pyarrow arr = arr.to_numpy(na_value=None) - arr = ArrowExtensionArray(pa.array(arr, from_pandas=True)) + arr = ArrowExtensionArray(pa.array(arr)) return arr diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index c18f06c3a126d..e1384754d0a27 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -16,6 +16,7 @@ import numpy as np from pandas._libs import lib +from pandas._libs.missing import NA from pandas._libs.tslibs import ( Timedelta, Timestamp, @@ -360,7 +361,7 @@ def _from_sequence_of_strings( # duration to string casting behavior mask = isna(scalars) if not isinstance(strings, (pa.Array, pa.ChunkedArray)): - strings = pa.array(strings, type=pa.string(), from_pandas=True) + strings = pa.array(strings, type=pa.string()) strings = pc.if_else(mask, None, strings) try: scalars = strings.cast(pa.int64()) @@ -381,7 +382,7 @@ def _from_sequence_of_strings( if isinstance(strings, (pa.Array, pa.ChunkedArray)): scalars = strings else: - scalars = pa.array(strings, type=pa.string(), from_pandas=True) + scalars = pa.array(strings, type=pa.string()) scalars = pc.if_else(pc.equal(scalars, "1.0"), "1", scalars) scalars = pc.if_else(pc.equal(scalars, "0.0"), "0", scalars) scalars = scalars.cast(pa.bool_()) @@ -393,6 +394,13 @@ def _from_sequence_of_strings( from pandas.core.tools.numeric import to_numeric scalars = to_numeric(strings, errors="raise") + if not pa.types.is_decimal(pa_type): + # TODO: figure out why doing this cast breaks with decimal dtype + # in test_from_sequence_of_strings_pa_array + mask = strings.is_null() + scalars = pa.array(scalars, mask=np.array(mask), type=pa_type) + # TODO: could we just do strings.cast(pa_type)? + else: raise NotImplementedError( f"Converting strings to {pa_type} is not implemented." @@ -435,7 +443,7 @@ def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar: """ if isinstance(value, pa.Scalar): pa_scalar = value - elif isna(value): + elif isna(value) and not lib.is_float(value): pa_scalar = pa.scalar(None, type=pa_type) else: # Workaround https://github.com/apache/arrow/issues/37291 @@ -452,7 +460,7 @@ def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar: value = value.as_unit(pa_type.unit) value = value._value - pa_scalar = pa.scalar(value, type=pa_type, from_pandas=True) + pa_scalar = pa.scalar(value, type=pa_type) if pa_type is not None and pa_scalar.type != pa_type: pa_scalar = pa_scalar.cast(pa_type) @@ -484,6 +492,13 @@ def _box_pa_array( if copy: value = value.copy() pa_array = value.__arrow_array__() + + elif hasattr(value, "__arrow_array__"): + # e.g. StringArray + if copy: + value = value.copy() + pa_array = value.__arrow_array__() + else: if ( isinstance(value, np.ndarray) @@ -510,11 +525,24 @@ def _box_pa_array( value = to_timedelta(value, unit=pa_type.unit).as_unit(pa_type.unit) value = value.to_numpy() + mask = None + if getattr(value, "dtype", None) is None or value.dtype.kind not in "mfM": + # similar to isna(value) but exclude NaN + # TODO: cythonize! + mask = np.array([x is NA or x is None for x in value], dtype=bool) + + from_pandas = False + if pa.types.is_integer(pa_type): + # If user specifically asks to cast a numpy float array with NaNs + # to pyarrow integer, we'll treat those NaNs as NA + from_pandas = True try: - pa_array = pa.array(value, type=pa_type, from_pandas=True) + pa_array = pa.array( + value, type=pa_type, mask=mask, from_pandas=from_pandas + ) except (pa.ArrowInvalid, pa.ArrowTypeError): # GH50430: let pyarrow infer type, then cast - pa_array = pa.array(value, from_pandas=True) + pa_array = pa.array(value, mask=mask, from_pandas=from_pandas) if pa_type is None and pa.types.is_duration(pa_array.type): # Workaround https://github.com/apache/arrow/issues/37291 @@ -522,7 +550,7 @@ def _box_pa_array( value = to_timedelta(value) value = value.to_numpy() - pa_array = pa.array(value, type=pa_type, from_pandas=True) + pa_array = pa.array(value, type=pa_type) if pa.types.is_duration(pa_array.type) and pa_array.null_count > 0: # GH52843: upstream bug for duration types when originally @@ -1169,7 +1197,7 @@ def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: if not len(values): return np.zeros(len(self), dtype=bool) - result = pc.is_in(self._pa_array, value_set=pa.array(values, from_pandas=True)) + result = pc.is_in(self._pa_array, value_set=pa.array(values)) # pyarrow 2.0.0 returned nulls, so we explicitly specify dtype to convert nulls # to False return np.array(result, dtype=np.bool_) @@ -2002,7 +2030,7 @@ def __setitem__(self, key, value) -> None: raise ValueError("Length of indexer and values mismatch") chunks = [ *self._pa_array[:key].chunks, - pa.array([value], type=self._pa_array.type, from_pandas=True), + pa.array([value], type=self._pa_array.type), *self._pa_array[key + 1 :].chunks, ] data = pa.chunked_array(chunks).combine_chunks() @@ -2056,7 +2084,7 @@ def _rank_calc( pa_type = pa.float64() else: pa_type = pa.uint64() - result = pa.array(ranked, type=pa_type, from_pandas=True) + result = pa.array(ranked, type=pa_type) return result data = self._pa_array.combine_chunks() @@ -2308,7 +2336,7 @@ def _to_numpy_and_type(value) -> tuple[np.ndarray, pa.DataType | None]: right, right_type = _to_numpy_and_type(right) pa_type = left_type or right_type result = np.where(cond, left, right) - return pa.array(result, type=pa_type, from_pandas=True) + return pa.array(result, type=pa_type) @classmethod def _replace_with_mask( @@ -2351,7 +2379,7 @@ def _replace_with_mask( replacements = replacements.as_py() result = np.array(values, dtype=object) result[mask] = replacements - return pa.array(result, type=values.type, from_pandas=True) + return pa.array(result, type=values.type) # ------------------------------------------------------------------ # GroupBy Methods @@ -2430,7 +2458,7 @@ def _groupby_op( return type(self)(pa_result) else: # DatetimeArray, TimedeltaArray - pa_result = pa.array(result, from_pandas=True) + pa_result = pa.array(result) return type(self)(pa_result) def _apply_elementwise(self, func: Callable) -> list[list[Any]]: diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 8048306df91a2..ad3fd9969f5e8 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -474,6 +474,12 @@ def _str_map_str_or_object( if self.dtype.storage == "pyarrow": import pyarrow as pa + # TODO: shouldn't this already be caught my passed mask? + # it isn't in test_extract_expand_capture_groups_index + # mask = mask | np.array( + # [x is libmissing.NA for x in result], dtype=bool + # ) + result = pa.array( result, mask=mask, type=pa.large_string(), from_pandas=True ) @@ -726,7 +732,7 @@ def __arrow_array__(self, type=None): values = self._ndarray.copy() values[self.isna()] = None - return pa.array(values, type=type, from_pandas=True) + return pa.array(values, type=type) def _values_for_factorize(self) -> tuple[np.ndarray, libmissing.NAType | float]: # type: ignore[override] arr = self._ndarray diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 8aae4609b1833..8908b21193207 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9874,7 +9874,7 @@ def where( def where( self, cond, - other=np.nan, + other=lib.no_default, *, inplace: bool = False, axis: Axis | None = None, @@ -10032,6 +10032,23 @@ def where( stacklevel=2, ) + if other is lib.no_default: + if self.ndim == 1: + if isinstance(self.dtype, ExtensionDtype): + other = self.dtype.na_value + else: + other = np.nan + else: + if self._mgr.nblocks == 1 and isinstance( + self._mgr.blocks[0].values.dtype, ExtensionDtype + ): + # FIXME: checking this is kludgy! + other = self._mgr.blocks[0].values.dtype.na_value + else: + # FIXME: the same problem we had with Series will now + # show up column-by-column! + other = np.nan + other = common.apply_if_callable(other, self) return self._where(cond, other, inplace=inplace, axis=axis, level=level) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index fc5930ebcd8ac..c6035d2495cfa 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -731,7 +731,7 @@ def test_EA_types(self, engine, data, dtype_backend, request): pytest.mark.xfail(reason="CSV parsers don't correctly handle binary") ) df = pd.DataFrame({"with_dtype": pd.Series(data, dtype=str(data.dtype))}) - csv_output = df.to_csv(index=False, na_rep=np.nan) + csv_output = df.to_csv(index=False, na_rep=np.nan) # should be NA? if pa.types.is_binary(pa_dtype): csv_output = BytesIO(csv_output) else: diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index 014558bbf4bba..08cf1047f316c 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -381,8 +381,10 @@ def test_first_last_skipna(any_real_nullable_dtype, sort, skipna, how): df = DataFrame( { "a": [2, 1, 1, 2, 3, 3], - "b": [na_value, 3.0, na_value, 4.0, np.nan, np.nan], - "c": [na_value, 3.0, na_value, 4.0, np.nan, np.nan], + # TODO: test that has mixed na_value and NaN either working for + # float or raising for int? + "b": [na_value, 3.0, na_value, 4.0, na_value, na_value], + "c": [na_value, 3.0, na_value, 4.0, na_value, na_value], }, dtype=any_real_nullable_dtype, ) diff --git a/pandas/tests/series/methods/test_rank.py b/pandas/tests/series/methods/test_rank.py index 7c6a7893ba3a0..8363ba118d4d3 100644 --- a/pandas/tests/series/methods/test_rank.py +++ b/pandas/tests/series/methods/test_rank.py @@ -276,6 +276,13 @@ def test_rank_tie_methods(self, ser, results, dtype, using_infer_string): ser = ser if dtype is None else ser.astype(dtype) result = ser.rank(method=method) + if dtype == "float64[pyarrow]": + # the NaNs are not treated as NA + exp = exp.copy() + if method == "average": + exp[np.isnan(ser)] = 9.5 + elif method == "dense": + exp[np.isnan(ser)] = 6 tm.assert_series_equal(result, Series(exp, dtype=expected_dtype(dtype, method))) @pytest.mark.parametrize("na_option", ["top", "bottom", "keep"]) @@ -321,6 +328,8 @@ def test_rank_tie_methods_on_infs_nans( order = [ranks[1], ranks[0], ranks[2]] elif na_option == "bottom": order = [ranks[0], ranks[2], ranks[1]] + elif dtype == "float64[pyarrow]": + order = [ranks[0], [NA] * chunk, ranks[1]] else: order = [ranks[0], [np.nan] * chunk, ranks[1]] expected = order if ascending else order[::-1] From f1e8ba0a03a7fbd1e63b4e6b300b9ae4899deddf Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 28 Jun 2025 10:23:00 -0700 Subject: [PATCH 2/2] comment --- pandas/tests/extension/base/setitem.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py index 185d6d750cace..99ab5d2f7e86f 100644 --- a/pandas/tests/extension/base/setitem.py +++ b/pandas/tests/extension/base/setitem.py @@ -422,6 +422,7 @@ def test_setitem_frame_2d_values(self, data): df.iloc[:-1] = df.iloc[:-1].copy() tm.assert_frame_equal(df, orig) + # FIXME: Breaks for pyarrow float dtype bc df.values changes NAs to NaN df.iloc[:] = df.values tm.assert_frame_equal(df, orig)