Skip to content

POC: NA-only behavior for numpy-nullable dtypes #61708

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -2116,3 +2116,10 @@ def temp_file(tmp_path):
file_path = tmp_path / str(uuid.uuid4())
file_path.touch()
return file_path


@pytest.fixture(params=[True, False])
def pdep16_nan_behavior(request):
opt = request.param
with pd.option_context("mode.pdep16_nan_behavior", opt):
yield opt
11 changes: 10 additions & 1 deletion pandas/core/arrays/masked.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@

import numpy as np

from pandas._config import get_option

from pandas._libs import (
lib,
missing as libmissing,
Expand Down Expand Up @@ -308,7 +310,9 @@ def __setitem__(self, key, value) -> None:
def __contains__(self, key) -> bool:
if isna(key) and key is not self.dtype.na_value:
# GH#52840
if self._data.dtype.kind == "f" and lib.is_float(key):
if lib.is_float(key) and get_option("mode.PDEP16_nan_behavior"):
key = self.dtype.na_value
elif self._data.dtype.kind == "f" and lib.is_float(key):
return bool((np.isnan(self._data) & ~self._mask).any())

return bool(super().__contains__(key))
Expand Down Expand Up @@ -655,6 +659,8 @@ def reconstruct(x: np.ndarray):
# reached in e.g. np.sqrt on BooleanArray
# we don't support float16
x = x.astype(np.float32)
if get_option("mode.PDEP16_nan_behavior"):
m[np.isnan(x)] = True
return FloatingArray(x, m)
else:
x[mask] = np.nan
Expand Down Expand Up @@ -860,6 +866,9 @@ def _maybe_mask_result(
if result.dtype.kind == "f":
from pandas.core.arrays import FloatingArray

if get_option("mode.PDEP16_nan_behavior"):
mask[np.isnan(result)] = True

return FloatingArray(result, mask, copy=False)

elif result.dtype.kind == "b":
Expand Down
13 changes: 13 additions & 0 deletions pandas/core/arrays/numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@

import numpy as np

from pandas._config import get_option

from pandas._libs import (
lib,
missing as libmissing,
Expand Down Expand Up @@ -101,6 +103,8 @@ def __from_arrow__(
array = array.combine_chunks()

data, mask = pyarrow_array_to_numpy_and_mask(array, dtype=self.numpy_dtype)
if data.dtype.kind == "f" and get_option("mode.PDEP16_nan_behavior"):
mask[np.isnan(data)] = False
return array_class(data.copy(), ~mask, copy=False)

@classmethod
Expand Down Expand Up @@ -261,10 +265,19 @@ def __init__(
f"values should be {descr} numpy array. Use "
"the 'pd.array' function instead"
)
if not (isinstance(mask, np.ndarray) and mask.dtype == np.bool_):
raise TypeError(
"mask should be bool numpy array. Use the 'pd.array' function instead"
)

if values.dtype == np.float16:
# If we don't raise here, then accessing self.dtype would raise
raise TypeError("FloatingArray does not support np.float16 dtype.")

# NB: if get_option("mode.PDEP16_nan_behavior") is True
# then caller is responsible for ensuring
# assert mask[np.isnan(values)].all()

super().__init__(values, mask, copy=copy)

@cache_readonly
Expand Down
11 changes: 11 additions & 0 deletions pandas/core/config_init.py
Original file line number Diff line number Diff line change
Expand Up @@ -427,6 +427,17 @@ def is_terminal() -> bool:
validator=is_one_of_factory([True, False, "warn"]),
)

with cf.config_prefix("mode"):
cf.register_option(
"PDEP16_nan_behavior",
True,
# TODO: set the default to False before merging;
# True is just to find the tests that break with it enabled.
"Whether to enable the PDEP-16 behavior *consistently* treating NaN "
"and NA as interchangeable for the purposes of numpy-nullable dtypes.",
validator=is_one_of_factory([True, False]),
)


# user warnings
chained_assignment = """
Expand Down
9 changes: 9 additions & 0 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -6584,6 +6584,15 @@ def _maybe_cast_indexer(self, key):
If we have a float key and are not a floating index, then try to cast
to an int if equivalent.
"""
if (
is_float(key)
and np.isnan(key)
and isinstance(self.dtype, ExtensionDtype)
and self.dtype.kind == "f"
and get_option("mode.pdep16_nan_behavior")
):
# TODO: better place to do this?
key = self.dtype.na_value
return key

def _maybe_cast_listlike_indexer(self, target) -> Index:
Expand Down
30 changes: 19 additions & 11 deletions pandas/tests/arrays/floating/test_arithmetic.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,21 +35,24 @@ def test_array_op(dtype, opname, exp):


@pytest.mark.parametrize("zero, negative", [(0, False), (0.0, False), (-0.0, True)])
def test_divide_by_zero(dtype, zero, negative):
def test_divide_by_zero(dtype, zero, negative, pdep16_nan_behavior):
# TODO pending NA/NaN discussion
# https://github.com/pandas-dev/pandas/issues/32265/
a = pd.array([0, 1, -1, None], dtype=dtype)
result = a / zero
exp_mask = np.array([False, False, False, True])
if pdep16_nan_behavior:
exp_mask[[0, -1]] = True
expected = FloatingArray(
np.array([np.nan, np.inf, -np.inf, np.nan], dtype=dtype.numpy_dtype),
np.array([False, False, False, True]),
exp_mask,
)
if negative:
expected *= -1
tm.assert_extension_array_equal(result, expected)


def test_pow_scalar(dtype):
def test_pow_scalar(dtype, pdep16_nan_behavior):
a = pd.array([-1, 0, 1, None, 2], dtype=dtype)
result = a**0
expected = pd.array([1, 1, 1, 1, 1], dtype=dtype)
Expand All @@ -64,11 +67,14 @@ def test_pow_scalar(dtype):
tm.assert_extension_array_equal(result, expected)

result = a**np.nan
# TODO np.nan should be converted to pd.NA / missing before operation?
expected = FloatingArray(
np.array([np.nan, np.nan, 1, np.nan, np.nan], dtype=dtype.numpy_dtype),
mask=a._mask,
)
if pdep16_nan_behavior:
expected = pd.array([None, None, 1, None, None], dtype=dtype)
else:
# TODO np.nan should be converted to pd.NA / missing before operation?
expected = FloatingArray(
np.array([np.nan, np.nan, 1, np.nan, np.nan], dtype=dtype.numpy_dtype),
mask=a._mask,
)
tm.assert_extension_array_equal(result, expected)

# reversed
Expand All @@ -87,9 +93,11 @@ def test_pow_scalar(dtype):
tm.assert_extension_array_equal(result, expected)

result = np.nan**a
expected = FloatingArray(
np.array([1, np.nan, np.nan, np.nan], dtype=dtype.numpy_dtype), mask=a._mask
)
if not pdep16_nan_behavior:
# Otherwise the previous `expected` can be reused
expected = FloatingArray(
np.array([1, np.nan, np.nan, np.nan], dtype=dtype.numpy_dtype), mask=a._mask
)
tm.assert_extension_array_equal(result, expected)


Expand Down
12 changes: 10 additions & 2 deletions pandas/tests/arrays/floating/test_comparison.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,15 @@ def test_equals():
assert a1.equals(a2) is False


def test_equals_nan_vs_na():
def test_equals_nan_vs_na(pdep16_nan_behavior):
# GH#44382

mask = np.zeros(3, dtype=bool)
data = np.array([1.0, np.nan, 3.0], dtype=np.float64)
if pdep16_nan_behavior:
# Under PDEP16, all callers of the FloatingArray constructor should
# ensure that mask[np.isnan(data)] = True
mask[1] = True

left = FloatingArray(data, mask)
assert left.equals(left)
Expand All @@ -57,7 +61,11 @@ def test_equals_nan_vs_na():
assert right.equals(right)
tm.assert_extension_array_equal(right, right)

assert not left.equals(right)
if not pdep16_nan_behavior:
assert not left.equals(right)
else:
# the constructor will set the NaN locations to NA
assert left.equals(right)

# with mask[1] = True, the only difference is data[1], which should
# not matter for equals
Expand Down
7 changes: 5 additions & 2 deletions pandas/tests/arrays/floating/test_contains.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,13 @@
import pandas as pd


def test_contains_nan():
def test_contains_nan(pdep16_nan_behavior):
# GH#52840
arr = pd.array(range(5)) / 0

assert np.isnan(arr._data[0])
assert not arr.isna()[0]
if pdep16_nan_behavior:
assert arr.isna()[0]
else:
assert not arr.isna()[0]
assert np.nan in arr
13 changes: 10 additions & 3 deletions pandas/tests/arrays/floating/test_to_numpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,11 +81,18 @@ def test_to_numpy_na_value(box):
tm.assert_numpy_array_equal(result, expected)


def test_to_numpy_na_value_with_nan():
def test_to_numpy_na_value_with_nan(pdep16_nan_behavior):
# array with both NaN and NA -> only fill NA with `na_value`
arr = FloatingArray(np.array([0.0, np.nan, 0.0]), np.array([False, False, True]))
mask = np.array([False, False, True])
if pdep16_nan_behavior:
mask[1] = True
arr = FloatingArray(np.array([0.0, np.nan, 0.0]), mask)
result = arr.to_numpy(dtype="float64", na_value=-1)
expected = np.array([0.0, np.nan, -1.0], dtype="float64")
if pdep16_nan_behavior:
# the NaN passed to the constructor is considered as NA
expected = np.array([0.0, -1.0, -1.0], dtype="float64")
else:
expected = np.array([0.0, np.nan, -1.0], dtype="float64")
tm.assert_numpy_array_equal(result, expected)


Expand Down
35 changes: 22 additions & 13 deletions pandas/tests/arrays/integer/test_arithmetic.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,13 +52,16 @@ def test_div(dtype):


@pytest.mark.parametrize("zero, negative", [(0, False), (0.0, False), (-0.0, True)])
def test_divide_by_zero(zero, negative):
def test_divide_by_zero(zero, negative, pdep16_nan_behavior):
# https://github.com/pandas-dev/pandas/issues/27398, GH#22793
a = pd.array([0, 1, -1, None], dtype="Int64")
result = a / zero
exp_mask = np.array([False, False, False, True])
if pdep16_nan_behavior:
exp_mask[0] = True
expected = FloatingArray(
np.array([np.nan, np.inf, -np.inf, 1], dtype="float64"),
np.array([False, False, False, True]),
exp_mask,
)
if negative:
expected *= -1
Expand Down Expand Up @@ -99,7 +102,7 @@ def test_mod(dtype):
tm.assert_extension_array_equal(result, expected)


def test_pow_scalar():
def test_pow_scalar(pdep16_nan_behavior):
a = pd.array([-1, 0, 1, None, 2], dtype="Int64")
result = a**0
expected = pd.array([1, 1, 1, 1, 1], dtype="Int64")
Expand All @@ -114,10 +117,13 @@ def test_pow_scalar():
tm.assert_extension_array_equal(result, expected)

result = a**np.nan
expected = FloatingArray(
np.array([np.nan, np.nan, 1, np.nan, np.nan], dtype="float64"),
np.array([False, False, False, True, False]),
)
if pdep16_nan_behavior:
expected = expected.astype("Float64")
else:
expected = FloatingArray(
np.array([np.nan, np.nan, 1, np.nan, np.nan], dtype="float64"),
np.array([False, False, False, True, False]),
)
tm.assert_extension_array_equal(result, expected)

# reversed
Expand All @@ -136,10 +142,13 @@ def test_pow_scalar():
tm.assert_extension_array_equal(result, expected)

result = np.nan**a
expected = FloatingArray(
np.array([1, np.nan, np.nan, np.nan], dtype="float64"),
np.array([False, False, True, False]),
)
if pdep16_nan_behavior:
expected = expected.astype("Float64")
else:
expected = FloatingArray(
np.array([1, np.nan, np.nan, np.nan], dtype="float64"),
np.array([False, False, True, False]),
)
tm.assert_extension_array_equal(result, expected)


Expand Down Expand Up @@ -212,7 +221,7 @@ def test_error_invalid_values(data, all_arithmetic_operators):
# TODO test unsigned overflow


def test_arith_coerce_scalar(data, all_arithmetic_operators):
def test_arith_coerce_scalar(data, all_arithmetic_operators, pdep16_nan_behavior):
op = tm.get_op_from_name(all_arithmetic_operators)
s = pd.Series(data)
other = 0.01
Expand All @@ -222,7 +231,7 @@ def test_arith_coerce_scalar(data, all_arithmetic_operators):
expected = expected.astype("Float64")

# rmod results in NaN that wasn't NA in original nullable Series -> unmask it
if all_arithmetic_operators == "__rmod__":
if all_arithmetic_operators == "__rmod__" and not pdep16_nan_behavior:
mask = (s == 0).fillna(False).to_numpy(bool)
expected.array._mask[mask] = False

Expand Down
7 changes: 5 additions & 2 deletions pandas/tests/arrays/integer/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,14 @@ def test_ufuncs_single_int(ufunc):


@pytest.mark.parametrize("ufunc", [np.log, np.exp, np.sin, np.cos, np.sqrt])
def test_ufuncs_single_float(ufunc):
def test_ufuncs_single_float(ufunc, pdep16_nan_behavior):
a = pd.array([1, 2, -3, np.nan])
with np.errstate(invalid="ignore"):
result = ufunc(a)
expected = FloatingArray(ufunc(a.astype(float)), mask=a._mask)
if pdep16_nan_behavior:
expected = pd.array(ufunc(a.astype(float)), dtype="Float64")
else:
expected = FloatingArray(ufunc(a.astype(float)), mask=a._mask)
tm.assert_extension_array_equal(result, expected)

s = pd.Series(a)
Expand Down
12 changes: 10 additions & 2 deletions pandas/tests/extension/base/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def test_can_hold_na_valid(self, data):
# GH-20761
assert data._can_hold_na is True

def test_contains(self, data, data_missing):
def test_contains(self, data, data_missing, pdep16_nan_behavior):
# GH-37867
# Tests for membership checks. Membership checks for nan-likes is tricky and
# the settled on rule is: `nan_like in arr` is True if nan_like is
Expand All @@ -55,7 +55,15 @@ def test_contains(self, data, data_missing):
# type check for e.g. two instances of Decimal("NAN")
continue
assert na_value_obj not in data
assert na_value_obj not in data_missing
if (
pdep16_nan_behavior
and isinstance(na_value_obj, float)
and isinstance(data, pd.core.arrays.BaseMaskedArray)
):
# TODO: wrong place for this override
assert na_value_obj in data_missing
else:
assert na_value_obj not in data_missing

def test_memory_usage(self, data):
s = pd.Series(data)
Expand Down
Loading
Loading