From 7ef2dba742ac757e82028bc2b3a526cde00f147c Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 24 Dec 2021 14:59:56 -0800 Subject: [PATCH 01/13] ENH/WIP: Index[bool] --- pandas/_libs/index.pyi | 1 + pandas/_libs/index.pyx | 6 +++++ pandas/conftest.py | 3 ++- pandas/core/algorithms.py | 3 --- pandas/core/dtypes/common.py | 2 +- pandas/core/indexes/base.py | 12 ++++++++-- pandas/core/tools/datetimes.py | 2 ++ pandas/core/util/hashing.py | 2 +- pandas/tests/indexes/common.py | 6 ++++- pandas/tests/indexes/test_base.py | 23 ++++++++++++++------ pandas/tests/indexes/test_index_new.py | 2 +- pandas/tests/indexes/test_numpy_compat.py | 4 ++-- pandas/tests/series/indexing/test_setitem.py | 4 ++-- pandas/tests/series/methods/test_drop.py | 3 ++- 14 files changed, 51 insertions(+), 22 deletions(-) diff --git a/pandas/_libs/index.pyi b/pandas/_libs/index.pyi index 446a980487cde..e0eb3d2f60b40 100644 --- a/pandas/_libs/index.pyi +++ b/pandas/_libs/index.pyi @@ -41,6 +41,7 @@ class ObjectEngine(IndexEngine): ... class DatetimeEngine(Int64Engine): ... class TimedeltaEngine(DatetimeEngine): ... class PeriodEngine(Int64Engine): ... +class BoolEngine(Uint8Engine): ... class BaseMultiIndexCodesEngine: levels: list[np.ndarray] diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 969da5aa53e3e..4d91c3bfef13a 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -795,3 +795,9 @@ cdef class BaseMultiIndexCodesEngine: # Generated from template. include "index_class_helper.pxi" + + +cdef class BoolEngine(UInt8Engine): + cdef _check_type(self, object val): + if not util.is_bool_object(val): + raise KeyError(val) diff --git a/pandas/conftest.py b/pandas/conftest.py index be28dbe35fcb2..14534cffe129c 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -529,7 +529,8 @@ def _create_mi_with_dt64tz_level(): "num_uint8": tm.makeNumericIndex(100, dtype="uint8"), "num_float64": tm.makeNumericIndex(100, dtype="float64"), "num_float32": tm.makeNumericIndex(100, dtype="float32"), - "bool": tm.makeBoolIndex(10), + "bool-object": tm.makeBoolIndex(10).astype(object), + "bool-dtype": Index(np.random.randn(10) < 0), "categorical": tm.makeCategoricalIndex(100), "interval": tm.makeIntervalIndex(100), "empty": Index([]), diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index a7252b6a7b7a2..541fa26e317ed 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -219,9 +219,6 @@ def _reconstruct_data( elif is_bool_dtype(dtype): values = values.astype(dtype, copy=False) - # we only support object dtypes bool Index - if isinstance(original, ABCIndex): - values = values.astype(object, copy=False) elif dtype is not None: if is_datetime64_dtype(dtype): dtype = np.dtype("datetime64[ns]") diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 60c8426ff3c6c..a17a10c7ccb20 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1330,7 +1330,7 @@ def is_bool_dtype(arr_or_dtype) -> bool: # we don't have a boolean Index class # so its object, we need to infer to # guess this - return arr_or_dtype.is_object() and arr_or_dtype.inferred_type == "boolean" + return arr_or_dtype.inferred_type == "boolean" elif isinstance(dtype, ExtensionDtype): return getattr(dtype, "_is_boolean", False) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 74044e55b5de6..9ea56dee6cb45 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -481,6 +481,10 @@ def __new__( if data.dtype.kind in ["i", "u", "f"]: # maybe coerce to a sub-class arr = data + elif data.dtype.kind == "b": + # No special subclass, and Index._ensure_array won't do this + # for us. + arr = np.asarray(data) else: arr = com.asarray_tuplesafe(data, dtype=np.dtype("object")) @@ -672,7 +676,7 @@ def _with_infer(cls, *args, **kwargs): # "Union[ExtensionArray, ndarray[Any, Any]]"; expected # "ndarray[Any, Any]" values = lib.maybe_convert_objects(result._values) # type: ignore[arg-type] - if values.dtype.kind in ["i", "u", "f"]: + if values.dtype.kind in ["i", "u", "f", "b"]: return Index(values, name=result.name) return result @@ -837,6 +841,8 @@ def _engine(self) -> libindex.IndexEngine: # to avoid a reference cycle, bind `target_values` to a local variable, so # `self` is not passed into the lambda. target_values = self._get_engine_target() + if target_values.dtype == bool: + return libindex.BoolEngine(target_values) return self._engine_type(target_values) @final @@ -2548,6 +2554,8 @@ def _is_all_dates(self) -> bool: """ Whether or not the index values only consist of dates. """ + if self.dtype.kind == "b": + return False return is_datetime_array(ensure_object(self._values)) @cache_readonly @@ -7048,7 +7056,7 @@ def _maybe_cast_data_without_dtype( FutureWarning, stacklevel=3, ) - if result.dtype.kind in ["b", "c"]: + if result.dtype.kind in ["c"]: return subarr result = ensure_wrapped_if_datetimelike(result) return result diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index f40f227259998..1133a0e5e8d7b 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -925,6 +925,8 @@ def to_datetime( result = convert_listlike(arg, format) else: result = convert_listlike(np.array([arg]), format)[0] + if isinstance(arg, bool) and isinstance(result, np.bool_): + result = bool(result) # TODO: avoid this kludge. # error: Incompatible return value type (got "Union[Timestamp, NaTType, # Series, Index]", expected "Union[DatetimeIndex, Series, float, str, diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index 02899bac14bb2..6f57984f2ddba 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -311,7 +311,7 @@ def _hash_ndarray( # First, turn whatever array this is into unsigned 64-bit ints, if we can # manage it. - elif isinstance(dtype, bool): + elif dtype == bool: vals = vals.astype("u8") elif issubclass(dtype.type, (np.datetime64, np.timedelta64)): vals = vals.view("i8").astype("u8", copy=False) diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 6b6caf1f8affd..4c106b18c5135 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -492,7 +492,11 @@ def test_fillna(self, index): # GH 11343 if len(index) == 0: return - elif isinstance(index, NumericIndex) and is_integer_dtype(index.dtype): + elif ( + isinstance(index, NumericIndex) + and is_integer_dtype(index.dtype) + or index.dtype == bool + ): return elif isinstance(index, MultiIndex): idx = index.copy(deep=True) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index c74a566cc573d..5fc9ce5c989f7 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -320,15 +320,21 @@ def test_view_with_args(self, index): "unicode", "string", pytest.param("categorical", marks=pytest.mark.xfail(reason="gh-25464")), - "bool", + "bool-object", + "bool-dtype", "empty", ], indirect=True, ) def test_view_with_args_object_array_raises(self, index): - msg = "Cannot change data-type for object array" - with pytest.raises(TypeError, match=msg): - index.view("i8") + if index.dtype == bool: + msg = "When changing to a larger dtype" + with pytest.raises(ValueError, match=msg): + index.view("i8") + else: + msg = "Cannot change data-type for object array" + with pytest.raises(TypeError, match=msg): + index.view("i8") @pytest.mark.parametrize("index", ["int", "range"], indirect=True) def test_astype(self, index): @@ -587,7 +593,8 @@ def test_append_empty_preserve_name(self, name, expected): "index, expected", [ ("string", False), - ("bool", False), + ("bool-object", False), + ("bool-dtype", False), ("categorical", False), ("int", True), ("datetime", False), @@ -602,7 +609,8 @@ def test_is_numeric(self, index, expected): "index, expected", [ ("string", True), - ("bool", True), + ("bool-object", True), + ("bool-dtype", False), ("categorical", False), ("int", False), ("datetime", False), @@ -617,7 +625,8 @@ def test_is_object(self, index, expected): "index, expected", [ ("string", False), - ("bool", False), + ("bool-object", False), + ("bool-dtype", False), ("categorical", False), ("int", False), ("datetime", True), diff --git a/pandas/tests/indexes/test_index_new.py b/pandas/tests/indexes/test_index_new.py index f44bbac1226e1..20d582bd234c6 100644 --- a/pandas/tests/indexes/test_index_new.py +++ b/pandas/tests/indexes/test_index_new.py @@ -73,7 +73,7 @@ def test_constructor_dtypes_to_object(self, cast_index, vals): index = Index(vals) assert type(index) is Index - assert index.dtype == object + assert index.dtype == bool def test_constructor_categorical_to_object(self): # GH#32167 Categorical data and dtype=object should return object-dtype diff --git a/pandas/tests/indexes/test_numpy_compat.py b/pandas/tests/indexes/test_numpy_compat.py index 573ee987ab4c8..179c44498ef05 100644 --- a/pandas/tests/indexes/test_numpy_compat.py +++ b/pandas/tests/indexes/test_numpy_compat.py @@ -51,7 +51,7 @@ def test_numpy_ufuncs_basic(index, func): with tm.external_error_raised((TypeError, AttributeError)): with np.errstate(all="ignore"): func(index) - elif isinstance(index, NumericIndex): + elif isinstance(index, NumericIndex) or index.dtype == bool: # coerces to float (e.g. np.sin) with np.errstate(all="ignore"): result = func(index) @@ -89,7 +89,7 @@ def test_numpy_ufuncs_other(index, func, request): with tm.external_error_raised(TypeError): func(index) - elif isinstance(index, NumericIndex): + elif isinstance(index, NumericIndex) or index.dtype == bool: # Results in bool array result = func(index) assert isinstance(result, np.ndarray) diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index 0dc2a9933cfc4..7d1cd1d8945f2 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -697,7 +697,7 @@ def test_index_where(self, obj, key, expected, val, request): mask[key] = True res = Index(obj).where(~mask, val) - tm.assert_index_equal(res, Index(expected)) + tm.assert_index_equal(res, Index(expected, dtype=expected.dtype)) def test_index_putmask(self, obj, key, expected, val): if Index(obj).dtype != obj.dtype: @@ -708,7 +708,7 @@ def test_index_putmask(self, obj, key, expected, val): mask[key] = True res = Index(obj).putmask(mask, val) - tm.assert_index_equal(res, Index(expected)) + tm.assert_index_equal(res, Index(expected, dtype=expected.dtype)) @pytest.mark.parametrize( diff --git a/pandas/tests/series/methods/test_drop.py b/pandas/tests/series/methods/test_drop.py index a566f8f62d72e..d1d216feea86d 100644 --- a/pandas/tests/series/methods/test_drop.py +++ b/pandas/tests/series/methods/test_drop.py @@ -54,7 +54,8 @@ def test_drop_with_ignore_errors(): # GH 8522 s = Series([2, 3], index=[True, False]) - assert s.index.is_object() + assert not s.index.is_object() + assert s.index.dtype == bool result = s.drop(True) expected = Series([3], index=[False]) tm.assert_series_equal(result, expected) From 5142a2dbee46ce5bdbba18418763c7bdf8a5a4f0 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 28 Dec 2021 12:05:05 -0800 Subject: [PATCH 02/13] failing tests --- pandas/core/algorithms.py | 11 +++++++++-- pandas/core/indexes/base.py | 7 ++++++- pandas/tests/base/test_value_counts.py | 2 +- pandas/tests/extension/test_boolean.py | 4 ++-- pandas/tests/indexes/common.py | 2 ++ pandas/tests/indexes/multi/test_indexing.py | 3 ++- pandas/tests/indexes/test_any_index.py | 4 ++++ pandas/tests/indexes/test_common.py | 7 +++++++ pandas/tests/reshape/concat/test_append_common.py | 11 ++++++----- pandas/tests/series/methods/test_value_counts.py | 2 +- pandas/tests/series/test_logical_ops.py | 6 ++++-- pandas/tests/test_algos.py | 8 +++++++- 12 files changed, 51 insertions(+), 16 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 85beef3c97afb..f174cc0a114d9 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -819,7 +819,10 @@ def value_counts( ------- Series """ - from pandas.core.series import Series + from pandas import ( + Index, + Series, + ) name = getattr(values, "name", None) @@ -857,7 +860,11 @@ def value_counts( else: keys, counts = value_counts_arraylike(values, dropna) - result = Series(counts, index=keys, name=name) + keys2 = keys + keys2 = Index._with_infer(keys) + if keys2.dtype.kind == "b" and keys.dtype == object: + keys2 = Index(keys, dtype=object) + result = Series(counts, index=keys2, name=name) if sort: result = result.sort_values(ascending=ascending) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index fa40a73dd4d9b..0dcd3ed8d28ac 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -370,7 +370,6 @@ def _outer_indexer( _comparables: list[str] = ["name"] _attributes: list[str] = ["name"] _is_numeric_dtype: bool = False - _can_hold_na: bool = True _can_hold_strings: bool = True # Whether this index is a NumericIndex, but not a Int64Index, Float64Index, @@ -2099,6 +2098,12 @@ def _get_grouper_for_level(self, mapper, *, level=None): # -------------------------------------------------------------------- # Introspection Methods + @cache_readonly + def _can_hold_na(self) -> bool: + if self.dtype == bool: + return False + return True + @final @property def is_monotonic(self) -> bool: diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py index 6130646bb52c5..db10cd303d633 100644 --- a/pandas/tests/base/test_value_counts.py +++ b/pandas/tests/base/test_value_counts.py @@ -276,7 +276,7 @@ def test_value_counts_with_nan(dropna, index_or_series): obj = klass(values) res = obj.value_counts(dropna=dropna) if dropna is True: - expected = Series([1], index=[True]) + expected = Series([1], index=Index([True], dtype=obj.dtype)) else: expected = Series([1, 1, 1], index=[True, pd.NA, np.nan]) tm.assert_series_equal(res, expected) diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index 0212610ec270f..0aec82966d335 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -261,7 +261,7 @@ def test_groupby_extension_agg(self, as_index, data_for_grouping): _, uniques = pd.factorize(data_for_grouping, sort=True) if as_index: - index = pd.Index(uniques, name="B") + index = pd.Index(uniques.astype(bool), name="B", dtype=bool) expected = pd.Series([3.0, 1.0], index=index, name="A") self.assert_series_equal(result, expected) else: @@ -289,7 +289,7 @@ def test_groupby_extension_no_sort(self, data_for_grouping): result = df.groupby("B", sort=False).A.mean() _, index = pd.factorize(data_for_grouping, sort=False) - index = pd.Index(index, name="B") + index = pd.Index(index.astype(bool), name="B") expected = pd.Series([1.0, 3.0], index=index, name="A") self.assert_series_equal(result, expected) diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 4c106b18c5135..a2c8dac161504 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -215,6 +215,8 @@ def test_ensure_copied_data(self, index): # RangeIndex cannot be initialized from data # MultiIndex and CategoricalIndex are tested separately return + elif index.dtype == object and index.inferred_type == "boolean": + init_kwargs["dtype"] = index.dtype index_type = type(index) result = index_type(index.values, copy=True, **init_kwargs) diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index 34722ad388ae0..d11fe5273b492 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -623,7 +623,8 @@ def test_get_loc_implicit_cast(self, level, dtypes): def test_get_loc_cast_bool(self): # GH 19086 : int is casted to bool, but not vice-versa - levels = [[False, True], np.arange(2, dtype="int64")] + # TODO: fails if we dont make levels[0] object-dtype + levels = [Index([False, True], dtype=object), np.arange(2, dtype="int64")] idx = MultiIndex.from_product(levels) assert idx.get_loc((0, 1)) == 1 diff --git a/pandas/tests/indexes/test_any_index.py b/pandas/tests/indexes/test_any_index.py index 93dd0f3c0a770..4bb04426eb24d 100644 --- a/pandas/tests/indexes/test_any_index.py +++ b/pandas/tests/indexes/test_any_index.py @@ -46,6 +46,10 @@ def test_mutability(index): def test_map_identity_mapping(index): # GH#12766 result = index.map(lambda x: x) + if index.dtype == object and result.dtype == bool: + assert (index == result).all() + # TODO: could work that into the 'exact="equiv"'? + return # FIXME: doesn't belong in this file anymore! tm.assert_index_equal(result, index, exact="equiv") diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index 50770f5bb38f2..26e2dd00ec2f7 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -92,6 +92,8 @@ def test_constructor_non_hashable_name(self, index_flat): def test_constructor_unwraps_index(self, index_flat): a = index_flat b = type(a)(a) + if a.dtype == object and b.dtype == bool: + b = b.astype(object) # FIXME: kludge tm.assert_equal(a._data, b._data) def test_to_flat_index(self, index_flat): @@ -432,6 +434,9 @@ def test_hasnans_isnans(self, index_flat): return elif isinstance(index, NumericIndex) and is_integer_dtype(index.dtype): return + elif index.dtype == bool: + # values[1] = np.nan below casts to True! + return values[1] = np.nan @@ -457,6 +462,8 @@ def test_sort_values_with_missing(index_with_missing, na_position): if isinstance(index_with_missing, CategoricalIndex): pytest.skip("missing value sorting order not well-defined") + if index_with_missing.dtype == bool: + pytest.skip("index_with_missing doesn't actually have missing") missing_count = np.sum(index_with_missing.isna()) not_na_vals = index_with_missing[index_with_missing.notna()].values diff --git a/pandas/tests/reshape/concat/test_append_common.py b/pandas/tests/reshape/concat/test_append_common.py index bb8027948c540..0e233df3fb104 100644 --- a/pandas/tests/reshape/concat/test_append_common.py +++ b/pandas/tests/reshape/concat/test_append_common.py @@ -61,10 +61,7 @@ def _check_expected_dtype(self, obj, label): considering not-supported dtypes """ if isinstance(obj, Index): - if label == "bool": - assert obj.dtype == "object" - else: - assert obj.dtype == label + assert obj.dtype == label elif isinstance(obj, Series): if label.startswith("period"): assert obj.dtype == "Period[M]" @@ -185,7 +182,7 @@ def test_concatlike_same_dtypes(self, item): with pytest.raises(TypeError, match=msg): pd.concat([Series(vals1), Series(vals2), vals3]) - def test_concatlike_dtypes_coercion(self, item, item2): + def test_concatlike_dtypes_coercion(self, item, item2, request): # GH 13660 typ1, vals1 = item typ2, vals2 = item2 @@ -209,8 +206,12 @@ def test_concatlike_dtypes_coercion(self, item, item2): # series coerces to numeric based on numpy rule # index doesn't because bool is object dtype exp_series_dtype = typ2 + mark = pytest.mark.xfail(reason="GH#39187 casting to object") + request.node.add_marker(mark) elif typ2 == "bool" and typ1 in ("int64", "float64"): exp_series_dtype = typ1 + mark = pytest.mark.xfail(reason="GH#39187 casting to object") + request.node.add_marker(mark) elif ( typ1 == "datetime64[ns, US/Eastern]" or typ2 == "datetime64[ns, US/Eastern]" diff --git a/pandas/tests/series/methods/test_value_counts.py b/pandas/tests/series/methods/test_value_counts.py index c914dba75dc35..1977bf88481a5 100644 --- a/pandas/tests/series/methods/test_value_counts.py +++ b/pandas/tests/series/methods/test_value_counts.py @@ -194,7 +194,7 @@ def test_value_counts_categorical_with_nan(self): ( Series([False, True, True, pd.NA]), True, - Series([2, 1], index=[True, False]), + Series([2, 1], index=pd.Index([True, False], dtype=object)), ), ( Series(range(3), index=[True, False, np.nan]).index, diff --git a/pandas/tests/series/test_logical_ops.py b/pandas/tests/series/test_logical_ops.py index 9648b01492e02..38e3c5ec8a6f2 100644 --- a/pandas/tests/series/test_logical_ops.py +++ b/pandas/tests/series/test_logical_ops.py @@ -268,7 +268,9 @@ def test_logical_ops_with_index(self, op): def test_reversed_xor_with_index_returns_index(self): # GH#22092, GH#19792 ser = Series([True, True, False, False]) - idx1 = Index([True, False, True, False]) + idx1 = Index( + [True, False, True, False], dtype=object + ) # TODO: raises if bool-dtype idx2 = Index([1, 0, 1, 0]) msg = "operating as a set operation" @@ -325,7 +327,7 @@ def test_reversed_logical_op_with_index_returns_series(self, op): [ (ops.rand_, Index([False, True])), (ops.ror_, Index([False, True])), - (ops.rxor, Index([])), + (ops.rxor, Index([], dtype=bool)), ], ) def test_reverse_ops_with_index(self, op, expected): diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 0efe4a62c6152..d054864d1b03e 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -56,6 +56,12 @@ def test_factorize(self, index_or_series_obj, sort): if isinstance(obj, MultiIndex): constructor = MultiIndex.from_tuples expected_uniques = constructor(obj.unique()) + if ( + isinstance(obj, Index) + and expected_uniques.dtype == bool + and obj.dtype == object + ): + expected_uniques = expected_uniques.astype(object) if sort: expected_uniques = expected_uniques.sort_values() @@ -1240,7 +1246,7 @@ def test_dropna(self): tm.assert_series_equal( Series([True] * 3 + [False] * 2 + [None] * 5).value_counts(dropna=True), - Series([3, 2], index=[True, False]), + Series([3, 2], index=Index([True, False], dtype=object)), ) tm.assert_series_equal( Series([True] * 5 + [False] * 3 + [None] * 2).value_counts(dropna=False), From 0db360ae8cc038e7525827d795efa541e09fbbd5 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 28 Dec 2021 16:49:50 -0800 Subject: [PATCH 03/13] mypy fixup --- pandas/_libs/index.pyi | 2 +- pandas/core/indexes/numeric.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/index.pyi b/pandas/_libs/index.pyi index e0eb3d2f60b40..aff3680246609 100644 --- a/pandas/_libs/index.pyi +++ b/pandas/_libs/index.pyi @@ -41,7 +41,7 @@ class ObjectEngine(IndexEngine): ... class DatetimeEngine(Int64Engine): ... class TimedeltaEngine(DatetimeEngine): ... class PeriodEngine(Int64Engine): ... -class BoolEngine(Uint8Engine): ... +class BoolEngine(UInt8Engine): ... class BaseMultiIndexCodesEngine: levels: list[np.ndarray] diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index bb25813e9742b..80fc12ae5e8f0 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -102,9 +102,8 @@ class NumericIndex(Index): _can_hold_strings = False _is_backward_compat_public_numeric_index: bool = True - # error: Signature of "_can_hold_na" incompatible with supertype "Index" @cache_readonly - def _can_hold_na(self) -> bool: # type: ignore[override] + def _can_hold_na(self) -> bool: if is_float_dtype(self.dtype): return True else: From b67bab1937a42259d4a196e853a59882c06c22eb Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 30 Dec 2021 11:42:34 -0800 Subject: [PATCH 04/13] xfail --- pandas/tests/indexes/test_setops.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index 648b79bd288df..2fb8735650fdd 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -46,12 +46,28 @@ def test_union_same_types(index): assert idx1.union(idx2).dtype == idx1.dtype -def test_union_different_types(index_flat, index_flat2): +def test_union_different_types(index_flat, index_flat2, request): # This test only considers combinations of indices # GH 23525 idx1 = index_flat idx2 = index_flat2 + if ( + not idx1.is_unique + and idx1.dtype.kind == "i" + and idx2.dtype.kind == "b" + and (idx1.has_duplicates and idx2.has_duplicates) + ) or ( + not idx2.is_unique + and idx2.dtype.kind == "i" + and idx1.dtype.kind == "b" + and (idx1.has_duplicates and idx2.has_duplicates) + ): + mark = pytest.mark.xfail( + reason="GH#44000 True==1", raises=ValueError, strict=False + ) + request.node.add_marker(mark) + common_dtype = find_common_type([idx1.dtype, idx2.dtype]) any_uint64 = idx1.dtype == np.uint64 or idx2.dtype == np.uint64 @@ -217,7 +233,11 @@ def test_union_base(self, index): def test_difference_base(self, sort, index): first = index[2:] second = index[:4] - if isinstance(index, CategoricalIndex) or index.is_boolean(): + if index.is_boolean(): + # i think (TODO: be sure) there assumptions baked in about + # the index fixture that don't hold here? + answer = set(first).difference(set(second)) + elif isinstance(index, CategoricalIndex): answer = [] else: answer = index[4:] From 3b2f6e3ded2c4f0c5b18c83bcd20e31392c6e685 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 2 Jan 2022 08:51:27 -0800 Subject: [PATCH 05/13] fix Boolean GroupBy tests --- pandas/tests/extension/test_boolean.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index 195ccb535ad86..fa712ee19a9d8 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -265,7 +265,7 @@ def test_groupby_extension_agg(self, as_index, data_for_grouping): _, uniques = pd.factorize(data_for_grouping, sort=True) if as_index: - index = pd.Index(uniques.astype(bool), name="B", dtype=bool) + index = pd.Index(uniques, name="B", dtype=bool) expected = pd.Series([3.0, 1.0], index=index, name="A") self.assert_series_equal(result, expected) else: @@ -293,7 +293,7 @@ def test_groupby_extension_no_sort(self, data_for_grouping): result = df.groupby("B", sort=False).A.mean() _, index = pd.factorize(data_for_grouping, sort=False) - index = pd.Index(index.astype(bool), name="B") + index = pd.Index(index, name="B") expected = pd.Series([1.0, 3.0], index=index, name="A") self.assert_series_equal(result, expected) From 298757ccb5a453ee2679c340b2cfcad6281ee4ff Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 7 Jan 2022 14:48:20 -0800 Subject: [PATCH 06/13] document value_counts behavior, skips and xfails --- pandas/conftest.py | 2 +- pandas/core/algorithms.py | 12 +++++++----- pandas/tests/extension/test_boolean.py | 2 +- pandas/tests/indexes/common.py | 9 ++++----- pandas/tests/indexes/multi/test_indexing.py | 20 ++++++++++++++------ pandas/tests/indexes/test_base.py | 4 ++-- pandas/tests/indexes/test_common.py | 8 +++----- 7 files changed, 32 insertions(+), 25 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 6a22a291a77a7..f6443379d0f9c 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -625,7 +625,7 @@ def index_flat_unique(request): key for key in indices_dict if not ( - key in ["int", "uint", "range", "empty", "repeats"] + key in ["int", "uint", "range", "empty", "repeats", "bool-dtype"] or key.startswith("num_") ) and not isinstance(indices_dict[key], MultiIndex) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index dbdc0e78ec8ea..2c85f6cb92f2a 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -859,11 +859,13 @@ def value_counts( else: keys, counts = value_counts_arraylike(values, dropna) - keys2 = keys - keys2 = Index._with_infer(keys) - if keys2.dtype.kind == "b" and keys.dtype == object: - keys2 = Index(keys, dtype=object) - result = Series(counts, index=keys2, name=name) + # For backwards compatibility, we let Index do its normal type + # inference, _except_ for if if infers from object to bool. + idx = Index._with_infer(keys) + if idx.dtype == bool and keys.dtype == object: + idx = idx.astype(object) + + result = Series(counts, index=idx, name=name) if sort: result = result.sort_values(ascending=ascending) diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index fa712ee19a9d8..1f44889cdd88a 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -265,7 +265,7 @@ def test_groupby_extension_agg(self, as_index, data_for_grouping): _, uniques = pd.factorize(data_for_grouping, sort=True) if as_index: - index = pd.Index(uniques, name="B", dtype=bool) + index = pd.Index(uniques, name="B") expected = pd.Series([3.0, 1.0], index=index, name="A") self.assert_series_equal(result, expected) else: diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 14d4cefddb578..b01d34cc46478 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -522,11 +522,10 @@ def test_fillna(self, index): # GH 11343 if len(index) == 0: return - elif ( - isinstance(index, NumericIndex) - and is_integer_dtype(index.dtype) - or index.dtype == bool - ): + elif index.dtype == bool: + # can't hold NAs + return + elif isinstance(index, NumericIndex) and is_integer_dtype(index.dtype): return elif isinstance(index, MultiIndex): idx = index.copy(deep=True) diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index 49cf935b103ff..4efb8d23b5865 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -621,14 +621,22 @@ def test_get_loc_implicit_cast(self, level, dtypes): idx = MultiIndex.from_product(levels) assert idx.get_loc(tuple(key)) == 3 - def test_get_loc_cast_bool(self): - # GH 19086 : int is casted to bool, but not vice-versa - # TODO: fails if we dont make levels[0] object-dtype - levels = [Index([False, True], dtype=object), np.arange(2, dtype="int64")] + @pytest.mark.parametrize("dtype", [bool, object]) + def test_get_loc_cast_bool(self, dtype): + # GH 19086 : int is casted to bool, but not vice-versa (for object dtype) + # With bool dtype, we don't cast in either direction. + levels = [Index([False, True], dtype=dtype), np.arange(2, dtype="int64")] idx = MultiIndex.from_product(levels) - assert idx.get_loc((0, 1)) == 1 - assert idx.get_loc((1, 0)) == 2 + if dtype is bool: + with pytest.raises(KeyError, match=r"^\(0, 1\)$"): + assert idx.get_loc((0, 1)) == 1 + with pytest.raises(KeyError, match=r"^\(1, 0\)$"): + assert idx.get_loc((1, 0)) == 2 + else: + # We use python object comparisons, which treat 0 == False and 1 == True + assert idx.get_loc((0, 1)) == 1 + assert idx.get_loc((1, 0)) == 2 with pytest.raises(KeyError, match=r"^\(False, True\)$"): idx.get_loc((False, True)) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 2ac25b4c3aa08..0fe8c7ca2bd23 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -403,9 +403,9 @@ def test_is_(self): def test_asof_numeric_vs_bool_raises(self): left = Index([1, 2, 3]) - right = Index([True, False]) + right = Index([True, False], dtype=object) - msg = "Cannot compare dtypes int64 and object" + msg = "Cannot compare dtypes int64 and bool" with pytest.raises(TypeError, match=msg): left.asof(right[0]) # TODO: should right.asof(left[0]) also raise? diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index ada6b646e17ba..5a0477ddfa165 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -91,9 +91,9 @@ def test_constructor_non_hashable_name(self, index_flat): def test_constructor_unwraps_index(self, index_flat): a = index_flat - b = type(a)(a) - if a.dtype == object and b.dtype == bool: - b = b.astype(object) # FIXME: kludge + # Passing dtype is necessary for Index([True, False], dtype=object) + # case. + b = type(a)(a, dtype=a.dtype) tm.assert_equal(a._data, b._data) def test_to_flat_index(self, index_flat): @@ -462,8 +462,6 @@ def test_sort_values_with_missing(index_with_missing, na_position): if isinstance(index_with_missing, CategoricalIndex): pytest.skip("missing value sorting order not well-defined") - if index_with_missing.dtype == bool: - pytest.skip("index_with_missing doesn't actually have missing") missing_count = np.sum(index_with_missing.isna()) not_na_vals = index_with_missing[index_with_missing.notna()].values From ad59ca1957a01d5293793b397dae893d3e33f09d Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 25 Jan 2022 18:19:18 -0800 Subject: [PATCH 07/13] post-merge cleanup --- pandas/core/indexes/base.py | 4 +++- pandas/core/indexes/numeric.py | 7 ------- pandas/tests/series/indexing/test_setitem.py | 8 ++++---- 3 files changed, 7 insertions(+), 12 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 3aaee7e7e17a9..e611179ce14f7 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2213,7 +2213,9 @@ def _get_grouper_for_level(self, mapper, *, level=None): @cache_readonly def _can_hold_na(self) -> bool: - if self.dtype == bool: + if isinstance(self.dtype, ExtensionDtype): + return self.dtype._can_hold_na + if self.dtype.kind in ["i", "u", "b"]: return False return True diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index 606bf0f0e7345..3d7182792aa08 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -92,13 +92,6 @@ class NumericIndex(Index): _can_hold_strings = False _is_backward_compat_public_numeric_index: bool = True - @cache_readonly - def _can_hold_na(self) -> bool: - if is_float_dtype(self.dtype): - return True - else: - return False - _engine_types: dict[np.dtype, type[libindex.IndexEngine]] = { np.dtype(np.int8): libindex.Int8Engine, np.dtype(np.int16): libindex.Int16Engine, diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index 5bdf535479bb6..3fbd25c8daabb 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -715,8 +715,8 @@ def test_series_where(self, obj, key, expected, val, is_inplace): self._check_inplace(is_inplace, orig, arr, obj) def test_index_where(self, obj, key, expected, val, request): - if obj.dtype == bool or obj.dtype.kind == "c" or expected.dtype.kind == "c": - # TODO(GH#45061): Should become unreachable (at least the bool part) + if obj.dtype.kind == "c" or expected.dtype.kind == "c": + # TODO(Index[complex]): Should become unreachable pytest.skip("test not applicable for this dtype") mask = np.zeros(obj.shape, dtype=bool) @@ -726,8 +726,8 @@ def test_index_where(self, obj, key, expected, val, request): tm.assert_index_equal(res, Index(expected, dtype=expected.dtype)) def test_index_putmask(self, obj, key, expected, val): - if obj.dtype == bool or obj.dtype.kind == "c" or expected.dtype.kind == "c": - # TODO(GH#45061): Should become unreachable (at least the bool part) + if obj.dtype.kind == "c" or expected.dtype.kind == "c": + # TODO(Index[complex]): Should become unreachable pytest.skip("test not applicable for this dtype") mask = np.zeros(obj.shape, dtype=bool) From 44a49af5464c0be25d7abbc9f2aec9a99609f99d Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 26 Jan 2022 08:17:32 -0800 Subject: [PATCH 08/13] fix should_compare --- pandas/core/indexes/base.py | 4 ++++ pandas/core/indexes/numeric.py | 5 ----- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index e611179ce14f7..03702c5921704 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -6161,6 +6161,10 @@ def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: """ Can we compare values of the given dtype to our own? """ + if self.dtype.kind == "b": + return dtype.kind == "b" + if is_numeric_dtype(self.dtype): + return is_numeric_dtype(dtype) return True @final diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index 3d7182792aa08..f5fffa1234130 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -14,7 +14,6 @@ ) from pandas._typing import ( Dtype, - DtypeObj, npt, ) from pandas.util._decorators import ( @@ -277,10 +276,6 @@ def _convert_tolerance(self, tolerance, target): ) return tolerance - def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: - # If we ever have BoolIndex or ComplexIndex, this may need to be tightened - return is_numeric_dtype(dtype) - @classmethod def _assert_safe_casting(cls, data: np.ndarray, subarr: np.ndarray) -> None: """ From 9b57f1333d03a29ffd185010ff4af6cef7562a1c Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 29 Jan 2022 11:54:02 -0800 Subject: [PATCH 09/13] whatsnew --- doc/source/whatsnew/v1.5.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 1d4054d5ea0f1..2f1fb4ecdf214 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -36,6 +36,7 @@ Other enhancements - Improved the rendering of ``categories`` in :class:`CategoricalIndex` (:issue:`45218`) - :meth:`to_numeric` now preserves float64 arrays when downcasting would generate values not representable in float32 (:issue:`43693`) - :meth:`.GroupBy.min` and :meth:`.GroupBy.max` now supports `Numba `_ execution with the ``engine`` keyword (:issue:`45428`) +- Implemented a ``bool``-dtype :class:`Index`, passing a bool-dtype arraylike to ``pd.Index`` will now retain ``bool`` dtype instead of casting to ``object`` (:issue:`45061`) - .. --------------------------------------------------------------------------- From 941986cd7786418a53e98f20a552e7869da76dbd Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 30 Jan 2022 11:05:53 -0800 Subject: [PATCH 10/13] patch IntervalIndex._can_hold_na --- pandas/core/indexes/interval.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 5250f19c839bf..002e5ad7ece46 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -201,6 +201,11 @@ class IntervalIndex(ExtensionIndex): _can_hold_strings = False _data_cls = IntervalArray + # FIXME: this is inaccurate for integer-backed IntervalArray, but + # is the pre-existing behavior before GH#45061 (Index[bool]). + # Without this, other.categories.take raises in IntervalArray._cmp_method + _can_hold_na = True + # -------------------------------------------------------------------- # Constructors From 3dd23bb5a4b4a43c904bcc43e76972f6cdb00a0f Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 30 Jan 2022 16:15:06 -0800 Subject: [PATCH 11/13] remove duplicate line --- pandas/tests/series/methods/test_drop.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/series/methods/test_drop.py b/pandas/tests/series/methods/test_drop.py index ee558d063823b..a625e890393a6 100644 --- a/pandas/tests/series/methods/test_drop.py +++ b/pandas/tests/series/methods/test_drop.py @@ -54,7 +54,6 @@ def test_drop_with_ignore_errors(): # GH 8522 ser = Series([2, 3], index=[True, False]) - ser = Series([2, 3], index=[True, False]) assert not ser.index.is_object() assert ser.index.dtype == bool result = ser.drop(True) From 8c91b1c34017fcb76557c74bf795c429ef2e208e Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 1 Feb 2022 12:20:02 -0800 Subject: [PATCH 12/13] fix _check_dtype for BoolEngine --- pandas/_libs/index.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 6250820934eaa..0e9a330587f07 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -806,6 +806,7 @@ cdef class BoolEngine(UInt8Engine): cdef _check_type(self, object val): if not util.is_bool_object(val): raise KeyError(val) + return val @cython.internal From c46d3a3a98fa671ad99da6717acb663dcd821752 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 1 Feb 2022 18:01:54 -0800 Subject: [PATCH 13/13] remove no-longer-needed --- pandas/core/indexes/interval.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 30c246bc9913a..1e39c1db1a73b 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -201,11 +201,6 @@ class IntervalIndex(ExtensionIndex): _can_hold_strings = False _data_cls = IntervalArray - # FIXME: this is inaccurate for integer-backed IntervalArray, but - # is the pre-existing behavior before GH#45061 (Index[bool]). - # Without this, other.categories.take raises in IntervalArray._cmp_method - _can_hold_na = True - # -------------------------------------------------------------------- # Constructors