From 3867f2d3a183f2bb7ee890b064045cfb0b26b322 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 3 Mar 2020 11:54:24 -0800 Subject: [PATCH 1/3] ENH: Categorical.fillna allow Categorical/ndarray --- pandas/core/algorithms.py | 18 +++++++++++------- pandas/core/arrays/categorical.py | 8 ++++++-- pandas/core/generic.py | 2 ++ pandas/core/internals/managers.py | 6 +----- .../tests/arrays/categorical/test_missing.py | 13 +++++++++++++ 5 files changed, 33 insertions(+), 14 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index f9059054ba59f..1a51101bc8db8 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -11,6 +11,7 @@ from pandas._libs import Timestamp, algos, hashtable as htable, lib from pandas._libs.tslib import iNaT +from pandas._typing import AnyArrayLike from pandas.util._decorators import doc from pandas.core.dtypes.cast import ( @@ -45,10 +46,14 @@ is_unsigned_integer_dtype, needs_i8_conversion, ) -from pandas.core.dtypes.generic import ABCIndex, ABCIndexClass, ABCSeries +from pandas.core.dtypes.generic import ( + ABCExtensionArray, + ABCIndex, + ABCIndexClass, + ABCSeries, +) from pandas.core.dtypes.missing import isna, na_value_for_dtype -import pandas.core.common as com from pandas.core.construction import array, extract_array from pandas.core.indexers import validate_indices @@ -384,7 +389,7 @@ def unique(values): unique1d = unique -def isin(comps, values) -> np.ndarray: +def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: """ Compute the isin boolean array. @@ -409,15 +414,14 @@ def isin(comps, values) -> np.ndarray: f"to isin(), you passed a [{type(values).__name__}]" ) - if not isinstance(values, (ABCIndex, ABCSeries, np.ndarray)): + if not isinstance(values, (ABCIndex, ABCSeries, ABCExtensionArray, np.ndarray)): values = construct_1d_object_array_from_listlike(list(values)) + comps = extract_array(comps, extract_numpy=True) if is_categorical_dtype(comps): # TODO(extension) # handle categoricals - return comps._values.isin(values) - - comps = com.values_from_object(comps) + return comps.isin(values) # type: ignore comps, dtype = _ensure_data(comps) values, _ = _ensure_data(values, dtype=dtype) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 40a169d03f39c..f7b14f987ff82 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1738,8 +1738,12 @@ def fillna(self, value=None, method=None, limit=None): # If value is a dict or a Series (a dict value has already # been converted to a Series) - if isinstance(value, ABCSeries): - if not value[~value.isin(self.categories)].isna().all(): + if isinstance(value, (np.ndarray, Categorical, ABCSeries)): + # We get ndarray or Categorical if called via Series.fillna, + # where it will unwrap another aligned Series before getting here + + mask = ~algorithms.isin(value, self.categories) + if not isna(value[mask]).all(): raise ValueError("fill value must be in categories") values_codes = _get_codes_for_values(value, self.categories) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index f7eb79a4f1c78..ba9d9209baec9 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5988,6 +5988,8 @@ def fillna( value = create_series_with_explicit_dtype( value, dtype_if_empty=object ) + value = value.reindex(self.index, copy=False) + value = value._values elif not is_list_like(value): pass else: diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index c0e32066a8a70..25b1b605cc7ed 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -364,6 +364,7 @@ def apply(self, f, filter=None, **kwargs) -> "BlockManager": BlockManager """ result_blocks = [] + # fillna: Series/DataFrame is responsible for making sure value is aligned # filter kwarg is used in replace-* family of methods if filter is not None: @@ -388,11 +389,6 @@ def apply(self, f, filter=None, **kwargs) -> "BlockManager": align_keys = ["new", "mask"] else: align_keys = ["mask"] - elif f == "fillna": - # fillna internally does putmask, maybe it's better to do this - # at mgr, not block level? - align_copy = False - align_keys = ["value"] else: align_keys = [] diff --git a/pandas/tests/arrays/categorical/test_missing.py b/pandas/tests/arrays/categorical/test_missing.py index 8889f45a84237..204621f188ad9 100644 --- a/pandas/tests/arrays/categorical/test_missing.py +++ b/pandas/tests/arrays/categorical/test_missing.py @@ -82,3 +82,16 @@ def test_fillna_iterable_category(self, named): expected = Categorical([Point(0, 0), Point(0, 1), Point(0, 0)]) tm.assert_categorical_equal(result, expected) + + def test_fillna_array(self): + # accept Categorical or ndarray value if it holds appropriat values + cat = Categorical([1, 2, 3, None, None]) + + other = cat.fillna(2) + result = cat.fillna(other) + tm.assert_categorical_equal(result, other) + + other = np.array([1, 2, 3, 2, 1]) + result = cat.fillna(other) + expected = Categorical([1, 2, 3, 2, 1], dtype=cat.dtype) + tm.assert_categorical_equal(result, expected) From 7d4ee85b9a951751d6b45eadb10898dfc8c55ca6 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 3 Mar 2020 11:57:24 -0800 Subject: [PATCH 2/3] add test for algos.isin accepting Categorical --- pandas/tests/test_algos.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index a1de9c435c9ba..ad7028702ec8c 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -760,6 +760,16 @@ def test_categorical_from_codes(self): result = algos.isin(Sd, St) tm.assert_numpy_array_equal(expected, result) + def test_categorical_isin(self): + vals = np.array([0, 1, 2, 0]) + cats = ["a", "b", "c"] + cat = Categorical(1).from_codes(vals, cats) + other = Categorical(1).from_codes(np.array([0, 1]), cats) + + expected = np.array([True, True, False, True]) + result = algos.isin(cat, other) + tm.assert_numpy_array_equal(expected, result) + def test_same_nan_is_in(self): # GH 22160 # nan is special, because from " a is b" doesn't follow "a == b" From 12e4c2afc6112ba79189cbaab78f0703c99751f2 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 5 Mar 2020 18:13:59 -0800 Subject: [PATCH 3/3] Fix broken test --- pandas/core/arrays/categorical.py | 1 + pandas/tests/arrays/categorical/test_missing.py | 12 +++++++----- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index f7b14f987ff82..c6e6acfd47e9a 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1748,6 +1748,7 @@ def fillna(self, value=None, method=None, limit=None): values_codes = _get_codes_for_values(value, self.categories) indexer = np.where(codes == -1) + codes = codes.copy() codes[indexer] = values_codes[indexer] # If value is not a dict or Series it should be a scalar diff --git a/pandas/tests/arrays/categorical/test_missing.py b/pandas/tests/arrays/categorical/test_missing.py index 204621f188ad9..9eb3c8b3a8c48 100644 --- a/pandas/tests/arrays/categorical/test_missing.py +++ b/pandas/tests/arrays/categorical/test_missing.py @@ -84,14 +84,16 @@ def test_fillna_iterable_category(self, named): tm.assert_categorical_equal(result, expected) def test_fillna_array(self): - # accept Categorical or ndarray value if it holds appropriat values - cat = Categorical([1, 2, 3, None, None]) + # accept Categorical or ndarray value if it holds appropriate values + cat = Categorical(["A", "B", "C", None, None]) - other = cat.fillna(2) + other = cat.fillna("C") result = cat.fillna(other) tm.assert_categorical_equal(result, other) + assert isna(cat[-1]) # didnt modify original inplace - other = np.array([1, 2, 3, 2, 1]) + other = np.array(["A", "B", "C", "B", "A"]) result = cat.fillna(other) - expected = Categorical([1, 2, 3, 2, 1], dtype=cat.dtype) + expected = Categorical(["A", "B", "C", "B", "A"], dtype=cat.dtype) tm.assert_categorical_equal(result, expected) + assert isna(cat[-1]) # didnt modify original inplace