diff --git a/asv_bench/benchmarks/hash_functions.py b/asv_bench/benchmarks/hash_functions.py index d9a291dc27125..da752b902b4fd 100644 --- a/asv_bench/benchmarks/hash_functions.py +++ b/asv_bench/benchmarks/hash_functions.py @@ -39,6 +39,21 @@ def time_unique(self, exponent): pd.unique(self.a2) +class Unique: + params = ["Int64", "Float64"] + param_names = ["dtype"] + + def setup(self, dtype): + self.ser = pd.Series(([1, pd.NA, 2] + list(range(100_000))) * 3, dtype=dtype) + self.ser_unique = pd.Series(list(range(300_000)) + [pd.NA], dtype=dtype) + + def time_unique_with_duplicates(self, exponent): + pd.unique(self.ser) + + def time_unique(self, exponent): + pd.unique(self.ser_unique) + + class NumericSeriesIndexing: params = [ diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 8a2b9c2f77627..3c9a1f86ad2a1 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -521,7 +521,7 @@ cdef class {{name}}HashTable(HashTable): def _unique(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, object na_value=None, bint ignore_na=False, - object mask=None, bint return_inverse=False): + object mask=None, bint return_inverse=False, bint use_result_mask=False): """ Calculate unique values and labels (no sorting!) @@ -551,6 +551,9 @@ cdef class {{name}}HashTable(HashTable): return_inverse : bool, default False Whether the mapping of the original array values to their location in the vector of uniques should be returned. + use_result_mask: bool, default False + Whether to create a result mask for the unique values. Not supported + with return_inverse=True. Returns ------- @@ -558,6 +561,8 @@ cdef class {{name}}HashTable(HashTable): Unique values of input, not sorted labels : ndarray[intp_t] (if return_inverse=True) The labels from values to uniques + result_mask: ndarray[bool], if use_result_mask is true + The mask for the result values. """ cdef: Py_ssize_t i, idx, count = count_prior, n = len(values) @@ -566,7 +571,9 @@ cdef class {{name}}HashTable(HashTable): {{c_type}} val, na_value2 khiter_t k {{name}}VectorData *ud - bint use_na_value, use_mask + UInt8Vector result_mask + UInt8VectorData *rmd + bint use_na_value, use_mask, seen_na = False uint8_t[:] mask_values if return_inverse: @@ -574,6 +581,14 @@ cdef class {{name}}HashTable(HashTable): ud = uniques.data use_na_value = na_value is not None use_mask = mask is not None + if not use_mask and use_result_mask: + raise NotImplementedError # pragma: no cover + + if use_result_mask and return_inverse: + raise NotImplementedError # pragma: no cover + + result_mask = UInt8Vector() + rmd = result_mask.data if use_mask: mask_values = mask.view("uint8") @@ -605,6 +620,27 @@ cdef class {{name}}HashTable(HashTable): # and replace the corresponding label with na_sentinel labels[i] = na_sentinel continue + elif not ignore_na and use_result_mask: + if mask_values[i]: + if seen_na: + continue + + seen_na = True + if needs_resize(ud): + with gil: + if uniques.external_view_exists: + raise ValueError("external reference to " + "uniques held, but " + "Vector.resize() needed") + uniques.resize() + if result_mask.external_view_exists: + raise ValueError("external reference to " + "result_mask held, but " + "Vector.resize() needed") + result_mask.resize() + append_data_{{dtype}}(ud, val) + append_data_uint8(rmd, 1) + continue k = kh_get_{{dtype}}(self.table, val) @@ -619,7 +655,16 @@ cdef class {{name}}HashTable(HashTable): "uniques held, but " "Vector.resize() needed") uniques.resize() + if use_result_mask: + if result_mask.external_view_exists: + raise ValueError("external reference to " + "result_mask held, but " + "Vector.resize() needed") + result_mask.resize() append_data_{{dtype}}(ud, val) + if use_result_mask: + append_data_uint8(rmd, 0) + if return_inverse: self.table.vals[k] = count labels[i] = count @@ -632,9 +677,11 @@ cdef class {{name}}HashTable(HashTable): if return_inverse: return uniques.to_array(), labels.base # .base -> underlying ndarray + if use_result_mask: + return uniques.to_array(), result_mask.to_array() return uniques.to_array() - def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False): + def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False, object mask=None): """ Calculate unique values and labels (no sorting!) @@ -645,6 +692,9 @@ cdef class {{name}}HashTable(HashTable): return_inverse : bool, default False Whether the mapping of the original array values to their location in the vector of uniques should be returned. + mask : ndarray[bool], optional + If not None, the mask is used as indicator for missing values + (True = missing, False = valid) instead of `na_value` or Returns ------- @@ -652,10 +702,13 @@ cdef class {{name}}HashTable(HashTable): Unique values of input, not sorted labels : ndarray[intp_t] (if return_inverse) The labels from values to uniques + result_mask: ndarray[bool], if mask is given as input + The mask for the result values. """ uniques = {{name}}Vector() + use_result_mask = True if mask is not None else False return self._unique(values, uniques, ignore_na=False, - return_inverse=return_inverse) + return_inverse=return_inverse, mask=mask, use_result_mask=use_result_mask) def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1, object na_value=None, object mask=None): @@ -1013,7 +1066,7 @@ cdef class StringHashTable(HashTable): return uniques.to_array(), labels.base # .base -> underlying ndarray return uniques.to_array() - def unique(self, ndarray[object] values, bint return_inverse=False): + def unique(self, ndarray[object] values, bint return_inverse=False, object mask=None): """ Calculate unique values and labels (no sorting!) @@ -1024,6 +1077,8 @@ cdef class StringHashTable(HashTable): return_inverse : bool, default False Whether the mapping of the original array values to their location in the vector of uniques should be returned. + mask : ndarray[bool], optional + Not yet implemented for StringHashTable Returns ------- @@ -1266,7 +1321,7 @@ cdef class PyObjectHashTable(HashTable): return uniques.to_array(), labels.base # .base -> underlying ndarray return uniques.to_array() - def unique(self, ndarray[object] values, bint return_inverse=False): + def unique(self, ndarray[object] values, bint return_inverse=False, object mask=None): """ Calculate unique values and labels (no sorting!) @@ -1277,6 +1332,8 @@ cdef class PyObjectHashTable(HashTable): return_inverse : bool, default False Whether the mapping of the original array values to their location in the vector of uniques should be returned. + mask : ndarray[bool], optional + Not yet implemented for PyObjectHashTable Returns ------- diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index a4736c2a141a5..1a5cda357296e 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -404,6 +404,11 @@ def unique(values): >>> pd.unique([("a", "b"), ("b", "a"), ("a", "c"), ("b", "a")]) array([('a', 'b'), ('b', 'a'), ('a', 'c')], dtype=object) """ + return unique_with_mask(values) + + +def unique_with_mask(values, mask: npt.NDArray[np.bool_] | None = None): + """See algorithms.unique for docs. Takes a mask for masked arrays.""" values = _ensure_arraylike(values) if is_extension_array_dtype(values.dtype): @@ -414,9 +419,16 @@ def unique(values): htable, values = _get_hashtable_algo(values) table = htable(len(values)) - uniques = table.unique(values) - uniques = _reconstruct_data(uniques, original.dtype, original) - return uniques + if mask is None: + uniques = table.unique(values) + uniques = _reconstruct_data(uniques, original.dtype, original) + return uniques + + else: + uniques, mask = table.unique(values, mask=mask) + uniques = _reconstruct_data(uniques, original.dtype, original) + assert mask is not None # for mypy + return uniques, mask.astype("bool") unique1d = unique diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 128c7e44f5075..15946ab9ce80d 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -851,6 +851,17 @@ def copy(self: BaseMaskedArrayT) -> BaseMaskedArrayT: mask = mask.copy() return type(self)(data, mask, copy=False) + def unique(self: BaseMaskedArrayT) -> BaseMaskedArrayT: + """ + Compute the BaseMaskedArray of unique values. + + Returns + ------- + uniques : BaseMaskedArray + """ + uniques, mask = algos.unique_with_mask(self._data, self._mask) + return type(self)(uniques, mask, copy=False) + @doc(ExtensionArray.searchsorted) def searchsorted( self, diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index def63c552e059..fd617dcb4565e 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -834,6 +834,13 @@ def test_do_not_mangle_na_values(self, unique_nulls_fixture, unique_nulls_fixtur assert a[0] is unique_nulls_fixture assert a[1] is unique_nulls_fixture2 + def test_unique_masked(self, any_numeric_ea_dtype): + # GH#48019 + ser = Series([1, pd.NA, 2] * 3, dtype=any_numeric_ea_dtype) + result = pd.unique(ser) + expected = pd.array([1, pd.NA, 2], dtype=any_numeric_ea_dtype) + tm.assert_extension_array_equal(result, expected) + class TestIsin: def test_invalid(self):