diff --git a/asv_bench/benchmarks/multiindex_object.py b/asv_bench/benchmarks/multiindex_object.py index a498c6b2e4944..782eb15234ea4 100644 --- a/asv_bench/benchmarks/multiindex_object.py +++ b/asv_bench/benchmarks/multiindex_object.py @@ -24,6 +24,13 @@ def setup(self): self.mi_small = MultiIndex.from_product( [np.arange(100), list("A"), list("A")], names=["one", "two", "three"] ) + self.mi_wide = MultiIndex.from_tuples( + [ + np.hstack([2, np.ones(99)]), + np.ones(100), + np.ones(100), + ], + ) def time_large_get_loc(self): self.mi_large.get_loc((999, 19, "Z")) @@ -32,6 +39,9 @@ def time_large_get_loc_warm(self): for _ in range(1000): self.mi_large.get_loc((999, 19, "Z")) + def time_wide_get_loc(self): + self.mi_wide.get_loc((1, 1)) + def time_med_get_loc(self): self.mi_med.get_loc((999, 9, "A")) diff --git a/pandas/_libs/algos.pyi b/pandas/_libs/algos.pyi index 0cc9209fbdfc5..3b9e320bd6889 100644 --- a/pandas/_libs/algos.pyi +++ b/pandas/_libs/algos.pyi @@ -34,6 +34,7 @@ def unique_deltas( arr: np.ndarray, # const int64_t[:] ) -> np.ndarray: ... # np.ndarray[np.int64, ndim=1] def is_lexsorted(list_of_arrays: list[npt.NDArray[np.int64]]) -> bool: ... +def lexsort_depth(list_of_arrays: list[npt.NDArray[np.int64]]) -> int: ... def groupsort_indexer( index: np.ndarray, # const int64_t[:] ngroups: int, diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index c05d6a300ccf0..ee76d4157609d 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -167,12 +167,11 @@ def is_lexsorted(list_of_arrays: list) -> bint: assert arr.dtype.name == 'int64' vecs[i] = cnp.PyArray_DATA(arr) - # Assume uniqueness?? with nogil: for i in range(1, n): for k in range(nlevels): cur = vecs[k][i] - pre = vecs[k][i -1] + pre = vecs[k][i-1] if cur == pre: continue elif cur > pre: @@ -186,6 +185,51 @@ def is_lexsorted(list_of_arrays: list) -> bint: return result +@cython.wraparound(False) +@cython.boundscheck(False) +def lexsort_depth(list_of_arrays: list) -> Py_ssize_t: + """ + Same as `is_lexsorted`, but keeps track of lexsort depth + as we iterate through the elements, and exits early if depth + is zero. + """ + cdef: + Py_ssize_t i, depth, k + Py_ssize_t n, nlevels + int64_t cur, pre + ndarray arr + + nlevels = len(list_of_arrays) + n = len(list_of_arrays[0]) + + cdef int64_t **vecs = malloc(nlevels * sizeof(int64_t*)) + for i in range(nlevels): + arr = list_of_arrays[i] + assert arr.dtype.name == 'int64' + vecs[i] = cnp.PyArray_DATA(arr) + + with nogil: + depth = nlevels + for i in range(1, n): + k = 0 + while k < depth: + cur = vecs[k][i] + pre = vecs[k][i-1] + if cur == pre: + k += 1 + continue + elif cur > pre: + break + else: + depth = min(k, depth) + k += 1 + if depth == 0: + # Depth can't increase, so if we've reached 0, break outer loop. + break + free(vecs) + return depth + + @cython.boundscheck(False) @cython.wraparound(False) def groupsort_indexer(const intp_t[:] index, Py_ssize_t ngroups): diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index fd6b6ba63d7e0..0fa9afbd5aa10 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3851,10 +3851,7 @@ def drop_duplicates(self, keep: str | bool = "first") -> MultiIndex: def _lexsort_depth(codes: list[np.ndarray], nlevels: int) -> int: """Count depth (up to a maximum of `nlevels`) with which codes are lexsorted.""" int64_codes = [ensure_int64(level_codes) for level_codes in codes] - for k in range(nlevels, 0, -1): - if libalgos.is_lexsorted(int64_codes[:k]): - return k - return 0 + return libalgos.lexsort_depth(int64_codes) def sparsify_labels(label_list, start: int = 0, sentinel=""):