From 294a997a2fa7a631759a85520403e17707b6d9ef Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Sun, 26 Jun 2022 15:20:28 +0200 Subject: [PATCH 1/9] perf: lexsort_depth --- pandas/_libs/algos.pyi | 1 + pandas/_libs/algos.pyx | 41 ++++++++++++++++++++++++++++++++++++ pandas/core/indexes/multi.py | 5 +---- 3 files changed, 43 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/algos.pyi b/pandas/_libs/algos.pyi index 0cc9209fbdfc5..3b9e320bd6889 100644 --- a/pandas/_libs/algos.pyi +++ b/pandas/_libs/algos.pyi @@ -34,6 +34,7 @@ def unique_deltas( arr: np.ndarray, # const int64_t[:] ) -> np.ndarray: ... # np.ndarray[np.int64, ndim=1] def is_lexsorted(list_of_arrays: list[npt.NDArray[np.int64]]) -> bool: ... +def lexsort_depth(list_of_arrays: list[npt.NDArray[np.int64]]) -> int: ... def groupsort_indexer( index: np.ndarray, # const int64_t[:] ngroups: int, diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 29f9a22c9b36e..fc1fedda09c69 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -186,6 +186,47 @@ def is_lexsorted(list_of_arrays: list) -> bint: return result +@cython.wraparound(False) +@cython.boundscheck(False) +def lexsort_depth(list_of_arrays: list) -> int: + cdef: + Py_ssize_t i, depth + Py_ssize_t n, nlevels + int64_t k, cur, pre + ndarray arr + + nlevels = len(list_of_arrays) + n = len(list_of_arrays[0]) + + cdef int64_t **vecs = malloc(nlevels * sizeof(int64_t*)) + for i in range(nlevels): + arr = list_of_arrays[i] + assert arr.dtype.name == 'int64' + vecs[i] = cnp.PyArray_DATA(arr) + + # Assume uniqueness?? + with nogil: + depth = nlevels + for i in range(1, n): + for k in range(nlevels): + if k >= depth: + # No need to check levels for which we know input isn't lexsorted. + break + cur = vecs[k][i] + pre = vecs[k][i -1] + if cur == pre: + continue + elif cur > pre: + break + else: + depth = min(k, depth) + if depth == 0: + # Depth can't increase, so if we've reached 0, break outer loop. + break + free(vecs) + return depth + + @cython.boundscheck(False) @cython.wraparound(False) def groupsort_indexer(const intp_t[:] index, Py_ssize_t ngroups): diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 15d06ef3bc8e5..7a42dc4b8dabb 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3841,10 +3841,7 @@ def drop_duplicates(self, keep: str | bool = "first") -> MultiIndex: def _lexsort_depth(codes: list[np.ndarray], nlevels: int) -> int: """Count depth (up to a maximum of `nlevels`) with which codes are lexsorted.""" int64_codes = [ensure_int64(level_codes) for level_codes in codes] - for k in range(nlevels, 0, -1): - if libalgos.is_lexsorted(int64_codes[:k]): - return k - return 0 + return libalgos.lexsort_depth(int64_codes) def sparsify_labels(label_list, start: int = 0, sentinel=""): From f945f12bb49178292adfa462772fc721b7f1d786 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Mon, 27 Jun 2022 17:09:06 +0200 Subject: [PATCH 2/9] remove uniqueness comment and extra whitespace --- pandas/_libs/algos.pyx | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index fc1fedda09c69..ac533aa9187a3 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -167,12 +167,11 @@ def is_lexsorted(list_of_arrays: list) -> bint: assert arr.dtype.name == 'int64' vecs[i] = cnp.PyArray_DATA(arr) - # Assume uniqueness?? with nogil: for i in range(1, n): for k in range(nlevels): cur = vecs[k][i] - pre = vecs[k][i -1] + pre = vecs[k][i-1] if cur == pre: continue elif cur > pre: @@ -189,6 +188,11 @@ def is_lexsorted(list_of_arrays: list) -> bint: @cython.wraparound(False) @cython.boundscheck(False) def lexsort_depth(list_of_arrays: list) -> int: + """ + Same as `is_lexsorted`, but keeps track of lexsort depth + as we iterate through the elements, and exits early if depth + is zero. + """ cdef: Py_ssize_t i, depth Py_ssize_t n, nlevels @@ -204,7 +208,6 @@ def lexsort_depth(list_of_arrays: list) -> int: assert arr.dtype.name == 'int64' vecs[i] = cnp.PyArray_DATA(arr) - # Assume uniqueness?? with nogil: depth = nlevels for i in range(1, n): @@ -213,7 +216,7 @@ def lexsort_depth(list_of_arrays: list) -> int: # No need to check levels for which we know input isn't lexsorted. break cur = vecs[k][i] - pre = vecs[k][i -1] + pre = vecs[k][i-1] if cur == pre: continue elif cur > pre: From b7760e8c5bcf4b79162b6209adc355b53e39825e Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Sat, 9 Jul 2022 15:31:35 +0100 Subject: [PATCH 3/9] add mi_wide asv benchmark --- asv_bench/benchmarks/multiindex_object.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/asv_bench/benchmarks/multiindex_object.py b/asv_bench/benchmarks/multiindex_object.py index a498c6b2e4944..782eb15234ea4 100644 --- a/asv_bench/benchmarks/multiindex_object.py +++ b/asv_bench/benchmarks/multiindex_object.py @@ -24,6 +24,13 @@ def setup(self): self.mi_small = MultiIndex.from_product( [np.arange(100), list("A"), list("A")], names=["one", "two", "three"] ) + self.mi_wide = MultiIndex.from_tuples( + [ + np.hstack([2, np.ones(99)]), + np.ones(100), + np.ones(100), + ], + ) def time_large_get_loc(self): self.mi_large.get_loc((999, 19, "Z")) @@ -32,6 +39,9 @@ def time_large_get_loc_warm(self): for _ in range(1000): self.mi_large.get_loc((999, 19, "Z")) + def time_wide_get_loc(self): + self.mi_wide.get_loc((1, 1)) + def time_med_get_loc(self): self.mi_med.get_loc((999, 9, "A")) From a706d9b4a3271eac94ff35f4eddf58bbcc1599e6 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Sat, 9 Jul 2022 20:49:48 +0100 Subject: [PATCH 4/9] better types --- pandas/_libs/algos.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 1bccd3faaeedb..fd028b5aadf1c 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -187,16 +187,16 @@ def is_lexsorted(list_of_arrays: list) -> bint: @cython.wraparound(False) @cython.boundscheck(False) -def lexsort_depth(list_of_arrays: list) -> int: +def lexsort_depth(list_of_arrays: list) -> Py_ssize_t: """ Same as `is_lexsorted`, but keeps track of lexsort depth as we iterate through the elements, and exits early if depth is zero. """ cdef: - Py_ssize_t i, depth + Py_ssize_t i, depth, k Py_ssize_t n, nlevels - int64_t k, cur, pre + int64_t cur, pre ndarray arr nlevels = len(list_of_arrays) From 3d6c56220f8219bbe4a7e0443ea40d7f83c2dd01 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Sun, 10 Jul 2022 09:21:47 +0100 Subject: [PATCH 5/9] use while loop --- pandas/_libs/algos.pyx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index fd028b5aadf1c..ee76d4157609d 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -211,18 +211,18 @@ def lexsort_depth(list_of_arrays: list) -> Py_ssize_t: with nogil: depth = nlevels for i in range(1, n): - for k in range(nlevels): - if k >= depth: - # No need to check levels for which we know input isn't lexsorted. - break + k = 0 + while k < depth: cur = vecs[k][i] pre = vecs[k][i-1] if cur == pre: + k += 1 continue elif cur > pre: break else: depth = min(k, depth) + k += 1 if depth == 0: # Depth can't increase, so if we've reached 0, break outer loop. break From b2be554e513a182111133257562426e9dc79d644 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Sun, 10 Jul 2022 09:35:35 +0100 Subject: [PATCH 6/9] another while loop --- pandas/_libs/algos.pyx | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index ee76d4157609d..c29ab9297c351 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -210,7 +210,8 @@ def lexsort_depth(list_of_arrays: list) -> Py_ssize_t: with nogil: depth = nlevels - for i in range(1, n): + i = 0 + while i < n and depth > 0: k = 0 while k < depth: cur = vecs[k][i] @@ -223,9 +224,7 @@ def lexsort_depth(list_of_arrays: list) -> Py_ssize_t: else: depth = min(k, depth) k += 1 - if depth == 0: - # Depth can't increase, so if we've reached 0, break outer loop. - break + i += 1 free(vecs) return depth From 3f1bcafbbb33dab5f0a3176dbcded29b76ec875b Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Sun, 10 Jul 2022 09:47:08 +0100 Subject: [PATCH 7/9] just check if depth --- pandas/_libs/algos.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index c29ab9297c351..a2239d1dc54ae 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -211,7 +211,7 @@ def lexsort_depth(list_of_arrays: list) -> Py_ssize_t: with nogil: depth = nlevels i = 0 - while i < n and depth > 0: + while i < n and depth: k = 0 while k < depth: cur = vecs[k][i] From 0b33bff83de7360ab7b9a17bb876846103adc88c Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Sun, 10 Jul 2022 09:50:15 +0100 Subject: [PATCH 8/9] use != 0 --- pandas/_libs/algos.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index a2239d1dc54ae..fee35be430714 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -211,7 +211,7 @@ def lexsort_depth(list_of_arrays: list) -> Py_ssize_t: with nogil: depth = nlevels i = 0 - while i < n and depth: + while i < n and depth != 0: k = 0 while k < depth: cur = vecs[k][i] From 39d90f729b7b3b94bedccb2b1930b6d50f5d6827 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Sun, 10 Jul 2022 09:53:53 +0100 Subject: [PATCH 9/9] revert --- pandas/_libs/algos.pyx | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index fee35be430714..ee76d4157609d 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -210,8 +210,7 @@ def lexsort_depth(list_of_arrays: list) -> Py_ssize_t: with nogil: depth = nlevels - i = 0 - while i < n and depth != 0: + for i in range(1, n): k = 0 while k < depth: cur = vecs[k][i] @@ -224,7 +223,9 @@ def lexsort_depth(list_of_arrays: list) -> Py_ssize_t: else: depth = min(k, depth) k += 1 - i += 1 + if depth == 0: + # Depth can't increase, so if we've reached 0, break outer loop. + break free(vecs) return depth