From 0ec13d39d6c00dce5f24753e006d3b5f28a11a91 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Wed, 24 Jan 2024 19:55:11 -0500 Subject: [PATCH 1/5] fix masked indexing regression --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/_libs/index.pyx | 6 +++++- pandas/tests/indexing/test_loc.py | 12 ++++++++++++ 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 75445c978d262..b668cd25fc503 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -14,6 +14,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`) +- Fixed regression in :meth:`DataFrame.loc` raising ``IndexError`` for non-unique, masked dtype indexes where result has more than 10,000 rows (:issue:`57027`) .. --------------------------------------------------------------------------- .. _whatsnew_221.bug_fixes: diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 2ea0e51236534..b4dd6c236bede 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -1220,6 +1220,7 @@ cdef class MaskedIndexEngine(IndexEngine): n_alloc *= 2 if n_alloc > max_alloc: n_alloc = max_alloc + result = np.resize(result, n_alloc) result[count] = na_idx count += 1 @@ -1236,6 +1237,7 @@ cdef class MaskedIndexEngine(IndexEngine): n_alloc *= 2 if n_alloc > max_alloc: n_alloc = max_alloc + result = np.resize(result, n_alloc) result[count] = j count += 1 @@ -1243,7 +1245,9 @@ cdef class MaskedIndexEngine(IndexEngine): # value not found if count >= n_alloc: - n_alloc += 10_000 + n_alloc *= 2 + if n_alloc > max_alloc: + n_alloc = max_alloc result = np.resize(result, n_alloc) result[count] = -1 count += 1 diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index c446f2c44b745..e217174136c11 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -3347,3 +3347,15 @@ def test_getitem_loc_str_periodindex(self): index = pd.period_range(start="2000", periods=20, freq="B") series = Series(range(20), index=index) assert series.loc["2000-01-14"] == 9 + + def test_loc_nonunique_masked_index(self): + # GH 57027 + ids = list(range(11)) + index = Index(ids * 1000, dtype="Int64") + df = DataFrame({"val": range(len(index))}, index=index) + result = df.loc[ids] + expected = DataFrame( + {"val": index.argsort(kind="stable")}, + index=Index(np.array(ids).repeat(1000), dtype="Int64"), + ) + tm.assert_frame_equal(result, expected) From fb0d7bb5bf87bd18fb2db6c04963441d984ad662 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Wed, 24 Jan 2024 21:35:14 -0500 Subject: [PATCH 2/5] fix test --- pandas/tests/indexing/test_loc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index e217174136c11..db73d30b7bd44 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -3352,7 +3352,7 @@ def test_loc_nonunique_masked_index(self): # GH 57027 ids = list(range(11)) index = Index(ids * 1000, dtype="Int64") - df = DataFrame({"val": range(len(index))}, index=index) + df = DataFrame({"val": np.arange(len(index))}, index=index) result = df.loc[ids] expected = DataFrame( {"val": index.argsort(kind="stable")}, From 868140de6685f0d84e84007743f9a3c3123116f1 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Wed, 24 Jan 2024 22:33:06 -0500 Subject: [PATCH 3/5] fix test --- pandas/tests/indexing/test_loc.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index db73d30b7bd44..b155c3aabd287 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -3352,10 +3352,10 @@ def test_loc_nonunique_masked_index(self): # GH 57027 ids = list(range(11)) index = Index(ids * 1000, dtype="Int64") - df = DataFrame({"val": np.arange(len(index))}, index=index) + df = DataFrame({"val": np.arange(len(index), dtype=np.intp)}, index=index) result = df.loc[ids] expected = DataFrame( - {"val": index.argsort(kind="stable")}, + {"val": index.argsort(kind="stable").astype(np.intp)}, index=Index(np.array(ids).repeat(1000), dtype="Int64"), ) tm.assert_frame_equal(result, expected) From ec56a513fc89efa5f18233eb982fdf840bdfb08b Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Thu, 25 Jan 2024 19:30:21 -0500 Subject: [PATCH 4/5] dedup resizing logic --- pandas/_libs/index.pyx | 67 +++++++++++++++++++----------------------- 1 file changed, 31 insertions(+), 36 deletions(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index b4dd6c236bede..39d48f43f2bf5 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -98,6 +98,20 @@ cdef ndarray _get_bool_indexer(ndarray values, object val, ndarray mask = None): return indexer.view(bool) +cdef _maybe_resize_array(ndarray values, loc, max_length): + """ + Resize array if loc is out of bounds. + """ + cdef: + Py_ssize_t n = len(values) + + if loc >= n: + while loc >= n: + n *= 2 + values = np.resize(values, min(n, max_length)) + return values + + # Don't populate hash tables in monotonic indexes larger than this _SIZE_CUTOFF = 1_000_000 @@ -456,27 +470,18 @@ cdef class IndexEngine: # found if val in d: key = val - + result = _maybe_resize_array( + result, + count + len(d[key]) - 1, + max_alloc + ) for j in d[key]: - - # realloc if needed - if count >= n_alloc: - n_alloc *= 2 - if n_alloc > max_alloc: - n_alloc = max_alloc - result = np.resize(result, n_alloc) - result[count] = j count += 1 # value not found else: - - if count >= n_alloc: - n_alloc *= 2 - if n_alloc > max_alloc: - n_alloc = max_alloc - result = np.resize(result, n_alloc) + result = _maybe_resize_array(result, count, max_alloc) result[count] = -1 count += 1 missing[count_missing] = i @@ -1214,14 +1219,12 @@ cdef class MaskedIndexEngine(IndexEngine): if PySequence_GetItem(target_mask, i): if na_pos: + result = _maybe_resize_array( + result, + count + len(na_pos) - 1, + max_alloc, + ) for na_idx in na_pos: - # realloc if needed - if count >= n_alloc: - n_alloc *= 2 - if n_alloc > max_alloc: - n_alloc = max_alloc - result = np.resize(result, n_alloc) - result[count] = na_idx count += 1 continue @@ -1229,26 +1232,18 @@ cdef class MaskedIndexEngine(IndexEngine): elif val in d: # found key = val - + result = _maybe_resize_array( + result, + count + len(d[key]) - 1, + max_alloc, + ) for j in d[key]: - - # realloc if needed - if count >= n_alloc: - n_alloc *= 2 - if n_alloc > max_alloc: - n_alloc = max_alloc - result = np.resize(result, n_alloc) - result[count] = j count += 1 continue # value not found - if count >= n_alloc: - n_alloc *= 2 - if n_alloc > max_alloc: - n_alloc = max_alloc - result = np.resize(result, n_alloc) + result = _maybe_resize_array(result, count, max_alloc) result[count] = -1 count += 1 missing[count_missing] = i From 1bf99d5d659d7ecfc2765de580e84b0961844a9d Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Fri, 26 Jan 2024 06:37:17 -0500 Subject: [PATCH 5/5] add types --- pandas/_libs/index.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 39d48f43f2bf5..a277272040a69 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -98,7 +98,7 @@ cdef ndarray _get_bool_indexer(ndarray values, object val, ndarray mask = None): return indexer.view(bool) -cdef _maybe_resize_array(ndarray values, loc, max_length): +cdef _maybe_resize_array(ndarray values, Py_ssize_t loc, Py_ssize_t max_length): """ Resize array if loc is out of bounds. """