From 0ec13d39d6c00dce5f24753e006d3b5f28a11a91 Mon Sep 17 00:00:00 2001
From: Luke Manley <lukemanley@gmail.com>
Date: Wed, 24 Jan 2024 19:55:11 -0500
Subject: [PATCH 1/5] fix masked indexing regression

---
 doc/source/whatsnew/v2.2.1.rst    |  1 +
 pandas/_libs/index.pyx            |  6 +++++-
 pandas/tests/indexing/test_loc.py | 12 ++++++++++++
 3 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst
index 75445c978d262..b668cd25fc503 100644
--- a/doc/source/whatsnew/v2.2.1.rst
+++ b/doc/source/whatsnew/v2.2.1.rst
@@ -14,6 +14,7 @@ including other versions of pandas.
 Fixed regressions
 ~~~~~~~~~~~~~~~~~
 - Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`)
+- Fixed regression in :meth:`DataFrame.loc` raising ``IndexError`` for non-unique, masked dtype indexes where result has more than 10,000 rows (:issue:`57027`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_221.bug_fixes:
diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx
index 2ea0e51236534..b4dd6c236bede 100644
--- a/pandas/_libs/index.pyx
+++ b/pandas/_libs/index.pyx
@@ -1220,6 +1220,7 @@ cdef class MaskedIndexEngine(IndexEngine):
                             n_alloc *= 2
                             if n_alloc > max_alloc:
                                 n_alloc = max_alloc
+                            result = np.resize(result, n_alloc)
 
                         result[count] = na_idx
                         count += 1
@@ -1236,6 +1237,7 @@ cdef class MaskedIndexEngine(IndexEngine):
                         n_alloc *= 2
                         if n_alloc > max_alloc:
                             n_alloc = max_alloc
+                        result = np.resize(result, n_alloc)
 
                     result[count] = j
                     count += 1
@@ -1243,7 +1245,9 @@ cdef class MaskedIndexEngine(IndexEngine):
 
             # value not found
             if count >= n_alloc:
-                n_alloc += 10_000
+                n_alloc *= 2
+                if n_alloc > max_alloc:
+                    n_alloc = max_alloc
                 result = np.resize(result, n_alloc)
             result[count] = -1
             count += 1
diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py
index c446f2c44b745..e217174136c11 100644
--- a/pandas/tests/indexing/test_loc.py
+++ b/pandas/tests/indexing/test_loc.py
@@ -3347,3 +3347,15 @@ def test_getitem_loc_str_periodindex(self):
             index = pd.period_range(start="2000", periods=20, freq="B")
             series = Series(range(20), index=index)
             assert series.loc["2000-01-14"] == 9
+
+    def test_loc_nonunique_masked_index(self):
+        # GH 57027
+        ids = list(range(11))
+        index = Index(ids * 1000, dtype="Int64")
+        df = DataFrame({"val": range(len(index))}, index=index)
+        result = df.loc[ids]
+        expected = DataFrame(
+            {"val": index.argsort(kind="stable")},
+            index=Index(np.array(ids).repeat(1000), dtype="Int64"),
+        )
+        tm.assert_frame_equal(result, expected)

From fb0d7bb5bf87bd18fb2db6c04963441d984ad662 Mon Sep 17 00:00:00 2001
From: Luke Manley <lukemanley@gmail.com>
Date: Wed, 24 Jan 2024 21:35:14 -0500
Subject: [PATCH 2/5] fix test

---
 pandas/tests/indexing/test_loc.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py
index e217174136c11..db73d30b7bd44 100644
--- a/pandas/tests/indexing/test_loc.py
+++ b/pandas/tests/indexing/test_loc.py
@@ -3352,7 +3352,7 @@ def test_loc_nonunique_masked_index(self):
         # GH 57027
         ids = list(range(11))
         index = Index(ids * 1000, dtype="Int64")
-        df = DataFrame({"val": range(len(index))}, index=index)
+        df = DataFrame({"val": np.arange(len(index))}, index=index)
         result = df.loc[ids]
         expected = DataFrame(
             {"val": index.argsort(kind="stable")},

From 868140de6685f0d84e84007743f9a3c3123116f1 Mon Sep 17 00:00:00 2001
From: Luke Manley <lukemanley@gmail.com>
Date: Wed, 24 Jan 2024 22:33:06 -0500
Subject: [PATCH 3/5] fix test

---
 pandas/tests/indexing/test_loc.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py
index db73d30b7bd44..b155c3aabd287 100644
--- a/pandas/tests/indexing/test_loc.py
+++ b/pandas/tests/indexing/test_loc.py
@@ -3352,10 +3352,10 @@ def test_loc_nonunique_masked_index(self):
         # GH 57027
         ids = list(range(11))
         index = Index(ids * 1000, dtype="Int64")
-        df = DataFrame({"val": np.arange(len(index))}, index=index)
+        df = DataFrame({"val": np.arange(len(index), dtype=np.intp)}, index=index)
         result = df.loc[ids]
         expected = DataFrame(
-            {"val": index.argsort(kind="stable")},
+            {"val": index.argsort(kind="stable").astype(np.intp)},
             index=Index(np.array(ids).repeat(1000), dtype="Int64"),
         )
         tm.assert_frame_equal(result, expected)

From ec56a513fc89efa5f18233eb982fdf840bdfb08b Mon Sep 17 00:00:00 2001
From: Luke Manley <lukemanley@gmail.com>
Date: Thu, 25 Jan 2024 19:30:21 -0500
Subject: [PATCH 4/5] dedup resizing logic

---
 pandas/_libs/index.pyx | 67 +++++++++++++++++++-----------------------
 1 file changed, 31 insertions(+), 36 deletions(-)

diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx
index b4dd6c236bede..39d48f43f2bf5 100644
--- a/pandas/_libs/index.pyx
+++ b/pandas/_libs/index.pyx
@@ -98,6 +98,20 @@ cdef ndarray _get_bool_indexer(ndarray values, object val, ndarray mask = None):
     return indexer.view(bool)
 
 
+cdef _maybe_resize_array(ndarray values, loc, max_length):
+    """
+    Resize array if loc is out of bounds.
+    """
+    cdef:
+        Py_ssize_t n = len(values)
+
+    if loc >= n:
+        while loc >= n:
+            n *= 2
+        values = np.resize(values, min(n, max_length))
+    return values
+
+
 # Don't populate hash tables in monotonic indexes larger than this
 _SIZE_CUTOFF = 1_000_000
 
@@ -456,27 +470,18 @@ cdef class IndexEngine:
             # found
             if val in d:
                 key = val
-
+                result = _maybe_resize_array(
+                    result,
+                    count + len(d[key]) - 1,
+                    max_alloc
+                )
                 for j in d[key]:
-
-                    # realloc if needed
-                    if count >= n_alloc:
-                        n_alloc *= 2
-                        if n_alloc > max_alloc:
-                            n_alloc = max_alloc
-                        result = np.resize(result, n_alloc)
-
                     result[count] = j
                     count += 1
 
             # value not found
             else:
-
-                if count >= n_alloc:
-                    n_alloc *= 2
-                    if n_alloc > max_alloc:
-                        n_alloc = max_alloc
-                    result = np.resize(result, n_alloc)
+                result = _maybe_resize_array(result, count, max_alloc)
                 result[count] = -1
                 count += 1
                 missing[count_missing] = i
@@ -1214,14 +1219,12 @@ cdef class MaskedIndexEngine(IndexEngine):
 
             if PySequence_GetItem(target_mask, i):
                 if na_pos:
+                    result = _maybe_resize_array(
+                        result,
+                        count + len(na_pos) - 1,
+                        max_alloc,
+                    )
                     for na_idx in na_pos:
-                        # realloc if needed
-                        if count >= n_alloc:
-                            n_alloc *= 2
-                            if n_alloc > max_alloc:
-                                n_alloc = max_alloc
-                            result = np.resize(result, n_alloc)
-
                         result[count] = na_idx
                         count += 1
                     continue
@@ -1229,26 +1232,18 @@ cdef class MaskedIndexEngine(IndexEngine):
             elif val in d:
                 # found
                 key = val
-
+                result = _maybe_resize_array(
+                    result,
+                    count + len(d[key]) - 1,
+                    max_alloc,
+                )
                 for j in d[key]:
-
-                    # realloc if needed
-                    if count >= n_alloc:
-                        n_alloc *= 2
-                        if n_alloc > max_alloc:
-                            n_alloc = max_alloc
-                        result = np.resize(result, n_alloc)
-
                     result[count] = j
                     count += 1
                 continue
 
             # value not found
-            if count >= n_alloc:
-                n_alloc *= 2
-                if n_alloc > max_alloc:
-                    n_alloc = max_alloc
-                result = np.resize(result, n_alloc)
+            result = _maybe_resize_array(result, count, max_alloc)
             result[count] = -1
             count += 1
             missing[count_missing] = i

From 1bf99d5d659d7ecfc2765de580e84b0961844a9d Mon Sep 17 00:00:00 2001
From: Luke Manley <lukemanley@gmail.com>
Date: Fri, 26 Jan 2024 06:37:17 -0500
Subject: [PATCH 5/5] add types

---
 pandas/_libs/index.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx
index 39d48f43f2bf5..a277272040a69 100644
--- a/pandas/_libs/index.pyx
+++ b/pandas/_libs/index.pyx
@@ -98,7 +98,7 @@ cdef ndarray _get_bool_indexer(ndarray values, object val, ndarray mask = None):
     return indexer.view(bool)
 
 
-cdef _maybe_resize_array(ndarray values, loc, max_length):
+cdef _maybe_resize_array(ndarray values, Py_ssize_t loc, Py_ssize_t max_length):
     """
     Resize array if loc is out of bounds.
     """