Skip to content

Commit 9893c43

Browse files
authored
Series.str.find fix for arrow strings when start < 0 (#56412)
1 parent 124b671 commit 9893c43

File tree

3 files changed

+12
-2
lines changed

3 files changed

+12
-2
lines changed

doc/source/whatsnew/v2.2.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -579,6 +579,7 @@ Strings
579579
- Bug in :meth:`DataFrame.apply` failing when ``engine="numba"`` and columns or index have ``StringDtype`` (:issue:`56189`)
580580
- Bug in :meth:`Index.str.cat` always casting result to object dtype (:issue:`56157`)
581581
- Bug in :meth:`Series.__mul__` for :class:`ArrowDtype` with ``pyarrow.string`` dtype and ``string[pyarrow]`` for the pyarrow backend (:issue:`51970`)
582+
- Bug in :meth:`Series.str.find` when ``start < 0`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56411`)
582583
- Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56404`)
583584
- Bug in :meth:`Series.str.startswith` and :meth:`Series.str.endswith` with arguments of type ``tuple[str, ...]`` for ``string[pyarrow]`` (:issue:`54942`)
584585

pandas/core/arrays/arrow/array.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2193,7 +2193,8 @@ def _str_find(self, sub: str, start: int = 0, end: int | None = None):
21932193
slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end)
21942194
result = pc.find_substring(slices, sub)
21952195
not_found = pc.equal(result, -1)
2196-
offset_result = pc.add(result, end - start)
2196+
start_offset = max(0, start)
2197+
offset_result = pc.add(result, start_offset)
21972198
result = pc.if_else(not_found, result, offset_result)
21982199
elif start == 0 and end is None:
21992200
slices = self._pa_array

pandas/tests/extension/test_arrow.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1835,7 +1835,7 @@ def test_str_fullmatch(pat, case, na, exp):
18351835

18361836
@pytest.mark.parametrize(
18371837
"sub, start, end, exp, exp_typ",
1838-
[["ab", 0, None, [0, None], pa.int32()], ["bc", 1, 3, [2, None], pa.int64()]],
1838+
[["ab", 0, None, [0, None], pa.int32()], ["bc", 1, 3, [1, None], pa.int64()]],
18391839
)
18401840
def test_str_find(sub, start, end, exp, exp_typ):
18411841
ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string()))
@@ -1844,6 +1844,14 @@ def test_str_find(sub, start, end, exp, exp_typ):
18441844
tm.assert_series_equal(result, expected)
18451845

18461846

1847+
def test_str_find_negative_start():
1848+
# GH 56411
1849+
ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string()))
1850+
result = ser.str.find(sub="b", start=-1000, end=3)
1851+
expected = pd.Series([1, None], dtype=ArrowDtype(pa.int64()))
1852+
tm.assert_series_equal(result, expected)
1853+
1854+
18471855
def test_str_find_notimplemented():
18481856
ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string()))
18491857
with pytest.raises(NotImplementedError, match="find not implemented"):

0 commit comments

Comments
 (0)