Skip to content

Commit 8dfb6cb

Browse files
committed
REF (string): de-duplicate str_endswith, startswith
1 parent 220c18d commit 8dfb6cb

File tree

3 files changed

+65
-72
lines changed

3 files changed

+65
-72
lines changed

pandas/core/arrays/_arrow_string_mixins.py

Lines changed: 63 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,16 +9,27 @@
99

1010
from pandas.compat import pa_version_under10p1
1111

12+
from pandas.core.dtypes.missing import isna
13+
1214
if not pa_version_under10p1:
1315
import pyarrow as pa
1416
import pyarrow.compute as pc
1517

1618
if TYPE_CHECKING:
17-
from pandas._typing import Self
19+
from collections.abc import Sized
20+
21+
from pandas._typing import (
22+
Scalar,
23+
Self,
24+
)
1825

1926

2027
class ArrowStringArrayMixin:
21-
_pa_array = None
28+
# _object_compat specifies whether we should 1) attempt to match behaviors
29+
# of the object-backed StringDtype and 2) fall back to object-based
30+
# computation for cases that pyarrow does not support natively.
31+
_object_compat = False
32+
_pa_array: Sized
2233

2334
def __init__(self, *args, **kwargs) -> None:
2435
raise NotImplementedError
@@ -97,3 +108,53 @@ def _str_removesuffix(self, suffix: str):
97108
removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix))
98109
result = pc.if_else(ends_with, removed, self._pa_array)
99110
return type(self)(result)
111+
112+
def _str_startswith(self, pat: str | tuple[str, ...], na: Scalar | None = None):
113+
if isinstance(pat, str):
114+
result = pc.starts_with(self._pa_array, pattern=pat)
115+
else:
116+
if len(pat) == 0:
117+
if self._object_compat:
118+
# mimic existing behaviour of string extension array
119+
# and python string method
120+
result = pa.array(
121+
np.zeros(len(self._pa_array), dtype=np.bool_),
122+
mask=isna(self._pa_array),
123+
)
124+
else:
125+
# For empty tuple, pd.StringDtype() returns null for missing values
126+
# and false for valid values.
127+
result = pc.if_else(pc.is_null(self._pa_array), None, False)
128+
else:
129+
result = pc.starts_with(self._pa_array, pattern=pat[0])
130+
131+
for p in pat[1:]:
132+
result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p))
133+
if not isna(na):
134+
result = result.fill_null(na)
135+
return self._convert_bool_result(result)
136+
137+
def _str_endswith(self, pat: str | tuple[str, ...], na: Scalar | None = None):
138+
if isinstance(pat, str):
139+
result = pc.ends_with(self._pa_array, pattern=pat)
140+
else:
141+
if len(pat) == 0:
142+
if self._object_compat:
143+
# mimic existing behaviour of string extension array
144+
# and python string method
145+
result = pa.array(
146+
np.zeros(len(self._pa_array), dtype=np.bool_),
147+
mask=isna(self._pa_array),
148+
)
149+
else:
150+
# For empty tuple, pd.StringDtype() returns null for missing values
151+
# and false for valid values.
152+
result = pc.if_else(pc.is_null(self._pa_array), None, False)
153+
else:
154+
result = pc.ends_with(self._pa_array, pattern=pat[0])
155+
156+
for p in pat[1:]:
157+
result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p))
158+
if not isna(na):
159+
result = result.fill_null(na)
160+
return self._convert_bool_result(result)

pandas/core/arrays/arrow/array.py

Lines changed: 1 addition & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -2337,38 +2337,7 @@ def _str_contains(
23372337
result = result.fill_null(na)
23382338
return type(self)(result)
23392339

2340-
def _str_startswith(self, pat: str | tuple[str, ...], na=None) -> Self:
2341-
if isinstance(pat, str):
2342-
result = pc.starts_with(self._pa_array, pattern=pat)
2343-
else:
2344-
if len(pat) == 0:
2345-
# For empty tuple, pd.StringDtype() returns null for missing values
2346-
# and false for valid values.
2347-
result = pc.if_else(pc.is_null(self._pa_array), None, False)
2348-
else:
2349-
result = pc.starts_with(self._pa_array, pattern=pat[0])
2350-
2351-
for p in pat[1:]:
2352-
result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p))
2353-
if not isna(na):
2354-
result = result.fill_null(na)
2355-
return type(self)(result)
2356-
2357-
def _str_endswith(self, pat: str | tuple[str, ...], na=None) -> Self:
2358-
if isinstance(pat, str):
2359-
result = pc.ends_with(self._pa_array, pattern=pat)
2360-
else:
2361-
if len(pat) == 0:
2362-
# For empty tuple, pd.StringDtype() returns null for missing values
2363-
# and false for valid values.
2364-
result = pc.if_else(pc.is_null(self._pa_array), None, False)
2365-
else:
2366-
result = pc.ends_with(self._pa_array, pattern=pat[0])
2367-
2368-
for p in pat[1:]:
2369-
result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p))
2370-
if not isna(na):
2371-
result = result.fill_null(na)
2340+
def _result_converter(self, result):
23722341
return type(self)(result)
23732342

23742343
def _str_replace(

pandas/core/arrays/string_arrow.py

Lines changed: 1 addition & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,7 @@ def astype(self, dtype, copy: bool = True):
278278

279279
# ------------------------------------------------------------------------
280280
# String methods interface
281+
_object_compat = True
281282

282283
_str_map = BaseStringArray._str_map
283284

@@ -298,44 +299,6 @@ def _str_contains(
298299
result[isna(result)] = bool(na)
299300
return result
300301

301-
def _str_startswith(self, pat: str | tuple[str, ...], na: Scalar | None = None):
302-
if isinstance(pat, str):
303-
result = pc.starts_with(self._pa_array, pattern=pat)
304-
else:
305-
if len(pat) == 0:
306-
# mimic existing behaviour of string extension array
307-
# and python string method
308-
result = pa.array(
309-
np.zeros(len(self._pa_array), dtype=bool), mask=isna(self._pa_array)
310-
)
311-
else:
312-
result = pc.starts_with(self._pa_array, pattern=pat[0])
313-
314-
for p in pat[1:]:
315-
result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p))
316-
if not isna(na):
317-
result = result.fill_null(na)
318-
return self._convert_bool_result(result)
319-
320-
def _str_endswith(self, pat: str | tuple[str, ...], na: Scalar | None = None):
321-
if isinstance(pat, str):
322-
result = pc.ends_with(self._pa_array, pattern=pat)
323-
else:
324-
if len(pat) == 0:
325-
# mimic existing behaviour of string extension array
326-
# and python string method
327-
result = pa.array(
328-
np.zeros(len(self._pa_array), dtype=bool), mask=isna(self._pa_array)
329-
)
330-
else:
331-
result = pc.ends_with(self._pa_array, pattern=pat[0])
332-
333-
for p in pat[1:]:
334-
result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p))
335-
if not isna(na):
336-
result = result.fill_null(na)
337-
return self._convert_bool_result(result)
338-
339302
def _str_replace(
340303
self,
341304
pat: str | re.Pattern,

0 commit comments

Comments
 (0)