From d8e811ae9d9ac8405b0636979dfe4330f04f90a5 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 22 Mar 2021 21:23:30 -0700 Subject: [PATCH 1/3] PERF: Define Block.__init__ in cython --- pandas/_libs/internals.pyx | 50 +++++++++++++++++++++++++++++++ pandas/core/internals/api.py | 4 +-- pandas/core/internals/blocks.py | 52 ++++----------------------------- 3 files changed, 58 insertions(+), 48 deletions(-) diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index 5352ca53e1b54..31b6935e9b2ba 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -455,3 +455,53 @@ def get_blkno_placements(blknos, group: bool = True): for blkno, indexer in get_blkno_indexers(blknos, group): yield blkno, BlockPlacement(indexer) + + +@cython.freelist(64) +cdef class Block: + """ + Defining __init__ in a cython class significantly improves performance. + """ + cdef: + public BlockPlacement _mgr_locs + readonly int ndim + public object values + + def __cinit__(self, values, placement: BlockPlacement, ndim: int): + """ + Parameters + ---------- + values : np.ndarray or ExtensionArray + We assume maybe_coerce_values has already been called. + placement : BlockPlacement + ndim : int + 1 for SingleBlockManager/Series, 2 for BlockManager/DataFrame + """ + self._mgr_locs = placement + self.ndim = ndim + self.values = values + + cpdef __reduce__(self): + # We have to do some gymnastics b/c "ndim" is keyword-only + from functools import partial + + from pandas.core.internals.blocks import new_block + + args = (self.values, self.mgr_locs.indexer) + func = partial(new_block, ndim=self.ndim) + return func, args + + cpdef __setstate__(self, state): + from pandas.core.construction import extract_array + + self.mgr_locs = BlockPlacement(state[0]) + self.values = extract_array(state[1], extract_numpy=True) + if len(state) > 2: + # we stored ndim + self.ndim = state[2] + else: + # older pickle + from pandas.core.internals.api import maybe_infer_ndim + + ndim = maybe_infer_ndim(self.values, self.mgr_locs) + self.ndim = ndim diff --git a/pandas/core/internals/api.py b/pandas/core/internals/api.py index aab8273b1e213..d6b76510c68ab 100644 --- a/pandas/core/internals/api.py +++ b/pandas/core/internals/api.py @@ -59,13 +59,13 @@ def make_block( if not isinstance(placement, BlockPlacement): placement = BlockPlacement(placement) - ndim = _maybe_infer_ndim(values, placement, ndim) + ndim = maybe_infer_ndim(values, placement, ndim) check_ndim(values, placement, ndim) values = maybe_coerce_values(values) return klass(values, ndim=ndim, placement=placement) -def _maybe_infer_ndim(values, placement: BlockPlacement, ndim: Optional[int]) -> int: +def maybe_infer_ndim(values, placement: BlockPlacement, ndim: Optional[int]) -> int: """ If `ndim` is not provided, infer it from placment and values. """ diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index c13eb3f109354..7b12f30622178 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -144,7 +144,7 @@ def newfunc(self, *args, **kwargs) -> List[Block]: return cast(F, newfunc) -class Block(PandasObject): +class Block(libinternals.Block, PandasObject): """ Canonical n-dimensional unit of homogeneous dtype contained in a pandas data structure @@ -154,7 +154,7 @@ class Block(PandasObject): values: Union[np.ndarray, ExtensionArray] - __slots__ = ["_mgr_locs", "values", "ndim"] + __slots__ = () is_numeric = False is_bool = False is_object = False @@ -162,35 +162,6 @@ class Block(PandasObject): _can_consolidate = True _validate_ndim = True - @classmethod - def _simple_new( - cls, values: ArrayLike, placement: BlockPlacement, ndim: int - ) -> Block: - """ - Fastpath constructor, does *no* validation - """ - obj = object.__new__(cls) - obj.ndim = ndim - obj.values = values - obj._mgr_locs = placement - return obj - - def __init__(self, values, placement: BlockPlacement, ndim: int): - """ - Parameters - ---------- - values : np.ndarray or ExtensionArray - We assume maybe_coerce_values has already been called. - placement : BlockPlacement (or castable) - ndim : int - 1 for SingleBlockManager/Series, 2 for BlockManager/DataFrame - """ - assert isinstance(ndim, int) - assert isinstance(placement, BlockPlacement) - self.ndim = ndim - self._mgr_locs = placement - self.values = values - @property def _holder(self): """ @@ -278,7 +249,6 @@ def mgr_locs(self) -> BlockPlacement: @mgr_locs.setter def mgr_locs(self, new_mgr_locs: BlockPlacement): - assert isinstance(new_mgr_locs, BlockPlacement) self._mgr_locs = new_mgr_locs @final @@ -323,16 +293,6 @@ def __repr__(self) -> str: def __len__(self) -> int: return len(self.values) - @final - def __getstate__(self): - return self.mgr_locs.indexer, self.values - - @final - def __setstate__(self, state): - self.mgr_locs = libinternals.BlockPlacement(state[0]) - self.values = extract_array(state[1], extract_numpy=True) - self.ndim = self.values.ndim - def _slice(self, slicer): """ return a slice of my values """ @@ -353,7 +313,7 @@ def getitem_block(self, slicer) -> Block: if new_values.ndim != self.values.ndim: raise ValueError("Only same dim slicing is allowed") - return type(self)._simple_new(new_values, new_mgr_locs, self.ndim) + return type(self)(new_values, new_mgr_locs, self.ndim) @final def getitem_block_index(self, slicer: slice) -> Block: @@ -365,7 +325,7 @@ def getitem_block_index(self, slicer: slice) -> Block: # error: Invalid index type "Tuple[ellipsis, slice]" for # "Union[ndarray, ExtensionArray]"; expected type "Union[int, slice, ndarray]" new_values = self.values[..., slicer] # type: ignore[index] - return type(self)._simple_new(new_values, self._mgr_locs, ndim=self.ndim) + return type(self)(new_values, self._mgr_locs, ndim=self.ndim) @final def getitem_block_columns(self, slicer, new_mgr_locs: BlockPlacement) -> Block: @@ -379,7 +339,7 @@ def getitem_block_columns(self, slicer, new_mgr_locs: BlockPlacement) -> Block: if new_values.ndim != self.values.ndim: raise ValueError("Only same dim slicing is allowed") - return type(self)._simple_new(new_values, new_mgr_locs, self.ndim) + return type(self)(new_values, new_mgr_locs, self.ndim) @property def shape(self) -> Shape: @@ -1921,7 +1881,7 @@ def set_inplace(self, locs, values): self.values[locs] = values -class DatetimeTZBlock(ExtensionBlock, DatetimeBlock): +class DatetimeTZBlock(ExtensionBlock, DatetimeLikeBlockMixin): """ implement a datetime64 block with a tz attribute """ values: DatetimeArray From 617db55a5792dc55967d858151c08e8ef033169b Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 1 Apr 2021 11:02:09 -0700 Subject: [PATCH 2/3] implement NumpyBlock (mypy failing) --- pandas/_libs/internals.pyi | 9 ++++++++- pandas/_libs/internals.pyx | 34 ++++++++++++++++++++++++++++++--- pandas/core/internals/blocks.py | 19 +++++++++--------- 3 files changed, 49 insertions(+), 13 deletions(-) diff --git a/pandas/_libs/internals.pyi b/pandas/_libs/internals.pyi index 446ee299698c5..79058a8240182 100644 --- a/pandas/_libs/internals.pyi +++ b/pandas/_libs/internals.pyi @@ -50,9 +50,16 @@ class BlockPlacement: def append(self, others: list[BlockPlacement]) -> BlockPlacement: ... -class Block: +class SharedBlock: _mgr_locs: BlockPlacement ndim: int values: ArrayLike def __init__(self, values: ArrayLike, placement: BlockPlacement, ndim: int): ... + +class NumpyBlock(SharedBlock): + values: np.ndarray + def getitem_block_index(self, slicer: slice) -> NumpyBlock: ... + +class Block(SharedBlock): + ... diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index 31b6935e9b2ba..bbaec16d69ae2 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -458,14 +458,13 @@ def get_blkno_placements(blknos, group: bool = True): @cython.freelist(64) -cdef class Block: +cdef class SharedBlock: """ Defining __init__ in a cython class significantly improves performance. """ cdef: public BlockPlacement _mgr_locs readonly int ndim - public object values def __cinit__(self, values, placement: BlockPlacement, ndim: int): """ @@ -479,7 +478,6 @@ cdef class Block: """ self._mgr_locs = placement self.ndim = ndim - self.values = values cpdef __reduce__(self): # We have to do some gymnastics b/c "ndim" is keyword-only @@ -505,3 +503,33 @@ cdef class Block: ndim = maybe_infer_ndim(self.values, self.mgr_locs) self.ndim = ndim + + +cdef class NumpyBlock(SharedBlock): + cdef: + public ndarray values + + def __cinit__(self, ndarray values, BlockPlacement placement, int ndim): + # set values here the (implicit) call to SharedBlock.__cinit__ will + # set placement and ndim + self.values = values + + # @final # not useful in cython, but we _would_ annotate with @final + def getitem_block_index(self, slicer: slice) -> NumpyBlock: + """ + Perform __getitem__-like specialized to slicing along index. + + Assumes self.ndim == 2 + """ + new_values = self.values[..., slicer] + return type(self)(new_values, self._mgr_locs, ndim=self.ndim) + + +cdef class Block(SharedBlock): + cdef: + public object values + + def __cinit__(self, object values, BlockPlacement placement, int ndim): + # set values here the (implicit) call to SharedBlock.__cinit__ will + # set placement and ndim + self.values = values diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index a77ea61d9e6de..dd9d997d26dc0 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -5,7 +5,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, List, Optional, Tuple, @@ -146,7 +145,7 @@ def newfunc(self, *args, **kwargs) -> List[Block]: return cast(F, newfunc) -class Block(libinternals.Block, PandasObject): +class Block(PandasObject): """ Canonical n-dimensional unit of homogeneous dtype contained in a pandas data structure @@ -155,6 +154,7 @@ class Block(libinternals.Block, PandasObject): """ values: Union[np.ndarray, ExtensionArray] + ndim: int __slots__ = () is_numeric = False @@ -1438,7 +1438,7 @@ def quantile( return new_block(result, placement=self._mgr_locs, ndim=2) -class ExtensionBlock(Block): +class ExtensionBlock(libinternals.Block, Block): """ Block for holding extension types. @@ -1751,10 +1751,10 @@ class HybridMixin: Mixin for Blocks backed (maybe indirectly) by ExtensionArrays. """ - array_values: Callable + values: ExtensionArray # type: ignore[misc] def _can_hold_element(self, element: Any) -> bool: - values = self.array_values + values = self.values try: # error: "Callable[..., Any]" has no attribute "_validate_setitem_value" @@ -1775,7 +1775,7 @@ class ObjectValuesExtensionBlock(HybridMixin, ExtensionBlock): pass -class NumericBlock(Block): +class NumericBlock(libinternals.NumpyBlock, Block): __slots__ = () is_numeric = True @@ -1906,7 +1906,7 @@ def array_values(self): return self.values -class DatetimeBlock(DatetimeLikeBlockMixin): +class DatetimeBlock(libinternals.Block, DatetimeLikeBlockMixin): __slots__ = () @@ -1932,11 +1932,12 @@ class DatetimeTZBlock(ExtensionBlock, DatetimeLikeBlockMixin): is_view = NDArrayBackedExtensionBlock.is_view # type: ignore[assignment] -class TimeDeltaBlock(DatetimeLikeBlockMixin): +class TimeDeltaBlock(libinternals.Block, DatetimeLikeBlockMixin): __slots__ = () + values: TimedeltaArray -class ObjectBlock(Block): +class ObjectBlock(libinternals.NumpyBlock, Block): __slots__ = () is_object = True From c610ebc201a3e97e4a4d9cead86aca2bfb324fac Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 1 Apr 2021 16:23:27 -0700 Subject: [PATCH 3/3] merge master --- pandas/_libs/groupby.pyx | 47 ++++++++++++-------- pandas/_libs/internals.pyx | 2 +- pandas/_libs/lib.pyx | 2 +- pandas/_libs/reshape.pyx | 2 +- pandas/_libs/tslibs/conversion.pyx | 2 +- pandas/_libs/tslibs/timedeltas.pyx | 4 ++ pandas/core/algorithms.py | 4 +- pandas/core/arrays/categorical.py | 4 +- pandas/core/arrays/datetimes.py | 2 +- pandas/core/arrays/timedeltas.py | 2 +- pandas/core/dtypes/cast.py | 54 +++++++++++++++++++++-- pandas/core/frame.py | 2 +- pandas/core/generic.py | 20 ++++----- pandas/core/groupby/categorical.py | 2 +- pandas/core/groupby/ops.py | 2 +- pandas/core/indexes/base.py | 8 ++-- pandas/core/internals/blocks.py | 66 +++++----------------------- pandas/core/internals/managers.py | 9 ---- pandas/core/series.py | 2 +- pandas/core/tools/datetimes.py | 2 +- pandas/core/window/rolling.py | 2 +- pandas/io/formats/excel.py | 14 +++--- pandas/tests/extension/test_numpy.py | 13 +++++- 23 files changed, 141 insertions(+), 126 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index a9feaee825a4b..ed8911b6cd929 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -154,9 +154,9 @@ def group_cumprod_float64(float64_t[:, ::1] out, Parameters ---------- - out : float64 array + out : np.ndarray[np.float64, ndim=2] Array to store cumprod in. - values : float64 array + values : np.ndarray[np.float64, ndim=2] Values to take cumprod of. labels : np.ndarray[np.intp] Labels to group by. @@ -211,9 +211,9 @@ def group_cumsum(numeric[:, ::1] out, Parameters ---------- - out : array + out : np.ndarray[ndim=2] Array to store cumsum in. - values : array + values : np.ndarray[ndim=2] Values to take cumsum of. labels : np.ndarray[np.intp] Labels to group by. @@ -329,12 +329,15 @@ def group_fillna_indexer(ndarray[int64_t] out, ndarray[intp_t] labels, Parameters ---------- - out : array of int64_t values which this method will write its results to - Missing values will be written to with a value of -1 + out : np.ndarray[np.uint8] + Values into which this method will write its results. labels : np.ndarray[np.intp] Array containing unique label for each group, with its ordering matching up to the corresponding record in `values`. - mask : array of int64_t values where a 1 indicates a missing value + values : np.ndarray[np.uint8] + Containing the truth value of each element. + mask : np.ndarray[np.uint8] + Indicating whether a value is na or not. direction : {'ffill', 'bfill'} Direction for fill to be applied (forwards or backwards, respectively) limit : Consecutive values to fill before stopping, or -1 for no limit @@ -396,12 +399,15 @@ def group_any_all(uint8_t[::1] out, Parameters ---------- - out : array of values which this method will write its results to + out : np.ndarray[np.uint8] + Values into which this method will write its results. labels : np.ndarray[np.intp] Array containing unique label for each group, with its ordering matching up to the corresponding record in `values` - values : array containing the truth value of each element - mask : array indicating whether a value is na or not + values : np.ndarray[np.uint8] + Containing the truth value of each element. + mask : np.ndarray[np.uint8] + Indicating whether a value is na or not. val_test : {'any', 'all'} String object dictating whether to use any or all truth testing skipna : bool @@ -721,14 +727,17 @@ def group_quantile(ndarray[float64_t] out, Parameters ---------- - out : ndarray + out : np.ndarray[np.float64] Array of aggregated values that will be written to. + values : np.ndarray + Array containing the values to apply the function against. labels : ndarray[np.intp] Array containing the unique group labels. values : ndarray Array containing the values to apply the function against. q : float The quantile value to search for. + interpolation : {'linear', 'lower', 'highest', 'nearest', 'midpoint'} Notes ----- @@ -1048,8 +1057,9 @@ def group_rank(float64_t[:, ::1] out, Parameters ---------- - out : array of float64_t values which this method will write its results to - values : array of rank_t values to be ranked + out : np.ndarray[np.float64, ndim=2] + Values to which this method will write its results. + values : np.ndarray of rank_t values to be ranked labels : np.ndarray[np.intp] Array containing unique label for each group, with its ordering matching up to the corresponding record in `values` @@ -1058,8 +1068,7 @@ def group_rank(float64_t[:, ::1] out, groupby functions. is_datetimelike : bool True if `values` contains datetime-like entries. - ties_method : {'average', 'min', 'max', 'first', 'dense'}, default - 'average' + ties_method : {'average', 'min', 'max', 'first', 'dense'}, default 'average' * average: average rank of group * min: lowest rank in group * max: highest rank in group @@ -1120,9 +1129,9 @@ cdef group_min_max(groupby_t[:, ::1] out, Parameters ---------- - out : array + out : np.ndarray[groupby_t, ndim=2] Array to store result in. - counts : int64 array + counts : np.ndarray[int64] Input as a zeroed array, populated by group sizes during algorithm values : array Values to find column-wise min/max of. @@ -1241,9 +1250,9 @@ def group_cummin_max(groupby_t[:, ::1] out, Parameters ---------- - out : array + out : np.ndarray[groupby_t, ndim=2] Array to store cummin/max in. - values : array + values : np.ndarray[groupby_t, ndim=2] Values to take cummin/max of. labels : np.ndarray[np.intp] Labels to group by. diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index bbaec16d69ae2..d7c5882e92f97 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -385,7 +385,7 @@ def get_blkno_indexers(int64_t[:] blknos, bint group=True): Returns ------- - iter : iterator of (int, slice or array) + list[tuple[int, slice | np.ndarray]] """ # There's blkno in this function's name because it's used in block & # blockno handling. diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 646b5a05afcad..b8d79d0835fb8 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -916,7 +916,7 @@ def indices_fast(ndarray[intp_t] index, const int64_t[:] labels, list keys, """ Parameters ---------- - index : ndarray + index : ndarray[intp] labels : ndarray[int64] keys : list sorted_labels : list[ndarray[int64]] diff --git a/pandas/_libs/reshape.pyx b/pandas/_libs/reshape.pyx index 8cf48ef04ac31..959d83a55d4f3 100644 --- a/pandas/_libs/reshape.pyx +++ b/pandas/_libs/reshape.pyx @@ -52,7 +52,7 @@ def unstack(reshape_t[:, :] values, const uint8_t[:] mask, stride : int length : int width : int - new_values : typed ndarray + new_values : np.ndarray[bool] result array new_mask : np.ndarray[bool] result mask diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 893c0fa52cd15..1bda35206ccef 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -316,7 +316,7 @@ def datetime_to_datetime64(ndarray[object] values): Returns ------- - result : ndarray[int64_t] + result : ndarray[datetime64ns] inferred_tz : tzinfo or None """ cdef: diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index bf8acfb459cb8..1d99ebba3b9f0 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -315,6 +315,10 @@ def array_to_timedelta64(ndarray[object] values, str unit=None, str errors="rais """ Convert an ndarray to an array of timedeltas. If errors == 'coerce', coerce non-convertible objects to NaT. Otherwise, raise. + + Returns + ------- + np.ndarray[timedelta64ns] """ cdef: diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 91a7584b975c3..541dd8abee3c3 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -943,7 +943,7 @@ def duplicated(values: ArrayLike, keep: Union[str, bool] = "first") -> np.ndarra Returns ------- - duplicated : ndarray + duplicated : ndarray[bool] """ values, _ = _ensure_data(values) ndtype = values.dtype.name @@ -1631,7 +1631,7 @@ def diff(arr, n: int, axis: int = 0, stacklevel=3): number of periods axis : {0, 1} axis to shift on - stacklevel : int + stacklevel : int, default 3 The stacklevel for the lost dtype warning. Returns diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 08646c4d25a50..9aafea4b998a1 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1452,7 +1452,7 @@ def isna(self): Returns ------- - a boolean array of whether my values are null + np.ndarray[bool] of whether my values are null See Also -------- @@ -1474,7 +1474,7 @@ def notna(self): Returns ------- - a boolean array of whether my values are not null + np.ndarray[bool] of whether my values are not null See Also -------- diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index c0a8c20832fa8..d1f0f506766a8 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -1031,7 +1031,7 @@ def to_pydatetime(self) -> np.ndarray: Returns ------- - datetimes : ndarray + datetimes : ndarray[object] """ return ints_to_pydatetime(self.asi8, tz=self.tz) diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index f3889ff360aa8..1bf822c1ae3e5 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -853,7 +853,7 @@ def to_pytimedelta(self) -> np.ndarray: Returns ------- - datetimes : ndarray + timedeltas : ndarray[object] """ return tslibs.ints_to_pytimedelta(self.asi8) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 979b70c30d1b0..edc43bc68b2a8 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -113,6 +113,9 @@ from pandas.core.arrays import ( DatetimeArray, ExtensionArray, + IntervalArray, + PeriodArray, + TimedeltaArray, ) _int8_max = np.iinfo(np.int8).max @@ -2169,24 +2172,51 @@ def validate_numeric_casting(dtype: np.dtype, value: Scalar) -> None: raise ValueError(f"Cannot assign {type(value).__name__} to bool series") -def can_hold_element(dtype: np.dtype, element: Any) -> bool: +def can_hold_element(arr: ArrayLike, element: Any) -> bool: """ Can we do an inplace setitem with this element in an array with this dtype? Parameters ---------- - dtype : np.dtype + arr : np.ndarray or ExtensionArray element : Any Returns ------- bool """ + dtype = arr.dtype + if not isinstance(dtype, np.dtype) or dtype.kind in ["m", "M"]: + if isinstance(dtype, (PeriodDtype, IntervalDtype, DatetimeTZDtype, np.dtype)): + # np.dtype here catches datetime64ns and timedelta64ns; we assume + # in this case that we have DatetimeArray/TimedeltaArray + arr = cast( + "PeriodArray | DatetimeArray | TimedeltaArray | IntervalArray", arr + ) + try: + arr._validate_setitem_value(element) + return True + except (ValueError, TypeError): + return False + + # This is technically incorrect, but maintains the behavior of + # ExtensionBlock._can_hold_element + return True + tipo = maybe_infer_dtype_type(element) if dtype.kind in ["i", "u"]: if tipo is not None: - return tipo.kind in ["i", "u"] and dtype.itemsize >= tipo.itemsize + if tipo.kind not in ["i", "u"]: + # Anything other than integer we cannot hold + return False + elif dtype.itemsize < tipo.itemsize: + return False + elif not isinstance(tipo, np.dtype): + # i.e. nullable IntegerDtype; we can put this into an ndarray + # losslessly iff it has no NAs + return not element._mask.any() + return True # We have not inferred an integer from the dtype # check if we have a builtin int or a float equal to an int @@ -2194,7 +2224,16 @@ def can_hold_element(dtype: np.dtype, element: Any) -> bool: elif dtype.kind == "f": if tipo is not None: - return tipo.kind in ["f", "i", "u"] + # TODO: itemsize check? + if tipo.kind not in ["f", "i", "u"]: + # Anything other than float/integer we cannot hold + return False + elif not isinstance(tipo, np.dtype): + # i.e. nullable IntegerDtype or FloatingDtype; + # we can put this into an ndarray losslessly iff it has no NAs + return not element._mask.any() + return True + return lib.is_integer(element) or lib.is_float(element) elif dtype.kind == "c": @@ -2212,4 +2251,11 @@ def can_hold_element(dtype: np.dtype, element: Any) -> bool: elif dtype == object: return True + elif dtype.kind == "S": + # TODO: test tests.frame.methods.test_replace tests get here, + # need more targeted tests. xref phofl has a PR about this + if tipo is not None: + return tipo.kind == "S" and tipo.itemsize <= dtype.itemsize + return isinstance(element, bytes) and len(element) <= dtype.itemsize + raise NotImplementedError(dtype) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b22fcbd9229e7..484b01f2c04f0 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8191,7 +8191,7 @@ def _gotitem( Parameters ---------- key : string / list of selections - ndim : 1,2 + ndim : {1, 2} requested ndim of result subset : object, default None subset to act on diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 8524907a84099..6b4e3c7caef50 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1656,9 +1656,9 @@ def _is_label_reference(self, key, axis=0) -> bool_t: Parameters ---------- - key: str + key : str Potential label name - axis: int, default 0 + axis : int, default 0 Axis perpendicular to the axis that labels are associated with (0 means search for column labels, 1 means search for index labels) @@ -1687,14 +1687,14 @@ def _is_label_or_level_reference(self, key: str, axis: int = 0) -> bool_t: Parameters ---------- - key: str + key : str Potential label or level name - axis: int, default 0 + axis : int, default 0 Axis that levels are associated with (0 for index, 1 for columns) Returns ------- - is_label_or_level: bool + bool """ return self._is_level_reference(key, axis=axis) or self._is_label_reference( key, axis=axis @@ -1710,9 +1710,9 @@ def _check_label_or_level_ambiguity(self, key, axis: int = 0) -> None: Parameters ---------- - key: str or object + key : str or object Label or level name. - axis: int, default 0 + axis : int, default 0 Axis that levels are associated with (0 for index, 1 for columns). Raises @@ -1760,14 +1760,14 @@ def _get_label_or_level_values(self, key: str, axis: int = 0) -> np.ndarray: Parameters ---------- - key: str + key : str Label or level name. - axis: int, default 0 + axis : int, default 0 Axis that levels are associated with (0 for index, 1 for columns) Returns ------- - values: np.ndarray + values : np.ndarray Raises ------ diff --git a/pandas/core/groupby/categorical.py b/pandas/core/groupby/categorical.py index ffe31147fe87d..6de8c1d789097 100644 --- a/pandas/core/groupby/categorical.py +++ b/pandas/core/groupby/categorical.py @@ -41,7 +41,7 @@ def recode_for_groupby( Returns ------- - New Categorical + Categorical If sort=False, the new categories are set to the order of appearance in codes (unless ordered=True, in which case the original order is preserved), followed by any unrepresented diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 4c086f3b8612e..8dd5e3b771f2f 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -808,7 +808,7 @@ class BinGrouper(BaseGrouper): binlabels : the label list filter_empty : bool, default False mutated : bool, default False - indexer : a intp array + indexer : np.ndarray[np.intp] Examples -------- diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index e85d09a479d16..29c2f7cfcf00d 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2412,7 +2412,7 @@ def isna(self) -> np.ndarray: Returns ------- - numpy.ndarray + numpy.ndarray[bool] A boolean array of whether my values are NA. See Also @@ -2470,7 +2470,7 @@ def notna(self) -> np.ndarray: Returns ------- - numpy.ndarray + numpy.ndarray[bool] Boolean array to indicate which entries are not NA. See Also @@ -4482,7 +4482,7 @@ def _validate_fill_value(self, value): TypeError If the value cannot be inserted into an array of this dtype. """ - if not can_hold_element(self.dtype, value): + if not can_hold_element(self._values, value): raise TypeError return value @@ -5499,7 +5499,7 @@ def isin(self, values, level=None): Returns ------- - is_contained : ndarray + is_contained : ndarray[bool] NumPy array of boolean values. See Also diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index dd9d997d26dc0..b6a34ac228a2b 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -17,8 +17,6 @@ import numpy as np from pandas._libs import ( - Interval, - Period, Timestamp, algos as libalgos, internals as libinternals, @@ -101,6 +99,7 @@ PeriodArray, TimedeltaArray, ) +from pandas.core.arrays._mixins import NDArrayBackedExtensionArray from pandas.core.base import PandasObject import pandas.core.common as com import pandas.core.computation.expressions as expressions @@ -121,7 +120,6 @@ Float64Index, Index, ) - from pandas.core.arrays._mixins import NDArrayBackedExtensionArray # comparison is faster than is_object_dtype _dtype_obj = np.dtype("object") @@ -625,9 +623,11 @@ def convert( """ return [self.copy()] if copy else [self] + @final def _can_hold_element(self, element: Any) -> bool: """ require the same dtype as ourselves """ - raise NotImplementedError("Implemented on subclasses") + element = extract_array(element, extract_numpy=True) + return can_hold_element(self.values, element) @final def should_store(self, value: ArrayLike) -> bool: @@ -1545,7 +1545,7 @@ def setitem(self, indexer, value): be a compatible shape. """ if not self._can_hold_element(value): - # This is only relevant for DatetimeTZBlock, ObjectValuesExtensionBlock, + # This is only relevant for DatetimeTZBlock, PeriodDtype, IntervalDtype, # which has a non-trivial `_can_hold_element`. # https://github.com/pandas-dev/pandas/issues/24020 # Need a dedicated setitem until GH#24020 (type promotion in setitem @@ -1597,10 +1597,6 @@ def take_nd( return self.make_block_same_class(new_values, new_mgr_locs) - def _can_hold_element(self, element: Any) -> bool: - # TODO: We may need to think about pushing this onto the array. - return True - def _slice(self, slicer): """ Return a slice of my values. @@ -1746,54 +1742,22 @@ def _unstack(self, unstacker, fill_value, new_placement): return blocks, mask -class HybridMixin: - """ - Mixin for Blocks backed (maybe indirectly) by ExtensionArrays. - """ - - values: ExtensionArray # type: ignore[misc] - - def _can_hold_element(self, element: Any) -> bool: - values = self.values - - try: - # error: "Callable[..., Any]" has no attribute "_validate_setitem_value" - values._validate_setitem_value(element) # type: ignore[attr-defined] - return True - except (ValueError, TypeError): - return False - - -class ObjectValuesExtensionBlock(HybridMixin, ExtensionBlock): - """ - Block providing backwards-compatibility for `.values`. - - Used by PeriodArray and IntervalArray to ensure that - Series[T].values is an ndarray of objects. - """ - - pass - - class NumericBlock(libinternals.NumpyBlock, Block): __slots__ = () is_numeric = True - def _can_hold_element(self, element: Any) -> bool: - element = extract_array(element, extract_numpy=True) - if isinstance(element, (IntegerArray, FloatingArray)): - if element._mask.any(): - return False - return can_hold_element(self.dtype, element) - -class NDArrayBackedExtensionBlock(HybridMixin, Block): +class NDArrayBackedExtensionBlock(Block): """ Block backed by an NDArrayBackedExtensionArray """ values: NDArrayBackedExtensionArray + @property + def array_values(self) -> NDArrayBackedExtensionArray: + return self.values + @property def is_view(self) -> bool: """ return a boolean if I am possibly a view """ @@ -1901,10 +1865,6 @@ class DatetimeLikeBlockMixin(NDArrayBackedExtensionBlock): is_numeric = False - @cache_readonly - def array_values(self): - return self.values - class DatetimeBlock(libinternals.Block, DatetimeLikeBlockMixin): __slots__ = () @@ -1920,7 +1880,6 @@ class DatetimeTZBlock(ExtensionBlock, DatetimeLikeBlockMixin): is_numeric = False internal_values = Block.internal_values - _can_hold_element = DatetimeBlock._can_hold_element diff = DatetimeBlock.diff where = DatetimeBlock.where putmask = DatetimeLikeBlockMixin.putmask @@ -1984,9 +1943,6 @@ def convert( res_values = ensure_block_shape(res_values, self.ndim) return [self.make_block(res_values)] - def _can_hold_element(self, element: Any) -> bool: - return True - class CategoricalBlock(ExtensionBlock): # this Block type is kept for backwards-compatibility @@ -2053,8 +2009,6 @@ def get_block_type(values, dtype: Optional[Dtype] = None): cls = CategoricalBlock elif vtype is Timestamp: cls = DatetimeTZBlock - elif vtype is Interval or vtype is Period: - cls = ObjectValuesExtensionBlock elif isinstance(dtype, ExtensionDtype): # Note: need to be sure PandasArray is unwrapped before we get here cls = ExtensionBlock diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 6681015856d6b..de0a5687aeb8b 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -73,7 +73,6 @@ CategoricalBlock, DatetimeTZBlock, ExtensionBlock, - ObjectValuesExtensionBlock, ensure_block_shape, extend_blocks, get_block_type, @@ -1841,14 +1840,6 @@ def _form_blocks( blocks.extend(external_blocks) - if len(items_dict["ObjectValuesExtensionBlock"]): - external_blocks = [ - new_block(array, klass=ObjectValuesExtensionBlock, placement=i, ndim=2) - for i, array in items_dict["ObjectValuesExtensionBlock"] - ] - - blocks.extend(external_blocks) - if len(extra_locs): shape = (len(extra_locs),) + tuple(len(x) for x in axes[1:]) diff --git a/pandas/core/series.py b/pandas/core/series.py index 4b89c09cdb898..27ebf7f228bc0 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3999,7 +3999,7 @@ def _gotitem(self, key, ndim, subset=None) -> Series: Parameters ---------- key : string / list of selections - ndim : 1,2 + ndim : {1, 2} Requested ndim of result. subset : object, default None Subset to act on. diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 66dc80159af16..7619623bb9eda 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -576,7 +576,7 @@ def _adjust_to_origin(arg, origin, unit): date to be adjusted origin : 'julian' or Timestamp origin offset for the arg - unit : string + unit : str passed unit from to_datetime, must be 'D' Returns diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index b482934dd25d2..b90722857938e 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -239,7 +239,7 @@ def _gotitem(self, key, ndim, subset=None): Parameters ---------- key : str / list of selections - ndim : 1,2 + ndim : {1, 2} requested ndim of result subset : object, default None subset to act on diff --git a/pandas/io/formats/excel.py b/pandas/io/formats/excel.py index dbe483d021c63..648df0ff2b6d9 100644 --- a/pandas/io/formats/excel.py +++ b/pandas/io/formats/excel.py @@ -431,8 +431,8 @@ class ExcelFormatter: ---------- df : DataFrame or Styler na_rep: na representation - float_format : string, default None - Format string for floating point numbers + float_format : str, default None + Format string for floating point numbers cols : sequence, optional Columns to write header : bool or sequence of str, default True @@ -440,12 +440,12 @@ class ExcelFormatter: assumed to be aliases for the column names index : bool, default True output row names (index) - index_label : string or sequence, default None - Column label for index column(s) if desired. If None is given, and - `header` and `index` are True, then the index names are used. A - sequence should be given if the DataFrame uses MultiIndex. + index_label : str or sequence, default None + Column label for index column(s) if desired. If None is given, and + `header` and `index` are True, then the index names are used. A + sequence should be given if the DataFrame uses MultiIndex. merge_cells : bool, default False - Format MultiIndex and Hierarchical Rows as merged cells. + Format MultiIndex and Hierarchical Rows as merged cells. inf_rep : str, default `'inf'` representation for np.inf values (which aren't representable in Excel) A `'-'` sign will be added in front of -inf. diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index 051871513a14e..e11e74f16030c 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -18,6 +18,7 @@ import pandas.util._test_decorators as td +from pandas.core.dtypes.cast import can_hold_element from pandas.core.dtypes.dtypes import ( ExtensionDtype, PandasDtype, @@ -27,7 +28,10 @@ import pandas as pd import pandas._testing as tm from pandas.core.arrays.numpy_ import PandasArray -from pandas.core.internals import managers +from pandas.core.internals import ( + blocks, + managers, +) from pandas.tests.extension import base # TODO(ArrayManager) PandasArray @@ -45,6 +49,12 @@ def _extract_array_patched(obj): return obj +def _can_hold_element_patched(obj, element) -> bool: + if isinstance(element, PandasArray): + element = element.to_numpy() + return can_hold_element(obj, element) + + @pytest.fixture(params=["float", "object"]) def dtype(request): return PandasDtype(np.dtype(request.param)) @@ -70,6 +80,7 @@ def allow_in_pandas(monkeypatch): with monkeypatch.context() as m: m.setattr(PandasArray, "_typ", "extension") m.setattr(managers, "_extract_array", _extract_array_patched) + m.setattr(blocks, "can_hold_element", _can_hold_element_patched) yield