From 654f58487a741058fe411bacd5a9730629b02de5 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 14 Feb 2022 20:07:56 -0800 Subject: [PATCH 1/5] REF: _IntegerDtype -> IntegerDtype --- pandas/core/arrays/integer.py | 30 +++++++++++++++--------------- pandas/core/arrays/string_.py | 4 ++-- pandas/core/groupby/ops.py | 10 +++++----- pandas/io/stata.py | 6 +++--- 4 files changed, 25 insertions(+), 25 deletions(-) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 056669f40ca87..68501689b5e39 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -14,12 +14,12 @@ ) -class _IntegerDtype(NumericDtype): +class IntegerDtype(NumericDtype): """ An ExtensionDtype to hold a single size & kind of integer dtype. These specific implementations are subclasses of the non-public - _IntegerDtype. For example we have Int8Dtype to represent signed int 8s. + IntegerDtype. For example we have Int8Dtype to represent signed int 8s. The attributes name & type are set when these subclasses are created. """ @@ -86,7 +86,7 @@ def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None: return None @classmethod - def _standardize_dtype(cls, dtype) -> _IntegerDtype: + def _standardize_dtype(cls, dtype) -> IntegerDtype: if isinstance(dtype, str) and ( dtype.startswith("Int") or dtype.startswith("UInt") ): @@ -94,7 +94,7 @@ def _standardize_dtype(cls, dtype) -> _IntegerDtype: # https://github.com/numpy/numpy/pull/7476 dtype = dtype.lower() - if not issubclass(type(dtype), _IntegerDtype): + if not issubclass(type(dtype), IntegerDtype): try: dtype = INT_STR_TO_DTYPE[str(np.dtype(dtype))] except KeyError as err: @@ -189,7 +189,7 @@ class IntegerArray(NumericArray): Length: 3, dtype: UInt16 """ - _dtype_cls = _IntegerDtype + _dtype_cls = IntegerDtype # The value used to fill '_data' to avoid upcasting _internal_fill_value = 1 @@ -198,7 +198,7 @@ class IntegerArray(NumericArray): _falsey_value = 0 @cache_readonly - def dtype(self) -> _IntegerDtype: + def dtype(self) -> IntegerDtype: return INT_STR_TO_DTYPE[str(self._data.dtype)] def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): @@ -231,62 +231,62 @@ def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): @register_extension_dtype -class Int8Dtype(_IntegerDtype): +class Int8Dtype(IntegerDtype): type = np.int8 name = "Int8" __doc__ = _dtype_docstring.format(dtype="int8") @register_extension_dtype -class Int16Dtype(_IntegerDtype): +class Int16Dtype(IntegerDtype): type = np.int16 name = "Int16" __doc__ = _dtype_docstring.format(dtype="int16") @register_extension_dtype -class Int32Dtype(_IntegerDtype): +class Int32Dtype(IntegerDtype): type = np.int32 name = "Int32" __doc__ = _dtype_docstring.format(dtype="int32") @register_extension_dtype -class Int64Dtype(_IntegerDtype): +class Int64Dtype(IntegerDtype): type = np.int64 name = "Int64" __doc__ = _dtype_docstring.format(dtype="int64") @register_extension_dtype -class UInt8Dtype(_IntegerDtype): +class UInt8Dtype(IntegerDtype): type = np.uint8 name = "UInt8" __doc__ = _dtype_docstring.format(dtype="uint8") @register_extension_dtype -class UInt16Dtype(_IntegerDtype): +class UInt16Dtype(IntegerDtype): type = np.uint16 name = "UInt16" __doc__ = _dtype_docstring.format(dtype="uint16") @register_extension_dtype -class UInt32Dtype(_IntegerDtype): +class UInt32Dtype(IntegerDtype): type = np.uint32 name = "UInt32" __doc__ = _dtype_docstring.format(dtype="uint32") @register_extension_dtype -class UInt64Dtype(_IntegerDtype): +class UInt64Dtype(IntegerDtype): type = np.uint64 name = "UInt64" __doc__ = _dtype_docstring.format(dtype="uint64") -INT_STR_TO_DTYPE: dict[str, _IntegerDtype] = { +INT_STR_TO_DTYPE: dict[str, IntegerDtype] = { "int8": Int8Dtype(), "int16": Int16Dtype(), "int32": Int32Dtype(), diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index af1756470a9c0..ca4348e3bd06a 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -45,7 +45,7 @@ ) from pandas.core.arrays.base import ExtensionArray from pandas.core.arrays.floating import FloatingDtype -from pandas.core.arrays.integer import _IntegerDtype +from pandas.core.arrays.integer import IntegerDtype from pandas.core.construction import extract_array from pandas.core.indexers import check_array_indexer from pandas.core.missing import isna @@ -432,7 +432,7 @@ def astype(self, dtype, copy: bool = True): return self.copy() return self - elif isinstance(dtype, _IntegerDtype): + elif isinstance(dtype, IntegerDtype): arr = self._ndarray.copy() mask = self.isna() arr[mask] = 0 diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index d4aa6ae9f4059..cf046d92dd6f3 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -76,7 +76,7 @@ ) from pandas.core.arrays.integer import ( Int64Dtype, - _IntegerDtype, + IntegerDtype, ) from pandas.core.arrays.masked import ( BaseMaskedArray, @@ -300,10 +300,10 @@ def _get_result_dtype(self, dtype: DtypeObj) -> DtypeObj: if how in ["add", "cumsum", "sum", "prod"]: if dtype == np.dtype(bool): return np.dtype(np.int64) - elif isinstance(dtype, (BooleanDtype, _IntegerDtype)): + elif isinstance(dtype, (BooleanDtype, IntegerDtype)): return Int64Dtype() elif how in ["mean", "median", "var"]: - if isinstance(dtype, (BooleanDtype, _IntegerDtype)): + if isinstance(dtype, (BooleanDtype, IntegerDtype)): return Float64Dtype() elif is_float_dtype(dtype) or is_complex_dtype(dtype): return dtype @@ -341,7 +341,7 @@ def _ea_wrap_cython_operation( # All of the functions implemented here are ordinal, so we can # operate on the tz-naive equivalents npvalues = values._ndarray.view("M8[ns]") - elif isinstance(values.dtype, (BooleanDtype, _IntegerDtype)): + elif isinstance(values.dtype, (BooleanDtype, IntegerDtype)): # IntegerArray or BooleanArray npvalues = values.to_numpy("float64", na_value=np.nan) elif isinstance(values.dtype, FloatingDtype): @@ -378,7 +378,7 @@ def _reconstruct_ea_result(self, values, res_values): # TODO: allow EAs to override this logic if isinstance( - values.dtype, (BooleanDtype, _IntegerDtype, FloatingDtype, StringDtype) + values.dtype, (BooleanDtype, IntegerDtype, FloatingDtype, StringDtype) ): dtype = self._get_result_dtype(values.dtype) cls = dtype.construct_array_type() diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 883cc36e4c1f1..60c4634662296 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -62,7 +62,7 @@ to_timedelta, ) from pandas.core.arrays.boolean import BooleanDtype -from pandas.core.arrays.integer import _IntegerDtype +from pandas.core.arrays.integer import IntegerDtype from pandas.core.frame import DataFrame from pandas.core.indexes.base import Index from pandas.core.series import Series @@ -585,7 +585,7 @@ def _cast_to_stata_types(data: DataFrame) -> DataFrame: for col in data: # Cast from unsupported types to supported types - is_nullable_int = isinstance(data[col].dtype, (_IntegerDtype, BooleanDtype)) + is_nullable_int = isinstance(data[col].dtype, (IntegerDtype, BooleanDtype)) orig = data[col] # We need to find orig_missing before altering data below orig_missing = orig.isna() @@ -593,7 +593,7 @@ def _cast_to_stata_types(data: DataFrame) -> DataFrame: missing_loc = data[col].isna() if missing_loc.any(): # Replace with always safe value - fv = 0 if isinstance(data[col].dtype, _IntegerDtype) else False + fv = 0 if isinstance(data[col].dtype, IntegerDtype) else False data.loc[missing_loc, col] = fv # Replace with NumPy-compatible column data[col] = data[col].astype(data[col].dtype.numpy_dtype) From b8a30915a21fff5e3d0326c4580ad6c820abae52 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 14 Feb 2022 20:11:30 -0800 Subject: [PATCH 2/5] REF: share dtype methods --- pandas/core/arrays/floating.py | 7 ------- pandas/core/arrays/integer.py | 16 ---------------- pandas/core/arrays/numeric.py | 16 ++++++++++++++++ 3 files changed, 16 insertions(+), 23 deletions(-) diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py index d55aef953b5b5..1779a9ced9feb 100644 --- a/pandas/core/arrays/floating.py +++ b/pandas/core/arrays/floating.py @@ -25,13 +25,6 @@ class FloatingDtype(NumericDtype): _default_np_dtype = np.dtype(np.float64) - def __repr__(self) -> str: - return f"{self.name}Dtype()" - - @property - def _is_numeric(self) -> bool: - return True - @classmethod def construct_array_type(cls) -> type[FloatingArray]: """ diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 68501689b5e39..2765b10112fdd 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -26,22 +26,6 @@ class IntegerDtype(NumericDtype): _default_np_dtype = np.dtype(np.int64) - def __repr__(self) -> str: - sign = "U" if self.is_unsigned_integer else "" - return f"{sign}Int{8 * self.itemsize}Dtype()" - - @cache_readonly - def is_signed_integer(self) -> bool: - return self.kind == "i" - - @cache_readonly - def is_unsigned_integer(self) -> bool: - return self.kind == "u" - - @property - def _is_numeric(self) -> bool: - return True - @classmethod def construct_array_type(cls) -> type[IntegerArray]: """ diff --git a/pandas/core/arrays/numeric.py b/pandas/core/arrays/numeric.py index 5ab1a9908fd02..7d465228a1330 100644 --- a/pandas/core/arrays/numeric.py +++ b/pandas/core/arrays/numeric.py @@ -17,6 +17,7 @@ DtypeObj, ) from pandas.errors import AbstractMethodError +from pandas.util._decorators import cache_readonly from pandas.core.dtypes.common import ( is_bool_dtype, @@ -42,6 +43,21 @@ class NumericDtype(BaseMaskedDtype): _default_np_dtype: np.dtype + def __repr__(self) -> str: + return f"{self.name}Dtype()" + + @cache_readonly + def is_signed_integer(self) -> bool: + return self.kind == "i" + + @cache_readonly + def is_unsigned_integer(self) -> bool: + return self.kind == "u" + + @property + def _is_numeric(self) -> bool: + return True + def __from_arrow__( self, array: pyarrow.Array | pyarrow.ChunkedArray ) -> BaseMaskedArray: From ac2e0e857e1cbfcbaa82bff83d7732455a573930 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 14 Feb 2022 20:16:28 -0800 Subject: [PATCH 3/5] REF: share _standardize_dtype --- pandas/core/arrays/floating.py | 14 ++------------ pandas/core/arrays/integer.py | 16 ++-------------- pandas/core/arrays/numeric.py | 17 ++++++++++++++++- 3 files changed, 20 insertions(+), 27 deletions(-) diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py index 1779a9ced9feb..5b65fa4f58a82 100644 --- a/pandas/core/arrays/floating.py +++ b/pandas/core/arrays/floating.py @@ -51,18 +51,8 @@ def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None: return None @classmethod - def _standardize_dtype(cls, dtype) -> FloatingDtype: - if isinstance(dtype, str) and dtype.startswith("Float"): - # Avoid DeprecationWarning from NumPy about np.dtype("Float64") - # https://github.com/numpy/numpy/pull/7476 - dtype = dtype.lower() - - if not issubclass(type(dtype), FloatingDtype): - try: - dtype = FLOAT_STR_TO_DTYPE[str(np.dtype(dtype))] - except KeyError as err: - raise ValueError(f"invalid dtype specified {dtype}") from err - return dtype + def _str_to_dtype_mapping(cls): + return FLOAT_STR_TO_DTYPE @classmethod def _safe_cast(cls, values: np.ndarray, dtype: np.dtype, copy: bool) -> np.ndarray: diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 2765b10112fdd..2e36dcbb2ab2c 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -70,20 +70,8 @@ def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None: return None @classmethod - def _standardize_dtype(cls, dtype) -> IntegerDtype: - if isinstance(dtype, str) and ( - dtype.startswith("Int") or dtype.startswith("UInt") - ): - # Avoid DeprecationWarning from NumPy about np.dtype("Int64") - # https://github.com/numpy/numpy/pull/7476 - dtype = dtype.lower() - - if not issubclass(type(dtype), IntegerDtype): - try: - dtype = INT_STR_TO_DTYPE[str(np.dtype(dtype))] - except KeyError as err: - raise ValueError(f"invalid dtype specified {dtype}") from err - return dtype + def _str_to_dtype_mapping(cls): + return INT_STR_TO_DTYPE @classmethod def _safe_cast(cls, values: np.ndarray, dtype: np.dtype, copy: bool) -> np.ndarray: diff --git a/pandas/core/arrays/numeric.py b/pandas/core/arrays/numeric.py index 7d465228a1330..df01779e93234 100644 --- a/pandas/core/arrays/numeric.py +++ b/pandas/core/arrays/numeric.py @@ -106,12 +106,27 @@ def __from_arrow__( else: return array_class._concat_same_type(results) + @classmethod + def _str_to_dtype_mapping(cls): + raise AbstractMethodError(cls) + @classmethod def _standardize_dtype(cls, dtype) -> NumericDtype: """ Convert a string representation or a numpy dtype to NumericDtype. """ - raise AbstractMethodError(cls) + if isinstance(dtype, str) and (dtype.startswith(("Int", "UInt", "Float"))): + # Avoid DeprecationWarning from NumPy about np.dtype("Int64") + # https://github.com/numpy/numpy/pull/7476 + dtype = dtype.lower() + + if not issubclass(type(dtype), cls): + mapping = cls._str_to_dtype_mapping() + try: + dtype = mapping[str(np.dtype(dtype))] + except KeyError as err: + raise ValueError(f"invalid dtype specified {dtype}") from err + return dtype @classmethod def _safe_cast(cls, values: np.ndarray, dtype: np.dtype, copy: bool) -> np.ndarray: From c5e694c5871471888f745fb84e683988e8011000 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 14 Feb 2022 20:18:22 -0800 Subject: [PATCH 4/5] REF: share dtype --- pandas/core/arrays/floating.py | 5 ----- pandas/core/arrays/integer.py | 5 ----- pandas/core/arrays/numeric.py | 5 +++++ 3 files changed, 5 insertions(+), 10 deletions(-) diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py index 5b65fa4f58a82..dfff1fc908623 100644 --- a/pandas/core/arrays/floating.py +++ b/pandas/core/arrays/floating.py @@ -3,7 +3,6 @@ import numpy as np from pandas._typing import DtypeObj -from pandas.util._decorators import cache_readonly from pandas.core.dtypes.dtypes import register_extension_dtype @@ -134,10 +133,6 @@ class FloatingArray(NumericArray): _truthy_value = 1.0 _falsey_value = 0.0 - @cache_readonly - def dtype(self) -> FloatingDtype: - return FLOAT_STR_TO_DTYPE[str(self._data.dtype)] - def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): if not (isinstance(values, np.ndarray) and values.dtype.kind == "f"): raise TypeError( diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 2e36dcbb2ab2c..7fccedec6f6b3 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -3,7 +3,6 @@ import numpy as np from pandas._typing import DtypeObj -from pandas.util._decorators import cache_readonly from pandas.core.dtypes.base import register_extension_dtype @@ -169,10 +168,6 @@ class IntegerArray(NumericArray): _truthy_value = 1 _falsey_value = 0 - @cache_readonly - def dtype(self) -> IntegerDtype: - return INT_STR_TO_DTYPE[str(self._data.dtype)] - def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): if not (isinstance(values, np.ndarray) and values.dtype.kind in ["i", "u"]): raise TypeError( diff --git a/pandas/core/arrays/numeric.py b/pandas/core/arrays/numeric.py index df01779e93234..3370f7bec4ae4 100644 --- a/pandas/core/arrays/numeric.py +++ b/pandas/core/arrays/numeric.py @@ -219,6 +219,11 @@ class NumericArray(BaseMaskedArray): _dtype_cls: type[NumericDtype] + @cache_readonly + def dtype(self) -> NumericDtype: + mapping = self._dtype_cls._str_to_dtype_mapping() + return mapping[str(self._data.dtype)] + @classmethod def _coerce_to_array( cls, value, *, dtype: DtypeObj, copy: bool = False From 78d3ffc2de04a4ddc23e051008b4d96bf3fa8ce7 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 14 Feb 2022 22:09:22 -0800 Subject: [PATCH 5/5] share constructor --- pandas/core/arrays/floating.py | 14 ++---------- pandas/core/arrays/integer.py | 10 ++------- pandas/core/arrays/numeric.py | 26 +++++++++++++++++++---- pandas/tests/frame/methods/test_astype.py | 5 ----- 4 files changed, 26 insertions(+), 29 deletions(-) diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py index dfff1fc908623..49a71922f331b 100644 --- a/pandas/core/arrays/floating.py +++ b/pandas/core/arrays/floating.py @@ -4,6 +4,7 @@ from pandas._typing import DtypeObj +from pandas.core.dtypes.common import is_float_dtype from pandas.core.dtypes.dtypes import register_extension_dtype from pandas.core.arrays.numeric import ( @@ -23,6 +24,7 @@ class FloatingDtype(NumericDtype): """ _default_np_dtype = np.dtype(np.float64) + _checker = is_float_dtype @classmethod def construct_array_type(cls) -> type[FloatingArray]: @@ -133,18 +135,6 @@ class FloatingArray(NumericArray): _truthy_value = 1.0 _falsey_value = 0.0 - def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): - if not (isinstance(values, np.ndarray) and values.dtype.kind == "f"): - raise TypeError( - "values should be floating numpy array. Use " - "the 'pd.array' function instead" - ) - if values.dtype == np.float16: - # If we don't raise here, then accessing self.dtype would raise - raise TypeError("FloatingArray does not support np.float16 dtype.") - - super().__init__(values, mask, copy=copy) - _dtype_docstring = """ An ExtensionDtype for {dtype} data. diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 7fccedec6f6b3..9ef3939656ecd 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -5,6 +5,7 @@ from pandas._typing import DtypeObj from pandas.core.dtypes.base import register_extension_dtype +from pandas.core.dtypes.common import is_integer_dtype from pandas.core.arrays.masked import BaseMaskedDtype from pandas.core.arrays.numeric import ( @@ -24,6 +25,7 @@ class IntegerDtype(NumericDtype): """ _default_np_dtype = np.dtype(np.int64) + _checker = is_integer_dtype @classmethod def construct_array_type(cls) -> type[IntegerArray]: @@ -168,14 +170,6 @@ class IntegerArray(NumericArray): _truthy_value = 1 _falsey_value = 0 - def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): - if not (isinstance(values, np.ndarray) and values.dtype.kind in ["i", "u"]): - raise TypeError( - "values should be integer numpy array. Use " - "the 'pd.array' function instead" - ) - super().__init__(values, mask, copy=copy) - _dtype_docstring = """ An ExtensionDtype for {dtype} integer data. diff --git a/pandas/core/arrays/numeric.py b/pandas/core/arrays/numeric.py index 3370f7bec4ae4..958c9f7b0b3f1 100644 --- a/pandas/core/arrays/numeric.py +++ b/pandas/core/arrays/numeric.py @@ -3,6 +3,8 @@ import numbers from typing import ( TYPE_CHECKING, + Any, + Callable, TypeVar, ) @@ -42,6 +44,7 @@ class NumericDtype(BaseMaskedDtype): _default_np_dtype: np.dtype + _checker: Callable[[Any], bool] # is_foo_dtype def __repr__(self) -> str: return f"{self.name}Dtype()" @@ -139,10 +142,7 @@ def _safe_cast(cls, values: np.ndarray, dtype: np.dtype, copy: bool) -> np.ndarr def _coerce_to_data_and_mask(values, mask, dtype, copy, dtype_cls, default_dtype): - if default_dtype.kind == "f": - checker = is_float_dtype - else: - checker = is_integer_dtype + checker = dtype_cls._checker inferred_type = None @@ -219,6 +219,24 @@ class NumericArray(BaseMaskedArray): _dtype_cls: type[NumericDtype] + def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): + checker = self._dtype_cls._checker + if not (isinstance(values, np.ndarray) and checker(values.dtype)): + descr = ( + "floating" + if self._dtype_cls.kind == "f" # type: ignore[comparison-overlap] + else "integer" + ) + raise TypeError( + f"values should be {descr} numpy array. Use " + "the 'pd.array' function instead" + ) + if values.dtype == np.float16: + # If we don't raise here, then accessing self.dtype would raise + raise TypeError("FloatingArray does not support np.float16 dtype.") + + super().__init__(values, mask, copy=copy) + @cache_readonly def dtype(self) -> NumericDtype: mapping = self._dtype_cls._str_to_dtype_mapping() diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index 0e7e4b537c719..6d343de9f5d3a 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -762,11 +762,6 @@ def test_astype_categorical_to_string_missing(self): class IntegerArrayNoCopy(pd.core.arrays.IntegerArray): # GH 42501 - @classmethod - def _from_sequence(cls, scalars, *, dtype=None, copy=False): - values, mask = cls._coerce_to_array(scalars, dtype=dtype, copy=copy) - return IntegerArrayNoCopy(values, mask) - def copy(self): assert False