From 50185f4acf199f142bdf179918058022706e1455 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Fri, 2 Oct 2020 22:54:47 +0700 Subject: [PATCH 1/9] REF: refactor describe --- pandas/core/generic.py | 153 +------------- pandas/io/formats/describe.py | 380 ++++++++++++++++++++++++++++++++++ 2 files changed, 390 insertions(+), 143 deletions(-) create mode 100644 pandas/io/formats/describe.py diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 6f0aa70625c1d..281a4933277e2 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -56,11 +56,7 @@ from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError, InvalidIndexError from pandas.util._decorators import Appender, doc, rewrite_axis_style_signature -from pandas.util._validators import ( - validate_bool_kwarg, - validate_fillna_kwargs, - validate_percentile, -) +from pandas.util._validators import validate_bool_kwarg, validate_fillna_kwargs from pandas.core.dtypes.common import ( ensure_int64, @@ -106,7 +102,8 @@ from pandas.core.window import Expanding, ExponentialMovingWindow, Rolling, Window from pandas.io.formats import format as fmt -from pandas.io.formats.format import DataFrameFormatter, format_percentiles +from pandas.io.formats.describe import DataFrameDescriber +from pandas.io.formats.format import DataFrameFormatter from pandas.io.formats.printing import pprint_thing if TYPE_CHECKING: @@ -10105,143 +10102,13 @@ def describe( 75% NaN 2.5 max NaN 3.0 """ - if self.ndim == 2 and self.columns.size == 0: - raise ValueError("Cannot describe a DataFrame without columns") - - if percentiles is not None: - # explicit conversion of `percentiles` to list - percentiles = list(percentiles) - - # get them all to be in [0, 1] - validate_percentile(percentiles) - - # median should always be included - if 0.5 not in percentiles: - percentiles.append(0.5) - percentiles = np.asarray(percentiles) - else: - percentiles = np.array([0.25, 0.5, 0.75]) - - # sort and check for duplicates - unique_pcts = np.unique(percentiles) - if len(unique_pcts) < len(percentiles): - raise ValueError("percentiles cannot contain duplicates") - percentiles = unique_pcts - - formatted_percentiles = format_percentiles(percentiles) - - def describe_numeric_1d(series): - stat_index = ( - ["count", "mean", "std", "min"] + formatted_percentiles + ["max"] - ) - d = ( - [series.count(), series.mean(), series.std(), series.min()] - + series.quantile(percentiles).tolist() - + [series.max()] - ) - return pd.Series(d, index=stat_index, name=series.name) - - def describe_categorical_1d(data): - names = ["count", "unique"] - objcounts = data.value_counts() - count_unique = len(objcounts[objcounts != 0]) - result = [data.count(), count_unique] - dtype = None - if result[1] > 0: - top, freq = objcounts.index[0], objcounts.iloc[0] - if is_datetime64_any_dtype(data.dtype): - if self.ndim == 1: - stacklevel = 4 - else: - stacklevel = 5 - warnings.warn( - "Treating datetime data as categorical rather than numeric in " - "`.describe` is deprecated and will be removed in a future " - "version of pandas. Specify `datetime_is_numeric=True` to " - "silence this warning and adopt the future behavior now.", - FutureWarning, - stacklevel=stacklevel, - ) - tz = data.dt.tz - asint = data.dropna().values.view("i8") - top = Timestamp(top) - if top.tzinfo is not None and tz is not None: - # Don't tz_localize(None) if key is already tz-aware - top = top.tz_convert(tz) - else: - top = top.tz_localize(tz) - names += ["top", "freq", "first", "last"] - result += [ - top, - freq, - Timestamp(asint.min(), tz=tz), - Timestamp(asint.max(), tz=tz), - ] - else: - names += ["top", "freq"] - result += [top, freq] - - # If the DataFrame is empty, set 'top' and 'freq' to None - # to maintain output shape consistency - else: - names += ["top", "freq"] - result += [np.nan, np.nan] - dtype = "object" - - return pd.Series(result, index=names, name=data.name, dtype=dtype) - - def describe_timestamp_1d(data): - # GH-30164 - stat_index = ["count", "mean", "min"] + formatted_percentiles + ["max"] - d = ( - [data.count(), data.mean(), data.min()] - + data.quantile(percentiles).tolist() - + [data.max()] - ) - return pd.Series(d, index=stat_index, name=data.name) - - def describe_1d(data): - if is_bool_dtype(data.dtype): - return describe_categorical_1d(data) - elif is_numeric_dtype(data): - return describe_numeric_1d(data) - elif is_datetime64_any_dtype(data.dtype) and datetime_is_numeric: - return describe_timestamp_1d(data) - elif is_timedelta64_dtype(data.dtype): - return describe_numeric_1d(data) - else: - return describe_categorical_1d(data) - - if self.ndim == 1: - return describe_1d(self) - elif (include is None) and (exclude is None): - # when some numerics are found, keep only numerics - default_include = [np.number] - if datetime_is_numeric: - default_include.append("datetime") - data = self.select_dtypes(include=default_include) - if len(data.columns) == 0: - data = self - elif include == "all": - if exclude is not None: - msg = "exclude must be None when include is 'all'" - raise ValueError(msg) - data = self - else: - data = self.select_dtypes(include=include, exclude=exclude) - - ldesc = [describe_1d(s) for _, s in data.items()] - # set a convenient order for rows - names: List[Label] = [] - ldesc_indexes = sorted((x.index for x in ldesc), key=len) - for idxnames in ldesc_indexes: - for name in idxnames: - if name not in names: - names.append(name) - - d = pd.concat([x.reindex(names, copy=False) for x in ldesc], axis=1, sort=False) - d.columns = data.columns.copy() - return d + describer = DataFrameDescriber( + data=self, + include=include, + exclude=exclude, + datetime_is_numeric=datetime_is_numeric, + ) + return describer.describe(percentiles) def pct_change( self: FrameOrSeries, diff --git a/pandas/io/formats/describe.py b/pandas/io/formats/describe.py new file mode 100644 index 0000000000000..d3bb600a79b4f --- /dev/null +++ b/pandas/io/formats/describe.py @@ -0,0 +1,380 @@ +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING, List, Optional, Sequence, Type, Union, cast +import warnings + +import numpy as np + +from pandas._libs.tslibs import Timestamp +from pandas._typing import Dtype, FrameOrSeries, FrameOrSeriesUnion, Label +from pandas.util._validators import validate_percentile + +from pandas.core.dtypes.common import ( + is_bool_dtype, + is_datetime64_any_dtype, + is_numeric_dtype, + is_timedelta64_dtype, +) + +import pandas as pd + +from pandas.io.formats.format import format_percentiles + +if TYPE_CHECKING: + from pandas import DataFrame, Series + + +class DataFrameDescriber: + """Class responsible for creating dataframe/series description. + + Called from pandas.core.generic.NDFrame.describe() + + Parameters + ---------- + data : FrameOrSeries + Dataframe or Series to be described. + include : 'all', list-like of dtypes or None (default), optional + A white list of data types to include in the result. Ignored for ``Series``. + exclude : list-like of dtypes or None (default), optional, + A black list of data types to omit from the result. Ignored for ``Series``. + datetime_is_numeric : bool, default False + Whether to treat datetime dtypes as numeric. + """ + + def __init__( + self, + *, + data: FrameOrSeries, + include: Optional[Union[str, Sequence[str]]], + exclude: Optional[Union[str, Sequence[str]]], + datetime_is_numeric: bool, + ): + self.include = include + self.exclude = exclude + self.datetime_is_numeric = datetime_is_numeric + self.data: FrameOrSeries = self._initialize_data(data) + + def _initialize_data(self, data) -> FrameOrSeries: + _validate_dframe_size(data) + + if data.ndim == 1: + return data + + if self.include is None and self.exclude is None: + # when some numerics are found, keep only numerics + include = [np.number] + if self.datetime_is_numeric: + include.append("datetime") + truncated = data.select_dtypes(include=include) + if len(truncated.columns) == 0: + return data + else: + return truncated + elif self.include == "all": + if self.exclude is not None: + msg = "exclude must be None when include is 'all'" + raise ValueError(msg) + return data + else: + return data.select_dtypes(include=self.include, exclude=self.exclude) + + def _select_strategy( + self, + series: "Series", + percentiles: Optional[Sequence[float]], + ) -> "StrategyAbstract": + """Select strategy for description.""" + strategy: Type[StrategyAbstract] = CategoricalStrategy + if is_bool_dtype(series.dtype): + strategy = CategoricalStrategy + elif is_numeric_dtype(series): + strategy = NumericStrategy + elif is_datetime64_any_dtype(series.dtype) and self.datetime_is_numeric: + strategy = TimestampStrategy + elif is_timedelta64_dtype(series.dtype): + strategy = NumericStrategy + + if strategy == CategoricalStrategy and is_datetime64_any_dtype(series.dtype): + strategy = TimestampAsCategoricalStrategy + warnings.warn( + "Treating datetime data as categorical rather than numeric in " + "`.describe` is deprecated and will be removed in a future " + "version of pandas. Specify `datetime_is_numeric=True` to " + "silence this warning and adopt the future behavior now.", + FutureWarning, + stacklevel=5, + ) + + return strategy(series, percentiles) + + def describe(self, percentiles: Optional[Sequence[float]]) -> FrameOrSeries: + """Do describe + + Parameters + ---------- + percentiles : list-like of numbers, optional + The percentiles to include in the output. All should fall between 0 and 1. + The default is ``[.25, .5, .75]``, which returns the 25th, 50th, and + 75th percentiles. + + Returns + ------- + result : FrameOrSeries + Either dataframe (if ``self.data`` is dataframe) + or series (if ``self.data`` is series). + """ + result: FrameOrSeriesUnion + if self.data.ndim == 1: + series = cast("Series", self.data) + result = self._describe_series(series, percentiles) + else: + dataframe = cast("DataFrame", self.data) + result = self._describe_dataframe(dataframe, percentiles) + return cast(FrameOrSeries, result) + + def _describe_series( + self, + series: "Series", + percentiles: Optional[Sequence[float]], + ) -> "Series": + """Describe series.""" + strategy = self._select_strategy(series, percentiles) + return strategy.describe() + + def _describe_dataframe( + self, + dataframe: "DataFrame", + percentiles: Optional[Sequence[float]], + ) -> "DataFrame": + """Describe dataframe by describing series and concating them together.""" + ldesc: List["Series"] = [] + for _, series in dataframe.items(): + # Could use _describe_series here to avoid code duplication, + # but there will be an error regarding warning stacklevel + strategy = self._select_strategy(series, percentiles) + ldesc.append(strategy.describe()) + + df = pd.concat( + self._reindex_columns(ldesc), + axis=1, + sort=False, + ) + df.columns = dataframe.columns.copy() + return cast("DataFrame", df) + + def _reindex_columns(self, column_data) -> List["Series"]: + """Set a convenient order for rows.""" + names: List[Label] = [] + ldesc_indexes = sorted((x.index for x in column_data), key=len) + for idxnames in ldesc_indexes: + for name in idxnames: + if name not in names: + names.append(name) + return [x.reindex(names, copy=False) for x in column_data] + + +class StrategyAbstract(ABC): + """Abstract strategy for describing series.""" + + def __init__( + self, + data: "Series", + percentiles: Optional[Sequence[float]], + ): + self.data = data + self.percentiles = self._initialize_percentiles(percentiles) + + def describe(self) -> "Series": + """Describe series.""" + return pd.Series( + self.array, + index=self.names, + name=self.data.name, + dtype=self.dtype, + ) + + @property + @abstractmethod + def array(self) -> List[object]: + """Series data.""" + + @property + @abstractmethod + def names(self) -> List[str]: + """Series index names.""" + + @property + @abstractmethod + def dtype(self) -> Optional[Dtype]: + """Series dtype.""" + + @property + def formatted_percentiles(self) -> List[str]: + """Percentiles formatted as strings, rounded.""" + return format_percentiles(self.percentiles) + + @staticmethod + def _initialize_percentiles( + percentiles: Optional[Sequence[float]], + ) -> Sequence[float]: + if percentiles is None: + return np.array([0.25, 0.5, 0.75]) + + # explicit conversion of `percentiles` to list + percentiles = list(percentiles) + + # get them all to be in [0, 1] + validate_percentile(percentiles) + + # median should always be included + if 0.5 not in percentiles: + percentiles.append(0.5) + percentiles = np.asarray(percentiles) + + # sort and check for duplicates + unique_pcts = np.unique(percentiles) + assert percentiles is not None + if len(unique_pcts) < len(percentiles): + raise ValueError("percentiles cannot contain duplicates") + return unique_pcts + + +class CategoricalStrategy(StrategyAbstract): + """Strategy for series with categorical values.""" + + def __init__(self, data, percentiles): + self.data = data + super().__init__(data, percentiles) + self.objcounts = self.data.value_counts() + + @property + def count(self) -> "Series": + return self.data.count() + + @property + def count_unique(self) -> int: + return len(self.objcounts[self.objcounts != 0]) + + @property + def names(self) -> List[str]: + return ["count", "unique", "top", "freq"] + + @property + def array(self) -> List[object]: + result = [self.count, self.count_unique] + if self.count_unique > 0: + top, freq = self.objcounts.index[0], self.objcounts.iloc[0] + result += [top, freq] + + # If the DataFrame is empty, set 'top' and 'freq' to None + # to maintain output shape consistency + else: + result += [np.nan, np.nan] + return result + + @property + def dtype(self) -> Optional[Dtype]: + if self.count_unique == 0: + return "object" + return None + + +class TimestampAsCategoricalStrategy(CategoricalStrategy): + """Strategy for series with timestamp values treated as categorical values.""" + + @property + def array(self) -> List[object]: + result = [self.count, self.count_unique] + if self.count_unique > 0: + top, freq = self.objcounts.index[0], self.objcounts.iloc[0] + tz = self.data.dt.tz + asint = self.data.dropna().values.view("i8") + top = Timestamp(top) + if top.tzinfo is not None and tz is not None: + # Don't tz_localize(None) if key is already tz-aware + top = top.tz_convert(tz) + else: + top = top.tz_localize(tz) + + result += [ + top, + freq, + Timestamp(asint.min(), tz=tz), + Timestamp(asint.max(), tz=tz), + ] + + # If the DataFrame is empty, set 'top' and 'freq' to None + # to maintain output shape consistency + else: + result += [np.nan, np.nan] + return result + + @property + def names(self) -> List[str]: + names = ["count", "unique"] + if self.count_unique > 0: + names += ["top", "freq", "first", "last"] + return names + + +class NumericStrategy(StrategyAbstract): + """Strategy for series with numeric values.""" + + @property + def names(self) -> List[str]: + return [ + "count", + "mean", + "std", + "min", + *self.formatted_percentiles, + "max", + ] + + @property + def array(self) -> List[object]: + return [ + self.data.count(), + self.data.mean(), + self.data.std(), + self.data.min(), + *self.data.quantile(self.percentiles).tolist(), + self.data.max(), + ] + + @property + def dtype(self) -> Optional[Dtype]: + return None + + +class TimestampStrategy(StrategyAbstract): + """Strategy for series with timestamp values.""" + + @property + def array(self) -> List[object]: + return [ + self.data.count(), + self.data.mean(), + self.data.min(), + *self.data.quantile(self.percentiles).tolist(), + self.data.max(), + ] + + @property + def names(self) -> List[str]: + return [ + "count", + "mean", + "min", + *self.formatted_percentiles, + "max", + ] + + @property + def dtype(self) -> Optional[Dtype]: + return None + + +def _validate_dframe_size(df: FrameOrSeriesUnion) -> None: + """Validate correct size of dataframe.""" + if df.ndim == 2 and df.columns.size == 0: + raise ValueError("Cannot describe a DataFrame without columns") From 2fc029d93885f57bb28817509cfc1943e98dc8ee Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Sat, 3 Oct 2020 13:12:40 +0700 Subject: [PATCH 2/9] REF: split describer into series/dataframe --- pandas/core/generic.py | 6 +- pandas/io/formats/describe.py | 207 +++++++++++++++++++++++----------- 2 files changed, 144 insertions(+), 69 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 281a4933277e2..d54b29cd186e0 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -102,7 +102,7 @@ from pandas.core.window import Expanding, ExponentialMovingWindow, Rolling, Window from pandas.io.formats import format as fmt -from pandas.io.formats.describe import DataFrameDescriber +from pandas.io.formats.describe import describe_ndframe from pandas.io.formats.format import DataFrameFormatter from pandas.io.formats.printing import pprint_thing @@ -10102,13 +10102,13 @@ def describe( 75% NaN 2.5 max NaN 3.0 """ - describer = DataFrameDescriber( + return describe_ndframe( data=self, include=include, exclude=exclude, datetime_is_numeric=datetime_is_numeric, + percentiles=percentiles, ) - return describer.describe(percentiles) def pct_change( self: FrameOrSeries, diff --git a/pandas/io/formats/describe.py b/pandas/io/formats/describe.py index d3bb600a79b4f..8765ef4bb2332 100644 --- a/pandas/io/formats/describe.py +++ b/pandas/io/formats/describe.py @@ -23,21 +23,154 @@ from pandas import DataFrame, Series -class DataFrameDescriber: - """Class responsible for creating dataframe/series description. +def describe_ndframe( + *, + data: FrameOrSeries, + include: Optional[Union[str, Sequence[str]]], + exclude: Optional[Union[str, Sequence[str]]], + datetime_is_numeric: bool, + percentiles: Optional[Sequence[float]], +) -> FrameOrSeries: + """Describe series or dataframe. Called from pandas.core.generic.NDFrame.describe() Parameters ---------- data : FrameOrSeries - Dataframe or Series to be described. + Either dataframe or series. include : 'all', list-like of dtypes or None (default), optional A white list of data types to include in the result. Ignored for ``Series``. exclude : list-like of dtypes or None (default), optional, A black list of data types to omit from the result. Ignored for ``Series``. datetime_is_numeric : bool, default False Whether to treat datetime dtypes as numeric. + percentiles : list-like of numbers, optional + The percentiles to include in the output. All should + fall between 0 and 1. The default is + ``[.25, .5, .75]``, which returns the 25th, 50th, and + 75th percentiles. + + Returns + ------- + FrameOrSeries + Dataframe or series described. + """ + describer: "NDFrameDescriber" + if data.ndim == 1: + describer = SeriesDescriber( + data=data, + datetime_is_numeric=datetime_is_numeric, + ) + else: + describer = DataFrameDescriber( + data=data, + include=include, + exclude=exclude, + datetime_is_numeric=datetime_is_numeric, + ) + result = describer.describe(percentiles) + return cast(FrameOrSeries, result) + + +class StrategyCreatorMixin: + datetime_is_numeric: bool + + def create_strategy( + self, + series: "Series", + percentiles: Optional[Sequence[float]], + ) -> "StrategyAbstract": + """Create strategy instance for description.""" + klass = self._select_strategy(series) + return klass(series, percentiles) + + def _select_strategy(self, series: "Series") -> Type["StrategyAbstract"]: + """Select strategy for description.""" + strategy: Type[StrategyAbstract] = CategoricalStrategy + if is_bool_dtype(series.dtype): + strategy = CategoricalStrategy + elif is_numeric_dtype(series): + strategy = NumericStrategy + elif is_datetime64_any_dtype(series.dtype) and self.datetime_is_numeric: + strategy = TimestampStrategy + elif is_timedelta64_dtype(series.dtype): + strategy = NumericStrategy + + if strategy == CategoricalStrategy and is_datetime64_any_dtype(series.dtype): + strategy = TimestampAsCategoricalStrategy + warnings.warn( + "Treating datetime data as categorical rather than numeric in " + "`.describe` is deprecated and will be removed in a future " + "version of pandas. Specify `datetime_is_numeric=True` to " + "silence this warning and adopt the future behavior now.", + FutureWarning, + stacklevel=6, + ) + return strategy + + +class NDFrameDescriber(ABC): + """Abstract class for describing dataframe or series.""" + + @abstractmethod + def describe(self, percentiles: Optional[Sequence[float]]) -> FrameOrSeriesUnion: + pass + + +class SeriesDescriber(NDFrameDescriber, StrategyCreatorMixin): + """Class responsible for creating series description. + + Parameters + ---------- + data : FrameOrSeries + Dataframe or Series to be described. + datetime_is_numeric : bool, default False + Whether to treat datetime dtypes as numeric. + """ + + def __init__( + self, + *, + data: FrameOrSeries, + datetime_is_numeric: bool, + ): + self.data = data + self.datetime_is_numeric = datetime_is_numeric + + def describe(self, percentiles: Optional[Sequence[float]]) -> "Series": + """Do describe. + + Parameters + ---------- + percentiles : list-like of numbers, optional + The percentiles to include in the output. All should fall between 0 and 1. + The default is ``[.25, .5, .75]``, which returns the 25th, 50th, and + 75th percentiles. + + Returns + ------- + result : Series + """ + series = cast("Series", self.data) + strategy = self.create_strategy(series, percentiles) + result = strategy.describe() + return result + + +class DataFrameDescriber(NDFrameDescriber, StrategyCreatorMixin): + """Class responsible for creating dataframe description. + + Parameters + ---------- + data : FrameOrSeries + Dataframe or Series to be described. + include : 'all', list-like of dtypes or None (default), optional + A white list of data types to include in the result. + exclude : list-like of dtypes or None (default), optional, + A black list of data types to omit from the result. + datetime_is_numeric : bool, default False + Whether to treat datetime dtypes as numeric. """ def __init__( @@ -56,9 +189,6 @@ def __init__( def _initialize_data(self, data) -> FrameOrSeries: _validate_dframe_size(data) - if data.ndim == 1: - return data - if self.include is None and self.exclude is None: # when some numerics are found, keep only numerics include = [np.number] @@ -77,37 +207,8 @@ def _initialize_data(self, data) -> FrameOrSeries: else: return data.select_dtypes(include=self.include, exclude=self.exclude) - def _select_strategy( - self, - series: "Series", - percentiles: Optional[Sequence[float]], - ) -> "StrategyAbstract": - """Select strategy for description.""" - strategy: Type[StrategyAbstract] = CategoricalStrategy - if is_bool_dtype(series.dtype): - strategy = CategoricalStrategy - elif is_numeric_dtype(series): - strategy = NumericStrategy - elif is_datetime64_any_dtype(series.dtype) and self.datetime_is_numeric: - strategy = TimestampStrategy - elif is_timedelta64_dtype(series.dtype): - strategy = NumericStrategy - - if strategy == CategoricalStrategy and is_datetime64_any_dtype(series.dtype): - strategy = TimestampAsCategoricalStrategy - warnings.warn( - "Treating datetime data as categorical rather than numeric in " - "`.describe` is deprecated and will be removed in a future " - "version of pandas. Specify `datetime_is_numeric=True` to " - "silence this warning and adopt the future behavior now.", - FutureWarning, - stacklevel=5, - ) - - return strategy(series, percentiles) - - def describe(self, percentiles: Optional[Sequence[float]]) -> FrameOrSeries: - """Do describe + def describe(self, percentiles: Optional[Sequence[float]]) -> "DataFrame": + """Do describe. Parameters ---------- @@ -118,39 +219,13 @@ def describe(self, percentiles: Optional[Sequence[float]]) -> FrameOrSeries: Returns ------- - result : FrameOrSeries - Either dataframe (if ``self.data`` is dataframe) - or series (if ``self.data`` is series). + result : DataFrame """ - result: FrameOrSeriesUnion - if self.data.ndim == 1: - series = cast("Series", self.data) - result = self._describe_series(series, percentiles) - else: - dataframe = cast("DataFrame", self.data) - result = self._describe_dataframe(dataframe, percentiles) - return cast(FrameOrSeries, result) + dataframe = cast("DataFrame", self.data) - def _describe_series( - self, - series: "Series", - percentiles: Optional[Sequence[float]], - ) -> "Series": - """Describe series.""" - strategy = self._select_strategy(series, percentiles) - return strategy.describe() - - def _describe_dataframe( - self, - dataframe: "DataFrame", - percentiles: Optional[Sequence[float]], - ) -> "DataFrame": - """Describe dataframe by describing series and concating them together.""" ldesc: List["Series"] = [] for _, series in dataframe.items(): - # Could use _describe_series here to avoid code duplication, - # but there will be an error regarding warning stacklevel - strategy = self._select_strategy(series, percentiles) + strategy = self.create_strategy(series, percentiles) ldesc.append(strategy.describe()) df = pd.concat( From cc9de705d4173fceb427fcac481ae59d2abfba99 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Sat, 3 Oct 2020 21:10:16 +0700 Subject: [PATCH 3/9] REF: extract method _get_top_and_freq --- pandas/io/formats/describe.py | 116 ++++++++++++++++------------------ 1 file changed, 56 insertions(+), 60 deletions(-) diff --git a/pandas/io/formats/describe.py b/pandas/io/formats/describe.py index 8765ef4bb2332..79ba1f78da4c8 100644 --- a/pandas/io/formats/describe.py +++ b/pandas/io/formats/describe.py @@ -1,3 +1,12 @@ +"""Module responsible for execution of NDFrame.describe() method. + +Method NDFrame.describe() delegates actual execution to function describe_ndframe(). + +Strategy pattern is utilized. + - The appropriate strategy is selected based on the series datatype. + - The strategy is responsible for running proper description. +""" + from abc import ABC, abstractmethod from typing import TYPE_CHECKING, List, Optional, Sequence, Type, Union, cast import warnings @@ -46,9 +55,8 @@ def describe_ndframe( datetime_is_numeric : bool, default False Whether to treat datetime dtypes as numeric. percentiles : list-like of numbers, optional - The percentiles to include in the output. All should - fall between 0 and 1. The default is - ``[.25, .5, .75]``, which returns the 25th, 50th, and + The percentiles to include in the output. All should fall between 0 and 1. + The default is ``[.25, .5, .75]``, which returns the 25th, 50th, and 75th percentiles. Returns @@ -74,6 +82,8 @@ def describe_ndframe( class StrategyCreatorMixin: + """Mixin for creating instance of appropriate strategy for describing series.""" + datetime_is_numeric: bool def create_strategy( @@ -115,7 +125,15 @@ class NDFrameDescriber(ABC): @abstractmethod def describe(self, percentiles: Optional[Sequence[float]]) -> FrameOrSeriesUnion: - pass + """Do describe either series or dataframe. + + Parameters + ---------- + percentiles : list-like of numbers, optional + The percentiles to include in the output. All should fall between 0 and 1. + The default is ``[.25, .5, .75]``, which returns the 25th, 50th, and + 75th percentiles. + """ class SeriesDescriber(NDFrameDescriber, StrategyCreatorMixin): @@ -139,19 +157,7 @@ def __init__( self.datetime_is_numeric = datetime_is_numeric def describe(self, percentiles: Optional[Sequence[float]]) -> "Series": - """Do describe. - - Parameters - ---------- - percentiles : list-like of numbers, optional - The percentiles to include in the output. All should fall between 0 and 1. - The default is ``[.25, .5, .75]``, which returns the 25th, 50th, and - 75th percentiles. - - Returns - ------- - result : Series - """ + """Do describe series.""" series = cast("Series", self.data) strategy = self.create_strategy(series, percentiles) result = strategy.describe() @@ -208,19 +214,7 @@ def _initialize_data(self, data) -> FrameOrSeries: return data.select_dtypes(include=self.include, exclude=self.exclude) def describe(self, percentiles: Optional[Sequence[float]]) -> "DataFrame": - """Do describe. - - Parameters - ---------- - percentiles : list-like of numbers, optional - The percentiles to include in the output. All should fall between 0 and 1. - The default is ``[.25, .5, .75]``, which returns the 25th, 50th, and - 75th percentiles. - - Returns - ------- - result : DataFrame - """ + """Do describe dataframe.""" dataframe = cast("DataFrame", self.data) ldesc: List["Series"] = [] @@ -322,36 +316,38 @@ def __init__(self, data, percentiles): self.objcounts = self.data.value_counts() @property - def count(self) -> "Series": - return self.data.count() - - @property - def count_unique(self) -> int: - return len(self.objcounts[self.objcounts != 0]) + def array(self) -> List[object]: + top, freq = self._get_top_and_freq() + return [ + self.count, + self.count_unique, + top, + freq, + ] @property def names(self) -> List[str]: return ["count", "unique", "top", "freq"] - @property - def array(self) -> List[object]: - result = [self.count, self.count_unique] - if self.count_unique > 0: - top, freq = self.objcounts.index[0], self.objcounts.iloc[0] - result += [top, freq] - - # If the DataFrame is empty, set 'top' and 'freq' to None - # to maintain output shape consistency - else: - result += [np.nan, np.nan] - return result - @property def dtype(self) -> Optional[Dtype]: if self.count_unique == 0: return "object" return None + @property + def count(self) -> "Series": + return self.data.count() + + @property + def count_unique(self) -> int: + return len(self.objcounts[self.objcounts != 0]) + + def _get_top_and_freq(self): + if self.count_unique > 0: + return self.objcounts.index[0], self.objcounts.iloc[0] + return np.nan, np.nan + class TimestampAsCategoricalStrategy(CategoricalStrategy): """Strategy for series with timestamp values treated as categorical values.""" @@ -394,17 +390,6 @@ def names(self) -> List[str]: class NumericStrategy(StrategyAbstract): """Strategy for series with numeric values.""" - @property - def names(self) -> List[str]: - return [ - "count", - "mean", - "std", - "min", - *self.formatted_percentiles, - "max", - ] - @property def array(self) -> List[object]: return [ @@ -416,6 +401,17 @@ def array(self) -> List[object]: self.data.max(), ] + @property + def names(self) -> List[str]: + return [ + "count", + "mean", + "std", + "min", + *self.formatted_percentiles, + "max", + ] + @property def dtype(self) -> Optional[Dtype]: return None From 6cc722408a0ff0364cda42a261682bdbc2117a34 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Sat, 3 Oct 2020 21:14:39 +0700 Subject: [PATCH 4/9] REF: simplify logic in _initialize_data --- pandas/io/formats/describe.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/io/formats/describe.py b/pandas/io/formats/describe.py index 79ba1f78da4c8..fec2aeca8eb35 100644 --- a/pandas/io/formats/describe.py +++ b/pandas/io/formats/describe.py @@ -205,13 +205,14 @@ def _initialize_data(self, data) -> FrameOrSeries: return data else: return truncated - elif self.include == "all": + + if self.include == "all": if self.exclude is not None: msg = "exclude must be None when include is 'all'" raise ValueError(msg) return data - else: - return data.select_dtypes(include=self.include, exclude=self.exclude) + + return data.select_dtypes(include=self.include, exclude=self.exclude) def describe(self, percentiles: Optional[Sequence[float]]) -> "DataFrame": """Do describe dataframe.""" From 4e30a8e0aab12e2107894cbcecf649f17c490351 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Sat, 3 Oct 2020 21:22:01 +0700 Subject: [PATCH 5/9] TYP: clear typing for Series vs DataFrame --- pandas/io/formats/describe.py | 39 +++++++++++++++++------------------ 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/pandas/io/formats/describe.py b/pandas/io/formats/describe.py index fec2aeca8eb35..7815407e09d4c 100644 --- a/pandas/io/formats/describe.py +++ b/pandas/io/formats/describe.py @@ -62,17 +62,19 @@ def describe_ndframe( Returns ------- FrameOrSeries - Dataframe or series described. + Dataframe or series description. """ describer: "NDFrameDescriber" if data.ndim == 1: + series = cast("Series", data) describer = SeriesDescriber( - data=data, + data=series, datetime_is_numeric=datetime_is_numeric, ) else: + dataframe = cast("DataFrame", data) describer = DataFrameDescriber( - data=data, + data=dataframe, include=include, exclude=exclude, datetime_is_numeric=datetime_is_numeric, @@ -141,8 +143,8 @@ class SeriesDescriber(NDFrameDescriber, StrategyCreatorMixin): Parameters ---------- - data : FrameOrSeries - Dataframe or Series to be described. + data : Series + Series to be described. datetime_is_numeric : bool, default False Whether to treat datetime dtypes as numeric. """ @@ -150,7 +152,7 @@ class SeriesDescriber(NDFrameDescriber, StrategyCreatorMixin): def __init__( self, *, - data: FrameOrSeries, + data: "Series", datetime_is_numeric: bool, ): self.data = data @@ -158,8 +160,7 @@ def __init__( def describe(self, percentiles: Optional[Sequence[float]]) -> "Series": """Do describe series.""" - series = cast("Series", self.data) - strategy = self.create_strategy(series, percentiles) + strategy = self.create_strategy(self.data, percentiles) result = strategy.describe() return result @@ -169,8 +170,8 @@ class DataFrameDescriber(NDFrameDescriber, StrategyCreatorMixin): Parameters ---------- - data : FrameOrSeries - Dataframe or Series to be described. + data : DataFrame + Dataframe to be described. include : 'all', list-like of dtypes or None (default), optional A white list of data types to include in the result. exclude : list-like of dtypes or None (default), optional, @@ -182,7 +183,7 @@ class DataFrameDescriber(NDFrameDescriber, StrategyCreatorMixin): def __init__( self, *, - data: FrameOrSeries, + data: "DataFrame", include: Optional[Union[str, Sequence[str]]], exclude: Optional[Union[str, Sequence[str]]], datetime_is_numeric: bool, @@ -190,9 +191,9 @@ def __init__( self.include = include self.exclude = exclude self.datetime_is_numeric = datetime_is_numeric - self.data: FrameOrSeries = self._initialize_data(data) + self.data: "DataFrame" = self._initialize_data(data) - def _initialize_data(self, data) -> FrameOrSeries: + def _initialize_data(self, data) -> "DataFrame": _validate_dframe_size(data) if self.include is None and self.exclude is None: @@ -200,11 +201,11 @@ def _initialize_data(self, data) -> FrameOrSeries: include = [np.number] if self.datetime_is_numeric: include.append("datetime") - truncated = data.select_dtypes(include=include) - if len(truncated.columns) == 0: + numeric_only = data.select_dtypes(include=include) + if len(numeric_only.columns) == 0: return data else: - return truncated + return numeric_only if self.include == "all": if self.exclude is not None: @@ -216,10 +217,8 @@ def _initialize_data(self, data) -> FrameOrSeries: def describe(self, percentiles: Optional[Sequence[float]]) -> "DataFrame": """Do describe dataframe.""" - dataframe = cast("DataFrame", self.data) - ldesc: List["Series"] = [] - for _, series in dataframe.items(): + for _, series in self.data.items(): strategy = self.create_strategy(series, percentiles) ldesc.append(strategy.describe()) @@ -228,7 +227,7 @@ def describe(self, percentiles: Optional[Sequence[float]]) -> "DataFrame": axis=1, sort=False, ) - df.columns = dataframe.columns.copy() + df.columns = self.data.columns.copy() return cast("DataFrame", df) def _reindex_columns(self, column_data) -> List["Series"]: From 3916fcda081b3cf2903f9a071281541352a677fd Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Sat, 3 Oct 2020 21:28:35 +0700 Subject: [PATCH 6/9] REF: extract method _extract_numeric_data --- pandas/io/formats/describe.py | 61 +++++++++++++++++++++-------------- 1 file changed, 37 insertions(+), 24 deletions(-) diff --git a/pandas/io/formats/describe.py b/pandas/io/formats/describe.py index 7815407e09d4c..2dc7a954821cd 100644 --- a/pandas/io/formats/describe.py +++ b/pandas/io/formats/describe.py @@ -8,7 +8,17 @@ """ from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, List, Optional, Sequence, Type, Union, cast +from typing import ( + TYPE_CHECKING, + Any, + List, + Optional, + Sequence, + Tuple, + Type, + Union, + cast, +) import warnings import numpy as np @@ -193,28 +203,6 @@ def __init__( self.datetime_is_numeric = datetime_is_numeric self.data: "DataFrame" = self._initialize_data(data) - def _initialize_data(self, data) -> "DataFrame": - _validate_dframe_size(data) - - if self.include is None and self.exclude is None: - # when some numerics are found, keep only numerics - include = [np.number] - if self.datetime_is_numeric: - include.append("datetime") - numeric_only = data.select_dtypes(include=include) - if len(numeric_only.columns) == 0: - return data - else: - return numeric_only - - if self.include == "all": - if self.exclude is not None: - msg = "exclude must be None when include is 'all'" - raise ValueError(msg) - return data - - return data.select_dtypes(include=self.include, exclude=self.exclude) - def describe(self, percentiles: Optional[Sequence[float]]) -> "DataFrame": """Do describe dataframe.""" ldesc: List["Series"] = [] @@ -240,6 +228,31 @@ def _reindex_columns(self, column_data) -> List["Series"]: names.append(name) return [x.reindex(names, copy=False) for x in column_data] + def _initialize_data(self, data: "DataFrame") -> "DataFrame": + _validate_dframe_size(data) + + if self.include is None and self.exclude is None: + return self._extract_numeric_data(data) + + if self.include == "all": + if self.exclude is not None: + msg = "exclude must be None when include is 'all'" + raise ValueError(msg) + return data + + return data.select_dtypes(include=self.include, exclude=self.exclude) + + def _extract_numeric_data(self, data: "DataFrame") -> "DataFrame": + """When some numerics are found, keep only numerics.""" + include = [np.number] + if self.datetime_is_numeric: + include.append("datetime") + numeric_only = data.select_dtypes(include=include) + if len(numeric_only.columns) == 0: + return data + else: + return numeric_only + class StrategyAbstract(ABC): """Abstract strategy for describing series.""" @@ -343,7 +356,7 @@ def count(self) -> "Series": def count_unique(self) -> int: return len(self.objcounts[self.objcounts != 0]) - def _get_top_and_freq(self): + def _get_top_and_freq(self) -> Tuple[Any, Any]: if self.count_unique > 0: return self.objcounts.index[0], self.objcounts.iloc[0] return np.nan, np.nan From 464ba1ef3d21947641cb6436daf1f5243cbe720a Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Wed, 7 Oct 2020 03:10:40 +0700 Subject: [PATCH 7/9] REF: replace import as pd with concrete imports --- pandas/io/formats/describe.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/io/formats/describe.py b/pandas/io/formats/describe.py index 2dc7a954821cd..d9fa236c54899 100644 --- a/pandas/io/formats/describe.py +++ b/pandas/io/formats/describe.py @@ -34,7 +34,7 @@ is_timedelta64_dtype, ) -import pandas as pd +from pandas.core.reshape.concat import concat from pandas.io.formats.format import format_percentiles @@ -210,7 +210,7 @@ def describe(self, percentiles: Optional[Sequence[float]]) -> "DataFrame": strategy = self.create_strategy(series, percentiles) ldesc.append(strategy.describe()) - df = pd.concat( + df = concat( self._reindex_columns(ldesc), axis=1, sort=False, @@ -267,7 +267,9 @@ def __init__( def describe(self) -> "Series": """Describe series.""" - return pd.Series( + from pandas.core.series import Series + + return Series( self.array, index=self.names, name=self.data.name, From 90bafe04aadc2b08b7d37cb79224c2bc27cbd443 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Wed, 7 Oct 2020 03:19:32 +0700 Subject: [PATCH 8/9] TST: add test for covering exclude is not None --- pandas/tests/frame/methods/test_describe.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pandas/tests/frame/methods/test_describe.py b/pandas/tests/frame/methods/test_describe.py index 0b70bead375da..496b05dec6dff 100644 --- a/pandas/tests/frame/methods/test_describe.py +++ b/pandas/tests/frame/methods/test_describe.py @@ -1,4 +1,5 @@ import numpy as np +import pytest import pandas as pd from pandas import Categorical, DataFrame, Series, Timestamp, date_range @@ -332,6 +333,16 @@ def test_describe_tz_values2(self): result = df.describe(include="all") tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("exclude", ["x", "y", ["x", "y"], ["x", "z"]]) + def test_describe_when_include_all_exclude_not_allowed(self, exclude): + """ + When include is 'all', then setting exclude != None is not allowed. + """ + df = pd.DataFrame({"x": [1], "y": [2], "z": [3]}) + msg = "exclude must be None when include is 'all'" + with pytest.raises(ValueError, match=msg): + df.describe(include="all", exclude=exclude) + def test_describe_percentiles_integer_idx(self): # GH#26660 df = pd.DataFrame({"x": [1]}) From 574d22fc6f86538bb8ad7a9fa56d6e2974321947 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov Date: Thu, 22 Oct 2020 03:20:57 +0700 Subject: [PATCH 9/9] CLN: pd.DataFrame -> DataFrame --- pandas/tests/frame/methods/test_describe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/frame/methods/test_describe.py b/pandas/tests/frame/methods/test_describe.py index 1a311bfb7a620..986e4333ccbc4 100644 --- a/pandas/tests/frame/methods/test_describe.py +++ b/pandas/tests/frame/methods/test_describe.py @@ -338,7 +338,7 @@ def test_describe_when_include_all_exclude_not_allowed(self, exclude): """ When include is 'all', then setting exclude != None is not allowed. """ - df = pd.DataFrame({"x": [1], "y": [2], "z": [3]}) + df = DataFrame({"x": [1], "y": [2], "z": [3]}) msg = "exclude must be None when include is 'all'" with pytest.raises(ValueError, match=msg): df.describe(include="all", exclude=exclude)