From 50185f4acf199f142bdf179918058022706e1455 Mon Sep 17 00:00:00 2001
From: Maxim Ivanov <ivanovmg@gmail.com>
Date: Fri, 2 Oct 2020 22:54:47 +0700
Subject: [PATCH 1/9] REF: refactor describe

---
 pandas/core/generic.py        | 153 +-------------
 pandas/io/formats/describe.py | 380 ++++++++++++++++++++++++++++++++++
 2 files changed, 390 insertions(+), 143 deletions(-)
 create mode 100644 pandas/io/formats/describe.py

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index 6f0aa70625c1d..281a4933277e2 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -56,11 +56,7 @@
 from pandas.compat.numpy import function as nv
 from pandas.errors import AbstractMethodError, InvalidIndexError
 from pandas.util._decorators import Appender, doc, rewrite_axis_style_signature
-from pandas.util._validators import (
-    validate_bool_kwarg,
-    validate_fillna_kwargs,
-    validate_percentile,
-)
+from pandas.util._validators import validate_bool_kwarg, validate_fillna_kwargs
 
 from pandas.core.dtypes.common import (
     ensure_int64,
@@ -106,7 +102,8 @@
 from pandas.core.window import Expanding, ExponentialMovingWindow, Rolling, Window
 
 from pandas.io.formats import format as fmt
-from pandas.io.formats.format import DataFrameFormatter, format_percentiles
+from pandas.io.formats.describe import DataFrameDescriber
+from pandas.io.formats.format import DataFrameFormatter
 from pandas.io.formats.printing import pprint_thing
 
 if TYPE_CHECKING:
@@ -10105,143 +10102,13 @@ def describe(
         75%            NaN      2.5
         max            NaN      3.0
         """
-        if self.ndim == 2 and self.columns.size == 0:
-            raise ValueError("Cannot describe a DataFrame without columns")
-
-        if percentiles is not None:
-            # explicit conversion of `percentiles` to list
-            percentiles = list(percentiles)
-
-            # get them all to be in [0, 1]
-            validate_percentile(percentiles)
-
-            # median should always be included
-            if 0.5 not in percentiles:
-                percentiles.append(0.5)
-            percentiles = np.asarray(percentiles)
-        else:
-            percentiles = np.array([0.25, 0.5, 0.75])
-
-        # sort and check for duplicates
-        unique_pcts = np.unique(percentiles)
-        if len(unique_pcts) < len(percentiles):
-            raise ValueError("percentiles cannot contain duplicates")
-        percentiles = unique_pcts
-
-        formatted_percentiles = format_percentiles(percentiles)
-
-        def describe_numeric_1d(series):
-            stat_index = (
-                ["count", "mean", "std", "min"] + formatted_percentiles + ["max"]
-            )
-            d = (
-                [series.count(), series.mean(), series.std(), series.min()]
-                + series.quantile(percentiles).tolist()
-                + [series.max()]
-            )
-            return pd.Series(d, index=stat_index, name=series.name)
-
-        def describe_categorical_1d(data):
-            names = ["count", "unique"]
-            objcounts = data.value_counts()
-            count_unique = len(objcounts[objcounts != 0])
-            result = [data.count(), count_unique]
-            dtype = None
-            if result[1] > 0:
-                top, freq = objcounts.index[0], objcounts.iloc[0]
-                if is_datetime64_any_dtype(data.dtype):
-                    if self.ndim == 1:
-                        stacklevel = 4
-                    else:
-                        stacklevel = 5
-                    warnings.warn(
-                        "Treating datetime data as categorical rather than numeric in "
-                        "`.describe` is deprecated and will be removed in a future "
-                        "version of pandas. Specify `datetime_is_numeric=True` to "
-                        "silence this warning and adopt the future behavior now.",
-                        FutureWarning,
-                        stacklevel=stacklevel,
-                    )
-                    tz = data.dt.tz
-                    asint = data.dropna().values.view("i8")
-                    top = Timestamp(top)
-                    if top.tzinfo is not None and tz is not None:
-                        # Don't tz_localize(None) if key is already tz-aware
-                        top = top.tz_convert(tz)
-                    else:
-                        top = top.tz_localize(tz)
-                    names += ["top", "freq", "first", "last"]
-                    result += [
-                        top,
-                        freq,
-                        Timestamp(asint.min(), tz=tz),
-                        Timestamp(asint.max(), tz=tz),
-                    ]
-                else:
-                    names += ["top", "freq"]
-                    result += [top, freq]
-
-            # If the DataFrame is empty, set 'top' and 'freq' to None
-            # to maintain output shape consistency
-            else:
-                names += ["top", "freq"]
-                result += [np.nan, np.nan]
-                dtype = "object"
-
-            return pd.Series(result, index=names, name=data.name, dtype=dtype)
-
-        def describe_timestamp_1d(data):
-            # GH-30164
-            stat_index = ["count", "mean", "min"] + formatted_percentiles + ["max"]
-            d = (
-                [data.count(), data.mean(), data.min()]
-                + data.quantile(percentiles).tolist()
-                + [data.max()]
-            )
-            return pd.Series(d, index=stat_index, name=data.name)
-
-        def describe_1d(data):
-            if is_bool_dtype(data.dtype):
-                return describe_categorical_1d(data)
-            elif is_numeric_dtype(data):
-                return describe_numeric_1d(data)
-            elif is_datetime64_any_dtype(data.dtype) and datetime_is_numeric:
-                return describe_timestamp_1d(data)
-            elif is_timedelta64_dtype(data.dtype):
-                return describe_numeric_1d(data)
-            else:
-                return describe_categorical_1d(data)
-
-        if self.ndim == 1:
-            return describe_1d(self)
-        elif (include is None) and (exclude is None):
-            # when some numerics are found, keep only numerics
-            default_include = [np.number]
-            if datetime_is_numeric:
-                default_include.append("datetime")
-            data = self.select_dtypes(include=default_include)
-            if len(data.columns) == 0:
-                data = self
-        elif include == "all":
-            if exclude is not None:
-                msg = "exclude must be None when include is 'all'"
-                raise ValueError(msg)
-            data = self
-        else:
-            data = self.select_dtypes(include=include, exclude=exclude)
-
-        ldesc = [describe_1d(s) for _, s in data.items()]
-        # set a convenient order for rows
-        names: List[Label] = []
-        ldesc_indexes = sorted((x.index for x in ldesc), key=len)
-        for idxnames in ldesc_indexes:
-            for name in idxnames:
-                if name not in names:
-                    names.append(name)
-
-        d = pd.concat([x.reindex(names, copy=False) for x in ldesc], axis=1, sort=False)
-        d.columns = data.columns.copy()
-        return d
+        describer = DataFrameDescriber(
+            data=self,
+            include=include,
+            exclude=exclude,
+            datetime_is_numeric=datetime_is_numeric,
+        )
+        return describer.describe(percentiles)
 
     def pct_change(
         self: FrameOrSeries,
diff --git a/pandas/io/formats/describe.py b/pandas/io/formats/describe.py
new file mode 100644
index 0000000000000..d3bb600a79b4f
--- /dev/null
+++ b/pandas/io/formats/describe.py
@@ -0,0 +1,380 @@
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, List, Optional, Sequence, Type, Union, cast
+import warnings
+
+import numpy as np
+
+from pandas._libs.tslibs import Timestamp
+from pandas._typing import Dtype, FrameOrSeries, FrameOrSeriesUnion, Label
+from pandas.util._validators import validate_percentile
+
+from pandas.core.dtypes.common import (
+    is_bool_dtype,
+    is_datetime64_any_dtype,
+    is_numeric_dtype,
+    is_timedelta64_dtype,
+)
+
+import pandas as pd
+
+from pandas.io.formats.format import format_percentiles
+
+if TYPE_CHECKING:
+    from pandas import DataFrame, Series
+
+
+class DataFrameDescriber:
+    """Class responsible for creating dataframe/series description.
+
+    Called from pandas.core.generic.NDFrame.describe()
+
+    Parameters
+    ----------
+    data : FrameOrSeries
+        Dataframe or Series to be described.
+    include : 'all', list-like of dtypes or None (default), optional
+        A white list of data types to include in the result. Ignored for ``Series``.
+    exclude : list-like of dtypes or None (default), optional,
+        A black list of data types to omit from the result. Ignored for ``Series``.
+    datetime_is_numeric : bool, default False
+        Whether to treat datetime dtypes as numeric.
+    """
+
+    def __init__(
+        self,
+        *,
+        data: FrameOrSeries,
+        include: Optional[Union[str, Sequence[str]]],
+        exclude: Optional[Union[str, Sequence[str]]],
+        datetime_is_numeric: bool,
+    ):
+        self.include = include
+        self.exclude = exclude
+        self.datetime_is_numeric = datetime_is_numeric
+        self.data: FrameOrSeries = self._initialize_data(data)
+
+    def _initialize_data(self, data) -> FrameOrSeries:
+        _validate_dframe_size(data)
+
+        if data.ndim == 1:
+            return data
+
+        if self.include is None and self.exclude is None:
+            # when some numerics are found, keep only numerics
+            include = [np.number]
+            if self.datetime_is_numeric:
+                include.append("datetime")
+            truncated = data.select_dtypes(include=include)
+            if len(truncated.columns) == 0:
+                return data
+            else:
+                return truncated
+        elif self.include == "all":
+            if self.exclude is not None:
+                msg = "exclude must be None when include is 'all'"
+                raise ValueError(msg)
+            return data
+        else:
+            return data.select_dtypes(include=self.include, exclude=self.exclude)
+
+    def _select_strategy(
+        self,
+        series: "Series",
+        percentiles: Optional[Sequence[float]],
+    ) -> "StrategyAbstract":
+        """Select strategy for description."""
+        strategy: Type[StrategyAbstract] = CategoricalStrategy
+        if is_bool_dtype(series.dtype):
+            strategy = CategoricalStrategy
+        elif is_numeric_dtype(series):
+            strategy = NumericStrategy
+        elif is_datetime64_any_dtype(series.dtype) and self.datetime_is_numeric:
+            strategy = TimestampStrategy
+        elif is_timedelta64_dtype(series.dtype):
+            strategy = NumericStrategy
+
+        if strategy == CategoricalStrategy and is_datetime64_any_dtype(series.dtype):
+            strategy = TimestampAsCategoricalStrategy
+            warnings.warn(
+                "Treating datetime data as categorical rather than numeric in "
+                "`.describe` is deprecated and will be removed in a future "
+                "version of pandas. Specify `datetime_is_numeric=True` to "
+                "silence this warning and adopt the future behavior now.",
+                FutureWarning,
+                stacklevel=5,
+            )
+
+        return strategy(series, percentiles)
+
+    def describe(self, percentiles: Optional[Sequence[float]]) -> FrameOrSeries:
+        """Do describe
+
+        Parameters
+        ----------
+        percentiles : list-like of numbers, optional
+            The percentiles to include in the output. All should fall between 0 and 1.
+            The default is ``[.25, .5, .75]``, which returns the 25th, 50th, and
+            75th percentiles.
+
+        Returns
+        -------
+        result : FrameOrSeries
+            Either dataframe (if ``self.data`` is dataframe)
+            or series (if ``self.data`` is series).
+        """
+        result: FrameOrSeriesUnion
+        if self.data.ndim == 1:
+            series = cast("Series", self.data)
+            result = self._describe_series(series, percentiles)
+        else:
+            dataframe = cast("DataFrame", self.data)
+            result = self._describe_dataframe(dataframe, percentiles)
+        return cast(FrameOrSeries, result)
+
+    def _describe_series(
+        self,
+        series: "Series",
+        percentiles: Optional[Sequence[float]],
+    ) -> "Series":
+        """Describe series."""
+        strategy = self._select_strategy(series, percentiles)
+        return strategy.describe()
+
+    def _describe_dataframe(
+        self,
+        dataframe: "DataFrame",
+        percentiles: Optional[Sequence[float]],
+    ) -> "DataFrame":
+        """Describe dataframe by describing series and concating them together."""
+        ldesc: List["Series"] = []
+        for _, series in dataframe.items():
+            # Could use _describe_series here to avoid code duplication,
+            # but there will be an error regarding warning stacklevel
+            strategy = self._select_strategy(series, percentiles)
+            ldesc.append(strategy.describe())
+
+        df = pd.concat(
+            self._reindex_columns(ldesc),
+            axis=1,
+            sort=False,
+        )
+        df.columns = dataframe.columns.copy()
+        return cast("DataFrame", df)
+
+    def _reindex_columns(self, column_data) -> List["Series"]:
+        """Set a convenient order for rows."""
+        names: List[Label] = []
+        ldesc_indexes = sorted((x.index for x in column_data), key=len)
+        for idxnames in ldesc_indexes:
+            for name in idxnames:
+                if name not in names:
+                    names.append(name)
+        return [x.reindex(names, copy=False) for x in column_data]
+
+
+class StrategyAbstract(ABC):
+    """Abstract strategy for describing series."""
+
+    def __init__(
+        self,
+        data: "Series",
+        percentiles: Optional[Sequence[float]],
+    ):
+        self.data = data
+        self.percentiles = self._initialize_percentiles(percentiles)
+
+    def describe(self) -> "Series":
+        """Describe series."""
+        return pd.Series(
+            self.array,
+            index=self.names,
+            name=self.data.name,
+            dtype=self.dtype,
+        )
+
+    @property
+    @abstractmethod
+    def array(self) -> List[object]:
+        """Series data."""
+
+    @property
+    @abstractmethod
+    def names(self) -> List[str]:
+        """Series index names."""
+
+    @property
+    @abstractmethod
+    def dtype(self) -> Optional[Dtype]:
+        """Series dtype."""
+
+    @property
+    def formatted_percentiles(self) -> List[str]:
+        """Percentiles formatted as strings, rounded."""
+        return format_percentiles(self.percentiles)
+
+    @staticmethod
+    def _initialize_percentiles(
+        percentiles: Optional[Sequence[float]],
+    ) -> Sequence[float]:
+        if percentiles is None:
+            return np.array([0.25, 0.5, 0.75])
+
+        # explicit conversion of `percentiles` to list
+        percentiles = list(percentiles)
+
+        # get them all to be in [0, 1]
+        validate_percentile(percentiles)
+
+        # median should always be included
+        if 0.5 not in percentiles:
+            percentiles.append(0.5)
+        percentiles = np.asarray(percentiles)
+
+        # sort and check for duplicates
+        unique_pcts = np.unique(percentiles)
+        assert percentiles is not None
+        if len(unique_pcts) < len(percentiles):
+            raise ValueError("percentiles cannot contain duplicates")
+        return unique_pcts
+
+
+class CategoricalStrategy(StrategyAbstract):
+    """Strategy for series with categorical values."""
+
+    def __init__(self, data, percentiles):
+        self.data = data
+        super().__init__(data, percentiles)
+        self.objcounts = self.data.value_counts()
+
+    @property
+    def count(self) -> "Series":
+        return self.data.count()
+
+    @property
+    def count_unique(self) -> int:
+        return len(self.objcounts[self.objcounts != 0])
+
+    @property
+    def names(self) -> List[str]:
+        return ["count", "unique", "top", "freq"]
+
+    @property
+    def array(self) -> List[object]:
+        result = [self.count, self.count_unique]
+        if self.count_unique > 0:
+            top, freq = self.objcounts.index[0], self.objcounts.iloc[0]
+            result += [top, freq]
+
+        # If the DataFrame is empty, set 'top' and 'freq' to None
+        # to maintain output shape consistency
+        else:
+            result += [np.nan, np.nan]
+        return result
+
+    @property
+    def dtype(self) -> Optional[Dtype]:
+        if self.count_unique == 0:
+            return "object"
+        return None
+
+
+class TimestampAsCategoricalStrategy(CategoricalStrategy):
+    """Strategy for series with timestamp values treated as categorical values."""
+
+    @property
+    def array(self) -> List[object]:
+        result = [self.count, self.count_unique]
+        if self.count_unique > 0:
+            top, freq = self.objcounts.index[0], self.objcounts.iloc[0]
+            tz = self.data.dt.tz
+            asint = self.data.dropna().values.view("i8")
+            top = Timestamp(top)
+            if top.tzinfo is not None and tz is not None:
+                # Don't tz_localize(None) if key is already tz-aware
+                top = top.tz_convert(tz)
+            else:
+                top = top.tz_localize(tz)
+
+            result += [
+                top,
+                freq,
+                Timestamp(asint.min(), tz=tz),
+                Timestamp(asint.max(), tz=tz),
+            ]
+
+        # If the DataFrame is empty, set 'top' and 'freq' to None
+        # to maintain output shape consistency
+        else:
+            result += [np.nan, np.nan]
+        return result
+
+    @property
+    def names(self) -> List[str]:
+        names = ["count", "unique"]
+        if self.count_unique > 0:
+            names += ["top", "freq", "first", "last"]
+        return names
+
+
+class NumericStrategy(StrategyAbstract):
+    """Strategy for series with numeric values."""
+
+    @property
+    def names(self) -> List[str]:
+        return [
+            "count",
+            "mean",
+            "std",
+            "min",
+            *self.formatted_percentiles,
+            "max",
+        ]
+
+    @property
+    def array(self) -> List[object]:
+        return [
+            self.data.count(),
+            self.data.mean(),
+            self.data.std(),
+            self.data.min(),
+            *self.data.quantile(self.percentiles).tolist(),
+            self.data.max(),
+        ]
+
+    @property
+    def dtype(self) -> Optional[Dtype]:
+        return None
+
+
+class TimestampStrategy(StrategyAbstract):
+    """Strategy for series with timestamp values."""
+
+    @property
+    def array(self) -> List[object]:
+        return [
+            self.data.count(),
+            self.data.mean(),
+            self.data.min(),
+            *self.data.quantile(self.percentiles).tolist(),
+            self.data.max(),
+        ]
+
+    @property
+    def names(self) -> List[str]:
+        return [
+            "count",
+            "mean",
+            "min",
+            *self.formatted_percentiles,
+            "max",
+        ]
+
+    @property
+    def dtype(self) -> Optional[Dtype]:
+        return None
+
+
+def _validate_dframe_size(df: FrameOrSeriesUnion) -> None:
+    """Validate correct size of dataframe."""
+    if df.ndim == 2 and df.columns.size == 0:
+        raise ValueError("Cannot describe a DataFrame without columns")

From 2fc029d93885f57bb28817509cfc1943e98dc8ee Mon Sep 17 00:00:00 2001
From: Maxim Ivanov <ivanovmg@gmail.com>
Date: Sat, 3 Oct 2020 13:12:40 +0700
Subject: [PATCH 2/9] REF: split describer into series/dataframe

---
 pandas/core/generic.py        |   6 +-
 pandas/io/formats/describe.py | 207 +++++++++++++++++++++++-----------
 2 files changed, 144 insertions(+), 69 deletions(-)

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index 281a4933277e2..d54b29cd186e0 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -102,7 +102,7 @@
 from pandas.core.window import Expanding, ExponentialMovingWindow, Rolling, Window
 
 from pandas.io.formats import format as fmt
-from pandas.io.formats.describe import DataFrameDescriber
+from pandas.io.formats.describe import describe_ndframe
 from pandas.io.formats.format import DataFrameFormatter
 from pandas.io.formats.printing import pprint_thing
 
@@ -10102,13 +10102,13 @@ def describe(
         75%            NaN      2.5
         max            NaN      3.0
         """
-        describer = DataFrameDescriber(
+        return describe_ndframe(
             data=self,
             include=include,
             exclude=exclude,
             datetime_is_numeric=datetime_is_numeric,
+            percentiles=percentiles,
         )
-        return describer.describe(percentiles)
 
     def pct_change(
         self: FrameOrSeries,
diff --git a/pandas/io/formats/describe.py b/pandas/io/formats/describe.py
index d3bb600a79b4f..8765ef4bb2332 100644
--- a/pandas/io/formats/describe.py
+++ b/pandas/io/formats/describe.py
@@ -23,21 +23,154 @@
     from pandas import DataFrame, Series
 
 
-class DataFrameDescriber:
-    """Class responsible for creating dataframe/series description.
+def describe_ndframe(
+    *,
+    data: FrameOrSeries,
+    include: Optional[Union[str, Sequence[str]]],
+    exclude: Optional[Union[str, Sequence[str]]],
+    datetime_is_numeric: bool,
+    percentiles: Optional[Sequence[float]],
+) -> FrameOrSeries:
+    """Describe series or dataframe.
 
     Called from pandas.core.generic.NDFrame.describe()
 
     Parameters
     ----------
     data : FrameOrSeries
-        Dataframe or Series to be described.
+        Either dataframe or series.
     include : 'all', list-like of dtypes or None (default), optional
         A white list of data types to include in the result. Ignored for ``Series``.
     exclude : list-like of dtypes or None (default), optional,
         A black list of data types to omit from the result. Ignored for ``Series``.
     datetime_is_numeric : bool, default False
         Whether to treat datetime dtypes as numeric.
+    percentiles : list-like of numbers, optional
+        The percentiles to include in the output. All should
+        fall between 0 and 1. The default is
+        ``[.25, .5, .75]``, which returns the 25th, 50th, and
+        75th percentiles.
+
+    Returns
+    -------
+    FrameOrSeries
+        Dataframe or series described.
+    """
+    describer: "NDFrameDescriber"
+    if data.ndim == 1:
+        describer = SeriesDescriber(
+            data=data,
+            datetime_is_numeric=datetime_is_numeric,
+        )
+    else:
+        describer = DataFrameDescriber(
+            data=data,
+            include=include,
+            exclude=exclude,
+            datetime_is_numeric=datetime_is_numeric,
+        )
+    result = describer.describe(percentiles)
+    return cast(FrameOrSeries, result)
+
+
+class StrategyCreatorMixin:
+    datetime_is_numeric: bool
+
+    def create_strategy(
+        self,
+        series: "Series",
+        percentiles: Optional[Sequence[float]],
+    ) -> "StrategyAbstract":
+        """Create strategy instance for description."""
+        klass = self._select_strategy(series)
+        return klass(series, percentiles)
+
+    def _select_strategy(self, series: "Series") -> Type["StrategyAbstract"]:
+        """Select strategy for description."""
+        strategy: Type[StrategyAbstract] = CategoricalStrategy
+        if is_bool_dtype(series.dtype):
+            strategy = CategoricalStrategy
+        elif is_numeric_dtype(series):
+            strategy = NumericStrategy
+        elif is_datetime64_any_dtype(series.dtype) and self.datetime_is_numeric:
+            strategy = TimestampStrategy
+        elif is_timedelta64_dtype(series.dtype):
+            strategy = NumericStrategy
+
+        if strategy == CategoricalStrategy and is_datetime64_any_dtype(series.dtype):
+            strategy = TimestampAsCategoricalStrategy
+            warnings.warn(
+                "Treating datetime data as categorical rather than numeric in "
+                "`.describe` is deprecated and will be removed in a future "
+                "version of pandas. Specify `datetime_is_numeric=True` to "
+                "silence this warning and adopt the future behavior now.",
+                FutureWarning,
+                stacklevel=6,
+            )
+        return strategy
+
+
+class NDFrameDescriber(ABC):
+    """Abstract class for describing dataframe or series."""
+
+    @abstractmethod
+    def describe(self, percentiles: Optional[Sequence[float]]) -> FrameOrSeriesUnion:
+        pass
+
+
+class SeriesDescriber(NDFrameDescriber, StrategyCreatorMixin):
+    """Class responsible for creating series description.
+
+    Parameters
+    ----------
+    data : FrameOrSeries
+        Dataframe or Series to be described.
+    datetime_is_numeric : bool, default False
+        Whether to treat datetime dtypes as numeric.
+    """
+
+    def __init__(
+        self,
+        *,
+        data: FrameOrSeries,
+        datetime_is_numeric: bool,
+    ):
+        self.data = data
+        self.datetime_is_numeric = datetime_is_numeric
+
+    def describe(self, percentiles: Optional[Sequence[float]]) -> "Series":
+        """Do describe.
+
+        Parameters
+        ----------
+        percentiles : list-like of numbers, optional
+            The percentiles to include in the output. All should fall between 0 and 1.
+            The default is ``[.25, .5, .75]``, which returns the 25th, 50th, and
+            75th percentiles.
+
+        Returns
+        -------
+        result : Series
+        """
+        series = cast("Series", self.data)
+        strategy = self.create_strategy(series, percentiles)
+        result = strategy.describe()
+        return result
+
+
+class DataFrameDescriber(NDFrameDescriber, StrategyCreatorMixin):
+    """Class responsible for creating dataframe description.
+
+    Parameters
+    ----------
+    data : FrameOrSeries
+        Dataframe or Series to be described.
+    include : 'all', list-like of dtypes or None (default), optional
+        A white list of data types to include in the result.
+    exclude : list-like of dtypes or None (default), optional,
+        A black list of data types to omit from the result.
+    datetime_is_numeric : bool, default False
+        Whether to treat datetime dtypes as numeric.
     """
 
     def __init__(
@@ -56,9 +189,6 @@ def __init__(
     def _initialize_data(self, data) -> FrameOrSeries:
         _validate_dframe_size(data)
 
-        if data.ndim == 1:
-            return data
-
         if self.include is None and self.exclude is None:
             # when some numerics are found, keep only numerics
             include = [np.number]
@@ -77,37 +207,8 @@ def _initialize_data(self, data) -> FrameOrSeries:
         else:
             return data.select_dtypes(include=self.include, exclude=self.exclude)
 
-    def _select_strategy(
-        self,
-        series: "Series",
-        percentiles: Optional[Sequence[float]],
-    ) -> "StrategyAbstract":
-        """Select strategy for description."""
-        strategy: Type[StrategyAbstract] = CategoricalStrategy
-        if is_bool_dtype(series.dtype):
-            strategy = CategoricalStrategy
-        elif is_numeric_dtype(series):
-            strategy = NumericStrategy
-        elif is_datetime64_any_dtype(series.dtype) and self.datetime_is_numeric:
-            strategy = TimestampStrategy
-        elif is_timedelta64_dtype(series.dtype):
-            strategy = NumericStrategy
-
-        if strategy == CategoricalStrategy and is_datetime64_any_dtype(series.dtype):
-            strategy = TimestampAsCategoricalStrategy
-            warnings.warn(
-                "Treating datetime data as categorical rather than numeric in "
-                "`.describe` is deprecated and will be removed in a future "
-                "version of pandas. Specify `datetime_is_numeric=True` to "
-                "silence this warning and adopt the future behavior now.",
-                FutureWarning,
-                stacklevel=5,
-            )
-
-        return strategy(series, percentiles)
-
-    def describe(self, percentiles: Optional[Sequence[float]]) -> FrameOrSeries:
-        """Do describe
+    def describe(self, percentiles: Optional[Sequence[float]]) -> "DataFrame":
+        """Do describe.
 
         Parameters
         ----------
@@ -118,39 +219,13 @@ def describe(self, percentiles: Optional[Sequence[float]]) -> FrameOrSeries:
 
         Returns
         -------
-        result : FrameOrSeries
-            Either dataframe (if ``self.data`` is dataframe)
-            or series (if ``self.data`` is series).
+        result : DataFrame
         """
-        result: FrameOrSeriesUnion
-        if self.data.ndim == 1:
-            series = cast("Series", self.data)
-            result = self._describe_series(series, percentiles)
-        else:
-            dataframe = cast("DataFrame", self.data)
-            result = self._describe_dataframe(dataframe, percentiles)
-        return cast(FrameOrSeries, result)
+        dataframe = cast("DataFrame", self.data)
 
-    def _describe_series(
-        self,
-        series: "Series",
-        percentiles: Optional[Sequence[float]],
-    ) -> "Series":
-        """Describe series."""
-        strategy = self._select_strategy(series, percentiles)
-        return strategy.describe()
-
-    def _describe_dataframe(
-        self,
-        dataframe: "DataFrame",
-        percentiles: Optional[Sequence[float]],
-    ) -> "DataFrame":
-        """Describe dataframe by describing series and concating them together."""
         ldesc: List["Series"] = []
         for _, series in dataframe.items():
-            # Could use _describe_series here to avoid code duplication,
-            # but there will be an error regarding warning stacklevel
-            strategy = self._select_strategy(series, percentiles)
+            strategy = self.create_strategy(series, percentiles)
             ldesc.append(strategy.describe())
 
         df = pd.concat(

From cc9de705d4173fceb427fcac481ae59d2abfba99 Mon Sep 17 00:00:00 2001
From: Maxim Ivanov <ivanovmg@gmail.com>
Date: Sat, 3 Oct 2020 21:10:16 +0700
Subject: [PATCH 3/9] REF: extract method _get_top_and_freq

---
 pandas/io/formats/describe.py | 116 ++++++++++++++++------------------
 1 file changed, 56 insertions(+), 60 deletions(-)

diff --git a/pandas/io/formats/describe.py b/pandas/io/formats/describe.py
index 8765ef4bb2332..79ba1f78da4c8 100644
--- a/pandas/io/formats/describe.py
+++ b/pandas/io/formats/describe.py
@@ -1,3 +1,12 @@
+"""Module responsible for execution of NDFrame.describe() method.
+
+Method NDFrame.describe() delegates actual execution to function describe_ndframe().
+
+Strategy pattern is utilized.
+ - The appropriate strategy is selected based on the series datatype.
+ - The strategy is responsible for running proper description.
+"""
+
 from abc import ABC, abstractmethod
 from typing import TYPE_CHECKING, List, Optional, Sequence, Type, Union, cast
 import warnings
@@ -46,9 +55,8 @@ def describe_ndframe(
     datetime_is_numeric : bool, default False
         Whether to treat datetime dtypes as numeric.
     percentiles : list-like of numbers, optional
-        The percentiles to include in the output. All should
-        fall between 0 and 1. The default is
-        ``[.25, .5, .75]``, which returns the 25th, 50th, and
+        The percentiles to include in the output. All should fall between 0 and 1.
+        The default is ``[.25, .5, .75]``, which returns the 25th, 50th, and
         75th percentiles.
 
     Returns
@@ -74,6 +82,8 @@ def describe_ndframe(
 
 
 class StrategyCreatorMixin:
+    """Mixin for creating instance of appropriate strategy for describing series."""
+
     datetime_is_numeric: bool
 
     def create_strategy(
@@ -115,7 +125,15 @@ class NDFrameDescriber(ABC):
 
     @abstractmethod
     def describe(self, percentiles: Optional[Sequence[float]]) -> FrameOrSeriesUnion:
-        pass
+        """Do describe either series or dataframe.
+
+        Parameters
+        ----------
+        percentiles : list-like of numbers, optional
+            The percentiles to include in the output. All should fall between 0 and 1.
+            The default is ``[.25, .5, .75]``, which returns the 25th, 50th, and
+            75th percentiles.
+        """
 
 
 class SeriesDescriber(NDFrameDescriber, StrategyCreatorMixin):
@@ -139,19 +157,7 @@ def __init__(
         self.datetime_is_numeric = datetime_is_numeric
 
     def describe(self, percentiles: Optional[Sequence[float]]) -> "Series":
-        """Do describe.
-
-        Parameters
-        ----------
-        percentiles : list-like of numbers, optional
-            The percentiles to include in the output. All should fall between 0 and 1.
-            The default is ``[.25, .5, .75]``, which returns the 25th, 50th, and
-            75th percentiles.
-
-        Returns
-        -------
-        result : Series
-        """
+        """Do describe series."""
         series = cast("Series", self.data)
         strategy = self.create_strategy(series, percentiles)
         result = strategy.describe()
@@ -208,19 +214,7 @@ def _initialize_data(self, data) -> FrameOrSeries:
             return data.select_dtypes(include=self.include, exclude=self.exclude)
 
     def describe(self, percentiles: Optional[Sequence[float]]) -> "DataFrame":
-        """Do describe.
-
-        Parameters
-        ----------
-        percentiles : list-like of numbers, optional
-            The percentiles to include in the output. All should fall between 0 and 1.
-            The default is ``[.25, .5, .75]``, which returns the 25th, 50th, and
-            75th percentiles.
-
-        Returns
-        -------
-        result : DataFrame
-        """
+        """Do describe dataframe."""
         dataframe = cast("DataFrame", self.data)
 
         ldesc: List["Series"] = []
@@ -322,36 +316,38 @@ def __init__(self, data, percentiles):
         self.objcounts = self.data.value_counts()
 
     @property
-    def count(self) -> "Series":
-        return self.data.count()
-
-    @property
-    def count_unique(self) -> int:
-        return len(self.objcounts[self.objcounts != 0])
+    def array(self) -> List[object]:
+        top, freq = self._get_top_and_freq()
+        return [
+            self.count,
+            self.count_unique,
+            top,
+            freq,
+        ]
 
     @property
     def names(self) -> List[str]:
         return ["count", "unique", "top", "freq"]
 
-    @property
-    def array(self) -> List[object]:
-        result = [self.count, self.count_unique]
-        if self.count_unique > 0:
-            top, freq = self.objcounts.index[0], self.objcounts.iloc[0]
-            result += [top, freq]
-
-        # If the DataFrame is empty, set 'top' and 'freq' to None
-        # to maintain output shape consistency
-        else:
-            result += [np.nan, np.nan]
-        return result
-
     @property
     def dtype(self) -> Optional[Dtype]:
         if self.count_unique == 0:
             return "object"
         return None
 
+    @property
+    def count(self) -> "Series":
+        return self.data.count()
+
+    @property
+    def count_unique(self) -> int:
+        return len(self.objcounts[self.objcounts != 0])
+
+    def _get_top_and_freq(self):
+        if self.count_unique > 0:
+            return self.objcounts.index[0], self.objcounts.iloc[0]
+        return np.nan, np.nan
+
 
 class TimestampAsCategoricalStrategy(CategoricalStrategy):
     """Strategy for series with timestamp values treated as categorical values."""
@@ -394,17 +390,6 @@ def names(self) -> List[str]:
 class NumericStrategy(StrategyAbstract):
     """Strategy for series with numeric values."""
 
-    @property
-    def names(self) -> List[str]:
-        return [
-            "count",
-            "mean",
-            "std",
-            "min",
-            *self.formatted_percentiles,
-            "max",
-        ]
-
     @property
     def array(self) -> List[object]:
         return [
@@ -416,6 +401,17 @@ def array(self) -> List[object]:
             self.data.max(),
         ]
 
+    @property
+    def names(self) -> List[str]:
+        return [
+            "count",
+            "mean",
+            "std",
+            "min",
+            *self.formatted_percentiles,
+            "max",
+        ]
+
     @property
     def dtype(self) -> Optional[Dtype]:
         return None

From 6cc722408a0ff0364cda42a261682bdbc2117a34 Mon Sep 17 00:00:00 2001
From: Maxim Ivanov <ivanovmg@gmail.com>
Date: Sat, 3 Oct 2020 21:14:39 +0700
Subject: [PATCH 4/9] REF: simplify logic in _initialize_data

---
 pandas/io/formats/describe.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/pandas/io/formats/describe.py b/pandas/io/formats/describe.py
index 79ba1f78da4c8..fec2aeca8eb35 100644
--- a/pandas/io/formats/describe.py
+++ b/pandas/io/formats/describe.py
@@ -205,13 +205,14 @@ def _initialize_data(self, data) -> FrameOrSeries:
                 return data
             else:
                 return truncated
-        elif self.include == "all":
+
+        if self.include == "all":
             if self.exclude is not None:
                 msg = "exclude must be None when include is 'all'"
                 raise ValueError(msg)
             return data
-        else:
-            return data.select_dtypes(include=self.include, exclude=self.exclude)
+
+        return data.select_dtypes(include=self.include, exclude=self.exclude)
 
     def describe(self, percentiles: Optional[Sequence[float]]) -> "DataFrame":
         """Do describe dataframe."""

From 4e30a8e0aab12e2107894cbcecf649f17c490351 Mon Sep 17 00:00:00 2001
From: Maxim Ivanov <ivanovmg@gmail.com>
Date: Sat, 3 Oct 2020 21:22:01 +0700
Subject: [PATCH 5/9] TYP: clear typing for Series vs DataFrame

---
 pandas/io/formats/describe.py | 39 +++++++++++++++++------------------
 1 file changed, 19 insertions(+), 20 deletions(-)

diff --git a/pandas/io/formats/describe.py b/pandas/io/formats/describe.py
index fec2aeca8eb35..7815407e09d4c 100644
--- a/pandas/io/formats/describe.py
+++ b/pandas/io/formats/describe.py
@@ -62,17 +62,19 @@ def describe_ndframe(
     Returns
     -------
     FrameOrSeries
-        Dataframe or series described.
+        Dataframe or series description.
     """
     describer: "NDFrameDescriber"
     if data.ndim == 1:
+        series = cast("Series", data)
         describer = SeriesDescriber(
-            data=data,
+            data=series,
             datetime_is_numeric=datetime_is_numeric,
         )
     else:
+        dataframe = cast("DataFrame", data)
         describer = DataFrameDescriber(
-            data=data,
+            data=dataframe,
             include=include,
             exclude=exclude,
             datetime_is_numeric=datetime_is_numeric,
@@ -141,8 +143,8 @@ class SeriesDescriber(NDFrameDescriber, StrategyCreatorMixin):
 
     Parameters
     ----------
-    data : FrameOrSeries
-        Dataframe or Series to be described.
+    data : Series
+        Series to be described.
     datetime_is_numeric : bool, default False
         Whether to treat datetime dtypes as numeric.
     """
@@ -150,7 +152,7 @@ class SeriesDescriber(NDFrameDescriber, StrategyCreatorMixin):
     def __init__(
         self,
         *,
-        data: FrameOrSeries,
+        data: "Series",
         datetime_is_numeric: bool,
     ):
         self.data = data
@@ -158,8 +160,7 @@ def __init__(
 
     def describe(self, percentiles: Optional[Sequence[float]]) -> "Series":
         """Do describe series."""
-        series = cast("Series", self.data)
-        strategy = self.create_strategy(series, percentiles)
+        strategy = self.create_strategy(self.data, percentiles)
         result = strategy.describe()
         return result
 
@@ -169,8 +170,8 @@ class DataFrameDescriber(NDFrameDescriber, StrategyCreatorMixin):
 
     Parameters
     ----------
-    data : FrameOrSeries
-        Dataframe or Series to be described.
+    data : DataFrame
+        Dataframe to be described.
     include : 'all', list-like of dtypes or None (default), optional
         A white list of data types to include in the result.
     exclude : list-like of dtypes or None (default), optional,
@@ -182,7 +183,7 @@ class DataFrameDescriber(NDFrameDescriber, StrategyCreatorMixin):
     def __init__(
         self,
         *,
-        data: FrameOrSeries,
+        data: "DataFrame",
         include: Optional[Union[str, Sequence[str]]],
         exclude: Optional[Union[str, Sequence[str]]],
         datetime_is_numeric: bool,
@@ -190,9 +191,9 @@ def __init__(
         self.include = include
         self.exclude = exclude
         self.datetime_is_numeric = datetime_is_numeric
-        self.data: FrameOrSeries = self._initialize_data(data)
+        self.data: "DataFrame" = self._initialize_data(data)
 
-    def _initialize_data(self, data) -> FrameOrSeries:
+    def _initialize_data(self, data) -> "DataFrame":
         _validate_dframe_size(data)
 
         if self.include is None and self.exclude is None:
@@ -200,11 +201,11 @@ def _initialize_data(self, data) -> FrameOrSeries:
             include = [np.number]
             if self.datetime_is_numeric:
                 include.append("datetime")
-            truncated = data.select_dtypes(include=include)
-            if len(truncated.columns) == 0:
+            numeric_only = data.select_dtypes(include=include)
+            if len(numeric_only.columns) == 0:
                 return data
             else:
-                return truncated
+                return numeric_only
 
         if self.include == "all":
             if self.exclude is not None:
@@ -216,10 +217,8 @@ def _initialize_data(self, data) -> FrameOrSeries:
 
     def describe(self, percentiles: Optional[Sequence[float]]) -> "DataFrame":
         """Do describe dataframe."""
-        dataframe = cast("DataFrame", self.data)
-
         ldesc: List["Series"] = []
-        for _, series in dataframe.items():
+        for _, series in self.data.items():
             strategy = self.create_strategy(series, percentiles)
             ldesc.append(strategy.describe())
 
@@ -228,7 +227,7 @@ def describe(self, percentiles: Optional[Sequence[float]]) -> "DataFrame":
             axis=1,
             sort=False,
         )
-        df.columns = dataframe.columns.copy()
+        df.columns = self.data.columns.copy()
         return cast("DataFrame", df)
 
     def _reindex_columns(self, column_data) -> List["Series"]:

From 3916fcda081b3cf2903f9a071281541352a677fd Mon Sep 17 00:00:00 2001
From: Maxim Ivanov <ivanovmg@gmail.com>
Date: Sat, 3 Oct 2020 21:28:35 +0700
Subject: [PATCH 6/9] REF: extract method _extract_numeric_data

---
 pandas/io/formats/describe.py | 61 +++++++++++++++++++++--------------
 1 file changed, 37 insertions(+), 24 deletions(-)

diff --git a/pandas/io/formats/describe.py b/pandas/io/formats/describe.py
index 7815407e09d4c..2dc7a954821cd 100644
--- a/pandas/io/formats/describe.py
+++ b/pandas/io/formats/describe.py
@@ -8,7 +8,17 @@
 """
 
 from abc import ABC, abstractmethod
-from typing import TYPE_CHECKING, List, Optional, Sequence, Type, Union, cast
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    List,
+    Optional,
+    Sequence,
+    Tuple,
+    Type,
+    Union,
+    cast,
+)
 import warnings
 
 import numpy as np
@@ -193,28 +203,6 @@ def __init__(
         self.datetime_is_numeric = datetime_is_numeric
         self.data: "DataFrame" = self._initialize_data(data)
 
-    def _initialize_data(self, data) -> "DataFrame":
-        _validate_dframe_size(data)
-
-        if self.include is None and self.exclude is None:
-            # when some numerics are found, keep only numerics
-            include = [np.number]
-            if self.datetime_is_numeric:
-                include.append("datetime")
-            numeric_only = data.select_dtypes(include=include)
-            if len(numeric_only.columns) == 0:
-                return data
-            else:
-                return numeric_only
-
-        if self.include == "all":
-            if self.exclude is not None:
-                msg = "exclude must be None when include is 'all'"
-                raise ValueError(msg)
-            return data
-
-        return data.select_dtypes(include=self.include, exclude=self.exclude)
-
     def describe(self, percentiles: Optional[Sequence[float]]) -> "DataFrame":
         """Do describe dataframe."""
         ldesc: List["Series"] = []
@@ -240,6 +228,31 @@ def _reindex_columns(self, column_data) -> List["Series"]:
                     names.append(name)
         return [x.reindex(names, copy=False) for x in column_data]
 
+    def _initialize_data(self, data: "DataFrame") -> "DataFrame":
+        _validate_dframe_size(data)
+
+        if self.include is None and self.exclude is None:
+            return self._extract_numeric_data(data)
+
+        if self.include == "all":
+            if self.exclude is not None:
+                msg = "exclude must be None when include is 'all'"
+                raise ValueError(msg)
+            return data
+
+        return data.select_dtypes(include=self.include, exclude=self.exclude)
+
+    def _extract_numeric_data(self, data: "DataFrame") -> "DataFrame":
+        """When some numerics are found, keep only numerics."""
+        include = [np.number]
+        if self.datetime_is_numeric:
+            include.append("datetime")
+        numeric_only = data.select_dtypes(include=include)
+        if len(numeric_only.columns) == 0:
+            return data
+        else:
+            return numeric_only
+
 
 class StrategyAbstract(ABC):
     """Abstract strategy for describing series."""
@@ -343,7 +356,7 @@ def count(self) -> "Series":
     def count_unique(self) -> int:
         return len(self.objcounts[self.objcounts != 0])
 
-    def _get_top_and_freq(self):
+    def _get_top_and_freq(self) -> Tuple[Any, Any]:
         if self.count_unique > 0:
             return self.objcounts.index[0], self.objcounts.iloc[0]
         return np.nan, np.nan

From 464ba1ef3d21947641cb6436daf1f5243cbe720a Mon Sep 17 00:00:00 2001
From: Maxim Ivanov <ivanovmg@gmail.com>
Date: Wed, 7 Oct 2020 03:10:40 +0700
Subject: [PATCH 7/9] REF: replace import as pd with concrete imports

---
 pandas/io/formats/describe.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/pandas/io/formats/describe.py b/pandas/io/formats/describe.py
index 2dc7a954821cd..d9fa236c54899 100644
--- a/pandas/io/formats/describe.py
+++ b/pandas/io/formats/describe.py
@@ -34,7 +34,7 @@
     is_timedelta64_dtype,
 )
 
-import pandas as pd
+from pandas.core.reshape.concat import concat
 
 from pandas.io.formats.format import format_percentiles
 
@@ -210,7 +210,7 @@ def describe(self, percentiles: Optional[Sequence[float]]) -> "DataFrame":
             strategy = self.create_strategy(series, percentiles)
             ldesc.append(strategy.describe())
 
-        df = pd.concat(
+        df = concat(
             self._reindex_columns(ldesc),
             axis=1,
             sort=False,
@@ -267,7 +267,9 @@ def __init__(
 
     def describe(self) -> "Series":
         """Describe series."""
-        return pd.Series(
+        from pandas.core.series import Series
+
+        return Series(
             self.array,
             index=self.names,
             name=self.data.name,

From 90bafe04aadc2b08b7d37cb79224c2bc27cbd443 Mon Sep 17 00:00:00 2001
From: Maxim Ivanov <ivanovmg@gmail.com>
Date: Wed, 7 Oct 2020 03:19:32 +0700
Subject: [PATCH 8/9] TST: add test for covering exclude is not None

---
 pandas/tests/frame/methods/test_describe.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/pandas/tests/frame/methods/test_describe.py b/pandas/tests/frame/methods/test_describe.py
index 0b70bead375da..496b05dec6dff 100644
--- a/pandas/tests/frame/methods/test_describe.py
+++ b/pandas/tests/frame/methods/test_describe.py
@@ -1,4 +1,5 @@
 import numpy as np
+import pytest
 
 import pandas as pd
 from pandas import Categorical, DataFrame, Series, Timestamp, date_range
@@ -332,6 +333,16 @@ def test_describe_tz_values2(self):
             result = df.describe(include="all")
         tm.assert_frame_equal(result, expected)
 
+    @pytest.mark.parametrize("exclude", ["x", "y", ["x", "y"], ["x", "z"]])
+    def test_describe_when_include_all_exclude_not_allowed(self, exclude):
+        """
+        When include is 'all', then setting exclude != None is not allowed.
+        """
+        df = pd.DataFrame({"x": [1], "y": [2], "z": [3]})
+        msg = "exclude must be None when include is 'all'"
+        with pytest.raises(ValueError, match=msg):
+            df.describe(include="all", exclude=exclude)
+
     def test_describe_percentiles_integer_idx(self):
         # GH#26660
         df = pd.DataFrame({"x": [1]})

From 574d22fc6f86538bb8ad7a9fa56d6e2974321947 Mon Sep 17 00:00:00 2001
From: Maxim Ivanov <ivanovmg@gmail.com>
Date: Thu, 22 Oct 2020 03:20:57 +0700
Subject: [PATCH 9/9] CLN: pd.DataFrame -> DataFrame

---
 pandas/tests/frame/methods/test_describe.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/tests/frame/methods/test_describe.py b/pandas/tests/frame/methods/test_describe.py
index 1a311bfb7a620..986e4333ccbc4 100644
--- a/pandas/tests/frame/methods/test_describe.py
+++ b/pandas/tests/frame/methods/test_describe.py
@@ -338,7 +338,7 @@ def test_describe_when_include_all_exclude_not_allowed(self, exclude):
         """
         When include is 'all', then setting exclude != None is not allowed.
         """
-        df = pd.DataFrame({"x": [1], "y": [2], "z": [3]})
+        df = DataFrame({"x": [1], "y": [2], "z": [3]})
         msg = "exclude must be None when include is 'all'"
         with pytest.raises(ValueError, match=msg):
             df.describe(include="all", exclude=exclude)