From e50beb7574cf929529c7fc61d3a225f8e750baab Mon Sep 17 00:00:00 2001
From: Iva Koevska <admatha@gmail.com>
Date: Sat, 10 Mar 2018 12:05:15 +0200
Subject: [PATCH 1/6] DOC: Update docs for pandas.cut

---
 pandas/core/reshape/tile.py | 79 +++++++++++++++++++++----------------
 1 file changed, 45 insertions(+), 34 deletions(-)

diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py
index 30132ddc05c40..6d55e2a3a2fb1 100644
--- a/pandas/core/reshape/tile.py
+++ b/pandas/core/reshape/tile.py
@@ -26,53 +26,64 @@
 def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
         include_lowest=False):
     """
-    Return indices of half-open bins to which each value of `x` belongs.
+    Return indices of half-open `bins` to which each value of `x` belongs.
+
+    Use `cut` when you need to segment and sort data values into bins or
+    buckets of data. This function is also useful for going from a continuous
+    variable to a categorical variable. For example, `cut` could convert ages
+    to groups of age ranges.
 
     Parameters
     ----------
     x : array-like
         Input array to be binned. It has to be 1-dimensional.
-    bins : int, sequence of scalars, or IntervalIndex
-        If `bins` is an int, it defines the number of equal-width bins in the
-        range of `x`. However, in this case, the range of `x` is extended
-        by .1% on each side to include the min or max values of `x`. If
-        `bins` is a sequence it defines the bin edges allowing for
-        non-uniform bin width. No extension of the range of `x` is done in
-        this case.
-    right : bool, optional
-        Indicates whether the bins include the rightmost edge or not. If
-        right == True (the default), then the bins [1,2,3,4] indicate
+    bins : int, sequence of scalars, or pandas.IntervalIndex
+        If `bins` is an int, defines the number of equal-width bins in the
+        range of `x`. The range of `x` is extended by .1% on each side to
+        include the min or max values of `x`.
+        If `bins` is a sequence, defines the bin edges allowing for
+        non-uniform bin width. No extension of the range of `x` is done.
+    right : bool, optional, default 'True'
+        Indicates whether the `bins` include the rightmost edge or not. If
+        `right == True` (the default), then the `bins` [1,2,3,4] indicate
         (1,2], (2,3], (3,4].
-    labels : array or boolean, default None
-        Used as labels for the resulting bins. Must be of the same length as
-        the resulting bins. If False, return only integer indicators of the
-        bins.
-    retbins : bool, optional
-        Whether to return the bins or not. Can be useful if bins is given
+    labels : array or bool, optional
+        Used as labels for the resulting `bins`. Must be of the same length as
+        the resulting `bins`. If False, returns only integer indicators of the
+        `bins`.
+    retbins : bool, optional, default 'False'
+        Whether to return the `bins` or not. Useful when `bins` is provided
         as a scalar.
-    precision : int, optional
-        The precision at which to store and display the bins labels
-    include_lowest : bool, optional
+    precision : int, optional, default '3'
+        The precision at which to store and display the `bins` labels.
+    include_lowest : bool, optional, default 'False'
         Whether the first interval should be left-inclusive or not.
 
     Returns
     -------
-    out : Categorical or Series or array of integers if labels is False
-        The return type (Categorical or Series) depends on the input: a Series
-        of type category if input is a Series else Categorical. Bins are
-        represented as categories when categorical data is returned.
-    bins : ndarray of floats
-        Returned only if `retbins` is True.
+    out : pandas.Categorical or Series, or array of int if `labels` is 'False'
+        The return type depends on the input.
+        If the input is a Series, a Series of type category is returned.
+        Else - pandas.Categorical is returned. `Bins` are represented as
+        categories when categorical data is returned.
+    bins : numpy.ndarray of floats
+        Returned only if `retbins` is 'True'.
+
+    See Also
+    --------
+    qcut : Discretize variable into equal-sized buckets based on rank
+        or based on sample quantiles.
+    pandas.Categorical : Represents a categorical variable in
+        classic R / S-plus fashion.
+    Series : One-dimensional ndarray with axis labels (including time series).
+    pandas.IntervalIndex : Immutable Index implementing an ordered,
+        sliceable set. IntervalIndex represents an Index of intervals that
+        are all closed on the same side.
 
     Notes
     -----
-    The `cut` function can be useful for going from a continuous variable to
-    a categorical variable. For example, `cut` could convert ages to groups
-    of age ranges.
-
-    Any NA values will be NA in the result.  Out of bounds values will be NA in
-    the resulting Categorical object
-
+    Any NA values will be NA in the result. Out of bounds values will be NA in
+    the resulting pandas.Categorical object.
 
     Examples
     --------
@@ -88,7 +99,7 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
     Categories (3, object): [good < medium < bad]
 
     >>> pd.cut(np.ones(5), 4, labels=False)
-    array([1, 1, 1, 1, 1])
+    array([1, 1, 1, 1, 1], dtype=int64)
     """
     # NOTE: this binning code is changed a bit from histogram for var(x) == 0
 

From 49e002fb856f05e9c2beb313e9e4f31e08e9b41a Mon Sep 17 00:00:00 2001
From: Iva Koevska <admatha@gmail.com>
Date: Sat, 10 Mar 2018 13:51:38 +0200
Subject: [PATCH 2/6] Udated with comments from Joris

---
 pandas/core/reshape/tile.py | 71 ++++++++++++++++++++++++-------------
 1 file changed, 46 insertions(+), 25 deletions(-)

diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py
index 6d55e2a3a2fb1..0eff1729d8b6e 100644
--- a/pandas/core/reshape/tile.py
+++ b/pandas/core/reshape/tile.py
@@ -26,37 +26,41 @@
 def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
         include_lowest=False):
     """
-    Return indices of half-open `bins` to which each value of `x` belongs.
+    Bin `x` and return data about the bin to which each `x` value belongs.
 
-    Use `cut` when you need to segment and sort data values into bins or
-    buckets of data. This function is also useful for going from a continuous
-    variable to a categorical variable. For example, `cut` could convert ages
-    to groups of age ranges.
+    This function splits `x` into the specified number of equal-width half-
+    open bins. Based on the parameters specified and the input, returns
+    information about the half-open bins to which each value of `x` belongs
+    or the bins themselves.
+    Use `cut` when you need to segment and sort data values into bins. This
+    function is also useful for going from a continuous variable to a
+    categorical variable. For example, `cut` could convert ages to groups
+    of age ranges.
 
     Parameters
     ----------
     x : array-like
-        Input array to be binned. It has to be 1-dimensional.
+        The input array to be binned. Must be 1-dimensional.
     bins : int, sequence of scalars, or pandas.IntervalIndex
-        If `bins` is an int, defines the number of equal-width bins in the
-        range of `x`. The range of `x` is extended by .1% on each side to
-        include the min or max values of `x`.
-        If `bins` is a sequence, defines the bin edges allowing for
-        non-uniform bin width. No extension of the range of `x` is done.
-    right : bool, optional, default 'True'
+        If int, defines the number of equal-width bins in the range of `x`.
+        The range of `x` is extended by .1% on each side to include the min or
+        max values of `x`.
+        If a sequence, defines the bin edges allowing for non-uniform width.
+        No extension of the range of `x` is done.
+    right : bool, default 'True'
         Indicates whether the `bins` include the rightmost edge or not. If
         `right == True` (the default), then the `bins` [1,2,3,4] indicate
         (1,2], (2,3], (3,4].
     labels : array or bool, optional
-        Used as labels for the resulting `bins`. Must be of the same length as
-        the resulting `bins`. If False, returns only integer indicators of the
-        `bins`.
-    retbins : bool, optional, default 'False'
-        Whether to return the `bins` or not. Useful when `bins` is provided
+        Specifies the labels for the returned bins. Must be the same length as
+        the resulting bins. If False, returns only integer indicators of the
+        bins.
+    retbins : bool, default 'False'
+        Whether to return the bins or not. Useful when bins is provided
         as a scalar.
-    precision : int, optional, default '3'
-        The precision at which to store and display the `bins` labels.
-    include_lowest : bool, optional, default 'False'
+    precision : int, default '3'
+        The precision at which to store and display the bins labels.
+    include_lowest : bool, default 'False'
         Whether the first interval should be left-inclusive or not.
 
     Returns
@@ -64,10 +68,10 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
     out : pandas.Categorical or Series, or array of int if `labels` is 'False'
         The return type depends on the input.
         If the input is a Series, a Series of type category is returned.
-        Else - pandas.Categorical is returned. `Bins` are represented as
+        Else - pandas.Categorical is returned. Bins are represented as
         categories when categorical data is returned.
     bins : numpy.ndarray of floats
-        Returned only if `retbins` is 'True'.
+        Returned when `retbins` is 'True'.
 
     See Also
     --------
@@ -87,10 +91,16 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
 
     Examples
     --------
-    >>> pd.cut(np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]), 3, retbins=True)
+    >>> pd.cut(np.array([1,7,5,4,6,3]), 3)
+    ... # doctest: +ELLIPSIS
+    [(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], ...
+    Categories (3, interval[float64]): [(0.994, 3.0] < (3.0, 5.0] ...
+
+    >>> pd.cut(np.array([1,7,5,4,6,3]), 3, retbins=True)
     ... # doctest: +ELLIPSIS
-    ([(0.19, 3.367], (0.19, 3.367], (0.19, 3.367], (3.367, 6.533], ...
-    Categories (3, interval[float64]): [(0.19, 3.367] < (3.367, 6.533] ...
+    ([(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], ...
+    Categories (3, interval[float64]): [(0.994, 3.0] < (3.0, 5.0] ...
+    array([0.994, 3.   , 5.   , 7.   ]))
 
     >>> pd.cut(np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]),
     ...        3, labels=["good", "medium", "bad"])
@@ -100,6 +110,17 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
 
     >>> pd.cut(np.ones(5), 4, labels=False)
     array([1, 1, 1, 1, 1], dtype=int64)
+
+    >>> s = pd.Series(np.array([2,4,6,8,10]), index=['a', 'b', 'c', 'd', 'e'])
+    >>> pd.cut(s, 3)
+    ... # doctest: +ELLIPSIS
+    a    (1.992, 4.667]
+    b    (1.992, 4.667]
+    c    (4.667, 7.333]
+    d     (7.333, 10.0]
+    e     (7.333, 10.0]
+    dtype: category
+    Categories (3, interval[float64]): [(1.992, 4.667] < (4.667, ...
     """
     # NOTE: this binning code is changed a bit from histogram for var(x) == 0
 

From d24c749b0088d3c9651aa82624041bf94b43cd9b Mon Sep 17 00:00:00 2001
From: Iva Koevska <admatha@gmail.com>
Date: Mon, 12 Mar 2018 19:52:41 +0200
Subject: [PATCH 3/6] Updated as per comments

---
 pandas/core/reshape/tile.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py
index 0eff1729d8b6e..903ef8ad4e34d 100644
--- a/pandas/core/reshape/tile.py
+++ b/pandas/core/reshape/tile.py
@@ -28,10 +28,10 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
     """
     Bin `x` and return data about the bin to which each `x` value belongs.
 
-    This function splits `x` into the specified number of equal-width half-
-    open bins. Based on the parameters specified and the input, returns
-    information about the half-open bins to which each value of `x` belongs
-    or the bins themselves.
+    Splits `x` into the specified number of equal-width half-open bins.
+    Based on the parameters specified and the input, returns data about
+    the half-open bins to which each value of `x` belongs or the bins
+    themselves.
     Use `cut` when you need to segment and sort data values into bins. This
     function is also useful for going from a continuous variable to a
     categorical variable. For example, `cut` could convert ages to groups
@@ -108,7 +108,7 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
     [good, good, good, medium, bad, good]
     Categories (3, object): [good < medium < bad]
 
-    >>> pd.cut(np.ones(5), 4, labels=False)
+    >>> pd.cut(np.ones(5, dtype='int64'), 4, labels=False)
     array([1, 1, 1, 1, 1], dtype=int64)
 
     >>> s = pd.Series(np.array([2,4,6,8,10]), index=['a', 'b', 'c', 'd', 'e'])

From 1f3caf6a24d49d654f25d13badd5e09a0260c228 Mon Sep 17 00:00:00 2001
From: Iva Koevska <admatha@gmail.com>
Date: Mon, 12 Mar 2018 21:07:33 +0200
Subject: [PATCH 4/6] Updated as per comments

---
 pandas/core/reshape/tile.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py
index 903ef8ad4e34d..a03628654638e 100644
--- a/pandas/core/reshape/tile.py
+++ b/pandas/core/reshape/tile.py
@@ -65,11 +65,14 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
 
     Returns
     -------
-    out : pandas.Categorical or Series, or array of int if `labels` is 'False'
-        The return type depends on the input.
-        If the input is a Series, a Series of type category is returned.
-        Else - pandas.Categorical is returned. Bins are represented as
-        categories when categorical data is returned.
+    out : pandas.Categorical, Series, or ndarray
+        An array-like object representing the respective bin for each value
+        of `x`. The type depends on the value of `labels`.
+
+        * True : returns a Series for Series `x` or a pandas.Categorical for
+        pandas.Categorial `x`.
+        
+        * False : returns an ndarray of integers.
     bins : numpy.ndarray of floats
         Returned when `retbins` is 'True'.
 

From 544af0e57f6e1462b21fe0670a852184123f3f83 Mon Sep 17 00:00:00 2001
From: Iva Koevska <admatha@gmail.com>
Date: Mon, 12 Mar 2018 21:20:03 +0200
Subject: [PATCH 5/6] Fixed whitespace issue

---
 pandas/core/reshape/tile.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py
index a03628654638e..eb90e41be2933 100644
--- a/pandas/core/reshape/tile.py
+++ b/pandas/core/reshape/tile.py
@@ -71,7 +71,7 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
 
         * True : returns a Series for Series `x` or a pandas.Categorical for
         pandas.Categorial `x`.
-        
+
         * False : returns an ndarray of integers.
     bins : numpy.ndarray of floats
         Returned when `retbins` is 'True'.

From f22e45fb5880550a276ff4fa4ebdb7ad3c2926d4 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <tom.w.augspurger@gmail.com>
Date: Thu, 15 Mar 2018 15:28:04 -0500
Subject: [PATCH 6/6] Updated [ci skip]

[ci skip]
---
 pandas/core/reshape/tile.py | 109 +++++++++++++++++++++++-------------
 1 file changed, 69 insertions(+), 40 deletions(-)

diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py
index eb90e41be2933..be28f7091712f 100644
--- a/pandas/core/reshape/tile.py
+++ b/pandas/core/reshape/tile.py
@@ -26,41 +26,44 @@
 def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
         include_lowest=False):
     """
-    Bin `x` and return data about the bin to which each `x` value belongs.
+    Bin values into discrete intervals.
 
-    Splits `x` into the specified number of equal-width half-open bins.
-    Based on the parameters specified and the input, returns data about
-    the half-open bins to which each value of `x` belongs or the bins
-    themselves.
     Use `cut` when you need to segment and sort data values into bins. This
     function is also useful for going from a continuous variable to a
-    categorical variable. For example, `cut` could convert ages to groups
-    of age ranges.
+    categorical variable. For example, `cut` could convert ages to groups of
+    age ranges. Supports binning into an equal number of bins, or a
+    pre-specified array of bins.
 
     Parameters
     ----------
     x : array-like
         The input array to be binned. Must be 1-dimensional.
     bins : int, sequence of scalars, or pandas.IntervalIndex
-        If int, defines the number of equal-width bins in the range of `x`.
-        The range of `x` is extended by .1% on each side to include the min or
-        max values of `x`.
-        If a sequence, defines the bin edges allowing for non-uniform width.
-        No extension of the range of `x` is done.
-    right : bool, default 'True'
-        Indicates whether the `bins` include the rightmost edge or not. If
-        `right == True` (the default), then the `bins` [1,2,3,4] indicate
-        (1,2], (2,3], (3,4].
+        The criteria to bin by.
+
+        * int : Defines the number of equal-width bins in the range of `x`. The
+          range of `x` is extended by .1% on each side to include the minimum
+          and maximum values of `x`.
+        * sequence of scalars : Defines the bin edges allowing for non-uniform
+          width. No extension of the range of `x` is done.
+        * IntervalIndex : Defines the exact bins to be used.
+
+    right : bool, default True
+        Indicates whether `bins` includes the rightmost edge or not. If
+        ``right == True`` (the default), then the `bins` ``[1, 2, 3, 4]``
+         indicate (1,2], (2,3], (3,4]. This argument is ignored when
+        `bins` is an IntervalIndex.
     labels : array or bool, optional
         Specifies the labels for the returned bins. Must be the same length as
         the resulting bins. If False, returns only integer indicators of the
-        bins.
-    retbins : bool, default 'False'
+        bins. This affects the type of the output container (see below).
+        This argument is ignored when `bins` is an IntervalIndex.
+    retbins : bool, default False
         Whether to return the bins or not. Useful when bins is provided
         as a scalar.
-    precision : int, default '3'
+    precision : int, default 3
         The precision at which to store and display the bins labels.
-    include_lowest : bool, default 'False'
+    include_lowest : bool, default False
         Whether the first interval should be left-inclusive or not.
 
     Returns
@@ -69,52 +72,68 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
         An array-like object representing the respective bin for each value
         of `x`. The type depends on the value of `labels`.
 
-        * True : returns a Series for Series `x` or a pandas.Categorical for
-        pandas.Categorial `x`.
+        * True (default) : returns a Series for Series `x` or a
+          pandas.Categorical for all other inputs. The values stored within
+          are Interval dtype.
+
+        * sequence of scalars : returns a Series for Series `x` or a
+          pandas.Categorical for all other inputs. The values stored within
+          are whatever the type in the sequence is.
 
         * False : returns an ndarray of integers.
-    bins : numpy.ndarray of floats
-        Returned when `retbins` is 'True'.
+
+    bins : numpy.ndarray or IntervalIndex.
+        The computed or specified bins. Only returned when `retbins=True`.
+        For scalar or sequence `bins`, this is an ndarray with the computed
+        bins. For an IntervalIndex `bins`, this is equal to `bins`.
 
     See Also
     --------
     qcut : Discretize variable into equal-sized buckets based on rank
         or based on sample quantiles.
-    pandas.Categorical : Represents a categorical variable in
-        classic R / S-plus fashion.
-    Series : One-dimensional ndarray with axis labels (including time series).
+    pandas.Categorical : Array type for storing data that come from a
+        fixed set of values.
+    Series : One-dimensional array with axis labels (including time series).
     pandas.IntervalIndex : Immutable Index implementing an ordered,
-        sliceable set. IntervalIndex represents an Index of intervals that
-        are all closed on the same side.
+        sliceable set.
 
     Notes
     -----
     Any NA values will be NA in the result. Out of bounds values will be NA in
-    the resulting pandas.Categorical object.
+    the resulting Series or pandas.Categorical object.
 
     Examples
     --------
-    >>> pd.cut(np.array([1,7,5,4,6,3]), 3)
+    Discretize into three equal-sized bins.
+
+    >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3)
     ... # doctest: +ELLIPSIS
     [(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], ...
     Categories (3, interval[float64]): [(0.994, 3.0] < (3.0, 5.0] ...
 
-    >>> pd.cut(np.array([1,7,5,4,6,3]), 3, retbins=True)
+    >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3, retbins=True)
     ... # doctest: +ELLIPSIS
     ([(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], ...
     Categories (3, interval[float64]): [(0.994, 3.0] < (3.0, 5.0] ...
     array([0.994, 3.   , 5.   , 7.   ]))
 
-    >>> pd.cut(np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]),
-    ...        3, labels=["good", "medium", "bad"])
-    ... # doctest: +SKIP
-    [good, good, good, medium, bad, good]
-    Categories (3, object): [good < medium < bad]
+    Discovers the same bins, but assign them specific labels. Notice that
+    the returned Categorical's categories are `labels` and is ordered.
 
-    >>> pd.cut(np.ones(5, dtype='int64'), 4, labels=False)
-    array([1, 1, 1, 1, 1], dtype=int64)
+    >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]),
+    ...        3, labels=["bad", "medium", "good"])
+    [bad, good, medium, medium, good, bad]
+    Categories (3, object): [bad < medium < good]
 
-    >>> s = pd.Series(np.array([2,4,6,8,10]), index=['a', 'b', 'c', 'd', 'e'])
+    ``labels=False`` implies you just want the bins back.
+
+    >>> pd.cut([0, 1, 1, 2], bins=4, labels=False)
+    array([0, 1, 1, 3])
+
+    Passing a Series as an input returns a Series with categorical dtype:
+
+    >>> s = pd.Series(np.array([2, 4, 6, 8, 10]),
+    ...               index=['a', 'b', 'c', 'd', 'e'])
     >>> pd.cut(s, 3)
     ... # doctest: +ELLIPSIS
     a    (1.992, 4.667]
@@ -124,6 +143,16 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
     e     (7.333, 10.0]
     dtype: category
     Categories (3, interval[float64]): [(1.992, 4.667] < (4.667, ...
+
+    Passing an IntervalIndex for `bins` results in those categories exactly.
+    Notice that values not covered by the IntervalIndex are set to NaN. 0
+    is to the left of the first bin (which is closed on the right), and 1.5
+    falls between two bins.
+
+    >>> bins = pd.IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)])
+    >>> pd.cut([0, 0.5, 1.5, 2.5, 4.5], bins)
+    [NaN, (0, 1], NaN, (2, 3], (4, 5]]
+    Categories (3, interval[int64]): [(0, 1] < (2, 3] < (4, 5]]
     """
     # NOTE: this binning code is changed a bit from histogram for var(x) == 0