From e50beb7574cf929529c7fc61d3a225f8e750baab Mon Sep 17 00:00:00 2001 From: Iva Koevska Date: Sat, 10 Mar 2018 12:05:15 +0200 Subject: [PATCH 1/6] DOC: Update docs for pandas.cut --- pandas/core/reshape/tile.py | 79 +++++++++++++++++++++---------------- 1 file changed, 45 insertions(+), 34 deletions(-) diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 30132ddc05c40..6d55e2a3a2fb1 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -26,53 +26,64 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, include_lowest=False): """ - Return indices of half-open bins to which each value of `x` belongs. + Return indices of half-open `bins` to which each value of `x` belongs. + + Use `cut` when you need to segment and sort data values into bins or + buckets of data. This function is also useful for going from a continuous + variable to a categorical variable. For example, `cut` could convert ages + to groups of age ranges. Parameters ---------- x : array-like Input array to be binned. It has to be 1-dimensional. - bins : int, sequence of scalars, or IntervalIndex - If `bins` is an int, it defines the number of equal-width bins in the - range of `x`. However, in this case, the range of `x` is extended - by .1% on each side to include the min or max values of `x`. If - `bins` is a sequence it defines the bin edges allowing for - non-uniform bin width. No extension of the range of `x` is done in - this case. - right : bool, optional - Indicates whether the bins include the rightmost edge or not. If - right == True (the default), then the bins [1,2,3,4] indicate + bins : int, sequence of scalars, or pandas.IntervalIndex + If `bins` is an int, defines the number of equal-width bins in the + range of `x`. The range of `x` is extended by .1% on each side to + include the min or max values of `x`. + If `bins` is a sequence, defines the bin edges allowing for + non-uniform bin width. No extension of the range of `x` is done. + right : bool, optional, default 'True' + Indicates whether the `bins` include the rightmost edge or not. If + `right == True` (the default), then the `bins` [1,2,3,4] indicate (1,2], (2,3], (3,4]. - labels : array or boolean, default None - Used as labels for the resulting bins. Must be of the same length as - the resulting bins. If False, return only integer indicators of the - bins. - retbins : bool, optional - Whether to return the bins or not. Can be useful if bins is given + labels : array or bool, optional + Used as labels for the resulting `bins`. Must be of the same length as + the resulting `bins`. If False, returns only integer indicators of the + `bins`. + retbins : bool, optional, default 'False' + Whether to return the `bins` or not. Useful when `bins` is provided as a scalar. - precision : int, optional - The precision at which to store and display the bins labels - include_lowest : bool, optional + precision : int, optional, default '3' + The precision at which to store and display the `bins` labels. + include_lowest : bool, optional, default 'False' Whether the first interval should be left-inclusive or not. Returns ------- - out : Categorical or Series or array of integers if labels is False - The return type (Categorical or Series) depends on the input: a Series - of type category if input is a Series else Categorical. Bins are - represented as categories when categorical data is returned. - bins : ndarray of floats - Returned only if `retbins` is True. + out : pandas.Categorical or Series, or array of int if `labels` is 'False' + The return type depends on the input. + If the input is a Series, a Series of type category is returned. + Else - pandas.Categorical is returned. `Bins` are represented as + categories when categorical data is returned. + bins : numpy.ndarray of floats + Returned only if `retbins` is 'True'. + + See Also + -------- + qcut : Discretize variable into equal-sized buckets based on rank + or based on sample quantiles. + pandas.Categorical : Represents a categorical variable in + classic R / S-plus fashion. + Series : One-dimensional ndarray with axis labels (including time series). + pandas.IntervalIndex : Immutable Index implementing an ordered, + sliceable set. IntervalIndex represents an Index of intervals that + are all closed on the same side. Notes ----- - The `cut` function can be useful for going from a continuous variable to - a categorical variable. For example, `cut` could convert ages to groups - of age ranges. - - Any NA values will be NA in the result. Out of bounds values will be NA in - the resulting Categorical object - + Any NA values will be NA in the result. Out of bounds values will be NA in + the resulting pandas.Categorical object. Examples -------- @@ -88,7 +99,7 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, Categories (3, object): [good < medium < bad] >>> pd.cut(np.ones(5), 4, labels=False) - array([1, 1, 1, 1, 1]) + array([1, 1, 1, 1, 1], dtype=int64) """ # NOTE: this binning code is changed a bit from histogram for var(x) == 0 From 49e002fb856f05e9c2beb313e9e4f31e08e9b41a Mon Sep 17 00:00:00 2001 From: Iva Koevska Date: Sat, 10 Mar 2018 13:51:38 +0200 Subject: [PATCH 2/6] Udated with comments from Joris --- pandas/core/reshape/tile.py | 71 ++++++++++++++++++++++++------------- 1 file changed, 46 insertions(+), 25 deletions(-) diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 6d55e2a3a2fb1..0eff1729d8b6e 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -26,37 +26,41 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, include_lowest=False): """ - Return indices of half-open `bins` to which each value of `x` belongs. + Bin `x` and return data about the bin to which each `x` value belongs. - Use `cut` when you need to segment and sort data values into bins or - buckets of data. This function is also useful for going from a continuous - variable to a categorical variable. For example, `cut` could convert ages - to groups of age ranges. + This function splits `x` into the specified number of equal-width half- + open bins. Based on the parameters specified and the input, returns + information about the half-open bins to which each value of `x` belongs + or the bins themselves. + Use `cut` when you need to segment and sort data values into bins. This + function is also useful for going from a continuous variable to a + categorical variable. For example, `cut` could convert ages to groups + of age ranges. Parameters ---------- x : array-like - Input array to be binned. It has to be 1-dimensional. + The input array to be binned. Must be 1-dimensional. bins : int, sequence of scalars, or pandas.IntervalIndex - If `bins` is an int, defines the number of equal-width bins in the - range of `x`. The range of `x` is extended by .1% on each side to - include the min or max values of `x`. - If `bins` is a sequence, defines the bin edges allowing for - non-uniform bin width. No extension of the range of `x` is done. - right : bool, optional, default 'True' + If int, defines the number of equal-width bins in the range of `x`. + The range of `x` is extended by .1% on each side to include the min or + max values of `x`. + If a sequence, defines the bin edges allowing for non-uniform width. + No extension of the range of `x` is done. + right : bool, default 'True' Indicates whether the `bins` include the rightmost edge or not. If `right == True` (the default), then the `bins` [1,2,3,4] indicate (1,2], (2,3], (3,4]. labels : array or bool, optional - Used as labels for the resulting `bins`. Must be of the same length as - the resulting `bins`. If False, returns only integer indicators of the - `bins`. - retbins : bool, optional, default 'False' - Whether to return the `bins` or not. Useful when `bins` is provided + Specifies the labels for the returned bins. Must be the same length as + the resulting bins. If False, returns only integer indicators of the + bins. + retbins : bool, default 'False' + Whether to return the bins or not. Useful when bins is provided as a scalar. - precision : int, optional, default '3' - The precision at which to store and display the `bins` labels. - include_lowest : bool, optional, default 'False' + precision : int, default '3' + The precision at which to store and display the bins labels. + include_lowest : bool, default 'False' Whether the first interval should be left-inclusive or not. Returns @@ -64,10 +68,10 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, out : pandas.Categorical or Series, or array of int if `labels` is 'False' The return type depends on the input. If the input is a Series, a Series of type category is returned. - Else - pandas.Categorical is returned. `Bins` are represented as + Else - pandas.Categorical is returned. Bins are represented as categories when categorical data is returned. bins : numpy.ndarray of floats - Returned only if `retbins` is 'True'. + Returned when `retbins` is 'True'. See Also -------- @@ -87,10 +91,16 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, Examples -------- - >>> pd.cut(np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]), 3, retbins=True) + >>> pd.cut(np.array([1,7,5,4,6,3]), 3) + ... # doctest: +ELLIPSIS + [(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], ... + Categories (3, interval[float64]): [(0.994, 3.0] < (3.0, 5.0] ... + + >>> pd.cut(np.array([1,7,5,4,6,3]), 3, retbins=True) ... # doctest: +ELLIPSIS - ([(0.19, 3.367], (0.19, 3.367], (0.19, 3.367], (3.367, 6.533], ... - Categories (3, interval[float64]): [(0.19, 3.367] < (3.367, 6.533] ... + ([(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], ... + Categories (3, interval[float64]): [(0.994, 3.0] < (3.0, 5.0] ... + array([0.994, 3. , 5. , 7. ])) >>> pd.cut(np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]), ... 3, labels=["good", "medium", "bad"]) @@ -100,6 +110,17 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, >>> pd.cut(np.ones(5), 4, labels=False) array([1, 1, 1, 1, 1], dtype=int64) + + >>> s = pd.Series(np.array([2,4,6,8,10]), index=['a', 'b', 'c', 'd', 'e']) + >>> pd.cut(s, 3) + ... # doctest: +ELLIPSIS + a (1.992, 4.667] + b (1.992, 4.667] + c (4.667, 7.333] + d (7.333, 10.0] + e (7.333, 10.0] + dtype: category + Categories (3, interval[float64]): [(1.992, 4.667] < (4.667, ... """ # NOTE: this binning code is changed a bit from histogram for var(x) == 0 From d24c749b0088d3c9651aa82624041bf94b43cd9b Mon Sep 17 00:00:00 2001 From: Iva Koevska Date: Mon, 12 Mar 2018 19:52:41 +0200 Subject: [PATCH 3/6] Updated as per comments --- pandas/core/reshape/tile.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 0eff1729d8b6e..903ef8ad4e34d 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -28,10 +28,10 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, """ Bin `x` and return data about the bin to which each `x` value belongs. - This function splits `x` into the specified number of equal-width half- - open bins. Based on the parameters specified and the input, returns - information about the half-open bins to which each value of `x` belongs - or the bins themselves. + Splits `x` into the specified number of equal-width half-open bins. + Based on the parameters specified and the input, returns data about + the half-open bins to which each value of `x` belongs or the bins + themselves. Use `cut` when you need to segment and sort data values into bins. This function is also useful for going from a continuous variable to a categorical variable. For example, `cut` could convert ages to groups @@ -108,7 +108,7 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, [good, good, good, medium, bad, good] Categories (3, object): [good < medium < bad] - >>> pd.cut(np.ones(5), 4, labels=False) + >>> pd.cut(np.ones(5, dtype='int64'), 4, labels=False) array([1, 1, 1, 1, 1], dtype=int64) >>> s = pd.Series(np.array([2,4,6,8,10]), index=['a', 'b', 'c', 'd', 'e']) From 1f3caf6a24d49d654f25d13badd5e09a0260c228 Mon Sep 17 00:00:00 2001 From: Iva Koevska Date: Mon, 12 Mar 2018 21:07:33 +0200 Subject: [PATCH 4/6] Updated as per comments --- pandas/core/reshape/tile.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 903ef8ad4e34d..a03628654638e 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -65,11 +65,14 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, Returns ------- - out : pandas.Categorical or Series, or array of int if `labels` is 'False' - The return type depends on the input. - If the input is a Series, a Series of type category is returned. - Else - pandas.Categorical is returned. Bins are represented as - categories when categorical data is returned. + out : pandas.Categorical, Series, or ndarray + An array-like object representing the respective bin for each value + of `x`. The type depends on the value of `labels`. + + * True : returns a Series for Series `x` or a pandas.Categorical for + pandas.Categorial `x`. + + * False : returns an ndarray of integers. bins : numpy.ndarray of floats Returned when `retbins` is 'True'. From 544af0e57f6e1462b21fe0670a852184123f3f83 Mon Sep 17 00:00:00 2001 From: Iva Koevska Date: Mon, 12 Mar 2018 21:20:03 +0200 Subject: [PATCH 5/6] Fixed whitespace issue --- pandas/core/reshape/tile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index a03628654638e..eb90e41be2933 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -71,7 +71,7 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, * True : returns a Series for Series `x` or a pandas.Categorical for pandas.Categorial `x`. - + * False : returns an ndarray of integers. bins : numpy.ndarray of floats Returned when `retbins` is 'True'. From f22e45fb5880550a276ff4fa4ebdb7ad3c2926d4 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 15 Mar 2018 15:28:04 -0500 Subject: [PATCH 6/6] Updated [ci skip] [ci skip] --- pandas/core/reshape/tile.py | 109 +++++++++++++++++++++++------------- 1 file changed, 69 insertions(+), 40 deletions(-) diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index eb90e41be2933..be28f7091712f 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -26,41 +26,44 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, include_lowest=False): """ - Bin `x` and return data about the bin to which each `x` value belongs. + Bin values into discrete intervals. - Splits `x` into the specified number of equal-width half-open bins. - Based on the parameters specified and the input, returns data about - the half-open bins to which each value of `x` belongs or the bins - themselves. Use `cut` when you need to segment and sort data values into bins. This function is also useful for going from a continuous variable to a - categorical variable. For example, `cut` could convert ages to groups - of age ranges. + categorical variable. For example, `cut` could convert ages to groups of + age ranges. Supports binning into an equal number of bins, or a + pre-specified array of bins. Parameters ---------- x : array-like The input array to be binned. Must be 1-dimensional. bins : int, sequence of scalars, or pandas.IntervalIndex - If int, defines the number of equal-width bins in the range of `x`. - The range of `x` is extended by .1% on each side to include the min or - max values of `x`. - If a sequence, defines the bin edges allowing for non-uniform width. - No extension of the range of `x` is done. - right : bool, default 'True' - Indicates whether the `bins` include the rightmost edge or not. If - `right == True` (the default), then the `bins` [1,2,3,4] indicate - (1,2], (2,3], (3,4]. + The criteria to bin by. + + * int : Defines the number of equal-width bins in the range of `x`. The + range of `x` is extended by .1% on each side to include the minimum + and maximum values of `x`. + * sequence of scalars : Defines the bin edges allowing for non-uniform + width. No extension of the range of `x` is done. + * IntervalIndex : Defines the exact bins to be used. + + right : bool, default True + Indicates whether `bins` includes the rightmost edge or not. If + ``right == True`` (the default), then the `bins` ``[1, 2, 3, 4]`` + indicate (1,2], (2,3], (3,4]. This argument is ignored when + `bins` is an IntervalIndex. labels : array or bool, optional Specifies the labels for the returned bins. Must be the same length as the resulting bins. If False, returns only integer indicators of the - bins. - retbins : bool, default 'False' + bins. This affects the type of the output container (see below). + This argument is ignored when `bins` is an IntervalIndex. + retbins : bool, default False Whether to return the bins or not. Useful when bins is provided as a scalar. - precision : int, default '3' + precision : int, default 3 The precision at which to store and display the bins labels. - include_lowest : bool, default 'False' + include_lowest : bool, default False Whether the first interval should be left-inclusive or not. Returns @@ -69,52 +72,68 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, An array-like object representing the respective bin for each value of `x`. The type depends on the value of `labels`. - * True : returns a Series for Series `x` or a pandas.Categorical for - pandas.Categorial `x`. + * True (default) : returns a Series for Series `x` or a + pandas.Categorical for all other inputs. The values stored within + are Interval dtype. + + * sequence of scalars : returns a Series for Series `x` or a + pandas.Categorical for all other inputs. The values stored within + are whatever the type in the sequence is. * False : returns an ndarray of integers. - bins : numpy.ndarray of floats - Returned when `retbins` is 'True'. + + bins : numpy.ndarray or IntervalIndex. + The computed or specified bins. Only returned when `retbins=True`. + For scalar or sequence `bins`, this is an ndarray with the computed + bins. For an IntervalIndex `bins`, this is equal to `bins`. See Also -------- qcut : Discretize variable into equal-sized buckets based on rank or based on sample quantiles. - pandas.Categorical : Represents a categorical variable in - classic R / S-plus fashion. - Series : One-dimensional ndarray with axis labels (including time series). + pandas.Categorical : Array type for storing data that come from a + fixed set of values. + Series : One-dimensional array with axis labels (including time series). pandas.IntervalIndex : Immutable Index implementing an ordered, - sliceable set. IntervalIndex represents an Index of intervals that - are all closed on the same side. + sliceable set. Notes ----- Any NA values will be NA in the result. Out of bounds values will be NA in - the resulting pandas.Categorical object. + the resulting Series or pandas.Categorical object. Examples -------- - >>> pd.cut(np.array([1,7,5,4,6,3]), 3) + Discretize into three equal-sized bins. + + >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3) ... # doctest: +ELLIPSIS [(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], ... Categories (3, interval[float64]): [(0.994, 3.0] < (3.0, 5.0] ... - >>> pd.cut(np.array([1,7,5,4,6,3]), 3, retbins=True) + >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3, retbins=True) ... # doctest: +ELLIPSIS ([(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], ... Categories (3, interval[float64]): [(0.994, 3.0] < (3.0, 5.0] ... array([0.994, 3. , 5. , 7. ])) - >>> pd.cut(np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]), - ... 3, labels=["good", "medium", "bad"]) - ... # doctest: +SKIP - [good, good, good, medium, bad, good] - Categories (3, object): [good < medium < bad] + Discovers the same bins, but assign them specific labels. Notice that + the returned Categorical's categories are `labels` and is ordered. - >>> pd.cut(np.ones(5, dtype='int64'), 4, labels=False) - array([1, 1, 1, 1, 1], dtype=int64) + >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), + ... 3, labels=["bad", "medium", "good"]) + [bad, good, medium, medium, good, bad] + Categories (3, object): [bad < medium < good] - >>> s = pd.Series(np.array([2,4,6,8,10]), index=['a', 'b', 'c', 'd', 'e']) + ``labels=False`` implies you just want the bins back. + + >>> pd.cut([0, 1, 1, 2], bins=4, labels=False) + array([0, 1, 1, 3]) + + Passing a Series as an input returns a Series with categorical dtype: + + >>> s = pd.Series(np.array([2, 4, 6, 8, 10]), + ... index=['a', 'b', 'c', 'd', 'e']) >>> pd.cut(s, 3) ... # doctest: +ELLIPSIS a (1.992, 4.667] @@ -124,6 +143,16 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, e (7.333, 10.0] dtype: category Categories (3, interval[float64]): [(1.992, 4.667] < (4.667, ... + + Passing an IntervalIndex for `bins` results in those categories exactly. + Notice that values not covered by the IntervalIndex are set to NaN. 0 + is to the left of the first bin (which is closed on the right), and 1.5 + falls between two bins. + + >>> bins = pd.IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)]) + >>> pd.cut([0, 0.5, 1.5, 2.5, 4.5], bins) + [NaN, (0, 1], NaN, (2, 3], (4, 5]] + Categories (3, interval[int64]): [(0, 1] < (2, 3] < (4, 5]] """ # NOTE: this binning code is changed a bit from histogram for var(x) == 0