From 8a90d6d21403ffcdbb450ba80fac517d8577ba2f Mon Sep 17 00:00:00 2001 From: Iva Koevska Date: Fri, 9 Mar 2018 00:19:00 +0200 Subject: [PATCH 01/18] Reworked doc string for pandas.cut --- pandas/core/reshape/tile.py | 71 +++++++++++++++++++++---------------- 1 file changed, 40 insertions(+), 31 deletions(-) diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 359c030157bd3..d21eaba0f4983 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -24,53 +24,62 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, include_lowest=False): """ - Return indices of half-open bins to which each value of `x` belongs. - + Return indices of half-open `bins` to which each value of `x` belongs. + Parameters ---------- x : array-like Input array to be binned. It has to be 1-dimensional. - bins : int, sequence of scalars, or IntervalIndex - If `bins` is an int, it defines the number of equal-width bins in the - range of `x`. However, in this case, the range of `x` is extended - by .1% on each side to include the min or max values of `x`. If - `bins` is a sequence it defines the bin edges allowing for - non-uniform bin width. No extension of the range of `x` is done in - this case. - right : bool, optional - Indicates whether the bins include the rightmost edge or not. If - right == True (the default), then the bins [1,2,3,4] indicate + bins : int, sequence of scalars, or pandas.IntervalIndex + If `bins` is an int, defines the number of equal-width bins in the + range of `x`. The range of `x` is extended by .1% on each side to + include the min or max values of `x`. + If `bins` is a sequence, defines the bin edges allowing for + non-uniform bin width. No extension of the range of `x` is done. + right : bool, optional, default 'True' + Indicates whether the `bins` include the rightmost edge or not. If + `right == True` (the default), then the `bins` [1,2,3,4] indicate (1,2], (2,3], (3,4]. - labels : array or boolean, default None - Used as labels for the resulting bins. Must be of the same length as - the resulting bins. If False, return only integer indicators of the - bins. - retbins : bool, optional - Whether to return the bins or not. Can be useful if bins is given + labels : array or bool, optional + Used as labels for the resulting `bins`. Must be of the same length as + the resulting `bins`. If False, returns only integer indicators of the + `bins`. + retbins : bool, optional, default 'False' + Whether to return the `bins` or not. Useful when `bins` is provided as a scalar. - precision : int, optional - The precision at which to store and display the bins labels - include_lowest : bool, optional + precision : int, optional, default '3' + The precision at which to store and display the `bins` labels. + include_lowest : bool, optional, default 'False' Whether the first interval should be left-inclusive or not. Returns ------- - out : Categorical or Series or array of integers if labels is False - The return type (Categorical or Series) depends on the input: a Series - of type category if input is a Series else Categorical. Bins are - represented as categories when categorical data is returned. - bins : ndarray of floats - Returned only if `retbins` is True. + out : pandas.Categorical or Series, or array of integers if `labels` is 'False' + The return type depends on the input. If the input is a Series, a Series + of type category is returned. Else - pandas.Categorical is returned. + `Bins` are represented as categories when categorical data is returned. + bins : numpy.ndarray of floats + Returned only if `retbins` is 'True'. + + See Also + -------- + qcut : Discretize variable into equal-sized buckets based on rank + or based on sample quantiles. + pandas.Categorical : Represents a categorical variable in + classic R / S-plus fashion. + Series : One-dimensional ndarray with axis labels (including time series). + pandas.IntervalIndex : Immutable Index implementing an ordered, sliceable set. + IntervalIndex represents an Index of intervals that are all closed on the + same side. Notes ----- - The `cut` function can be useful for going from a continuous variable to + The `cut` function is useful for going from a continuous variable to a categorical variable. For example, `cut` could convert ages to groups of age ranges. - Any NA values will be NA in the result. Out of bounds values will be NA in - the resulting Categorical object - + Any NA values will be NA in the result. Out of bounds values will be NA in + the resulting pandas.Categorical object. Examples -------- From 690dbc5a5ab99f04f2486e25829daad6fa5cd528 Mon Sep 17 00:00:00 2001 From: Iva Koevska Date: Fri, 9 Mar 2018 00:52:50 +0200 Subject: [PATCH 02/18] Fixed example and extended descr --- pandas/core/reshape/tile.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index d21eaba0f4983..d79eda71436ff 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -25,6 +25,11 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, include_lowest=False): """ Return indices of half-open `bins` to which each value of `x` belongs. + + Use `cut` when you need to segment and sort data values into bins or + buckets of data. This function is also useful for going from a continuous + variable to a categorical variable. For example, `cut` could convert ages + to groups of age ranges. Parameters ---------- @@ -74,10 +79,6 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, Notes ----- - The `cut` function is useful for going from a continuous variable to - a categorical variable. For example, `cut` could convert ages to groups - of age ranges. - Any NA values will be NA in the result. Out of bounds values will be NA in the resulting pandas.Categorical object. @@ -95,7 +96,7 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, Categories (3, object): [good < medium < bad] >>> pd.cut(np.ones(5), 4, labels=False) - array([1, 1, 1, 1, 1]) + array([1, 1, 1, 1, 1], dtype=int64) """ # NOTE: this binning code is changed a bit from histogram for var(x) == 0 From 00f35fbab77d1534432b050336b0af02e21c59ca Mon Sep 17 00:00:00 2001 From: Iva Koevska Date: Fri, 9 Mar 2018 00:52:50 +0200 Subject: [PATCH 03/18] DOC: Fixed example & description for pandas.cut --- pandas/core/reshape/tile.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index d21eaba0f4983..d79eda71436ff 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -25,6 +25,11 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, include_lowest=False): """ Return indices of half-open `bins` to which each value of `x` belongs. + + Use `cut` when you need to segment and sort data values into bins or + buckets of data. This function is also useful for going from a continuous + variable to a categorical variable. For example, `cut` could convert ages + to groups of age ranges. Parameters ---------- @@ -74,10 +79,6 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, Notes ----- - The `cut` function is useful for going from a continuous variable to - a categorical variable. For example, `cut` could convert ages to groups - of age ranges. - Any NA values will be NA in the result. Out of bounds values will be NA in the resulting pandas.Categorical object. @@ -95,7 +96,7 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, Categories (3, object): [good < medium < bad] >>> pd.cut(np.ones(5), 4, labels=False) - array([1, 1, 1, 1, 1]) + array([1, 1, 1, 1, 1], dtype=int64) """ # NOTE: this binning code is changed a bit from histogram for var(x) == 0 From 54df8d3c3a4b24b9f0d89b0c61aeab22fc576ff2 Mon Sep 17 00:00:00 2001 From: Iva Koevska Date: Fri, 9 Mar 2018 07:45:44 +0200 Subject: [PATCH 04/18] DOC: Fixed issues with panda.cut after flake8 --- pandas/core/reshape/tile.py | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index d79eda71436ff..5925413bd26b8 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -26,19 +26,19 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, """ Return indices of half-open `bins` to which each value of `x` belongs. - Use `cut` when you need to segment and sort data values into bins or + Use `cut` when you need to segment and sort data values into bins or buckets of data. This function is also useful for going from a continuous variable to a categorical variable. For example, `cut` could convert ages to groups of age ranges. - + Parameters ---------- x : array-like Input array to be binned. It has to be 1-dimensional. bins : int, sequence of scalars, or pandas.IntervalIndex If `bins` is an int, defines the number of equal-width bins in the - range of `x`. The range of `x` is extended by .1% on each side to - include the min or max values of `x`. + range of `x`. The range of `x` is extended by .1% on each side to + include the min or max values of `x`. If `bins` is a sequence, defines the bin edges allowing for non-uniform bin width. No extension of the range of `x` is done. right : bool, optional, default 'True' @@ -59,23 +59,24 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, Returns ------- - out : pandas.Categorical or Series, or array of integers if `labels` is 'False' - The return type depends on the input. If the input is a Series, a Series - of type category is returned. Else - pandas.Categorical is returned. - `Bins` are represented as categories when categorical data is returned. + out : pandas.Categorical or Series, or array of int if `labels` is 'False' + The return type depends on the input. + If the input is a Series, a Series of type category is returned. + Else - pandas.Categorical is returned. `Bins` are represented as + categories when categorical data is returned. bins : numpy.ndarray of floats Returned only if `retbins` is 'True'. - + See Also -------- - qcut : Discretize variable into equal-sized buckets based on rank + qcut : Discretize variable into equal-sized buckets based on rank or based on sample quantiles. - pandas.Categorical : Represents a categorical variable in + pandas.Categorical : Represents a categorical variable in classic R / S-plus fashion. Series : One-dimensional ndarray with axis labels (including time series). - pandas.IntervalIndex : Immutable Index implementing an ordered, sliceable set. - IntervalIndex represents an Index of intervals that are all closed on the - same side. + pandas.IntervalIndex : Immutable Index implementing an ordered, + sliceable set. IntervalIndex represents an Index of intervals that + are all closed on the same side. Notes ----- From 747501a810173b95124c34f0ac6c25957cff5e12 Mon Sep 17 00:00:00 2001 From: Aly Sivji <4369343+alysivji@users.noreply.github.com> Date: Fri, 9 Mar 2018 02:19:59 -0600 Subject: [PATCH 05/18] DOC: Improve docstring for pandas.Index.repeat (#19985) --- pandas/core/indexes/base.py | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 9cc8c0dfe2a90..69a07a91838e1 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -696,12 +696,38 @@ def memory_usage(self, deep=False): @deprecate_kwarg(old_arg_name='n', new_arg_name='repeats') def repeat(self, repeats, *args, **kwargs): """ - Repeat elements of an Index. Refer to `numpy.ndarray.repeat` - for more information about the `repeats` argument. + Repeat elements of an Index. - See also + Returns a new index where each element of the current index + is repeated consecutively a given number of times. + + Parameters + ---------- + repeats : int + The number of repetitions for each element. + **kwargs + Additional keywords have no effect but might be accepted for + compatibility with numpy. + + Returns + ------- + pandas.Index + Newly created Index with repeated elements. + + See Also + -------- + Series.repeat : Equivalent function for Series + numpy.repeat : Underlying implementation + + Examples -------- - numpy.ndarray.repeat + >>> idx = pd.Index([1, 2, 3]) + >>> idx + Int64Index([1, 2, 3], dtype='int64') + >>> idx.repeat(2) + Int64Index([1, 1, 2, 2, 3, 3], dtype='int64') + >>> idx.repeat(3) + Int64Index([1, 1, 1, 2, 2, 2, 3, 3, 3], dtype='int64') """ nv.validate_repeat(args, kwargs) return self._shallow_copy(self._values.repeat(repeats)) From 9119d076837dfcfe843726eb58cca4e449245562 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 9 Mar 2018 10:03:44 +0100 Subject: [PATCH 06/18] Temporary github PR template for sprint (#20055) --- .github/PULL_REQUEST_TEMPLATE.md | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 4e1e9ce017408..c1e02bd8eafc4 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,3 +1,27 @@ +Checklist for the pandas documentation sprint (ignore this if you are doing +an unrelated PR): + +- [ ] PR title is "DOC: update the docstring" +- [ ] The validation script passes: `scripts/validate_docstrings.py ` +- [ ] The PEP8 style check passes: `git diff upstream/master -u -- "*.py" | flake8 --diff` +- [ ] The html version looks good: `python doc/make.py --single ` +- [ ] It has been proofread on language by another sprint participant + +Please include the output of the validation script below between the "```" ticks: + +``` +# paste output of "scripts/validate_docstrings.py " here +# between the "```" (remove this comment, but keep the "```") + +``` + +If the validation script still gives errors, but you think there is a good reason +to deviate in this case (and there are certainly such cases), please state this +explicitly. + + +Checklist for other PRs (remove this part if you are doing a PR for the pandas documentation sprint): + - [ ] closes #xxxx - [ ] tests added / passed - [ ] passes `git diff upstream/master -u -- "*.py" | flake8 --diff` From c730d08eca4c7f4455d0fc61802bb712db07aea7 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Fri, 9 Mar 2018 02:37:27 -0800 Subject: [PATCH 07/18] DOC: Update Kurt Docstr (#20044) --- pandas/core/window.py | 41 ++++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/pandas/core/window.py b/pandas/core/window.py index cef012bb33e9f..c41b07759d555 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -906,21 +906,23 @@ def skew(self, **kwargs): Parameters ---------- - kwargs : Under Review + **kwargs + Under Review. Returns ------- - Series or DataFrame (matches input) - Like-indexed object containing the result of function application + Series or DataFrame + Returned object type is determined by the caller of the %(name)s + calculation See Also -------- - pandas.Series.%(name)s - pandas.DataFrame.%(name)s - pandas.Series.kurtosis - pandas.DataFrame.kurtosis - scipy.stats.skew - scipy.stats.kurtosis + Series.%(name)s : Calling object with Series data + DataFrame.%(name)s : Calling object with DataFrames + Series.kurt : Equivalent method for Series + DataFrame.kurt : Equivalent method for DataFrame + scipy.stats.skew : Third moment of a probability density + scipy.stats.kurtosis : Reference SciPy method Notes ----- @@ -932,19 +934,20 @@ def skew(self, **kwargs): four matching the equivalent function call using `scipy.stats`. >>> arr = [1, 2, 3, 4, 999] + >>> fmt = "{0:.6f}" # limit the printed precision to 6 digits >>> import scipy.stats - >>> print("{0:.6f}".format(scipy.stats.kurtosis(arr[:-1], bias=False))) + >>> print(fmt.format(scipy.stats.kurtosis(arr[:-1], bias=False))) -1.200000 - >>> print("{0:.6f}".format(scipy.stats.kurtosis(arr[1:], bias=False))) + >>> print(fmt.format(scipy.stats.kurtosis(arr[1:], bias=False))) 3.999946 - >>> df = pd.DataFrame(arr) - >>> df.rolling(4).kurt() - 0 - 0 NaN - 1 NaN - 2 NaN - 3 -1.200000 - 4 3.999946 + >>> s = pd.Series(arr) + >>> s.rolling(4).kurt() + 0 NaN + 1 NaN + 2 NaN + 3 -1.200000 + 4 3.999946 + dtype: float64 """) def kurt(self, **kwargs): From cc1b934a1be61ff5f1f2776426870b86c384819c Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Fri, 9 Mar 2018 03:13:50 -0800 Subject: [PATCH 08/18] BUG: Retain timezone dtype with cut and qcut (#19890) --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/core/reshape/tile.py | 30 ++++++--- pandas/tests/reshape/test_tile.py | 108 ++++++++++++++++++++++-------- 3 files changed, 102 insertions(+), 37 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index bb513605b1c94..302f8043f3ba7 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -1019,6 +1019,7 @@ Reshaping - Bug in :func:`DataFrame.iterrows`, which would infers strings not compliant to `ISO8601 `_ to datetimes (:issue:`19671`) - Bug in :class:`Series` constructor with ``Categorical`` where a ```ValueError`` is not raised when an index of different length is given (:issue:`19342`) - Bug in :meth:`DataFrame.astype` where column metadata is lost when converting to categorical or a dictionary of dtypes (:issue:`19920`) +- Bug in :func:`cut` and :func:`qcut` where timezone information was dropped (:issue:`19872`) Other ^^^^^ diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 359c030157bd3..30132ddc05c40 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -1,6 +1,7 @@ """ Quantilization functions and related stuff """ +from functools import partial from pandas.core.dtypes.missing import isna from pandas.core.dtypes.common import ( @@ -9,6 +10,7 @@ is_categorical_dtype, is_datetime64_dtype, is_timedelta64_dtype, + is_datetime64tz_dtype, _ensure_int64) import pandas.core.algorithms as algos @@ -239,7 +241,8 @@ def _bins_to_cuts(x, bins, right=True, labels=None, ids = _ensure_int64(bins.searchsorted(x, side=side)) if include_lowest: - ids[x == bins[0]] = 1 + # Numpy 1.9 support: ensure this mask is a Numpy array + ids[np.asarray(x == bins[0])] = 1 na_mask = isna(x) | (ids == len(bins)) | (ids == 0) has_nas = na_mask.any() @@ -284,12 +287,14 @@ def _coerce_to_type(x): """ dtype = None - if is_timedelta64_dtype(x): - x = to_timedelta(x) - dtype = np.timedelta64 + if is_datetime64tz_dtype(x): + dtype = x.dtype elif is_datetime64_dtype(x): x = to_datetime(x) dtype = np.datetime64 + elif is_timedelta64_dtype(x): + x = to_timedelta(x) + dtype = np.timedelta64 if dtype is not None: # GH 19768: force NaT to NaN during integer conversion @@ -305,7 +310,7 @@ def _convert_bin_to_numeric_type(bins, dtype): Parameters ---------- - bins : list-liek of bins + bins : list-like of bins dtype : dtype of data Raises @@ -318,7 +323,7 @@ def _convert_bin_to_numeric_type(bins, dtype): bins = to_timedelta(bins).view(np.int64) else: raise ValueError("bins must be of timedelta64 dtype") - elif is_datetime64_dtype(dtype): + elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype): if bins_dtype in ['datetime', 'datetime64']: bins = to_datetime(bins).view(np.int64) else: @@ -333,7 +338,10 @@ def _format_labels(bins, precision, right=True, closed = 'right' if right else 'left' - if is_datetime64_dtype(dtype): + if is_datetime64tz_dtype(dtype): + formatter = partial(Timestamp, tz=dtype.tz) + adjust = lambda x: x - Timedelta('1ns') + elif is_datetime64_dtype(dtype): formatter = Timestamp adjust = lambda x: x - Timedelta('1ns') elif is_timedelta64_dtype(dtype): @@ -372,7 +380,13 @@ def _preprocess_for_cut(x): series_index = x.index name = x.name - x = np.asarray(x) + # Check that the passed array is a Pandas or Numpy object + # We don't want to strip away a Pandas data-type here (e.g. datetimetz) + ndim = getattr(x, 'ndim', None) + if ndim is None: + x = np.asarray(x) + if x.ndim != 1: + raise ValueError("Input array must be 1 dimensional") return x_is_series, series_index, name, x diff --git a/pandas/tests/reshape/test_tile.py b/pandas/tests/reshape/test_tile.py index ff914273d47b1..8d093f2784ba1 100644 --- a/pandas/tests/reshape/test_tile.py +++ b/pandas/tests/reshape/test_tile.py @@ -4,7 +4,7 @@ import numpy as np from pandas.compat import zip -from pandas import (Series, isna, to_datetime, DatetimeIndex, +from pandas import (DataFrame, Series, isna, to_datetime, DatetimeIndex, Index, Timestamp, Interval, IntervalIndex, Categorical, cut, qcut, date_range, NaT, TimedeltaIndex) from pandas.tseries.offsets import Nano, Day @@ -104,6 +104,12 @@ def test_cut_corner(self): pytest.raises(ValueError, cut, [1, 2, 3], 0.5) + @pytest.mark.parametrize('arg', [2, np.eye(2), DataFrame(np.eye(2))]) + @pytest.mark.parametrize('cut_func', [cut, qcut]) + def test_cut_not_1d_arg(self, arg, cut_func): + with pytest.raises(ValueError): + cut_func(arg, 2) + def test_cut_out_of_range_more(self): # #1511 s = Series([0, -1, 0, 1, -3], name='x') @@ -251,18 +257,6 @@ def test_qcut_nas(self): result = qcut(arr, 4) assert isna(result[:20]).all() - @pytest.mark.parametrize('s', [ - Series(DatetimeIndex(['20180101', NaT, '20180103'])), - Series(TimedeltaIndex(['0 days', NaT, '2 days']))], - ids=lambda x: str(x.dtype)) - def test_qcut_nat(self, s): - # GH 19768 - intervals = IntervalIndex.from_tuples( - [(s[0] - Nano(), s[2] - Day()), np.nan, (s[2] - Day(), s[2])]) - expected = Series(Categorical(intervals, ordered=True)) - result = qcut(s, 2) - tm.assert_series_equal(result, expected) - def test_qcut_index(self): result = qcut([0, 2], 2) intervals = [Interval(-0.001, 1), Interval(1, 2)] @@ -452,6 +446,37 @@ def test_single_bin(self): result = cut(s, 1, labels=False) tm.assert_series_equal(result, expected) + @pytest.mark.parametrize( + "array_1_writeable, array_2_writeable", + [(True, True), (True, False), (False, False)]) + def test_cut_read_only(self, array_1_writeable, array_2_writeable): + # issue 18773 + array_1 = np.arange(0, 100, 10) + array_1.flags.writeable = array_1_writeable + + array_2 = np.arange(0, 100, 10) + array_2.flags.writeable = array_2_writeable + + hundred_elements = np.arange(100) + + tm.assert_categorical_equal(cut(hundred_elements, array_1), + cut(hundred_elements, array_2)) + + +class TestDatelike(object): + + @pytest.mark.parametrize('s', [ + Series(DatetimeIndex(['20180101', NaT, '20180103'])), + Series(TimedeltaIndex(['0 days', NaT, '2 days']))], + ids=lambda x: str(x.dtype)) + def test_qcut_nat(self, s): + # GH 19768 + intervals = IntervalIndex.from_tuples( + [(s[0] - Nano(), s[2] - Day()), np.nan, (s[2] - Day(), s[2])]) + expected = Series(Categorical(intervals, ordered=True)) + result = qcut(s, 2) + tm.assert_series_equal(result, expected) + def test_datetime_cut(self): # GH 14714 # testing for time data to be present as series @@ -488,6 +513,47 @@ def test_datetime_cut(self): result, bins = cut(data, 3, retbins=True) tm.assert_series_equal(Series(result), expected) + @pytest.mark.parametrize('bins', [ + 3, [Timestamp('2013-01-01 04:57:07.200000'), + Timestamp('2013-01-01 21:00:00'), + Timestamp('2013-01-02 13:00:00'), + Timestamp('2013-01-03 05:00:00')]]) + @pytest.mark.parametrize('box', [list, np.array, Index, Series]) + def test_datetimetz_cut(self, bins, box): + # GH 19872 + tz = 'US/Eastern' + s = Series(date_range('20130101', periods=3, tz=tz)) + if not isinstance(bins, int): + bins = box(bins) + result = cut(s, bins) + expected = ( + Series(IntervalIndex([ + Interval(Timestamp('2012-12-31 23:57:07.200000', tz=tz), + Timestamp('2013-01-01 16:00:00', tz=tz)), + Interval(Timestamp('2013-01-01 16:00:00', tz=tz), + Timestamp('2013-01-02 08:00:00', tz=tz)), + Interval(Timestamp('2013-01-02 08:00:00', tz=tz), + Timestamp('2013-01-03 00:00:00', tz=tz))])) + .astype(CDT(ordered=True))) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize('bins', [3, np.linspace(0, 1, 4)]) + def test_datetimetz_qcut(self, bins): + # GH 19872 + tz = 'US/Eastern' + s = Series(date_range('20130101', periods=3, tz=tz)) + result = qcut(s, bins) + expected = ( + Series(IntervalIndex([ + Interval(Timestamp('2012-12-31 23:59:59.999999999', tz=tz), + Timestamp('2013-01-01 16:00:00', tz=tz)), + Interval(Timestamp('2013-01-01 16:00:00', tz=tz), + Timestamp('2013-01-02 08:00:00', tz=tz)), + Interval(Timestamp('2013-01-02 08:00:00', tz=tz), + Timestamp('2013-01-03 00:00:00', tz=tz))])) + .astype(CDT(ordered=True))) + tm.assert_series_equal(result, expected) + def test_datetime_bin(self): data = [np.datetime64('2012-12-13'), np.datetime64('2012-12-15')] bin_data = ['2012-12-12', '2012-12-14', '2012-12-16'] @@ -523,19 +589,3 @@ def f(): mask = result.isna() tm.assert_numpy_array_equal( mask, np.array([False, True, True, True, True])) - - @pytest.mark.parametrize( - "array_1_writeable, array_2_writeable", - [(True, True), (True, False), (False, False)]) - def test_cut_read_only(self, array_1_writeable, array_2_writeable): - # issue 18773 - array_1 = np.arange(0, 100, 10) - array_1.flags.writeable = array_1_writeable - - array_2 = np.arange(0, 100, 10) - array_2.flags.writeable = array_2_writeable - - hundred_elements = np.arange(100) - - tm.assert_categorical_equal(cut(hundred_elements, array_1), - cut(hundred_elements, array_2)) From 731d97164ba04150174c4e05f56d40f716044219 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Fri, 9 Mar 2018 03:30:22 -0800 Subject: [PATCH 09/18] Fix typo in apply.py (#20058) --- pandas/core/apply.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 9056f78ee02ed..8fb74e2e87174 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -191,7 +191,7 @@ def apply_broadcast(self, target): for i, col in enumerate(target.columns): res = self.f(target[col]) - ares = np. asarray(res).ndim + ares = np.asarray(res).ndim # must be a scalar or 1d if ares > 1: From 7c14e4f14aff216be558bf5d4d2d00b4838c2360 Mon Sep 17 00:00:00 2001 From: Kyle Barron Date: Fri, 9 Mar 2018 11:31:14 -0500 Subject: [PATCH 10/18] DOC: Add syntax highlighting to SAS code blocks in comparison_with_sas.rst (#20080) * Add syntax highlighting to SAS code blocks * Fix typo --- doc/source/comparison_with_sas.rst | 44 +++++++++++++++--------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/doc/source/comparison_with_sas.rst b/doc/source/comparison_with_sas.rst index 214667119f7e0..0354ad473544b 100644 --- a/doc/source/comparison_with_sas.rst +++ b/doc/source/comparison_with_sas.rst @@ -25,7 +25,7 @@ As is customary, we import pandas and NumPy as follows: This is often used in interactive work (e.g. `Jupyter notebook `_ or terminal) - the equivalent in SAS would be: - .. code-block:: none + .. code-block:: sas proc print data=df(obs=5); run; @@ -65,7 +65,7 @@ in the ``DATA`` step. Every ``DataFrame`` and ``Series`` has an ``Index`` - which are labels on the *rows* of the data. SAS does not have an exactly analogous concept. A data set's -row are essentially unlabeled, other than an implicit integer index that can be +rows are essentially unlabeled, other than an implicit integer index that can be accessed during the ``DATA`` step (``_N_``). In pandas, if no index is specified, an integer index is also used by default @@ -87,7 +87,7 @@ A SAS data set can be built from specified values by placing the data after a ``datalines`` statement and specifying the column names. -.. code-block:: none +.. code-block:: sas data df; input x y; @@ -121,7 +121,7 @@ will be used in many of the following examples. SAS provides ``PROC IMPORT`` to read csv data into a data set. -.. code-block:: none +.. code-block:: sas proc import datafile='tips.csv' dbms=csv out=tips replace; getnames=yes; @@ -156,7 +156,7 @@ Exporting Data The inverse of ``PROC IMPORT`` in SAS is ``PROC EXPORT`` -.. code-block:: none +.. code-block:: sas proc export data=tips outfile='tips2.csv' dbms=csv; run; @@ -178,7 +178,7 @@ Operations on Columns In the ``DATA`` step, arbitrary math expressions can be used on new or existing columns. -.. code-block:: none +.. code-block:: sas data tips; set tips; @@ -207,7 +207,7 @@ Filtering Filtering in SAS is done with an ``if`` or ``where`` statement, on one or more columns. -.. code-block:: none +.. code-block:: sas data tips; set tips; @@ -233,7 +233,7 @@ If/Then Logic In SAS, if/then logic can be used to create new columns. -.. code-block:: none +.. code-block:: sas data tips; set tips; @@ -262,7 +262,7 @@ Date Functionality SAS provides a variety of functions to do operations on date/datetime columns. -.. code-block:: none +.. code-block:: sas data tips; set tips; @@ -307,7 +307,7 @@ Selection of Columns SAS provides keywords in the ``DATA`` step to select, drop, and rename columns. -.. code-block:: none +.. code-block:: sas data tips; set tips; @@ -343,7 +343,7 @@ Sorting by Values Sorting in SAS is accomplished via ``PROC SORT`` -.. code-block:: none +.. code-block:: sas proc sort data=tips; by sex total_bill; @@ -369,7 +369,7 @@ SAS determines the length of a character string with the and `LENGTHC `__ functions. ``LENGTHN`` excludes trailing blanks and ``LENGTHC`` includes trailing blanks. -.. code-block:: none +.. code-block:: sas data _null_; set tips; @@ -395,7 +395,7 @@ SAS determines the position of a character in a string with the ``FINDW`` takes the string defined by the first argument and searches for the first position of the substring you supply as the second argument. -.. code-block:: none +.. code-block:: sas data _null_; set tips; @@ -419,7 +419,7 @@ Substring SAS extracts a substring from a string based on its position with the `SUBSTR `__ function. -.. code-block:: none +.. code-block:: sas data _null_; set tips; @@ -442,7 +442,7 @@ The SAS `SCAN `__ functions change the case of the argument. -.. code-block:: none +.. code-block:: sas data firstlast; input String $60.; @@ -516,7 +516,7 @@ types of joins are accomplished using the ``in=`` dummy variables to track whether a match was found in one or both input frames. -.. code-block:: none +.. code-block:: sas proc sort data=df1; by key; @@ -572,7 +572,7 @@ operations, and is ignored by default for aggregations. One difference is that missing data cannot be compared to its sentinel value. For example, in SAS you could do this to filter missing values. -.. code-block:: none +.. code-block:: sas data outer_join_nulls; set outer_join; @@ -615,7 +615,7 @@ SAS's PROC SUMMARY can be used to group by one or more key variables and compute aggregations on numeric columns. -.. code-block:: none +.. code-block:: sas proc summary data=tips nway; class sex smoker; @@ -640,7 +640,7 @@ In SAS, if the group aggregations need to be used with the original frame, it must be merged back together. For example, to subtract the mean for each observation by smoker group. -.. code-block:: none +.. code-block:: sas proc summary data=tips missing nway; class smoker; @@ -679,7 +679,7 @@ replicate most other by group processing from SAS. For example, this ``DATA`` step reads the data by sex/smoker group and filters to the first entry for each. -.. code-block:: none +.. code-block:: sas proc sort data=tips; by sex smoker; @@ -719,7 +719,7 @@ Data Interop pandas provides a :func:`read_sas` method that can read SAS data saved in the XPORT or SAS7BDAT binary format. -.. code-block:: none +.. code-block:: sas libname xportout xport 'transport-file.xpt'; data xportout.tips; From ed96567dacefc6017ca981c964c852d1e7609a14 Mon Sep 17 00:00:00 2001 From: Ksenia Date: Sat, 10 Mar 2018 02:40:10 +0100 Subject: [PATCH 11/18] TST: series/indexing tests parametrization + moving test methods (#20059) --- .../tests/series/indexing/test_alter_index.py | 201 +++++---- pandas/tests/series/indexing/test_boolean.py | 121 +++-- pandas/tests/series/indexing/test_callable.py | 33 ++ pandas/tests/series/indexing/test_datetime.py | 10 +- pandas/tests/series/indexing/test_iloc.py | 2 - pandas/tests/series/indexing/test_indexing.py | 419 +++++++----------- pandas/tests/series/indexing/test_loc.py | 3 - pandas/tests/series/indexing/test_numeric.py | 5 + 8 files changed, 378 insertions(+), 416 deletions(-) create mode 100644 pandas/tests/series/indexing/test_callable.py diff --git a/pandas/tests/series/indexing/test_alter_index.py b/pandas/tests/series/indexing/test_alter_index.py index 2629cfde9b4af..c1b6d0a452232 100644 --- a/pandas/tests/series/indexing/test_alter_index.py +++ b/pandas/tests/series/indexing/test_alter_index.py @@ -21,88 +21,72 @@ JOIN_TYPES = ['inner', 'outer', 'left', 'right'] -def test_align(test_data): - def _check_align(a, b, how='left', fill=None): - aa, ab = a.align(b, join=how, fill_value=fill) - - join_index = a.index.join(b.index, how=how) - if fill is not None: - diff_a = aa.index.difference(join_index) - diff_b = ab.index.difference(join_index) - if len(diff_a) > 0: - assert (aa.reindex(diff_a) == fill).all() - if len(diff_b) > 0: - assert (ab.reindex(diff_b) == fill).all() - - ea = a.reindex(join_index) - eb = b.reindex(join_index) - - if fill is not None: - ea = ea.fillna(fill) - eb = eb.fillna(fill) - - assert_series_equal(aa, ea) - assert_series_equal(ab, eb) - assert aa.name == 'ts' - assert ea.name == 'ts' - assert ab.name == 'ts' - assert eb.name == 'ts' - - for kind in JOIN_TYPES: - _check_align(test_data.ts[2:], test_data.ts[:-5], how=kind) - _check_align(test_data.ts[2:], test_data.ts[:-5], how=kind, fill=-1) - - # empty left - _check_align(test_data.ts[:0], test_data.ts[:-5], how=kind) - _check_align(test_data.ts[:0], test_data.ts[:-5], how=kind, fill=-1) - - # empty right - _check_align(test_data.ts[:-5], test_data.ts[:0], how=kind) - _check_align(test_data.ts[:-5], test_data.ts[:0], how=kind, fill=-1) - - # both empty - _check_align(test_data.ts[:0], test_data.ts[:0], how=kind) - _check_align(test_data.ts[:0], test_data.ts[:0], how=kind, fill=-1) - - -def test_align_fill_method(test_data): - def _check_align(a, b, how='left', method='pad', limit=None): - aa, ab = a.align(b, join=how, method=method, limit=limit) - - join_index = a.index.join(b.index, how=how) - ea = a.reindex(join_index) - eb = b.reindex(join_index) - - ea = ea.fillna(method=method, limit=limit) - eb = eb.fillna(method=method, limit=limit) - - assert_series_equal(aa, ea) - assert_series_equal(ab, eb) - - for kind in JOIN_TYPES: - for meth in ['pad', 'bfill']: - _check_align(test_data.ts[2:], test_data.ts[:-5], - how=kind, method=meth) - _check_align(test_data.ts[2:], test_data.ts[:-5], - how=kind, method=meth, limit=1) - - # empty left - _check_align(test_data.ts[:0], test_data.ts[:-5], - how=kind, method=meth) - _check_align(test_data.ts[:0], test_data.ts[:-5], - how=kind, method=meth, limit=1) - - # empty right - _check_align(test_data.ts[:-5], test_data.ts[:0], - how=kind, method=meth) - _check_align(test_data.ts[:-5], test_data.ts[:0], - how=kind, method=meth, limit=1) - - # both empty - _check_align(test_data.ts[:0], test_data.ts[:0], - how=kind, method=meth) - _check_align(test_data.ts[:0], test_data.ts[:0], - how=kind, method=meth, limit=1) +@pytest.mark.parametrize( + 'first_slice,second_slice', [ + [[2, None], [None, -5]], + [[None, 0], [None, -5]], + [[None, -5], [None, 0]], + [[None, 0], [None, 0]] + ]) +@pytest.mark.parametrize('join_type', JOIN_TYPES) +@pytest.mark.parametrize('fill', [None, -1]) +def test_align(test_data, first_slice, second_slice, join_type, fill): + a = test_data.ts[slice(*first_slice)] + b = test_data.ts[slice(*second_slice)] + + aa, ab = a.align(b, join=join_type, fill_value=fill) + + join_index = a.index.join(b.index, how=join_type) + if fill is not None: + diff_a = aa.index.difference(join_index) + diff_b = ab.index.difference(join_index) + if len(diff_a) > 0: + assert (aa.reindex(diff_a) == fill).all() + if len(diff_b) > 0: + assert (ab.reindex(diff_b) == fill).all() + + ea = a.reindex(join_index) + eb = b.reindex(join_index) + + if fill is not None: + ea = ea.fillna(fill) + eb = eb.fillna(fill) + + assert_series_equal(aa, ea) + assert_series_equal(ab, eb) + assert aa.name == 'ts' + assert ea.name == 'ts' + assert ab.name == 'ts' + assert eb.name == 'ts' + + +@pytest.mark.parametrize( + 'first_slice,second_slice', [ + [[2, None], [None, -5]], + [[None, 0], [None, -5]], + [[None, -5], [None, 0]], + [[None, 0], [None, 0]] + ]) +@pytest.mark.parametrize('join_type', JOIN_TYPES) +@pytest.mark.parametrize('method', ['pad', 'bfill']) +@pytest.mark.parametrize('limit', [None, 1]) +def test_align_fill_method(test_data, + first_slice, second_slice, + join_type, method, limit): + a = test_data.ts[slice(*first_slice)] + b = test_data.ts[slice(*second_slice)] + + aa, ab = a.align(b, join=join_type, method=method, limit=limit) + + join_index = a.index.join(b.index, how=join_type) + ea = a.reindex(join_index) + eb = b.reindex(join_index) + + ea = ea.fillna(method=method, limit=limit) + eb = eb.fillna(method=method, limit=limit) + + assert_series_equal(aa, ea) + assert_series_equal(ab, eb) def test_align_nocopy(test_data): @@ -481,3 +465,56 @@ def test_rename(): assert_series_equal(result, expected) assert result.name == expected.name + + +def test_drop(): + # unique + s = Series([1, 2], index=['one', 'two']) + expected = Series([1], index=['one']) + result = s.drop(['two']) + assert_series_equal(result, expected) + result = s.drop('two', axis='rows') + assert_series_equal(result, expected) + + # non-unique + # GH 5248 + s = Series([1, 1, 2], index=['one', 'two', 'one']) + expected = Series([1, 2], index=['one', 'one']) + result = s.drop(['two'], axis=0) + assert_series_equal(result, expected) + result = s.drop('two') + assert_series_equal(result, expected) + + expected = Series([1], index=['two']) + result = s.drop(['one']) + assert_series_equal(result, expected) + result = s.drop('one') + assert_series_equal(result, expected) + + # single string/tuple-like + s = Series(range(3), index=list('abc')) + pytest.raises(KeyError, s.drop, 'bc') + pytest.raises(KeyError, s.drop, ('a',)) + + # errors='ignore' + s = Series(range(3), index=list('abc')) + result = s.drop('bc', errors='ignore') + assert_series_equal(result, s) + result = s.drop(['a', 'd'], errors='ignore') + expected = s.iloc[1:] + assert_series_equal(result, expected) + + # bad axis + pytest.raises(ValueError, s.drop, 'one', axis='columns') + + # GH 8522 + s = Series([2, 3], index=[True, False]) + assert s.index.is_object() + result = s.drop(True) + expected = Series([3], index=[False]) + assert_series_equal(result, expected) + + # GH 16877 + s = Series([2, 3], index=[0, 1]) + with tm.assert_raises_regex(KeyError, 'not contained in axis'): + s.drop([False, True]) diff --git a/pandas/tests/series/indexing/test_boolean.py b/pandas/tests/series/indexing/test_boolean.py index 75aa2898ae773..f1f4a5a05697d 100644 --- a/pandas/tests/series/indexing/test_boolean.py +++ b/pandas/tests/series/indexing/test_boolean.py @@ -283,34 +283,30 @@ def test_where_error(): []) -def test_where_array_like(): +@pytest.mark.parametrize('klass', [list, tuple, np.array, Series]) +def test_where_array_like(klass): # see gh-15414 s = Series([1, 2, 3]) cond = [False, True, True] expected = Series([np.nan, 2, 3]) - klasses = [list, tuple, np.array, Series] - for klass in klasses: - result = s.where(klass(cond)) - assert_series_equal(result, expected) + result = s.where(klass(cond)) + assert_series_equal(result, expected) -def test_where_invalid_input(): +@pytest.mark.parametrize('cond', [ + [1, 0, 1], + Series([2, 5, 7]), + ["True", "False", "True"], + [Timestamp("2017-01-01"), pd.NaT, Timestamp("2017-01-02")] +]) +def test_where_invalid_input(cond): # see gh-15414: only boolean arrays accepted s = Series([1, 2, 3]) msg = "Boolean array expected for the condition" - conds = [ - [1, 0, 1], - Series([2, 5, 7]), - ["True", "False", "True"], - [Timestamp("2017-01-01"), - pd.NaT, Timestamp("2017-01-02")] - ] - - for cond in conds: - with tm.assert_raises_regex(ValueError, msg): - s.where(cond) + with tm.assert_raises_regex(ValueError, msg): + s.where(cond) msg = "Array conditional must be same shape as self" with tm.assert_raises_regex(ValueError, msg): @@ -403,37 +399,43 @@ def f(): assert_series_equal(s, expected) -def test_where_broadcast(): - # Test a variety of differently sized series - for size in range(2, 6): - # Test a variety of boolean indices - for selection in [ - # First element should be set - np.resize([True, False, False, False, False], size), - # Set alternating elements] - np.resize([True, False], size), - # No element should be set - np.resize([False], size) - ]: - - # Test a variety of different numbers as content - for item in [2.0, np.nan, np.finfo(np.float).max, - np.finfo(np.float).min]: - # Test numpy arrays, lists and tuples as the input to be - # broadcast - for arr in [np.array([item]), [item], (item,)]: - data = np.arange(size, dtype=float) - s = Series(data) - s[selection] = arr - # Construct the expected series by taking the source - # data or item based on the selection - expected = Series([item if use_item else data[ - i] for i, use_item in enumerate(selection)]) - assert_series_equal(s, expected) - - s = Series(data) - result = s.where(~selection, arr) - assert_series_equal(result, expected) +@pytest.mark.parametrize('size', range(2, 6)) +@pytest.mark.parametrize('mask', [ + [True, False, False, False, False], + [True, False], + [False] +]) +@pytest.mark.parametrize('item', [ + 2.0, np.nan, np.finfo(np.float).max, np.finfo(np.float).min +]) +# Test numpy arrays, lists and tuples as the input to be +# broadcast +@pytest.mark.parametrize('box', [ + lambda x: np.array([x]), + lambda x: [x], + lambda x: (x,) +]) +def test_broadcast(size, mask, item, box): + selection = np.resize(mask, size) + + data = np.arange(size, dtype=float) + + # Construct the expected series by taking the source + # data or item based on the selection + expected = Series([item if use_item else data[ + i] for i, use_item in enumerate(selection)]) + + s = Series(data) + s[selection] = box(item) + assert_series_equal(s, expected) + + s = Series(data) + result = s.where(~selection, box(item)) + assert_series_equal(result, expected) + + s = Series(data) + result = s.mask(selection, box(item)) + assert_series_equal(result, expected) def test_where_inplace(): @@ -587,29 +589,6 @@ def test_mask(): assert_series_equal(result, expected) -def test_mask_broadcast(): - # GH 8801 - # copied from test_where_broadcast - for size in range(2, 6): - for selection in [ - # First element should be set - np.resize([True, False, False, False, False], size), - # Set alternating elements] - np.resize([True, False], size), - # No element should be set - np.resize([False], size) - ]: - for item in [2.0, np.nan, np.finfo(np.float).max, - np.finfo(np.float).min]: - for arr in [np.array([item]), [item], (item,)]: - data = np.arange(size, dtype=float) - s = Series(data) - result = s.mask(selection, arr) - expected = Series([item if use_item else data[ - i] for i, use_item in enumerate(selection)]) - assert_series_equal(result, expected) - - def test_mask_inplace(): s = Series(np.random.randn(5)) cond = s > 0 diff --git a/pandas/tests/series/indexing/test_callable.py b/pandas/tests/series/indexing/test_callable.py new file mode 100644 index 0000000000000..b656137545903 --- /dev/null +++ b/pandas/tests/series/indexing/test_callable.py @@ -0,0 +1,33 @@ +import pandas as pd +import pandas.util.testing as tm + + +def test_getitem_callable(): + # GH 12533 + s = pd.Series(4, index=list('ABCD')) + result = s[lambda x: 'A'] + assert result == s.loc['A'] + + result = s[lambda x: ['A', 'B']] + tm.assert_series_equal(result, s.loc[['A', 'B']]) + + result = s[lambda x: [True, False, True, True]] + tm.assert_series_equal(result, s.iloc[[0, 2, 3]]) + + +def test_setitem_callable(): + # GH 12533 + s = pd.Series([1, 2, 3, 4], index=list('ABCD')) + s[lambda x: 'A'] = -1 + tm.assert_series_equal(s, pd.Series([-1, 2, 3, 4], index=list('ABCD'))) + + +def test_setitem_other_callable(): + # GH 13299 + inc = lambda x: x + 1 + + s = pd.Series([1, 2, -1, 4]) + s[s < 0] = inc + + expected = pd.Series([1, 2, inc, 4]) + tm.assert_series_equal(s, expected) diff --git a/pandas/tests/series/indexing/test_datetime.py b/pandas/tests/series/indexing/test_datetime.py index db8118384f6f6..f484cdea2e09f 100644 --- a/pandas/tests/series/indexing/test_datetime.py +++ b/pandas/tests/series/indexing/test_datetime.py @@ -700,11 +700,11 @@ def test_nat_operations(): assert s.max() == exp -def test_round_nat(): +@pytest.mark.parametrize('method', ["round", "floor", "ceil"]) +@pytest.mark.parametrize('freq', ["s", "5s", "min", "5min", "h", "5h"]) +def test_round_nat(method, freq): # GH14940 s = Series([pd.NaT]) expected = Series(pd.NaT) - for method in ["round", "floor", "ceil"]: - round_method = getattr(s.dt, method) - for freq in ["s", "5s", "min", "5min", "h", "5h"]: - assert_series_equal(round_method(freq), expected) + round_method = getattr(s.dt, method) + assert_series_equal(round_method(freq), expected) diff --git a/pandas/tests/series/indexing/test_iloc.py b/pandas/tests/series/indexing/test_iloc.py index 5908a7708c426..648a37ce0262b 100644 --- a/pandas/tests/series/indexing/test_iloc.py +++ b/pandas/tests/series/indexing/test_iloc.py @@ -9,8 +9,6 @@ from pandas.util.testing import (assert_series_equal, assert_almost_equal) -JOIN_TYPES = ['inner', 'outer', 'left', 'right'] - def test_iloc(): s = Series(np.random.randn(10), index=lrange(0, 20, 2)) diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index 9005ac8e97929..5cc1a8ff1c451 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -21,7 +21,58 @@ import pandas.util.testing as tm -JOIN_TYPES = ['inner', 'outer', 'left', 'right'] +def test_basic_indexing(): + s = Series(np.random.randn(5), index=['a', 'b', 'a', 'a', 'b']) + + pytest.raises(IndexError, s.__getitem__, 5) + pytest.raises(IndexError, s.__setitem__, 5, 0) + + pytest.raises(KeyError, s.__getitem__, 'c') + + s = s.sort_index() + + pytest.raises(IndexError, s.__getitem__, 5) + pytest.raises(IndexError, s.__setitem__, 5, 0) + + +def test_basic_getitem_with_labels(test_data): + indices = test_data.ts.index[[5, 10, 15]] + + result = test_data.ts[indices] + expected = test_data.ts.reindex(indices) + assert_series_equal(result, expected) + + result = test_data.ts[indices[0]:indices[2]] + expected = test_data.ts.loc[indices[0]:indices[2]] + assert_series_equal(result, expected) + + # integer indexes, be careful + s = Series(np.random.randn(10), index=lrange(0, 20, 2)) + inds = [0, 2, 5, 7, 8] + arr_inds = np.array([0, 2, 5, 7, 8]) + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + result = s[inds] + expected = s.reindex(inds) + assert_series_equal(result, expected) + + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + result = s[arr_inds] + expected = s.reindex(arr_inds) + assert_series_equal(result, expected) + + # GH12089 + # with tz for values + s = Series(pd.date_range("2011-01-01", periods=3, tz="US/Eastern"), + index=['a', 'b', 'c']) + expected = Timestamp('2011-01-01', tz='US/Eastern') + result = s.loc['a'] + assert result == expected + result = s.iloc[0] + assert result == expected + result = s['a'] + assert result == expected def test_getitem_setitem_ellipsis(): @@ -36,18 +87,6 @@ def test_getitem_setitem_ellipsis(): assert (result == 5).all() -def test_pop(): - # GH 6600 - df = DataFrame({'A': 0, 'B': np.arange(5, dtype='int64'), 'C': 0, }) - k = df.iloc[4] - - result = k.pop('B') - assert result == 4 - - expected = Series([0, 0], index=['A', 'C'], name=4) - assert_series_equal(k, expected) - - def test_getitem_get(test_data): test_series = test_data.series test_obj_series = test_data.objSeries @@ -75,11 +114,6 @@ def test_getitem_get(test_data): assert result is None -def test_getitem_int64(test_data): - idx = np.int64(5) - assert test_data.ts[idx] == test_data.ts[5] - - def test_getitem_fancy(test_data): slice1 = test_data.series[[1, 2, 3]] slice2 = test_data.objSeries[[1, 2, 3]] @@ -199,26 +233,6 @@ def test_getitem_dups(): assert_series_equal(result, expected) -def test_getitem_dataframe(): - rng = list(range(10)) - s = pd.Series(10, index=rng) - df = pd.DataFrame(rng, index=rng) - pytest.raises(TypeError, s.__getitem__, df > 5) - - -def test_getitem_callable(): - # GH 12533 - s = pd.Series(4, index=list('ABCD')) - result = s[lambda x: 'A'] - assert result == s.loc['A'] - - result = s[lambda x: ['A', 'B']] - tm.assert_series_equal(result, s.loc[['A', 'B']]) - - result = s[lambda x: [True, False, True, True]] - tm.assert_series_equal(result, s.iloc[[0, 2, 3]]) - - def test_setitem_ambiguous_keyerror(): s = Series(lrange(10), index=lrange(0, 20, 2)) @@ -234,48 +248,11 @@ def test_setitem_ambiguous_keyerror(): assert_series_equal(s2, expected) -def test_setitem_callable(): - # GH 12533 - s = pd.Series([1, 2, 3, 4], index=list('ABCD')) - s[lambda x: 'A'] = -1 - tm.assert_series_equal(s, pd.Series([-1, 2, 3, 4], index=list('ABCD'))) - - -def test_setitem_other_callable(): - # GH 13299 - inc = lambda x: x + 1 - - s = pd.Series([1, 2, -1, 4]) - s[s < 0] = inc - - expected = pd.Series([1, 2, inc, 4]) - tm.assert_series_equal(s, expected) - - -def test_slice(test_data): - numSlice = test_data.series[10:20] - numSliceEnd = test_data.series[-10:] - objSlice = test_data.objSeries[10:20] - - assert test_data.series.index[9] not in numSlice.index - assert test_data.objSeries.index[9] not in objSlice.index - - assert len(numSlice) == len(numSlice.index) - assert test_data.series[numSlice.index[0]] == numSlice[numSlice.index[0]] - - assert numSlice.index[1] == test_data.series.index[11] - assert tm.equalContents(numSliceEnd, np.array(test_data.series)[-10:]) - - # Test return view. - sl = test_data.series[10:20] - sl[:] = 0 - - assert (test_data.series[10:20] == 0).all() - - -def test_slice_can_reorder_not_uniquely_indexed(): - s = Series(1, index=['a', 'a', 'b', 'b', 'c']) - s[::-1] # it works! +def test_getitem_dataframe(): + rng = list(range(10)) + s = pd.Series(10, index=rng) + df = pd.DataFrame(rng, index=rng) + pytest.raises(TypeError, s.__getitem__, df > 5) def test_setitem(test_data): @@ -389,86 +366,46 @@ def test_basic_getitem_setitem_corner(test_data): [5, slice(None, None)], 2) -def test_basic_getitem_with_labels(test_data): - indices = test_data.ts.index[[5, 10, 15]] - - result = test_data.ts[indices] - expected = test_data.ts.reindex(indices) - assert_series_equal(result, expected) - - result = test_data.ts[indices[0]:indices[2]] - expected = test_data.ts.loc[indices[0]:indices[2]] - assert_series_equal(result, expected) - - # integer indexes, be careful - s = Series(np.random.randn(10), index=lrange(0, 20, 2)) - inds = [0, 2, 5, 7, 8] - arr_inds = np.array([0, 2, 5, 7, 8]) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - result = s[inds] - expected = s.reindex(inds) - assert_series_equal(result, expected) - - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - result = s[arr_inds] - expected = s.reindex(arr_inds) - assert_series_equal(result, expected) - - # GH12089 - # with tz for values - s = Series(pd.date_range("2011-01-01", periods=3, tz="US/Eastern"), - index=['a', 'b', 'c']) - expected = Timestamp('2011-01-01', tz='US/Eastern') - result = s.loc['a'] - assert result == expected - result = s.iloc[0] - assert result == expected - result = s['a'] - assert result == expected - - -def test_setitem_with_tz(): - for tz in ['US/Eastern', 'UTC', 'Asia/Tokyo']: - orig = pd.Series(pd.date_range('2016-01-01', freq='H', periods=3, - tz=tz)) - assert orig.dtype == 'datetime64[ns, {0}]'.format(tz) +@pytest.mark.parametrize('tz', ['US/Eastern', 'UTC', 'Asia/Tokyo']) +def test_setitem_with_tz(tz): + orig = pd.Series(pd.date_range('2016-01-01', freq='H', periods=3, + tz=tz)) + assert orig.dtype == 'datetime64[ns, {0}]'.format(tz) - # scalar - s = orig.copy() - s[1] = pd.Timestamp('2011-01-01', tz=tz) - exp = pd.Series([pd.Timestamp('2016-01-01 00:00', tz=tz), - pd.Timestamp('2011-01-01 00:00', tz=tz), - pd.Timestamp('2016-01-01 02:00', tz=tz)]) - tm.assert_series_equal(s, exp) + # scalar + s = orig.copy() + s[1] = pd.Timestamp('2011-01-01', tz=tz) + exp = pd.Series([pd.Timestamp('2016-01-01 00:00', tz=tz), + pd.Timestamp('2011-01-01 00:00', tz=tz), + pd.Timestamp('2016-01-01 02:00', tz=tz)]) + tm.assert_series_equal(s, exp) - s = orig.copy() - s.loc[1] = pd.Timestamp('2011-01-01', tz=tz) - tm.assert_series_equal(s, exp) + s = orig.copy() + s.loc[1] = pd.Timestamp('2011-01-01', tz=tz) + tm.assert_series_equal(s, exp) - s = orig.copy() - s.iloc[1] = pd.Timestamp('2011-01-01', tz=tz) - tm.assert_series_equal(s, exp) + s = orig.copy() + s.iloc[1] = pd.Timestamp('2011-01-01', tz=tz) + tm.assert_series_equal(s, exp) - # vector - vals = pd.Series([pd.Timestamp('2011-01-01', tz=tz), - pd.Timestamp('2012-01-01', tz=tz)], index=[1, 2]) - assert vals.dtype == 'datetime64[ns, {0}]'.format(tz) + # vector + vals = pd.Series([pd.Timestamp('2011-01-01', tz=tz), + pd.Timestamp('2012-01-01', tz=tz)], index=[1, 2]) + assert vals.dtype == 'datetime64[ns, {0}]'.format(tz) - s[[1, 2]] = vals - exp = pd.Series([pd.Timestamp('2016-01-01 00:00', tz=tz), - pd.Timestamp('2011-01-01 00:00', tz=tz), - pd.Timestamp('2012-01-01 00:00', tz=tz)]) - tm.assert_series_equal(s, exp) + s[[1, 2]] = vals + exp = pd.Series([pd.Timestamp('2016-01-01 00:00', tz=tz), + pd.Timestamp('2011-01-01 00:00', tz=tz), + pd.Timestamp('2012-01-01 00:00', tz=tz)]) + tm.assert_series_equal(s, exp) - s = orig.copy() - s.loc[[1, 2]] = vals - tm.assert_series_equal(s, exp) + s = orig.copy() + s.loc[[1, 2]] = vals + tm.assert_series_equal(s, exp) - s = orig.copy() - s.iloc[[1, 2]] = vals - tm.assert_series_equal(s, exp) + s = orig.copy() + s.iloc[[1, 2]] = vals + tm.assert_series_equal(s, exp) def test_setitem_with_tz_dst(): @@ -550,22 +487,30 @@ def test_categorial_assigning_ops(): tm.assert_series_equal(s, exp) -def test_take(): - s = Series([-1, 5, 6, 2, 4]) +def test_slice(test_data): + numSlice = test_data.series[10:20] + numSliceEnd = test_data.series[-10:] + objSlice = test_data.objSeries[10:20] - actual = s.take([1, 3, 4]) - expected = Series([5, 2, 4], index=[1, 3, 4]) - tm.assert_series_equal(actual, expected) + assert test_data.series.index[9] not in numSlice.index + assert test_data.objSeries.index[9] not in objSlice.index - actual = s.take([-1, 3, 4]) - expected = Series([4, 2, 4], index=[4, 3, 4]) - tm.assert_series_equal(actual, expected) + assert len(numSlice) == len(numSlice.index) + assert test_data.series[numSlice.index[0]] == numSlice[numSlice.index[0]] - pytest.raises(IndexError, s.take, [1, 10]) - pytest.raises(IndexError, s.take, [2, 5]) + assert numSlice.index[1] == test_data.series.index[11] + assert tm.equalContents(numSliceEnd, np.array(test_data.series)[-10:]) - with tm.assert_produces_warning(FutureWarning): - s.take([-1, 3, 4], convert=False) + # Test return view. + sl = test_data.series[10:20] + sl[:] = 0 + + assert (test_data.series[10:20] == 0).all() + + +def test_slice_can_reorder_not_uniquely_indexed(): + s = Series(1, index=['a', 'a', 'b', 'b', 'c']) + s[::-1] # it works! def test_ix_setitem(test_data): @@ -615,20 +560,6 @@ def test_setitem_na(): assert_series_equal(s, expected) -def test_basic_indexing(): - s = Series(np.random.randn(5), index=['a', 'b', 'a', 'a', 'b']) - - pytest.raises(IndexError, s.__getitem__, 5) - pytest.raises(IndexError, s.__setitem__, 5, 0) - - pytest.raises(KeyError, s.__getitem__, 'c') - - s = s.sort_index() - - pytest.raises(IndexError, s.__getitem__, 5) - pytest.raises(IndexError, s.__setitem__, 5, 0) - - def test_timedelta_assignment(): # GH 8209 s = Series([]) @@ -700,73 +631,6 @@ def test_preserve_refs(test_data): assert not np.isnan(test_data.ts[10]) -def test_drop(): - # unique - s = Series([1, 2], index=['one', 'two']) - expected = Series([1], index=['one']) - result = s.drop(['two']) - assert_series_equal(result, expected) - result = s.drop('two', axis='rows') - assert_series_equal(result, expected) - - # non-unique - # GH 5248 - s = Series([1, 1, 2], index=['one', 'two', 'one']) - expected = Series([1, 2], index=['one', 'one']) - result = s.drop(['two'], axis=0) - assert_series_equal(result, expected) - result = s.drop('two') - assert_series_equal(result, expected) - - expected = Series([1], index=['two']) - result = s.drop(['one']) - assert_series_equal(result, expected) - result = s.drop('one') - assert_series_equal(result, expected) - - # single string/tuple-like - s = Series(range(3), index=list('abc')) - pytest.raises(KeyError, s.drop, 'bc') - pytest.raises(KeyError, s.drop, ('a',)) - - # errors='ignore' - s = Series(range(3), index=list('abc')) - result = s.drop('bc', errors='ignore') - assert_series_equal(result, s) - result = s.drop(['a', 'd'], errors='ignore') - expected = s.iloc[1:] - assert_series_equal(result, expected) - - # bad axis - pytest.raises(ValueError, s.drop, 'one', axis='columns') - - # GH 8522 - s = Series([2, 3], index=[True, False]) - assert s.index.is_object() - result = s.drop(True) - expected = Series([3], index=[False]) - assert_series_equal(result, expected) - - # GH 16877 - s = Series([2, 3], index=[0, 1]) - with tm.assert_raises_regex(KeyError, 'not contained in axis'): - s.drop([False, True]) - - -def test_select(test_data): - # deprecated: gh-12410 - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - n = len(test_data.ts) - result = test_data.ts.select(lambda x: x >= test_data.ts.index[n // 2]) - expected = test_data.ts.reindex(test_data.ts.index[n // 2:]) - assert_series_equal(result, expected) - - result = test_data.ts.select(lambda x: x.weekday() == 2) - expected = test_data.ts[test_data.ts.index.weekday == 2] - assert_series_equal(result, expected) - - def test_cast_on_putmask(): # GH 2746 @@ -799,13 +663,6 @@ def test_type_promote_putmask(): assert_series_equal(s, Series([0, 'foo', 'bar', 0])) -def test_head_tail(test_data): - assert_series_equal(test_data.series.head(), test_data.series[:5]) - assert_series_equal(test_data.series.head(0), test_data.series[0:0]) - assert_series_equal(test_data.series.tail(), test_data.series[-5:]) - assert_series_equal(test_data.series.tail(0), test_data.series[0:0]) - - def test_multilevel_preserve_name(): index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], @@ -845,3 +702,59 @@ def test_setitem_slice_into_readonly_backing_data(): series[1:3] = 1 assert not array.any() + + +""" +miscellaneous methods +""" + + +def test_select(test_data): + # deprecated: gh-12410 + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + n = len(test_data.ts) + result = test_data.ts.select(lambda x: x >= test_data.ts.index[n // 2]) + expected = test_data.ts.reindex(test_data.ts.index[n // 2:]) + assert_series_equal(result, expected) + + result = test_data.ts.select(lambda x: x.weekday() == 2) + expected = test_data.ts[test_data.ts.index.weekday == 2] + assert_series_equal(result, expected) + + +def test_pop(): + # GH 6600 + df = DataFrame({'A': 0, 'B': np.arange(5, dtype='int64'), 'C': 0, }) + k = df.iloc[4] + + result = k.pop('B') + assert result == 4 + + expected = Series([0, 0], index=['A', 'C'], name=4) + assert_series_equal(k, expected) + + +def test_take(): + s = Series([-1, 5, 6, 2, 4]) + + actual = s.take([1, 3, 4]) + expected = Series([5, 2, 4], index=[1, 3, 4]) + tm.assert_series_equal(actual, expected) + + actual = s.take([-1, 3, 4]) + expected = Series([4, 2, 4], index=[4, 3, 4]) + tm.assert_series_equal(actual, expected) + + pytest.raises(IndexError, s.take, [1, 10]) + pytest.raises(IndexError, s.take, [2, 5]) + + with tm.assert_produces_warning(FutureWarning): + s.take([-1, 3, 4], convert=False) + + +def test_head_tail(test_data): + assert_series_equal(test_data.series.head(), test_data.series[:5]) + assert_series_equal(test_data.series.head(0), test_data.series[0:0]) + assert_series_equal(test_data.series.tail(), test_data.series[-5:]) + assert_series_equal(test_data.series.tail(0), test_data.series[0:0]) diff --git a/pandas/tests/series/indexing/test_loc.py b/pandas/tests/series/indexing/test_loc.py index d78b09a3c6ccb..088406e0a1db6 100644 --- a/pandas/tests/series/indexing/test_loc.py +++ b/pandas/tests/series/indexing/test_loc.py @@ -12,9 +12,6 @@ from pandas.util.testing import (assert_series_equal) -JOIN_TYPES = ['inner', 'outer', 'left', 'right'] - - def test_loc_getitem(test_data): inds = test_data.series.index[[3, 4, 7]] assert_series_equal( diff --git a/pandas/tests/series/indexing/test_numeric.py b/pandas/tests/series/indexing/test_numeric.py index e6035ccf2d569..b964ec3874998 100644 --- a/pandas/tests/series/indexing/test_numeric.py +++ b/pandas/tests/series/indexing/test_numeric.py @@ -229,3 +229,8 @@ def test_int_indexing(): pytest.raises(KeyError, s.__getitem__, 5) pytest.raises(KeyError, s.__getitem__, 'c') + + +def test_getitem_int64(test_data): + idx = np.int64(5) + assert test_data.ts[idx] == test_data.ts[5] From bd31f716b7cd439c5d4a222c307414d6f6efb752 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Fri, 9 Mar 2018 17:53:34 -0800 Subject: [PATCH 12/18] Added 'displayed_only' option to 'read_html' (#20047) --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/io/html.py | 71 ++++++++++++++++++++++++++++++--- pandas/tests/io/test_html.py | 66 ++++++++++++++++++++++++++++++ 3 files changed, 133 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 302f8043f3ba7..bea897e1b88e6 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -343,6 +343,7 @@ Other Enhancements - :meth:`Timestamp.day_name` and :meth:`DatetimeIndex.day_name` are now available to return day names with a specified locale (:issue:`12806`) - :meth:`DataFrame.to_sql` now performs a multivalue insert if the underlying connection supports itk rather than inserting row by row. ``SQLAlchemy`` dialects supporting multivalue inserts include: ``mysql``, ``postgresql``, ``sqlite`` and any dialect with ``supports_multivalues_insert``. (:issue:`14315`, :issue:`8953`) +- :func:`read_html` now accepts a ``displayed_only`` keyword argument to controls whether or not hidden elements are parsed (``True`` by default) (:issue:`20027`) .. _whatsnew_0230.api_breaking: diff --git a/pandas/io/html.py b/pandas/io/html.py index be4854bc19cc6..300a5a151f5d2 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -160,6 +160,14 @@ class _HtmlFrameParser(object): attrs : dict List of HTML element attributes to match. + encoding : str + Encoding to be used by parser + + displayed_only : bool + Whether or not items with "display:none" should be ignored + + .. versionadded:: 0.23.0 + Attributes ---------- io : str or file-like @@ -172,6 +180,14 @@ class _HtmlFrameParser(object): A dictionary of valid table attributes to use to search for table elements. + encoding : str + Encoding to be used by parser + + displayed_only : bool + Whether or not items with "display:none" should be ignored + + .. versionadded:: 0.23.0 + Notes ----- To subclass this class effectively you must override the following methods: @@ -187,11 +203,12 @@ class _HtmlFrameParser(object): functionality. """ - def __init__(self, io, match, attrs, encoding): + def __init__(self, io, match, attrs, encoding, displayed_only): self.io = io self.match = match self.attrs = attrs self.encoding = encoding + self.displayed_only = displayed_only def parse_tables(self): tables = self._parse_tables(self._build_doc(), self.match, self.attrs) @@ -380,6 +397,27 @@ def _parse_raw_tbody(self, table): res = self._parse_tr(table) return self._parse_raw_data(res) + def _handle_hidden_tables(self, tbl_list, attr_name): + """Returns list of tables, potentially removing hidden elements + + Parameters + ---------- + tbl_list : list of Tag or list of Element + Type of list elements will vary depending upon parser used + attr_name : str + Name of the accessor for retrieving HTML attributes + + Returns + ------- + list of Tag or list of Element + Return type matches `tbl_list` + """ + if not self.displayed_only: + return tbl_list + + return [x for x in tbl_list if "display:none" not in + getattr(x, attr_name).get('style', '').replace(" ", "")] + class _BeautifulSoupHtml5LibFrameParser(_HtmlFrameParser): """HTML to DataFrame parser that uses BeautifulSoup under the hood. @@ -431,8 +469,14 @@ def _parse_tables(self, doc, match, attrs): result = [] unique_tables = set() + tables = self._handle_hidden_tables(tables, "attrs") for table in tables: + if self.displayed_only: + for elem in table.find_all( + style=re.compile(r"display:\s*none")): + elem.decompose() + if (table not in unique_tables and table.find(text=match) is not None): result.append(table) @@ -528,6 +572,17 @@ def _parse_tables(self, doc, match, kwargs): tables = doc.xpath(xpath_expr, namespaces=_re_namespace) + tables = self._handle_hidden_tables(tables, "attrib") + if self.displayed_only: + for table in tables: + # lxml utilizes XPATH 1.0 which does not have regex + # support. As a result, we find all elements with a style + # attribute and iterate them to check for display:none + for elem in table.xpath('.//*[@style]'): + if "display:none" in elem.attrib.get( + "style", "").replace(" ", ""): + elem.getparent().remove(elem) + if not tables: raise ValueError("No tables found matching regex {patt!r}" .format(patt=pattern)) @@ -729,7 +784,7 @@ def _validate_flavor(flavor): return flavor -def _parse(flavor, io, match, attrs, encoding, **kwargs): +def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs): flavor = _validate_flavor(flavor) compiled_match = re.compile(match) # you can pass a compiled regex here @@ -737,7 +792,7 @@ def _parse(flavor, io, match, attrs, encoding, **kwargs): retained = None for flav in flavor: parser = _parser_dispatch(flav) - p = parser(io, compiled_match, attrs, encoding) + p = parser(io, compiled_match, attrs, encoding, displayed_only) try: tables = p.parse_tables() @@ -773,7 +828,7 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, skiprows=None, attrs=None, parse_dates=False, tupleize_cols=None, thousands=',', encoding=None, decimal='.', converters=None, na_values=None, - keep_default_na=True): + keep_default_na=True, displayed_only=True): r"""Read HTML tables into a ``list`` of ``DataFrame`` objects. Parameters @@ -877,6 +932,11 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, .. versionadded:: 0.19.0 + display_only : bool, default True + Whether elements with "display: none" should be parsed + + .. versionadded:: 0.23.0 + Returns ------- dfs : list of DataFrames @@ -924,4 +984,5 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, parse_dates=parse_dates, tupleize_cols=tupleize_cols, thousands=thousands, attrs=attrs, encoding=encoding, decimal=decimal, converters=converters, na_values=na_values, - keep_default_na=keep_default_na) + keep_default_na=keep_default_na, + displayed_only=displayed_only) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 151a0750b7f6e..b18104e951504 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -674,6 +674,39 @@ def test_wikipedia_states_table(self): result = self.read_html(data, 'Arizona', header=1)[0] assert result['sq mi'].dtype == np.dtype('float64') + @pytest.mark.parametrize("displayed_only,exp0,exp1", [ + (True, DataFrame(["foo"]), None), + (False, DataFrame(["foo bar baz qux"]), DataFrame(["foo"]))]) + def test_displayed_only(self, displayed_only, exp0, exp1): + # GH 20027 + data = StringIO(""" + +
+ + + +
+ foo + bar + baz + qux +
+ + + + +
foo
+ + """) + + dfs = self.read_html(data, displayed_only=displayed_only) + tm.assert_frame_equal(dfs[0], exp0) + + if exp1 is not None: + tm.assert_frame_equal(dfs[1], exp1) + else: + assert len(dfs) == 1 # Should not parse hidden table + def test_decimal_rows(self): # GH 12907 @@ -896,6 +929,39 @@ def test_computer_sales_page(self): data = os.path.join(DATA_PATH, 'computer_sales_page.html') self.read_html(data, header=[0, 1]) + @pytest.mark.parametrize("displayed_only,exp0,exp1", [ + (True, DataFrame(["foo"]), None), + (False, DataFrame(["foo bar baz qux"]), DataFrame(["foo"]))]) + def test_displayed_only(self, displayed_only, exp0, exp1): + # GH 20027 + data = StringIO(""" + + + + + +
+ foo + bar + baz + qux +
+ + + + +
foo
+ + """) + + dfs = self.read_html(data, displayed_only=displayed_only) + tm.assert_frame_equal(dfs[0], exp0) + + if exp1 is not None: + tm.assert_frame_equal(dfs[1], exp1) + else: + assert len(dfs) == 1 # Should not parse hidden table + def test_invalid_flavor(): url = 'google.com' From da6f827053a5ef7c4d4af0e704a3b152d9c99280 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Fri, 9 Mar 2018 18:03:50 -0800 Subject: [PATCH 13/18] Refactored GroupBy ASVs (#20043) --- asv_bench/benchmarks/groupby.py | 137 ++++++++++++++------------------ 1 file changed, 58 insertions(+), 79 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 3e7e5c821b14c..7777322071957 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -14,7 +14,10 @@ method_blacklist = { 'object': {'median', 'prod', 'sem', 'cumsum', 'sum', 'cummin', 'mean', 'max', 'skew', 'cumprod', 'cummax', 'rank', 'pct_change', 'min', - 'var', 'mad', 'describe', 'std'} + 'var', 'mad', 'describe', 'std'}, + 'datetime': {'median', 'prod', 'sem', 'cumsum', 'sum', 'mean', 'skew', + 'cumprod', 'cummax', 'pct_change', 'var', 'mad', 'describe', + 'std'} } @@ -90,45 +93,6 @@ def time_series_groups(self, data, key): self.ser.groupby(self.ser).groups -class FirstLast(object): - - goal_time = 0.2 - - param_names = ['dtype'] - params = ['float32', 'float64', 'datetime', 'object'] - - def setup(self, dtype): - N = 10**5 - # with datetimes (GH7555) - if dtype == 'datetime': - self.df = DataFrame({'values': date_range('1/1/2011', - periods=N, - freq='s'), - 'key': range(N)}) - elif dtype == 'object': - self.df = DataFrame({'values': ['foo'] * N, - 'key': range(N)}) - else: - labels = np.arange(N / 10).repeat(10) - data = Series(np.random.randn(len(labels)), dtype=dtype) - data[::3] = np.nan - data[1::3] = np.nan - labels = labels.take(np.random.permutation(len(labels))) - self.df = DataFrame({'values': data, 'key': labels}) - - def time_groupby_first(self, dtype): - self.df.groupby('key').first() - - def time_groupby_last(self, dtype): - self.df.groupby('key').last() - - def time_groupby_nth_all(self, dtype): - self.df.groupby('key').nth(0, dropna='all') - - def time_groupby_nth_none(self, dtype): - self.df.groupby('key').nth(0) - - class GroupManyLabels(object): goal_time = 0.2 @@ -149,39 +113,40 @@ class Nth(object): goal_time = 0.2 - def setup_cache(self): - df = DataFrame(np.random.randint(1, 100, (10000, 2))) - df.iloc[1, 1] = np.nan - return df - - def time_frame_nth_any(self, df): - df.groupby(0).nth(0, dropna='any') - - def time_frame_nth(self, df): - df.groupby(0).nth(0) - + param_names = ['dtype'] + params = ['float32', 'float64', 'datetime', 'object'] - def time_series_nth_any(self, df): - df[1].groupby(df[0]).nth(0, dropna='any') + def setup(self, dtype): + N = 10**5 + # with datetimes (GH7555) + if dtype == 'datetime': + values = date_range('1/1/2011', periods=N, freq='s') + elif dtype == 'object': + values = ['foo'] * N + else: + values = np.arange(N).astype(dtype) - def time_series_nth(self, df): - df[1].groupby(df[0]).nth(0) + key = np.arange(N) + self.df = DataFrame({'key': key, 'values': values}) + self.df.iloc[1, 1] = np.nan # insert missing data + def time_frame_nth_any(self, dtype): + self.df.groupby('key').nth(0, dropna='any') -class NthObject(object): + def time_groupby_nth_all(self, dtype): + self.df.groupby('key').nth(0, dropna='all') - goal_time = 0.2 + def time_frame_nth(self, dtype): + self.df.groupby('key').nth(0) - def setup_cache(self): - df = DataFrame(np.random.randint(1, 100, (10000,)), columns=['g']) - df['obj'] = ['a'] * 5000 + ['b'] * 5000 - return df + def time_series_nth_any(self, dtype): + self.df['values'].groupby(self.df['key']).nth(0, dropna='any') - def time_nth(self, df): - df.groupby('g').nth(5) + def time_groupby_nth_all(self, dtype): + self.df['values'].groupby(self.df['key']).nth(0, dropna='all') - def time_nth_last(self, df): - df.groupby('g').last() + def time_series_nth(self, dtype): + self.df['values'].groupby(self.df['key']).nth(0) class DateAttributes(object): @@ -243,7 +208,7 @@ def time_multi_count(self, df): df.groupby(['key1', 'key2']).count() -class CountInt(object): +class CountMultiInt(object): goal_time = 0.2 @@ -255,10 +220,10 @@ def setup_cache(self): 'ints2': np.random.randint(0, 1000, size=n)}) return df - def time_int_count(self, df): + def time_multi_int_count(self, df): df.groupby(['key1', 'key2']).count() - def time_int_nunique(self, df): + def time_multi_int_nunique(self, df): df.groupby(['key1', 'key2']).nunique() @@ -266,7 +231,7 @@ class AggFunctions(object): goal_time = 0.2 - def setup_cache(self): + def setup_cache(): N = 10**5 fac1 = np.array(['A', 'B', 'C'], dtype='O') fac2 = np.array(['one', 'two'], dtype='O') @@ -361,9 +326,6 @@ def setup(self): def time_multi_size(self): self.df.groupby(['key1', 'key2']).size() - def time_dt_size(self): - self.df.groupby(['dates']).size() - def time_dt_timegrouper_size(self): with warnings.catch_warnings(record=True): self.df.groupby(TimeGrouper(key='dates', freq='M')).size() @@ -376,15 +338,16 @@ class GroupByMethods(object): goal_time = 0.2 - param_names = ['dtype', 'method'] - params = [['int', 'float', 'object'], + param_names = ['dtype', 'method', 'application'] + params = [['int', 'float', 'object', 'datetime'], ['all', 'any', 'bfill', 'count', 'cumcount', 'cummax', 'cummin', 'cumprod', 'cumsum', 'describe', 'ffill', 'first', 'head', 'last', 'mad', 'max', 'min', 'median', 'mean', 'nunique', 'pct_change', 'prod', 'rank', 'sem', 'shift', 'size', 'skew', - 'std', 'sum', 'tail', 'unique', 'value_counts', 'var']] + 'std', 'sum', 'tail', 'unique', 'value_counts', 'var'], + ['direct', 'transformation']] - def setup(self, dtype, method): + def setup(self, dtype, method, application): if method in method_blacklist.get(dtype, {}): raise NotImplementedError # skip benchmark ngroups = 1000 @@ -398,12 +361,28 @@ def setup(self, dtype, method): np.random.random(ngroups) * 10.0]) elif dtype == 'object': key = ['foo'] * size + elif dtype == 'datetime': + key = date_range('1/1/2011', periods=size, freq='s') df = DataFrame({'values': values, 'key': key}) - self.df_groupby_method = getattr(df.groupby('key')['values'], method) - def time_method(self, dtype, method): - self.df_groupby_method() + if application == 'transform': + if method == 'describe': + raise NotImplementedError + + self.as_group_method = lambda: df.groupby( + 'key')['values'].transform(method) + self.as_field_method = lambda: df.groupby( + 'values')['key'].transform(method) + else: + self.as_group_method = getattr(df.groupby('key')['values'], method) + self.as_field_method = getattr(df.groupby('values')['key'], method) + + def time_dtype_as_group(self, dtype, method, application): + self.as_group_method() + + def time_dtype_as_field(self, dtype, method, application): + self.as_field_method() class Float32(object): From 52cffa3b3b2a510c30ed7f8cc8525c03d62e9130 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Fri, 9 Mar 2018 18:06:43 -0800 Subject: [PATCH 14/18] Cythonized GroupBy pct_change (#19919) --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/core/groupby.py | 24 +++++++ pandas/tests/groupby/test_groupby.py | 55 ---------------- pandas/tests/groupby/test_transform.py | 87 ++++++++++++++++++++++++++ 4 files changed, 112 insertions(+), 55 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index bea897e1b88e6..3afd9cff10e86 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -795,6 +795,7 @@ Performance Improvements - Improved performance of variable ``.rolling()`` on ``.min()`` and ``.max()`` (:issue:`19521`) - Improved performance of :func:`pandas.core.groupby.GroupBy.ffill` and :func:`pandas.core.groupby.GroupBy.bfill` (:issue:`11296`) - Improved performance of :func:`pandas.core.groupby.GroupBy.any` and :func:`pandas.core.groupby.GroupBy.all` (:issue:`15435`) +- Improved performance of :func:`pandas.core.groupby.GroupBy.pct_change` (:issue:`19165`) .. _whatsnew_0230.docs: diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 6b10d2ca3b5b2..285c5786b532b 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2044,6 +2044,23 @@ def shift(self, periods=1, freq=None, axis=0): result_is_index=True, periods=periods) + @Substitution(name='groupby') + @Appender(_doc_template) + def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None, + axis=0): + """Calcuate pct_change of each value to previous entry in group""" + if freq is not None or axis != 0: + return self.apply(lambda x: x.pct_change(periods=periods, + fill_method=fill_method, + limit=limit, freq=freq, + axis=axis)) + + filled = getattr(self, fill_method)(limit=limit).drop( + self.grouper.names, axis=1) + shifted = filled.shift(periods=periods, freq=freq) + + return (filled / shifted) - 1 + @Substitution(name='groupby') @Appender(_doc_template) def head(self, n=5): @@ -3884,6 +3901,13 @@ def _apply_to_column_groupbys(self, func): """ return a pass thru """ return func(self) + def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None): + """Calculate percent change of each value to previous entry in group""" + filled = getattr(self, fill_method)(limit=limit) + shifted = filled.shift(periods=periods, freq=freq) + + return (filled / shifted) - 1 + class NDFrameGroupBy(GroupBy): diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 0561b3a1d8592..be0c32cefa6ff 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2062,61 +2062,6 @@ def test_rank_object_raises(self, ties_method, ascending, na_option, ascending=ascending, na_option=na_option, pct=pct) - @pytest.mark.parametrize("mix_groupings", [True, False]) - @pytest.mark.parametrize("as_series", [True, False]) - @pytest.mark.parametrize("val1,val2", [ - ('foo', 'bar'), (1, 2), (1., 2.)]) - @pytest.mark.parametrize("fill_method,limit,exp_vals", [ - ("ffill", None, - [np.nan, np.nan, 'val1', 'val1', 'val1', 'val2', 'val2', 'val2']), - ("ffill", 1, - [np.nan, np.nan, 'val1', 'val1', np.nan, 'val2', 'val2', np.nan]), - ("bfill", None, - ['val1', 'val1', 'val1', 'val2', 'val2', 'val2', np.nan, np.nan]), - ("bfill", 1, - [np.nan, 'val1', 'val1', np.nan, 'val2', 'val2', np.nan, np.nan]) - ]) - def test_group_fill_methods(self, mix_groupings, as_series, val1, val2, - fill_method, limit, exp_vals): - vals = [np.nan, np.nan, val1, np.nan, np.nan, val2, np.nan, np.nan] - _exp_vals = list(exp_vals) - # Overwrite placeholder values - for index, exp_val in enumerate(_exp_vals): - if exp_val == 'val1': - _exp_vals[index] = val1 - elif exp_val == 'val2': - _exp_vals[index] = val2 - - # Need to modify values and expectations depending on the - # Series / DataFrame that we ultimately want to generate - if mix_groupings: # ['a', 'b', 'a, 'b', ...] - keys = ['a', 'b'] * len(vals) - - def interweave(list_obj): - temp = list() - for x in list_obj: - temp.extend([x, x]) - - return temp - - _exp_vals = interweave(_exp_vals) - vals = interweave(vals) - else: # ['a', 'a', 'a', ... 'b', 'b', 'b'] - keys = ['a'] * len(vals) + ['b'] * len(vals) - _exp_vals = _exp_vals * 2 - vals = vals * 2 - - df = DataFrame({'key': keys, 'val': vals}) - if as_series: - result = getattr( - df.groupby('key')['val'], fill_method)(limit=limit) - exp = Series(_exp_vals, name='val') - assert_series_equal(result, exp) - else: - result = getattr(df.groupby('key'), fill_method)(limit=limit) - exp = DataFrame({'key': keys, 'val': _exp_vals}) - assert_frame_equal(result, exp) - @pytest.mark.parametrize("agg_func", ['any', 'all']) @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.parametrize("vals", [ diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index b418bb0c5fea6..bce38b8cf9eed 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -636,3 +636,90 @@ def test_transform_numeric_ret(self, cols, exp, comp_func, agg_func): exp = exp.astype('float') comp_func(result, exp) + + @pytest.mark.parametrize("mix_groupings", [True, False]) + @pytest.mark.parametrize("as_series", [True, False]) + @pytest.mark.parametrize("val1,val2", [ + ('foo', 'bar'), (1, 2), (1., 2.)]) + @pytest.mark.parametrize("fill_method,limit,exp_vals", [ + ("ffill", None, + [np.nan, np.nan, 'val1', 'val1', 'val1', 'val2', 'val2', 'val2']), + ("ffill", 1, + [np.nan, np.nan, 'val1', 'val1', np.nan, 'val2', 'val2', np.nan]), + ("bfill", None, + ['val1', 'val1', 'val1', 'val2', 'val2', 'val2', np.nan, np.nan]), + ("bfill", 1, + [np.nan, 'val1', 'val1', np.nan, 'val2', 'val2', np.nan, np.nan]) + ]) + def test_group_fill_methods(self, mix_groupings, as_series, val1, val2, + fill_method, limit, exp_vals): + vals = [np.nan, np.nan, val1, np.nan, np.nan, val2, np.nan, np.nan] + _exp_vals = list(exp_vals) + # Overwrite placeholder values + for index, exp_val in enumerate(_exp_vals): + if exp_val == 'val1': + _exp_vals[index] = val1 + elif exp_val == 'val2': + _exp_vals[index] = val2 + + # Need to modify values and expectations depending on the + # Series / DataFrame that we ultimately want to generate + if mix_groupings: # ['a', 'b', 'a, 'b', ...] + keys = ['a', 'b'] * len(vals) + + def interweave(list_obj): + temp = list() + for x in list_obj: + temp.extend([x, x]) + + return temp + + _exp_vals = interweave(_exp_vals) + vals = interweave(vals) + else: # ['a', 'a', 'a', ... 'b', 'b', 'b'] + keys = ['a'] * len(vals) + ['b'] * len(vals) + _exp_vals = _exp_vals * 2 + vals = vals * 2 + + df = DataFrame({'key': keys, 'val': vals}) + if as_series: + result = getattr( + df.groupby('key')['val'], fill_method)(limit=limit) + exp = Series(_exp_vals, name='val') + assert_series_equal(result, exp) + else: + result = getattr(df.groupby('key'), fill_method)(limit=limit) + exp = DataFrame({'key': keys, 'val': _exp_vals}) + assert_frame_equal(result, exp) + + @pytest.mark.parametrize("test_series", [True, False]) + @pytest.mark.parametrize("periods,fill_method,limit", [ + (1, 'ffill', None), (1, 'ffill', 1), + (1, 'bfill', None), (1, 'bfill', 1), + (-1, 'ffill', None), (-1, 'ffill', 1), + (-1, 'bfill', None), (-1, 'bfill', 1)]) + def test_pct_change(self, test_series, periods, fill_method, limit): + vals = [np.nan, np.nan, 1, 2, 4, 10, np.nan, np.nan] + exp_vals = Series(vals).pct_change(periods=periods, + fill_method=fill_method, + limit=limit).tolist() + + df = DataFrame({'key': ['a'] * len(vals) + ['b'] * len(vals), + 'vals': vals * 2}) + grp = df.groupby('key') + + def get_result(grp_obj): + return grp_obj.pct_change(periods=periods, + fill_method=fill_method, + limit=limit) + + if test_series: + exp = pd.Series(exp_vals * 2) + exp.name = 'vals' + grp = grp['vals'] + result = get_result(grp) + tm.assert_series_equal(result, exp) + else: + exp = DataFrame({'vals': exp_vals * 2}) + result = get_result(grp) + tm.assert_frame_equal(result, exp) From 4131149fa9ac1d6d4a405589a1c6e63187db0662 Mon Sep 17 00:00:00 2001 From: Stijn Van Hoey Date: Sat, 10 Mar 2018 10:15:41 +0100 Subject: [PATCH 15/18] DOC: Extend docstring pandas core index to_frame method (#20036) --- pandas/core/indexes/base.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 69a07a91838e1..7e6ae88a26e7c 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1148,7 +1148,26 @@ def to_frame(self, index=True): Returns ------- - DataFrame : a DataFrame containing the original Index data. + DataFrame + DataFrame containing the original Index data. + + Examples + -------- + >>> idx = pd.Index(['Ant', 'Bear', 'Cow'], name='animal') + >>> idx.to_frame() + animal + animal + Ant Ant + Bear Bear + Cow Cow + + By default, the original Index is reused. To enforce a new Index: + + >>> idx.to_frame(index=False) + animal + 0 Ant + 1 Bear + 2 Cow """ from pandas import DataFrame From 2e6b4b186ea7c73067f84771a89e81128a8d177c Mon Sep 17 00:00:00 2001 From: Iva Koevska Date: Fri, 9 Mar 2018 00:19:00 +0200 Subject: [PATCH 16/18] Reworked doc string for pandas.cut --- pandas/core/reshape/tile.py | 71 +++++++++++++++++++++---------------- 1 file changed, 40 insertions(+), 31 deletions(-) diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 30132ddc05c40..077bb75a6f7df 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -26,53 +26,62 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, include_lowest=False): """ - Return indices of half-open bins to which each value of `x` belongs. - + Return indices of half-open `bins` to which each value of `x` belongs. + Parameters ---------- x : array-like Input array to be binned. It has to be 1-dimensional. - bins : int, sequence of scalars, or IntervalIndex - If `bins` is an int, it defines the number of equal-width bins in the - range of `x`. However, in this case, the range of `x` is extended - by .1% on each side to include the min or max values of `x`. If - `bins` is a sequence it defines the bin edges allowing for - non-uniform bin width. No extension of the range of `x` is done in - this case. - right : bool, optional - Indicates whether the bins include the rightmost edge or not. If - right == True (the default), then the bins [1,2,3,4] indicate + bins : int, sequence of scalars, or pandas.IntervalIndex + If `bins` is an int, defines the number of equal-width bins in the + range of `x`. The range of `x` is extended by .1% on each side to + include the min or max values of `x`. + If `bins` is a sequence, defines the bin edges allowing for + non-uniform bin width. No extension of the range of `x` is done. + right : bool, optional, default 'True' + Indicates whether the `bins` include the rightmost edge or not. If + `right == True` (the default), then the `bins` [1,2,3,4] indicate (1,2], (2,3], (3,4]. - labels : array or boolean, default None - Used as labels for the resulting bins. Must be of the same length as - the resulting bins. If False, return only integer indicators of the - bins. - retbins : bool, optional - Whether to return the bins or not. Can be useful if bins is given + labels : array or bool, optional + Used as labels for the resulting `bins`. Must be of the same length as + the resulting `bins`. If False, returns only integer indicators of the + `bins`. + retbins : bool, optional, default 'False' + Whether to return the `bins` or not. Useful when `bins` is provided as a scalar. - precision : int, optional - The precision at which to store and display the bins labels - include_lowest : bool, optional + precision : int, optional, default '3' + The precision at which to store and display the `bins` labels. + include_lowest : bool, optional, default 'False' Whether the first interval should be left-inclusive or not. Returns ------- - out : Categorical or Series or array of integers if labels is False - The return type (Categorical or Series) depends on the input: a Series - of type category if input is a Series else Categorical. Bins are - represented as categories when categorical data is returned. - bins : ndarray of floats - Returned only if `retbins` is True. + out : pandas.Categorical or Series, or array of integers if `labels` is 'False' + The return type depends on the input. If the input is a Series, a Series + of type category is returned. Else - pandas.Categorical is returned. + `Bins` are represented as categories when categorical data is returned. + bins : numpy.ndarray of floats + Returned only if `retbins` is 'True'. + + See Also + -------- + qcut : Discretize variable into equal-sized buckets based on rank + or based on sample quantiles. + pandas.Categorical : Represents a categorical variable in + classic R / S-plus fashion. + Series : One-dimensional ndarray with axis labels (including time series). + pandas.IntervalIndex : Immutable Index implementing an ordered, sliceable set. + IntervalIndex represents an Index of intervals that are all closed on the + same side. Notes ----- - The `cut` function can be useful for going from a continuous variable to + The `cut` function is useful for going from a continuous variable to a categorical variable. For example, `cut` could convert ages to groups of age ranges. - Any NA values will be NA in the result. Out of bounds values will be NA in - the resulting Categorical object - + Any NA values will be NA in the result. Out of bounds values will be NA in + the resulting pandas.Categorical object. Examples -------- From 1d392e4c58a3beba09878a6657f0d4254fcf482b Mon Sep 17 00:00:00 2001 From: Iva Koevska Date: Fri, 9 Mar 2018 00:52:50 +0200 Subject: [PATCH 17/18] Fixed example and extended descr --- pandas/core/reshape/tile.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 077bb75a6f7df..d9fd2e6f71bc5 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -27,6 +27,11 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, include_lowest=False): """ Return indices of half-open `bins` to which each value of `x` belongs. + + Use `cut` when you need to segment and sort data values into bins or + buckets of data. This function is also useful for going from a continuous + variable to a categorical variable. For example, `cut` could convert ages + to groups of age ranges. Parameters ---------- @@ -76,10 +81,6 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, Notes ----- - The `cut` function is useful for going from a continuous variable to - a categorical variable. For example, `cut` could convert ages to groups - of age ranges. - Any NA values will be NA in the result. Out of bounds values will be NA in the resulting pandas.Categorical object. @@ -97,7 +98,7 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, Categories (3, object): [good < medium < bad] >>> pd.cut(np.ones(5), 4, labels=False) - array([1, 1, 1, 1, 1]) + array([1, 1, 1, 1, 1], dtype=int64) """ # NOTE: this binning code is changed a bit from histogram for var(x) == 0 From 2387be974816f0efb3c957b9672442ae701d88c0 Mon Sep 17 00:00:00 2001 From: Iva Koevska Date: Fri, 9 Mar 2018 07:45:44 +0200 Subject: [PATCH 18/18] DOC: Fixed issues with panda.cut after flake8 --- pandas/core/reshape/tile.py | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index d9fd2e6f71bc5..6d55e2a3a2fb1 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -28,19 +28,19 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, """ Return indices of half-open `bins` to which each value of `x` belongs. - Use `cut` when you need to segment and sort data values into bins or + Use `cut` when you need to segment and sort data values into bins or buckets of data. This function is also useful for going from a continuous variable to a categorical variable. For example, `cut` could convert ages to groups of age ranges. - + Parameters ---------- x : array-like Input array to be binned. It has to be 1-dimensional. bins : int, sequence of scalars, or pandas.IntervalIndex If `bins` is an int, defines the number of equal-width bins in the - range of `x`. The range of `x` is extended by .1% on each side to - include the min or max values of `x`. + range of `x`. The range of `x` is extended by .1% on each side to + include the min or max values of `x`. If `bins` is a sequence, defines the bin edges allowing for non-uniform bin width. No extension of the range of `x` is done. right : bool, optional, default 'True' @@ -61,23 +61,24 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, Returns ------- - out : pandas.Categorical or Series, or array of integers if `labels` is 'False' - The return type depends on the input. If the input is a Series, a Series - of type category is returned. Else - pandas.Categorical is returned. - `Bins` are represented as categories when categorical data is returned. + out : pandas.Categorical or Series, or array of int if `labels` is 'False' + The return type depends on the input. + If the input is a Series, a Series of type category is returned. + Else - pandas.Categorical is returned. `Bins` are represented as + categories when categorical data is returned. bins : numpy.ndarray of floats Returned only if `retbins` is 'True'. - + See Also -------- - qcut : Discretize variable into equal-sized buckets based on rank + qcut : Discretize variable into equal-sized buckets based on rank or based on sample quantiles. - pandas.Categorical : Represents a categorical variable in + pandas.Categorical : Represents a categorical variable in classic R / S-plus fashion. Series : One-dimensional ndarray with axis labels (including time series). - pandas.IntervalIndex : Immutable Index implementing an ordered, sliceable set. - IntervalIndex represents an Index of intervals that are all closed on the - same side. + pandas.IntervalIndex : Immutable Index implementing an ordered, + sliceable set. IntervalIndex represents an Index of intervals that + are all closed on the same side. Notes -----