From 937614454f81319a72d505609837c74af6ea5e6b Mon Sep 17 00:00:00 2001 From: pedrooa Date: Mon, 25 May 2020 22:44:02 -0300 Subject: [PATCH 1/7] BUG: Fixes Issue#34224 --- pandas/core/frame.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 2d181e826c2a9..4f6100d510e89 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7219,8 +7219,12 @@ def aggregate(self, func, axis=0, *args, **kwargs): result = None try: result, how = self._aggregate(func, axis=axis, *args, **kwargs) - except TypeError: - pass + except TypeError as err: + exc = TypeError( + "DataFrame constructor called with " + f"incompatible data and dtype: {err}" + ) + raise exc from err if result is None: return self.apply(func, axis=axis, args=args, **kwargs) return result From 89d7384df8440a0f628b1d04dc6b3c3e51416f34 Mon Sep 17 00:00:00 2001 From: pedrooa Date: Wed, 27 May 2020 20:31:29 -0300 Subject: [PATCH 2/7] Test added --- pandas/tests/series/test_apply.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pandas/tests/series/test_apply.py b/pandas/tests/series/test_apply.py index 589f8933efa96..f50399289b32c 100644 --- a/pandas/tests/series/test_apply.py +++ b/pandas/tests/series/test_apply.py @@ -42,7 +42,7 @@ def test_apply(self, datetime_series): def test_apply_same_length_inference_bug(self): s = Series([1, 2]) - f = lambda x: (x, x + 1) + def f(x): return (x, x + 1) result = s.apply(f) expected = s.map(f) @@ -56,7 +56,7 @@ def test_apply_same_length_inference_bug(self): def test_apply_dont_convert_dtype(self): s = Series(np.random.randn(10)) - f = lambda x: x if x > 0 else np.nan + def f(x): return x if x > 0 else np.nan result = s.apply(f, convert_dtype=False) assert result.dtype == object @@ -459,6 +459,12 @@ def test_agg_cython_table_raises(self, series, func, expected): # e.g. Series('a b'.split()).cumprod() will raise series.agg(func) + def test_transform_none_to_type(self): + df = pd.DataFrame({"a": [None]}) + + with pytest.raises(TypeError): + df.transform({"a": int}) + class TestSeriesMap: def test_map(self, datetime_series): From f7572aad811e6f712ec44d6e747de4e6a6e071fe Mon Sep 17 00:00:00 2001 From: pedrooa Date: Wed, 27 May 2020 20:39:32 -0300 Subject: [PATCH 3/7] TST: fix code formatting --- pandas/tests/series/test_apply.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/tests/series/test_apply.py b/pandas/tests/series/test_apply.py index f50399289b32c..9653c3af75ef1 100644 --- a/pandas/tests/series/test_apply.py +++ b/pandas/tests/series/test_apply.py @@ -42,7 +42,9 @@ def test_apply(self, datetime_series): def test_apply_same_length_inference_bug(self): s = Series([1, 2]) - def f(x): return (x, x + 1) + + def f(x): + return (x, x + 1) result = s.apply(f) expected = s.map(f) @@ -56,7 +58,9 @@ def f(x): return (x, x + 1) def test_apply_dont_convert_dtype(self): s = Series(np.random.randn(10)) - def f(x): return x if x > 0 else np.nan + def f(x): + return x if x > 0 else np.nan + result = s.apply(f, convert_dtype=False) assert result.dtype == object From 22f2859ff03eebda6ce8c5c5d2d2fe9c1879fbf0 Mon Sep 17 00:00:00 2001 From: pedrooa Date: Wed, 27 May 2020 23:44:31 -0300 Subject: [PATCH 4/7] TST: including issue number and matching error message --- pandas/tests/series/test_apply.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/series/test_apply.py b/pandas/tests/series/test_apply.py index 9653c3af75ef1..e6f86dda05893 100644 --- a/pandas/tests/series/test_apply.py +++ b/pandas/tests/series/test_apply.py @@ -464,9 +464,11 @@ def test_agg_cython_table_raises(self, series, func, expected): series.agg(func) def test_transform_none_to_type(self): + # GH34377 df = pd.DataFrame({"a": [None]}) - with pytest.raises(TypeError): + msg = "DataFrame constructor called with incompatible data and dtype" + with pytest.raises(TypeError, match=msg): df.transform({"a": int}) From a25a993026c74c536c7e4acd7666003c03706e51 Mon Sep 17 00:00:00 2001 From: pedrooa Date: Mon, 1 Jun 2020 18:40:50 -0300 Subject: [PATCH 5/7] whats new note --- doc/source/whatsnew/v1.1.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index a3499f857d158..19223c4364b90 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -851,6 +851,7 @@ Reshaping - Bug in :func:`cut` raised an error when non-unique labels (:issue:`33141`) - Bug in :meth:`DataFrame.replace` casts columns to ``object`` dtype if items in ``to_replace`` not in values (:issue:`32988`) - Ensure only named functions can be used in :func:`eval()` (:issue:`32460`) +- Bug in :func:`aggregate` was causing recursive loop in some cases (:issue:`34224`) Sparse ^^^^^^ From d3493af0a1e94d5c23a54dafdad7aa3290040446 Mon Sep 17 00:00:00 2001 From: pedrooa Date: Tue, 2 Jun 2020 20:21:50 -0300 Subject: [PATCH 6/7] fix whats new note --- doc/source/whatsnew/v1.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 19223c4364b90..0a916683e69b7 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -851,7 +851,7 @@ Reshaping - Bug in :func:`cut` raised an error when non-unique labels (:issue:`33141`) - Bug in :meth:`DataFrame.replace` casts columns to ``object`` dtype if items in ``to_replace`` not in values (:issue:`32988`) - Ensure only named functions can be used in :func:`eval()` (:issue:`32460`) -- Bug in :func:`aggregate` was causing recursive loop in some cases (:issue:`34224`) +- Bug in :func:`Dataframe.aggregate` and :func:`Series.aggregate` was causing recursive loop in some cases (:issue:`34224`) Sparse ^^^^^^ From c27f2beb34ad184ba47c62a63b6ad14470ef11e1 Mon Sep 17 00:00:00 2001 From: pedrooa Date: Tue, 2 Jun 2020 20:30:37 -0300 Subject: [PATCH 7/7] fix whats new note --- doc/source/whatsnew/v1.1.0.rst | 91 +++++++++++++++++++++++++++++++++- 1 file changed, 89 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 0a916683e69b7..80c1041c4bd06 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -13,6 +13,24 @@ including other versions of pandas. Enhancements ~~~~~~~~~~~~ +.. _whatsnew_110.astype_string: + +All dtypes can now be converted to ``StringDtype`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Previously, declaring or converting to :class:`StringDtype` was in general only possible if the data was already only ``str`` or nan-like (:issue:`31204`). +:class:`StringDtype` now works in all situations where ``astype(str)`` or ``dtype=str`` work: + +For example, the below now works: + +.. ipython:: python + + ser = pd.Series([1, "abc", np.nan], dtype="string") + ser + ser[0] + pd.Series([1, 2, np.nan], dtype="Int64").astype("string") + + .. _whatsnew_110.period_index_partial_string_slicing: Nonmonotonic PeriodIndex Partial String Slicing @@ -209,7 +227,7 @@ Other enhancements - :class:`Series.str` now has a `fullmatch` method that matches a regular expression against the entire string in each row of the series, similar to `re.fullmatch` (:issue:`32806`). - :meth:`DataFrame.sample` will now also allow array-like and BitGenerator objects to be passed to ``random_state`` as seeds (:issue:`32503`) - :meth:`MultiIndex.union` will now raise `RuntimeWarning` if the object inside are unsortable, pass `sort=False` to suppress this warning (:issue:`33015`) -- :class:`Series.dt` and :class:`DatatimeIndex` now have an `isocalendar` method that returns a :class:`DataFrame` with year, week, and day calculated according to the ISO 8601 calendar (:issue:`33206`). +- :class:`Series.dt` and :class:`DatatimeIndex` now have an `isocalendar` method that returns a :class:`DataFrame` with year, week, and day calculated according to the ISO 8601 calendar (:issue:`33206`, :issue:`34392`). - The :meth:`DataFrame.to_feather` method now supports additional keyword arguments (e.g. to set the compression) that are added in pyarrow 0.17 (:issue:`33422`). @@ -236,6 +254,7 @@ Other enhancements and :class:`~pandas.io.stata.StataWriterUTF8` (:issue:`26599`). - :meth:`HDFStore.put` now accepts `track_times` parameter. Parameter is passed to ``create_table`` method of ``PyTables`` (:issue:`32682`). - Make :class:`pandas.core.window.Rolling` and :class:`pandas.core.window.Expanding` iterable(:issue:`11704`) +- Make ``option_context`` a :class:`contextlib.ContextDecorator`, which allows it to be used as a decorator over an entire function (:issue:`34253`). .. --------------------------------------------------------------------------- @@ -341,6 +360,7 @@ Backwards incompatible API changes will now result in a float column instead of an object dtyped column (:issue:`33607`) - :meth:`Series.to_timestamp` now raises a ``TypeError`` if the axis is not a :class:`PeriodIndex`. Previously an ``AttributeError`` was raised (:issue:`33327`) - :meth:`Series.to_period` now raises a ``TypeError`` if the axis is not a :class:`DatetimeIndex`. Previously an ``AttributeError`` was raised (:issue:`33327`) +- :func: `pandas.api.dtypes.is_string_dtype` no longer incorrectly identifies categorical series as string. ``MultiIndex.get_indexer`` interprets `method` argument differently ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -563,6 +583,53 @@ Assignment to multiple columns of a :class:`DataFrame` when some of the columns df[['a', 'c']] = 1 df +.. _whatsnew_110.api_breaking.groupby_consistency: + +Consistency across groupby reductions +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Using :meth:`DataFrame.groupby` with ``as_index=True`` and the aggregation ``nunique`` would include the grouping column(s) in the columns of the result. Now the grouping column(s) only appear in the index, consistent with other reductions. (:issue:`32579`) + +.. ipython:: python + + df = pd.DataFrame({"a": ["x", "x", "y", "y"], "b": [1, 1, 2, 3]}) + df + +*Previous behavior*: + +.. code-block:: ipython + + In [3]: df.groupby("a", as_index=True).nunique() + Out[4]: + a b + a + x 1 1 + y 1 2 + +*New behavior*: + +.. ipython:: python + + df.groupby("a", as_index=True).nunique() + +Using :meth:`DataFrame.groupby` with ``as_index=False`` and the function ``idxmax``, ``idxmin``, ``mad``, ``nunique``, ``sem``, ``skew``, or ``std`` would modify the grouping column. Now the grouping column remains unchanged, consistent with other reductions. (:issue:`21090`, :issue:`10355`) + +*Previous behavior*: + +.. code-block:: ipython + + In [3]: df.groupby("a", as_index=False).nunique() + Out[4]: + a b + 0 1 1 + 1 1 2 + +*New behavior*: + +.. ipython:: python + + df.groupby("a", as_index=False).nunique() + .. _whatsnew_110.deprecations: Deprecations @@ -588,7 +655,14 @@ Deprecations - :func:`pandas.api.types.is_categorical` is deprecated and will be removed in a future version; use `:func:pandas.api.types.is_categorical_dtype` instead (:issue:`33385`) - :meth:`Index.get_value` is deprecated and will be removed in a future version (:issue:`19728`) +- :meth:`Series.dt.week` and `Series.dt.weekofyear` are deprecated and will be removed in a future version, use :meth:`Series.dt.isocalendar().week` instead (:issue:`33595`) +- :meth:`DatetimeIndex.week` and `DatetimeIndex.weekofyear` are deprecated and will be removed in a future version, use :meth:`DatetimeIndex.isocalendar().week` instead (:issue:`33595`) +- :meth:`DatetimeArray.week` and `DatetimeArray.weekofyear` are deprecated and will be removed in a future version, use :meth:`DatetimeArray.isocalendar().week` instead (:issue:`33595`) - :meth:`DateOffset.__call__` is deprecated and will be removed in a future version, use ``offset + other`` instead (:issue:`34171`) +- Indexing an :class:`Index` object with a float key is deprecated, and will + raise an ``IndexError`` in the future. You can manually convert to an integer key + instead (:issue:`34191`). +- The ``squeeze`` keyword in the ``groupby`` function is deprecated and will be removed in a future version (:issue:`32380`) .. --------------------------------------------------------------------------- @@ -614,6 +688,8 @@ Performance improvements - Performance improvement in :func:`factorize` for nullable (integer and boolean) dtypes (:issue:`33064`). - Performance improvement in reductions (sum, prod, min, max) for nullable (integer and boolean) dtypes (:issue:`30982`, :issue:`33261`, :issue:`33442`). - Performance improvement in arithmetic operations between two :class:`DataFrame` objects (:issue:`32779`) +- Performance improvement in :class:`pandas.core.groupby.RollingGroupby` (:issue:`34052`) +- Performance improvement in arithmetic operations (sub, add, mul, div) for MultiIndex (:issue:`34297`) .. --------------------------------------------------------------------------- @@ -726,6 +802,7 @@ Indexing - Bug in :meth:`DataFrame.iloc.__setitem__` creating a new array instead of overwriting ``Categorical`` values in-place (:issue:`32831`) - Bug in :class:`Interval` where a :class:`Timedelta` could not be added or subtracted from a :class:`Timestamp` interval (:issue:`32023`) - Bug in :meth:`DataFrame.copy` _item_cache not invalidated after copy causes post-copy value updates to not be reflected (:issue:`31784`) +- Fixed regression in :meth:`DataFrame.loc` and :meth:`Series.loc` throwing an error when a ``datetime64[ns, tz]`` value is provided (:issue:`32395`) - Bug in `Series.__getitem__` with an integer key and a :class:`MultiIndex` with leading integer level failing to raise ``KeyError`` if the key is not present in the first level (:issue:`33355`) - Bug in :meth:`DataFrame.iloc` when slicing a single column-:class:`DataFrame`` with ``ExtensionDtype`` (e.g. ``df.iloc[:, :1]``) returning an invalid result (:issue:`32957`) - Bug in :meth:`DatetimeIndex.insert` and :meth:`TimedeltaIndex.insert` causing index ``freq`` to be lost when setting an element into an empty :class:`Series` (:issue:33573`) @@ -733,6 +810,7 @@ Indexing - Bug in :meth:`Series.__getitem__` allowing missing labels with ``np.ndarray``, :class:`Index`, :class:`Series` indexers but not ``list``, these now all raise ``KeyError`` (:issue:`33646`) - Bug in :meth:`DataFrame.truncate` and :meth:`Series.truncate` where index was assumed to be monotone increasing (:issue:`33756`) - Indexing with a list of strings representing datetimes failed on :class:`DatetimeIndex` or :class:`PeriodIndex`(:issue:`11278`) +- Bug in :meth:`Series.at` when used with a :class:`MultiIndex` would raise an exception on valid inputs (:issue:`26989`) Missing ^^^^^^^ @@ -773,6 +851,7 @@ I/O timestamps with ``version="2.0"`` (:issue:`31652`). - Bug in :meth:`read_csv` was raising `TypeError` when `sep=None` was used in combination with `comment` keyword (:issue:`31396`) - Bug in :class:`HDFStore` that caused it to set to ``int64`` the dtype of a ``datetime64`` column when reading a DataFrame in Python 3 from fixed format written in Python 2 (:issue:`31750`) +- :func:`read_sas()` now handles dates and datetimes larger than :attr:`Timestamp.max` returning them as :class:`datetime.datetime` objects (:issue:`20927`) - Bug in :meth:`DataFrame.to_json` where ``Timedelta`` objects would not be serialized correctly with ``date_format="iso"`` (:issue:`28256`) - :func:`read_csv` will raise a ``ValueError`` when the column names passed in `parse_dates` are missing in the Dataframe (:issue:`31251`) - Bug in :meth:`read_excel` where a UTF-8 string with a high surrogate would cause a segmentation violation (:issue:`23809`) @@ -792,6 +871,8 @@ I/O - Bug in :meth:`~DataFrame.read_feather` was raising an `ArrowIOError` when reading an s3 or http file path (:issue:`29055`) - Bug in :meth:`read_parquet` was raising a ``FileNotFoundError`` when passed an s3 directory path. (:issue:`26388`) - Bug in :meth:`~DataFrame.to_parquet` was throwing an ``AttributeError`` when writing a partitioned parquet file to s3 (:issue:`27596`) +- Bug in :meth:`~DataFrame.to_excel` could not handle the column name `render` and was raising an ``KeyError`` (:issue:`34331`) +- Bug in :meth:`~SQLDatabase.execute` was raising a ``ProgrammingError`` for some DB-API drivers when the SQL statement contained the `%` character and no parameters were present (:issue:`34211`) Plotting ^^^^^^^^ @@ -817,11 +898,15 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.resample` where an ``AmbiguousTimeError`` would be raised when the resulting timezone aware :class:`DatetimeIndex` had a DST transition at midnight (:issue:`25758`) - Bug in :meth:`DataFrame.groupby` where a ``ValueError`` would be raised when grouping by a categorical column with read-only categories and ``sort=False`` (:issue:`33410`) - Bug in :meth:`GroupBy.first` and :meth:`GroupBy.last` where None is not preserved in object dtype (:issue:`32800`) +- Bug in :meth:`GroupBy.quantile` causes the quantiles to be shifted when the ``by`` axis contains ``NaN`` (:issue:`33200`, :issue:`33569`) - Bug in :meth:`Rolling.min` and :meth:`Rolling.max`: Growing memory usage after multiple calls when using a fixed window (:issue:`30726`) - Bug in :meth:`Series.groupby` would raise ``ValueError`` when grouping by :class:`PeriodIndex` level (:issue:`34010`) - Bug in :meth:`GroupBy.agg`, :meth:`GroupBy.transform`, and :meth:`GroupBy.resample` where subclasses are not preserved (:issue:`28330`) - Bug in :meth:`GroupBy.rolling.apply` ignores args and kwargs parameters (:issue:`33433`) -- Bug in :meth:`DataFrameGroupby.std` and :meth:`DataFrameGroupby.sem` would modify grouped-by columns when ``as_index=False`` (:issue:`10355`) +- Bug in :meth:`core.groupby.DataFrameGroupBy.apply` where the output index shape for functions returning a DataFrame which is equally indexed + to the input DataFrame is inconsistent. An internal heuristic to detect index mutation would behave differently for equal but not identical + indices. In particular, the result index shape might change if a copy of the input would be returned. + The behaviour now is consistent, independent of internal heuristics. (:issue:`31612`, :issue:`14927`, :issue:`13056`) Reshaping ^^^^^^^^^ @@ -852,12 +937,14 @@ Reshaping - Bug in :meth:`DataFrame.replace` casts columns to ``object`` dtype if items in ``to_replace`` not in values (:issue:`32988`) - Ensure only named functions can be used in :func:`eval()` (:issue:`32460`) - Bug in :func:`Dataframe.aggregate` and :func:`Series.aggregate` was causing recursive loop in some cases (:issue:`34224`) +- Fixed bug in :func:`melt` where melting MultiIndex columns with ``col_level`` > 0 would raise a ``KeyError`` on ``id_vars`` (:issue:`34129`) Sparse ^^^^^^ - Creating a :class:`SparseArray` from timezone-aware dtype will issue a warning before dropping timezone information, instead of doing so silently (:issue:`32501`) - Bug in :meth:`arrays.SparseArray.from_spmatrix` wrongly read scipy sparse matrix (:issue:`31991`) - Bug in :meth:`Series.sum` with ``SparseArray`` raises ``TypeError`` (:issue:`25777`) +- The repr of :class:`SparseDtype` now includes the repr of its ``fill_value`` attribute. Previously it used ``fill_value``'s string representation (:issue:`34352`) ExtensionArray ^^^^^^^^^^^^^^