From c2d75928b75507d79723899e8717c50ede42c02e Mon Sep 17 00:00:00 2001 From: datajanko Date: Wed, 18 Sep 2019 21:28:09 +0200 Subject: [PATCH 01/59] define accumulation interface for ExtensionArrays --- pandas/core/arrays/base.py | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 0778b6726d104..5946472f8031a 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -65,6 +65,7 @@ class ExtensionArray: take unique view + _accumulate _concat_same_type _formatter _from_factorized @@ -114,8 +115,9 @@ class ExtensionArray: as they only compose abstract methods. Still, a more efficient implementation may be available, and these methods can be overridden. - One can implement methods to handle array reductions. + One can implement methods to handle array accumulations or reductions. + * _accumulate * _reduce One can implement methods to handle parsing from strings that will be used @@ -407,6 +409,7 @@ def isna(self) -> ArrayLike: * ``na_values._is_boolean`` should be True * `na_values` should implement :func:`ExtensionArray._reduce` + * `na_values` should implement :func:`ExtensionArray._accumulate` * ``na_values.any`` and ``na_values.all`` should be implemented """ raise AbstractMethodError(self) @@ -992,6 +995,35 @@ def _ndarray_values(self) -> np.ndarray: """ return np.array(self) + def _accumulate(self, name, skipna=True, **kwargs): + """ + Return an array result of performing the accumulation operation. + + Parameters + ---------- + name : str + Name of the function, supported values are: + { cummin, cummax, cumsum, cumprod }. + skipna : bool, default True + If True, skip NaN values. + **kwargs + Additional keyword arguments passed to the accumulation function. + Currently, no is the only supported kwarg. + + Returns + ------- + array + + Raises + ------ + TypeError : subclass does not define accumulations + """ + raise TypeError( + "cannot perform {name} with type {dtype}".format( + name=name, dtype=self.dtype + ) + ) + def _reduce(self, name, skipna=True, **kwargs): """ Return a scalar result of performing the reduction operation. From 2c149c0693d19930fb09fc8da5f1cf759892283a Mon Sep 17 00:00:00 2001 From: datajanko Date: Thu, 19 Sep 2019 20:38:42 +0200 Subject: [PATCH 02/59] reformulate doc string --- pandas/core/arrays/base.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 5946472f8031a..994c9584fe68c 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -409,7 +409,6 @@ def isna(self) -> ArrayLike: * ``na_values._is_boolean`` should be True * `na_values` should implement :func:`ExtensionArray._reduce` - * `na_values` should implement :func:`ExtensionArray._accumulate` * ``na_values.any`` and ``na_values.all`` should be implemented """ raise AbstractMethodError(self) @@ -995,20 +994,27 @@ def _ndarray_values(self) -> np.ndarray: """ return np.array(self) - def _accumulate(self, name, skipna=True, **kwargs): + def _accumulate(self, name, skipna=True, **kwargs) -> ABCExtensionArray: """ - Return an array result of performing the accumulation operation. + Return an ExtensionArray performing the accumulation operation. + The underlying data type might change + # TODO Clarify Parameters ---------- name : str Name of the function, supported values are: - { cummin, cummax, cumsum, cumprod }. + # TODO Add function signatures + - cummin + - cummax + - cumsum + - cumprod skipna : bool, default True - If True, skip NaN values. + If True, skip NA values. **kwargs Additional keyword arguments passed to the accumulation function. - Currently, no is the only supported kwarg. + # TODO check if kwargs are needed + Currently, there is no supported kwarg. Returns ------- From 79cea1138c229f57a45727219b41948971715dbf Mon Sep 17 00:00:00 2001 From: datajanko Date: Thu, 19 Sep 2019 20:50:46 +0200 Subject: [PATCH 03/59] creates baseExtension tests for accumulate --- pandas/tests/extension/base/accumulate.py | 60 +++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 pandas/tests/extension/base/accumulate.py diff --git a/pandas/tests/extension/base/accumulate.py b/pandas/tests/extension/base/accumulate.py new file mode 100644 index 0000000000000..79a5d93b37469 --- /dev/null +++ b/pandas/tests/extension/base/accumulate.py @@ -0,0 +1,60 @@ +import warnings + +import pytest + +import pandas as pd +import pandas.util.testing as tm + +from .base import BaseExtensionTests + + +class BaseAccumulateTests(BaseExtensionTests): + """ + Accumulation specific tests. Generally these only + make sense for numeric/boolean operations. + """ + + def check_accumulate(self, s, op_name, skipna): + result = getattr(s, op_name)(skipna=skipna) + expected = getattr(s.astype("float64"), op_name)(skipna=skipna) + tm.assert_almost_equal(result, expected) + + +class BaseNoAccumulateTests(BaseAccumulateTests): + """ we don't define any accumulation """ + + @pytest.mark.parametrize("skipna", [True, False]) + def test_accumulate_series_numeric(self, data, all_numeric_accumulations, skipna): + op_name = all_numeric_accumulations + s = pd.Series(data) + + with pytest.raises(TypeError): + getattr(s, op_name)(skipna=skipna) + + @pytest.mark.parametrize("skipna", [True, False]) + def test_accumulate_series_boolean(self, data, all_boolean_accumulations, skipna): + op_name = all_boolean_accumulations + s = pd.Series(data) + + with pytest.raises(TypeError): + getattr(s, op_name)(skipna=skipna) + + +class BaseNumericAccumulateTests(BaseAccumulateTests): + @pytest.mark.parametrize("skipna", [True, False]) + def test_accumulate_series(self, data, all_numeric_accumulations, skipna): + op_name = all_numeric_accumulations + s = pd.Series(data) + + # min/max with empty produce numpy warnings + with warnings.catch_warnings(): + warnings.simplefilter("ignore", RuntimeWarning) + self.check_accumulate(s, op_name, skipna) + + +class BaseBooleanAccumulateTests(BaseAccumulateTests): + @pytest.mark.parametrize("skipna", [True, False]) + def test_accumulate_series(self, data, all_boolean_accumulations, skipna): + op_name = all_boolean_accumulations + s = pd.Series(data) + self.check_accumulate(s, op_name, skipna) From 12a5ca30f000a137c6986e8798bba21ef64870a7 Mon Sep 17 00:00:00 2001 From: datajanko Date: Fri, 4 Oct 2019 22:05:05 +0200 Subject: [PATCH 04/59] adds fixtures for numeric_accumulations --- pandas/conftest.py | 11 +++++++++++ pandas/tests/extension/base/__init__.py | 1 + pandas/tests/extension/base/accumulate.py | 16 ---------------- 3 files changed, 12 insertions(+), 16 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index b032e14d8f7e1..07228949daf8f 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -229,6 +229,17 @@ def all_boolean_reductions(request): return request.param +_all_numeric_accumulations = ["cumsum", "cumprod", "cummin", "cummax"] + + +@pytest.fixture(params=_all_numeric_accumulations) +def all_numeric_accumulations(request): + """ + Fixture for numeric reduction names + """ + return request.param + + _cython_table = pd.core.base.SelectionMixin._cython_table.items() diff --git a/pandas/tests/extension/base/__init__.py b/pandas/tests/extension/base/__init__.py index 090df35bd94c9..a27352e856e73 100644 --- a/pandas/tests/extension/base/__init__.py +++ b/pandas/tests/extension/base/__init__.py @@ -40,6 +40,7 @@ class TestMyDtype(BaseDtypeTests): ``assert_series_equal`` on your base test class. """ +# from .accumulate import BaseNoAccumulateTests, BaseNumericAccumulateTests from .casting import BaseCastingTests # noqa from .constructors import BaseConstructorsTests # noqa from .dtype import BaseDtypeTests # noqa diff --git a/pandas/tests/extension/base/accumulate.py b/pandas/tests/extension/base/accumulate.py index 79a5d93b37469..e523e3e2c38ec 100644 --- a/pandas/tests/extension/base/accumulate.py +++ b/pandas/tests/extension/base/accumulate.py @@ -31,14 +31,6 @@ def test_accumulate_series_numeric(self, data, all_numeric_accumulations, skipna with pytest.raises(TypeError): getattr(s, op_name)(skipna=skipna) - @pytest.mark.parametrize("skipna", [True, False]) - def test_accumulate_series_boolean(self, data, all_boolean_accumulations, skipna): - op_name = all_boolean_accumulations - s = pd.Series(data) - - with pytest.raises(TypeError): - getattr(s, op_name)(skipna=skipna) - class BaseNumericAccumulateTests(BaseAccumulateTests): @pytest.mark.parametrize("skipna", [True, False]) @@ -50,11 +42,3 @@ def test_accumulate_series(self, data, all_numeric_accumulations, skipna): with warnings.catch_warnings(): warnings.simplefilter("ignore", RuntimeWarning) self.check_accumulate(s, op_name, skipna) - - -class BaseBooleanAccumulateTests(BaseAccumulateTests): - @pytest.mark.parametrize("skipna", [True, False]) - def test_accumulate_series(self, data, all_boolean_accumulations, skipna): - op_name = all_boolean_accumulations - s = pd.Series(data) - self.check_accumulate(s, op_name, skipna) From dc959f45050c8e550dc88fdc7c7c5f17d797b823 Mon Sep 17 00:00:00 2001 From: datajanko Date: Wed, 13 Nov 2019 21:45:43 +0100 Subject: [PATCH 05/59] fixes typos --- pandas/conftest.py | 2 +- pandas/tests/extension/base/accumulate.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 07228949daf8f..a63f444bc30ef 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -235,7 +235,7 @@ def all_boolean_reductions(request): @pytest.fixture(params=_all_numeric_accumulations) def all_numeric_accumulations(request): """ - Fixture for numeric reduction names + Fixture for numeric accumulation names """ return request.param diff --git a/pandas/tests/extension/base/accumulate.py b/pandas/tests/extension/base/accumulate.py index e523e3e2c38ec..e35aa5b198a09 100644 --- a/pandas/tests/extension/base/accumulate.py +++ b/pandas/tests/extension/base/accumulate.py @@ -21,7 +21,7 @@ def check_accumulate(self, s, op_name, skipna): class BaseNoAccumulateTests(BaseAccumulateTests): - """ we don't define any accumulation """ + """ we don't define any accumulations """ @pytest.mark.parametrize("skipna", [True, False]) def test_accumulate_series_numeric(self, data, all_numeric_accumulations, skipna): From bcfb8a835fd319adfa1d2525434ca8e67ad374f0 Mon Sep 17 00:00:00 2001 From: datajanko Date: Tue, 10 Dec 2019 22:21:19 +0100 Subject: [PATCH 06/59] adds accumulate tests for integer arrays --- pandas/tests/extension/base/__init__.py | 2 +- pandas/tests/extension/base/accumulate.py | 1 - pandas/tests/extension/test_integer.py | 4 ++++ 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/tests/extension/base/__init__.py b/pandas/tests/extension/base/__init__.py index a27352e856e73..0497b631f693e 100644 --- a/pandas/tests/extension/base/__init__.py +++ b/pandas/tests/extension/base/__init__.py @@ -40,7 +40,7 @@ class TestMyDtype(BaseDtypeTests): ``assert_series_equal`` on your base test class. """ -# from .accumulate import BaseNoAccumulateTests, BaseNumericAccumulateTests +from .accumulate import BaseNoAccumulateTests, BaseNumericAccumulateTests # noqa from .casting import BaseCastingTests # noqa from .constructors import BaseConstructorsTests # noqa from .dtype import BaseDtypeTests # noqa diff --git a/pandas/tests/extension/base/accumulate.py b/pandas/tests/extension/base/accumulate.py index e35aa5b198a09..0f386ee9f06eb 100644 --- a/pandas/tests/extension/base/accumulate.py +++ b/pandas/tests/extension/base/accumulate.py @@ -38,7 +38,6 @@ def test_accumulate_series(self, data, all_numeric_accumulations, skipna): op_name = all_numeric_accumulations s = pd.Series(data) - # min/max with empty produce numpy warnings with warnings.catch_warnings(): warnings.simplefilter("ignore", RuntimeWarning) self.check_accumulate(s, op_name, skipna) diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py index d051345fdd12d..8b0229bcac19a 100644 --- a/pandas/tests/extension/test_integer.py +++ b/pandas/tests/extension/test_integer.py @@ -229,6 +229,10 @@ class TestBooleanReduce(base.BaseBooleanReduceTests): pass +class TestNumeriAccumulation(base.BaseNumericAccumulateTests): + pass + + class TestPrinting(base.BasePrintingTests): pass From 9a8f4ec98e2481f2f917da4268faa19c3dfabd3f Mon Sep 17 00:00:00 2001 From: datajanko Date: Thu, 12 Dec 2019 18:32:00 +0100 Subject: [PATCH 07/59] fixes typo --- pandas/tests/extension/test_integer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py index 8b0229bcac19a..9ffe1241fe208 100644 --- a/pandas/tests/extension/test_integer.py +++ b/pandas/tests/extension/test_integer.py @@ -229,7 +229,7 @@ class TestBooleanReduce(base.BaseBooleanReduceTests): pass -class TestNumeriAccumulation(base.BaseNumericAccumulateTests): +class TestNumericAccumulation(base.BaseNumericAccumulateTests): pass From 5d837d9def42a72d2fed3232e18f9c70fb9c8261 Mon Sep 17 00:00:00 2001 From: datajanko Date: Thu, 9 Jan 2020 21:22:58 +0100 Subject: [PATCH 08/59] first implementation of cumsum --- pandas/core/arrays/integer.py | 50 +++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 7b03bf35faf25..007a02a0f8e8b 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -642,6 +642,56 @@ def cmp_method(self, other): name = "__{name}__".format(name=op.__name__) return set_function_name(cmp_method, name, cls) + def _accumulate(self, name, skipna=True, **kwargs): + data = self._data + mask = self._mask + + from ..nanops import _get_values + + if name == "cumsum": + fill_value = 0 + if name == "cumprod": + fill_value = 1 + if name == "cummax": + fill_value = data.min() + if name == "cummin": + fill_value = data.max() + + values, mask, dtype, dtype_max, fill_value = _get_values( + data, skipna=True, fill_value=fill_value, mask=mask, + ) + + if name == "cumsum": + return IntegerArray(values.cumsum(dtype=dtype_max), mask) + # # cumsum impute with 0 just add, afterwards replace again if needed + # # cumprod replace nan by 1, cumprod the, maybe float here necessary? + # # cummax, impute by min value np.maximum accumulate. Replace again + # # cummin, impute by max value, np.minimum.accumulate. replace again afterwards + # # coerce to a nan-aware float if needed + # if mask.any(): + # data = self._data.astype("float64") + # data[mask] = self._na_value + + # from ..nanops import _get_values + + # data[mask] = 0 + # self._data = data.cumsum() + + # values, mask, dtype, dtype_max, _ = _ get_values( + # data, skipna, fill_value=0, mask=mask + # ) + # the_cumsum = values.cumsum(axis=0, dtype=dtype_max) + # the_cumsum =_maybe_null_out(the_cumsum, axis=0, mask=mask, + # values=values.shape, min_count=min_count) + + # # TODO: check in nanops: + # # - _get_values + # # - _maybe_null_out + # # - _wrap_restuls + # # - _maybe_get_mask + + # return self + def _reduce(self, name, skipna=True, **kwargs): data = self._data mask = self._mask From 73363bfdde30eda50a90b26688f788eb9207de49 Mon Sep 17 00:00:00 2001 From: datajanko Date: Sun, 15 Mar 2020 08:51:14 +0100 Subject: [PATCH 09/59] stashed merge conflict --- pandas/core/generic.py | 19 +++++++++++++++++++ pandas/tests/extension/test_categorical.py | 2 ++ 2 files changed, 21 insertions(+) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 8d56311331d4d..bec901a3d69f6 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -100,6 +100,7 @@ from pandas.core.internals import BlockManager from pandas.core.missing import find_valid_index from pandas.core.ops import _align_method_FRAME +from pandas.core.dtypes.base import ExtensionDtype from pandas.io.formats import format as fmt from pandas.io.formats.format import DataFrameFormatter, format_percentiles @@ -11197,6 +11198,9 @@ def cum_func(self, axis=None, skipna=True, *args, **kwargs): axis = self._stat_axis_number else: axis = self._get_axis_number(axis) + + if issubclass(self.dtype, ExtensionDtype): + return self._accumulate(name, skipna=skipna) if axis == 1: return cum_func(self.T, axis=0, skipna=skipna, *args, **kwargs).T @@ -11211,6 +11215,21 @@ def block_accum_func(blk_values): result = self._data.apply(block_accum_func) + # y = com.values_from_object(self).copy() + + # if skipna and issubclass(y.dtype.type, (np.datetime64, np.timedelta64)): + # result = accum_func(y, axis) + # mask = isna(self) + # np.putmask(result, mask, iNaT) + # elif skipna and not issubclass(y.dtype.type, (np.integer, np.bool_)): + # mask = isna(self) + # np.putmask(y, mask, mask_a) + # result = accum_func(y, axis) + # np.putmask(result, mask, mask_b) + # # TODO: probably here, we need to call self._accumulate if the proper subclass is available + # else: + # result = accum_func(y, axis) + d = self._construct_axes_dict() d["copy"] = False return self._constructor(result, **d).__finalize__(self) diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index 059d3453995bd..1d5083739431f 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -161,6 +161,8 @@ def test_fillna_limit_backfill(self, data_missing): class TestReduce(base.BaseNoReduceTests): pass +class TestAccumulate(base.BaseNoAccumulateTests): + pass class TestMethods(base.BaseMethodsTests): @pytest.mark.skip(reason="Unobserved categories included") From 0d9a3d582f315cc8398939d6d5f1651684e4001a Mon Sep 17 00:00:00 2001 From: datajanko Date: Sun, 15 Mar 2020 08:55:50 +0100 Subject: [PATCH 10/59] fixes formatting --- pandas/core/generic.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index bec901a3d69f6..ba68718059db7 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -56,6 +56,7 @@ validate_percentile, ) +from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.common import ( ensure_int64, ensure_object, @@ -100,7 +101,6 @@ from pandas.core.internals import BlockManager from pandas.core.missing import find_valid_index from pandas.core.ops import _align_method_FRAME -from pandas.core.dtypes.base import ExtensionDtype from pandas.io.formats import format as fmt from pandas.io.formats.format import DataFrameFormatter, format_percentiles @@ -11198,7 +11198,7 @@ def cum_func(self, axis=None, skipna=True, *args, **kwargs): axis = self._stat_axis_number else: axis = self._get_axis_number(axis) - + if issubclass(self.dtype, ExtensionDtype): return self._accumulate(name, skipna=skipna) @@ -11226,10 +11226,11 @@ def block_accum_func(blk_values): # np.putmask(y, mask, mask_a) # result = accum_func(y, axis) # np.putmask(result, mask, mask_b) - # # TODO: probably here, we need to call self._accumulate if the proper subclass is available + # # TODO: probably here, we need to call + # self._accumulate if the proper subclass is available # else: # result = accum_func(y, axis) - + d = self._construct_axes_dict() d["copy"] = False return self._constructor(result, **d).__finalize__(self) From 84a7d81236a7b3eb348ac39db6cc807928fea4a2 Mon Sep 17 00:00:00 2001 From: datajanko Date: Mon, 23 Mar 2020 22:29:50 +0100 Subject: [PATCH 11/59] first green test for integer extension arrays and cumsum --- pandas/core/generic.py | 6 ++++-- pandas/tests/extension/test_categorical.py | 2 ++ pandas/tests/extension/test_integer.py | 7 ++++++- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index ba68718059db7..150a01ae45cf5 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -11199,8 +11199,10 @@ def cum_func(self, axis=None, skipna=True, *args, **kwargs): else: axis = self._get_axis_number(axis) - if issubclass(self.dtype, ExtensionDtype): - return self._accumulate(name, skipna=skipna) + # mimicking from series._reduce, which delegates + delegate = self._values + if isinstance(delegate.dtype, ExtensionDtype): + return delegate._accumulate(name, skipna=skipna, **kwargs) if axis == 1: return cum_func(self.T, axis=0, skipna=skipna, *args, **kwargs).T diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index 1d5083739431f..0f01b5677cd85 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -161,9 +161,11 @@ def test_fillna_limit_backfill(self, data_missing): class TestReduce(base.BaseNoReduceTests): pass + class TestAccumulate(base.BaseNoAccumulateTests): pass + class TestMethods(base.BaseMethodsTests): @pytest.mark.skip(reason="Unobserved categories included") def test_value_counts(self, all_data, dropna): diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py index 80ffae1b9f596..adb62dc27fc03 100644 --- a/pandas/tests/extension/test_integer.py +++ b/pandas/tests/extension/test_integer.py @@ -249,7 +249,12 @@ class TestBooleanReduce(base.BaseBooleanReduceTests): class TestNumericAccumulation(base.BaseNumericAccumulateTests): - pass + def check_accumulate(self, s, op_name, skipna): + # overwrite to ensure pd.NA is tested instead of np.nan + # https://github.com/pandas-dev/pandas/issues/30958 + result = getattr(s, op_name)(skipna=skipna) + expected = integer_array(getattr(s.astype("float64"), op_name)(skipna=skipna)) + tm.assert_extension_array_equal(result, expected) class TestPrinting(base.BasePrintingTests): From ce6869df4d549a53fc88eaa9705c1dee168298ea Mon Sep 17 00:00:00 2001 From: datajanko Date: Thu, 2 Apr 2020 21:15:47 +0200 Subject: [PATCH 12/59] first passing tests for cummin and cummax --- pandas/core/arrays/integer.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 0cb4bfb2c1539..425a011af96b9 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -589,6 +589,13 @@ def _accumulate(self, name: str, skipna: bool = True, **kwargs): if name == "cumsum": return IntegerArray(values.cumsum(dtype=dtype_max), mask) + elif name == "cumprod": + return IntegerArray(values.cumprod(dtype=dtype_max), mask) + elif name == "cummax": + return np.maximum.accumulate(IntegerArray(values, mask)) + elif name == "cummin": + return np.minimum.accumulate(IntegerArray(values, mask)) + # # cumsum impute with 0 just add, afterwards replace again if needed # # cumprod replace nan by 1, cumprod the, maybe float here necessary? # # cummax, impute by min value np.maximum accumulate. Replace again From 3b5d1d8b3a0c0763076ef8f3a51cc1d1277fe1eb Mon Sep 17 00:00:00 2001 From: datajanko Date: Sun, 5 Apr 2020 21:26:15 +0200 Subject: [PATCH 13/59] utilizes na_accum_func --- pandas/core/arrays/integer.py | 45 ++++++++++++++++++++--------------- pandas/core/generic.py | 1 + pandas/core/nanops.py | 27 +++++++++++++++++++++ 3 files changed, 54 insertions(+), 19 deletions(-) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 425a011af96b9..3c70efa391aea 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -569,32 +569,39 @@ def cmp_method(self, other): return set_function_name(cmp_method, name, cls) def _accumulate(self, name: str, skipna: bool = True, **kwargs): - data = self._data - mask = self._mask + # data = self._data + # mask = self._mask - from ..nanops import _get_values + # from ..nanops import _get_values - if name == "cumsum": - fill_value = 0 - if name == "cumprod": - fill_value = 1 - if name == "cummax": - fill_value = data.min() - if name == "cummin": - fill_value = data.max() - - values, mask, dtype, dtype_max, fill_value = _get_values( - data, skipna=True, fill_value=fill_value, mask=mask, - ) + # if name == "cumsum": + # fill_value = 0 + # if name == "cumprod": + # fill_value = 1 + # if name == "cummax": + # fill_value = data.min() + # if name == "cummin": + # fill_value = data.max() + + # values, mask, dtype, dtype_max, fill_value = _get_values( + # data, skipna=True, fill_value=fill_value, mask=mask, + # ) + from ..nanops import na_accum_func if name == "cumsum": - return IntegerArray(values.cumsum(dtype=dtype_max), mask) + # return IntegerArray(values.cumsum(dtype=dtype_max), mask) + return na_accum_func(self, np.cumsum, skipna=skipna) elif name == "cumprod": - return IntegerArray(values.cumprod(dtype=dtype_max), mask) + # return IntegerArray(values.cumprod(dtype=dtype_max), mask) + return na_accum_func(self, np.cumprod, skipna=skipna) + elif name == "cummax": - return np.maximum.accumulate(IntegerArray(values, mask)) + # return np.maximum.accumulate(IntegerArray(values, mask)) + return na_accum_func(self, np.maximum.accumulate, skipna=skipna) + elif name == "cummin": - return np.minimum.accumulate(IntegerArray(values, mask)) + # return np.minimum.accumulate(IntegerArray(values, mask)) + return na_accum_func(self, np.minimum.accumulate, skipna=skipna) # # cumsum impute with 0 just add, afterwards replace again if needed # # cumprod replace nan by 1, cumprod the, maybe float here necessary? diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 150a01ae45cf5..aa25fc79caf49 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -11200,6 +11200,7 @@ def cum_func(self, axis=None, skipna=True, *args, **kwargs): axis = self._get_axis_number(axis) # mimicking from series._reduce, which delegates + # using na_accum_func_now delegate = self._values if isinstance(delegate.dtype, ExtensionDtype): return delegate._accumulate(name, skipna=skipna, **kwargs) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index a5e70bd279d21..9435953506363 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -1558,10 +1558,37 @@ def na_accum_func(values: ArrayLike, accum_func, skipna: bool) -> ArrayLike: if isinstance(values, np.ndarray): result = result.view(orig_dtype) + else: # DatetimeArray result = type(values)._from_sequence(result, dtype=orig_dtype) + from pandas.core.arrays import IntegerArray + + if isinstance(values, IntegerArray): + data = values._data + mask = values._mask + + fill_value = { + np.cumprod: 1, + np.maximum.accumulate: data.min(), + np.cumsum: 0, + np.minimum.accumulate: data.max(), + }[accum_func] + + values, mask, dtype, dtype_max, fill_value = _get_values( + data, skipna=skipna, fill_value=fill_value, mask=mask + ) + + if not skipna: + mask = np.maximum.accumulate(mask) + + vals = accum_func(values) + + from pandas import Series + + result = Series(IntegerArray(vals, mask)) + elif skipna and not issubclass(values.dtype.type, (np.integer, np.bool_)): vals = values.copy() mask = isna(vals) From 0337cb0405816fd2bb27d5698e86023f8d664c4a Mon Sep 17 00:00:00 2001 From: datajanko Date: Sun, 5 Apr 2020 21:30:59 +0200 Subject: [PATCH 14/59] removes delegation leftover --- pandas/core/generic.py | 6 +++--- pandas/core/nanops.py | 5 +---- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index aa25fc79caf49..e2682d340a5a8 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -11201,9 +11201,9 @@ def cum_func(self, axis=None, skipna=True, *args, **kwargs): # mimicking from series._reduce, which delegates # using na_accum_func_now - delegate = self._values - if isinstance(delegate.dtype, ExtensionDtype): - return delegate._accumulate(name, skipna=skipna, **kwargs) + # delegate = self._values + # if isinstance(delegate.dtype, ExtensionDtype): + # return delegate._accumulate(name, skipna=skipna, **kwargs) if axis == 1: return cum_func(self.T, axis=0, skipna=skipna, *args, **kwargs).T diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 9435953506363..7e9b7896c9b65 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -1584,10 +1584,7 @@ def na_accum_func(values: ArrayLike, accum_func, skipna: bool) -> ArrayLike: mask = np.maximum.accumulate(mask) vals = accum_func(values) - - from pandas import Series - - result = Series(IntegerArray(vals, mask)) + result = IntegerArray(vals, mask) elif skipna and not issubclass(values.dtype.type, (np.integer, np.bool_)): vals = values.copy() From f0722f5c42f6b0dfe25371184f27f0651d8659ab Mon Sep 17 00:00:00 2001 From: datajanko Date: Thu, 9 Apr 2020 21:42:33 +0200 Subject: [PATCH 15/59] creates running tests --- pandas/tests/extension/test_integer.py | 42 ++++++++++++++++++++++++-- 1 file changed, 39 insertions(+), 3 deletions(-) diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py index adb62dc27fc03..9b5c55196bf4c 100644 --- a/pandas/tests/extension/test_integer.py +++ b/pandas/tests/extension/test_integer.py @@ -252,9 +252,45 @@ class TestNumericAccumulation(base.BaseNumericAccumulateTests): def check_accumulate(self, s, op_name, skipna): # overwrite to ensure pd.NA is tested instead of np.nan # https://github.com/pandas-dev/pandas/issues/30958 - result = getattr(s, op_name)(skipna=skipna) - expected = integer_array(getattr(s.astype("float64"), op_name)(skipna=skipna)) - tm.assert_extension_array_equal(result, expected) + if op_name == "cumsum": + if s.dtype.name.startswith("U"): + expected_dtype = "UInt64" + else: + expected_dtype = "Int64" + result = getattr(s, op_name)(skipna=skipna) + expected = pd.Series( + integer_array( + getattr(s.astype("float64"), op_name)(skipna=skipna), + dtype=expected_dtype, + ) + ) + tm.assert_series_equal(result, expected) + elif op_name in ["cummax", "cummin"]: + expected_dtype = s.dtype + result = getattr(s, op_name)(skipna=skipna) + expected = pd.Series( + integer_array( + getattr(s.astype("float64"), op_name)(skipna=skipna), + dtype=expected_dtype, + ) + ) + tm.assert_series_equal(result, expected) + elif op_name == "cumprod": + if s.dtype.name.startswith("U"): + expected_dtype = "UInt64" + else: + expected_dtype = "Int64" + result = getattr(s[:20], op_name)(skipna=skipna) + expected = pd.Series( + integer_array( + getattr(s[:20].astype("float64"), op_name)(skipna=skipna), + dtype=expected_dtype, + ) + ) + tm.assert_series_equal(result, expected) + + else: + raise class TestPrinting(base.BasePrintingTests): From fa35b141e076a9cbac3f72adfd6a2f7e190d8c22 Mon Sep 17 00:00:00 2001 From: datajanko Date: Thu, 9 Apr 2020 22:10:48 +0200 Subject: [PATCH 16/59] removes ABCExtensionArray Type hint --- pandas/core/arrays/base.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index d11e78e3cd696..bc462aa14c2de 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1025,8 +1025,7 @@ def _concat_same_type( # of objects _can_hold_na = True - - def _accumulate(self, name, skipna=True, **kwargs) -> ABCExtensionArray: + def _accumulate(self, name, skipna=True, **kwargs): """ Return an ExtensionArray performing the accumulation operation. The underlying data type might change @@ -1062,7 +1061,6 @@ def _accumulate(self, name, skipna=True, **kwargs) -> ABCExtensionArray: ) ) - def _reduce(self, name, skipna=True, **kwargs): """ Return a scalar result of performing the reduction operation. From 185510b68cbc865a0696e35d4529b2f37b7aa36c Mon Sep 17 00:00:00 2001 From: datajanko Date: Fri, 10 Apr 2020 08:19:16 +0200 Subject: [PATCH 17/59] removes clutter from generic.py --- pandas/core/generic.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index c2c642b9c68ed..7c22a47801dd0 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -56,7 +56,6 @@ validate_percentile, ) -from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.common import ( ensure_int64, ensure_object, @@ -11181,12 +11180,6 @@ def cum_func(self, axis=None, skipna=True, *args, **kwargs): else: axis = self._get_axis_number(axis) - # mimicking from series._reduce, which delegates - # using na_accum_func_now - # delegate = self._values - # if isinstance(delegate.dtype, ExtensionDtype): - # return delegate._accumulate(name, skipna=skipna, **kwargs) - if axis == 1: return cum_func(self.T, axis=0, skipna=skipna, *args, **kwargs).T From 2ef9ebbd7a8bac636672e4e0f206c0c23a3d1241 Mon Sep 17 00:00:00 2001 From: datajanko Date: Fri, 10 Apr 2020 08:27:16 +0200 Subject: [PATCH 18/59] removes clutter in _accumulate --- pandas/core/arrays/integer.py | 52 ----------------------------------- 1 file changed, 52 deletions(-) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 5a0e48c8f236a..636a923e791f6 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -559,69 +559,17 @@ def cmp_method(self, other): return set_function_name(cmp_method, name, cls) def _accumulate(self, name: str, skipna: bool = True, **kwargs): - # data = self._data - # mask = self._mask - - # from ..nanops import _get_values - - # if name == "cumsum": - # fill_value = 0 - # if name == "cumprod": - # fill_value = 1 - # if name == "cummax": - # fill_value = data.min() - # if name == "cummin": - # fill_value = data.max() - - # values, mask, dtype, dtype_max, fill_value = _get_values( - # data, skipna=True, fill_value=fill_value, mask=mask, - # ) from ..nanops import na_accum_func if name == "cumsum": - # return IntegerArray(values.cumsum(dtype=dtype_max), mask) return na_accum_func(self, np.cumsum, skipna=skipna) elif name == "cumprod": - # return IntegerArray(values.cumprod(dtype=dtype_max), mask) return na_accum_func(self, np.cumprod, skipna=skipna) - elif name == "cummax": - # return np.maximum.accumulate(IntegerArray(values, mask)) return na_accum_func(self, np.maximum.accumulate, skipna=skipna) - elif name == "cummin": - # return np.minimum.accumulate(IntegerArray(values, mask)) return na_accum_func(self, np.minimum.accumulate, skipna=skipna) - # # cumsum impute with 0 just add, afterwards replace again if needed - # # cumprod replace nan by 1, cumprod the, maybe float here necessary? - # # cummax, impute by min value np.maximum accumulate. Replace again - # # cummin, impute by max value, np.minimum.accumulate. replace again afterwards - # # coerce to a nan-aware float if needed - # if mask.any(): - # data = self._data.astype("float64") - # data[mask] = self._na_value - - # from ..nanops import _get_values - - # data[mask] = 0 - # self._data = data.cumsum() - - # values, mask, dtype, dtype_max, _ = _ get_values( - # data, skipna, fill_value=0, mask=mask - # ) - # the_cumsum = values.cumsum(axis=0, dtype=dtype_max) - # the_cumsum =_maybe_null_out(the_cumsum, axis=0, mask=mask, - # values=values.shape, min_count=min_count) - - # # TODO: check in nanops: - # # - _get_values - # # - _maybe_null_out - # # - _wrap_restuls - # # - _maybe_get_mask - - # return self - def _reduce(self, name: str, skipna: bool = True, **kwargs): data = self._data mask = self._mask From 7d898bd50352a7381a5d23c60e524f3e8902b167 Mon Sep 17 00:00:00 2001 From: datajanko Date: Fri, 10 Apr 2020 15:52:54 +0200 Subject: [PATCH 19/59] adds typehints for ExtensionArray and IntegerArray --- pandas/core/arrays/base.py | 2 +- pandas/core/arrays/integer.py | 11 ++++++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index bc462aa14c2de..cd6b30ce120f4 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1025,7 +1025,7 @@ def _concat_same_type( # of objects _can_hold_na = True - def _accumulate(self, name, skipna=True, **kwargs): + def _accumulate(self, name, skipna=True, **kwargs) -> "ExtensionArray": """ Return an ExtensionArray performing the accumulation operation. The underlying data type might change diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 636a923e791f6..ae8447637e367 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -558,17 +558,18 @@ def cmp_method(self, other): name = f"__{op.__name__}__" return set_function_name(cmp_method, name, cls) - def _accumulate(self, name: str, skipna: bool = True, **kwargs): + def _accumulate(self, name: str, skipna: bool = True, **kwargs) -> "IntegerArray": from ..nanops import na_accum_func if name == "cumsum": - return na_accum_func(self, np.cumsum, skipna=skipna) + result = na_accum_func(self, np.cumsum, skipna=skipna) elif name == "cumprod": - return na_accum_func(self, np.cumprod, skipna=skipna) + resut = na_accum_func(self, np.cumprod, skipna=skipna) elif name == "cummax": - return na_accum_func(self, np.maximum.accumulate, skipna=skipna) + result = na_accum_func(self, np.maximum.accumulate, skipna=skipna) elif name == "cummin": - return na_accum_func(self, np.minimum.accumulate, skipna=skipna) + result = na_accum_func(self, np.minimum.accumulate, skipna=skipna) + return result def _reduce(self, name: str, skipna: bool = True, **kwargs): data = self._data From 09b42be676640d1fb66cb2cd1872e7f2d55472cc Mon Sep 17 00:00:00 2001 From: datajanko Date: Fri, 10 Apr 2020 21:25:56 +0200 Subject: [PATCH 20/59] delegates the accumulate calls to extension arrays --- pandas/core/arrays/integer.py | 30 ++++++++++++++++++++---------- pandas/core/generic.py | 6 +++++- pandas/core/nanops.py | 23 ----------------------- 3 files changed, 25 insertions(+), 34 deletions(-) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index ae8447637e367..d078bcd38f5cf 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -559,16 +559,26 @@ def cmp_method(self, other): return set_function_name(cmp_method, name, cls) def _accumulate(self, name: str, skipna: bool = True, **kwargs) -> "IntegerArray": - from ..nanops import na_accum_func - - if name == "cumsum": - result = na_accum_func(self, np.cumsum, skipna=skipna) - elif name == "cumprod": - resut = na_accum_func(self, np.cumprod, skipna=skipna) - elif name == "cummax": - result = na_accum_func(self, np.maximum.accumulate, skipna=skipna) - elif name == "cummin": - result = na_accum_func(self, np.minimum.accumulate, skipna=skipna) + data = self._data + mask = self._mask + + cum_function, fill_value = { + "cumprod": (np.cumprod, 1), + "cummax": (np.maximum.accumulate, data.min()), + "cumsum": (np.cumsum, 0), + "cummin": (np.minimum.accumulate, data.max()), + }[name] + from ..nanops import _get_values + + values, mask, dtype, dtype_max, fill_value = _get_values( + data, skipna=skipna, fill_value=fill_value, mask=mask + ) + + if not skipna: + mask = np.maximum.accumulate(mask) + + vals = cum_function(values) + result = IntegerArray(vals, mask) return result def _reduce(self, name: str, skipna: bool = True, **kwargs): diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 7c22a47801dd0..aa62a38dfa67b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -84,6 +84,7 @@ import pandas as pd from pandas.core import missing, nanops import pandas.core.algorithms as algos +from pandas.core.arrays import ExtensionArray from pandas.core.base import PandasObject, SelectionMixin import pandas.core.common as com from pandas.core.construction import create_series_with_explicit_dtype @@ -11191,7 +11192,10 @@ def block_accum_func(blk_values): result = result.T if hasattr(result, "T") else result return result - result = self._mgr.apply(block_accum_func) + if isinstance(self.values, ExtensionArray): + result = self.values._accumulate(name, skipna, **kwargs) + else: + result = self._mgr.apply(block_accum_func) # y = com.values_from_object(self).copy() diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index a7dbfb51d4f23..0e68c24dc54d7 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -1593,29 +1593,6 @@ def na_accum_func(values: ArrayLike, accum_func, skipna: bool) -> ArrayLike: # DatetimeArray result = type(values)._from_sequence(result, dtype=orig_dtype) - from pandas.core.arrays import IntegerArray - - if isinstance(values, IntegerArray): - data = values._data - mask = values._mask - - fill_value = { - np.cumprod: 1, - np.maximum.accumulate: data.min(), - np.cumsum: 0, - np.minimum.accumulate: data.max(), - }[accum_func] - - values, mask, dtype, dtype_max, fill_value = _get_values( - data, skipna=skipna, fill_value=fill_value, mask=mask - ) - - if not skipna: - mask = np.maximum.accumulate(mask) - - vals = accum_func(values) - result = IntegerArray(vals, mask) - elif skipna and not issubclass(values.dtype.type, (np.integer, np.bool_)): vals = values.copy() mask = isna(vals) From af0dd24627d25702bc6578d54adaa459a9faab62 Mon Sep 17 00:00:00 2001 From: datajanko Date: Fri, 10 Apr 2020 21:52:39 +0200 Subject: [PATCH 21/59] removes diff in nanops --- pandas/core/nanops.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 0e68c24dc54d7..9494248a423a8 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -1588,7 +1588,6 @@ def na_accum_func(values: ArrayLike, accum_func, skipna: bool) -> ArrayLike: if isinstance(values, np.ndarray): result = result.view(orig_dtype) - else: # DatetimeArray result = type(values)._from_sequence(result, dtype=orig_dtype) From bc9a36ad5ec1a02eca3310a372a91bdc14c49554 Mon Sep 17 00:00:00 2001 From: datajanko Date: Fri, 10 Apr 2020 21:58:12 +0200 Subject: [PATCH 22/59] removes unwanted pattern --- pandas/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 9799771ca7854..680d28358f7f8 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -746,7 +746,7 @@ def all_logical_operators(request): """ return request.param - + _all_numeric_accumulations = ["cumsum", "cumprod", "cummin", "cummax"] From 38454a3ea4f46dccef213ae5b24aa02136947a65 Mon Sep 17 00:00:00 2001 From: datajanko Date: Sun, 12 Apr 2020 20:37:37 +0200 Subject: [PATCH 23/59] makes output types for sum and prod explicit --- pandas/core/arrays/integer.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index d078bcd38f5cf..05fb7cd2a5e30 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -577,7 +577,16 @@ def _accumulate(self, name: str, skipna: bool = True, **kwargs) -> "IntegerArray if not skipna: mask = np.maximum.accumulate(mask) - vals = cum_function(values) + # makes target dtypes explicit since CI showed optimal UInt32 + # dtype on test data occasionally. This was different across systems + dtype_out = dtype + if name in ["cumsum", "cumprod"]: + if dtype.name.lower().startswith("u"): + dtype_out = "UInt64" + else: + dtype_out = "Int64" + + vals = cum_function(values, dtype=dtype_out) result = IntegerArray(vals, mask) return result From 5ecfa516f550b7d64f8b4bd2726eb89bdfcb62c4 Mon Sep 17 00:00:00 2001 From: datajanko Date: Mon, 13 Apr 2020 08:52:17 +0200 Subject: [PATCH 24/59] makes the base accumulate test more general by not comparing types --- pandas/tests/extension/base/accumulate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/extension/base/accumulate.py b/pandas/tests/extension/base/accumulate.py index 0f386ee9f06eb..4cf28480fd3cb 100644 --- a/pandas/tests/extension/base/accumulate.py +++ b/pandas/tests/extension/base/accumulate.py @@ -17,7 +17,7 @@ class BaseAccumulateTests(BaseExtensionTests): def check_accumulate(self, s, op_name, skipna): result = getattr(s, op_name)(skipna=skipna) expected = getattr(s.astype("float64"), op_name)(skipna=skipna) - tm.assert_almost_equal(result, expected) + tm.assert_almost_equal(result, expected, check_dtype=False) class BaseNoAccumulateTests(BaseAccumulateTests): From 8d625943f0228583539068d9e8bec48100cf7958 Mon Sep 17 00:00:00 2001 From: datajanko Date: Mon, 13 Apr 2020 08:59:16 +0200 Subject: [PATCH 25/59] implements accumulation for boolean arrays --- pandas/core/arrays/boolean.py | 29 ++++++++++++++++++++++++++ pandas/tests/extension/test_boolean.py | 4 ++++ 2 files changed, 33 insertions(+) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index e85534def6b97..03afc1c9c758c 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -688,6 +688,35 @@ def cmp_method(self, other): name = f"__{op.__name__}" return set_function_name(cmp_method, name, cls) + def _accumulate( + self, name: str, skipna: bool = True, **kwargs + ): # TODO Type hints not working propery here due to circular imports + data = self._data + mask = self._mask + + cum_function, fill_value = { + "cumprod": (np.cumprod, 1), + "cummax": (np.maximum.accumulate, False), + "cumsum": (np.cumsum, 0), + "cummin": (np.minimum.accumulate, True), + }[name] + from ..nanops import _get_values + + values, mask, dtype, dtype_max, fill_value = _get_values( + data, skipna=skipna, fill_value=fill_value, mask=mask + ) + + if not skipna: + mask = np.maximum.accumulate(mask) + + if name in ["cumsum", "cumprod"]: + from pandas.core.arrays import IntegerArray + + result = IntegerArray(cum_function(values, dtype="UInt64"), mask) + else: + result = BooleanArray(cum_function(values, dtype=bool), mask) + return result + def _reduce(self, name: str, skipna: bool = True, **kwargs): if name in {"any", "all"}: diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index e2331b69916fb..4955bcc982def 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -346,6 +346,10 @@ class TestUnaryOps(base.BaseUnaryOpsTests): pass +class TestNumericAccumulation(base.BaseNumericAccumulateTests): + pass + + # TODO parsing not yet supported # class TestParsing(base.BaseParsingTests): # pass From 5f3b624149fc0f344db280b11702ec04833465d6 Mon Sep 17 00:00:00 2001 From: datajanko Date: Sun, 26 Apr 2020 08:30:28 +0200 Subject: [PATCH 26/59] uses f-string in base.py --- pandas/core/arrays/base.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index cd6b30ce120f4..8303c853cb1ca 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1055,11 +1055,7 @@ def _accumulate(self, name, skipna=True, **kwargs) -> "ExtensionArray": ------ TypeError : subclass does not define accumulations """ - raise TypeError( - "cannot perform {name} with type {dtype}".format( - name=name, dtype=self.dtype - ) - ) + raise TypeError(f"cannot perform {name} with type {self.dtype}") def _reduce(self, name, skipna=True, **kwargs): """ From 06d12860b41085e1f341264dcc42daefe94d8008 Mon Sep 17 00:00:00 2001 From: datajanko Date: Sat, 2 May 2020 08:29:40 +0200 Subject: [PATCH 27/59] uses blockmanager also for extension arrays --- pandas/core/generic.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index aa62a38dfa67b..8a18c9cf494c3 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -11187,15 +11187,15 @@ def cum_func(self, axis=None, skipna=True, *args, **kwargs): def block_accum_func(blk_values): values = blk_values.T if hasattr(blk_values, "T") else blk_values - result = nanops.na_accum_func(values, accum_func, skipna=skipna) + if is_extension_array_dtype(values.dtype): + result = values._accumulate(name, skipna, **kwargs) + else: + result = nanops.na_accum_func(values, accum_func, skipna=skipna) result = result.T if hasattr(result, "T") else result return result - if isinstance(self.values, ExtensionArray): - result = self.values._accumulate(name, skipna, **kwargs) - else: - result = self._mgr.apply(block_accum_func) + result = self._mgr.apply(block_accum_func) # y = com.values_from_object(self).copy() From f7e3f4fab7243b0ddc0237e69691f52514444290 Mon Sep 17 00:00:00 2001 From: datajanko Date: Sun, 3 May 2020 12:07:18 +0200 Subject: [PATCH 28/59] fixes flake8 issues --- pandas/core/arrays/boolean.py | 2 +- pandas/core/generic.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index e278c1d23eeb7..2f8fbd0d015b5 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -680,7 +680,7 @@ def cmp_method(self, other): def _accumulate( self, name: str, skipna: bool = True, **kwargs - ): # TODO Type hints not working propery here due to circular imports + ): # TODO Type hints not working propery here due to circular imports data = self._data mask = self._mask diff --git a/pandas/core/generic.py b/pandas/core/generic.py index de40599fd63ab..c0b21abeb2895 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -84,7 +84,6 @@ import pandas as pd from pandas.core import missing, nanops import pandas.core.algorithms as algos -from pandas.core.arrays import ExtensionArray from pandas.core.base import PandasObject, SelectionMixin import pandas.core.common as com from pandas.core.construction import create_series_with_explicit_dtype From b3ae86475aa6a5a5eed59e61fd0c41065cbd81a5 Mon Sep 17 00:00:00 2001 From: datajanko Date: Wed, 17 Jun 2020 21:46:27 +0200 Subject: [PATCH 29/59] removes uncommented code --- pandas/core/generic.py | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index b0af05736bbe1..590032e353aa4 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -11534,27 +11534,6 @@ def block_accum_func(blk_values): result = self._mgr.apply(block_accum_func) - # y = com.values_from_object(self).copy() - - # if skipna and issubclass(y.dtype.type, (np.datetime64, np.timedelta64)): - # result = accum_func(y, axis) - # mask = isna(self) - # np.putmask(result, mask, iNaT) - # elif skipna and not issubclass(y.dtype.type, (np.integer, np.bool_)): - # mask = isna(self) - # np.putmask(y, mask, mask_a) - # result = accum_func(y, axis) - # np.putmask(result, mask, mask_b) - # # TODO: probably here, we need to call - # self._accumulate if the proper subclass is available - # else: - # result = accum_func(y, axis) - - # TODO: check later if these 3 commands are necessary -> check for failing tests - # d = self._construct_axes_dict() - # d["copy"] = False - # return self._constructor(result, **d).__finalize__(self, method=name) - return self._constructor(result).__finalize__(self, method=name) return set_function_name(cum_func, name, cls) From 52e6486bfd82f75fc8d672f86db65767051c12ec Mon Sep 17 00:00:00 2001 From: datajanko Date: Wed, 17 Jun 2020 21:47:12 +0200 Subject: [PATCH 30/59] adds todo for runtime warning --- pandas/tests/extension/base/accumulate.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/extension/base/accumulate.py b/pandas/tests/extension/base/accumulate.py index 4cf28480fd3cb..8edad7236f3dc 100644 --- a/pandas/tests/extension/base/accumulate.py +++ b/pandas/tests/extension/base/accumulate.py @@ -38,6 +38,8 @@ def test_accumulate_series(self, data, all_numeric_accumulations, skipna): op_name = all_numeric_accumulations s = pd.Series(data) + # TODO: check if needed, copied from reduce + # min/max with empty produce numpy warnings with warnings.catch_warnings(): warnings.simplefilter("ignore", RuntimeWarning) self.check_accumulate(s, op_name, skipna) From 99fb664d0b08ddba18dc65a7211783241c85aa8f Mon Sep 17 00:00:00 2001 From: datajanko Date: Mon, 22 Jun 2020 21:54:20 +0200 Subject: [PATCH 31/59] reuses integer array to accumulate for booleans --- pandas/core/arrays/boolean.py | 32 +++++--------------------------- 1 file changed, 5 insertions(+), 27 deletions(-) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index d7ffbee7a4775..85cdfa44c50a2 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -653,34 +653,12 @@ def cmp_method(self, other): name = f"__{op.__name__}" return set_function_name(cmp_method, name, cls) - def _accumulate( - self, name: str, skipna: bool = True, **kwargs - ): # TODO Type hints not working propery here due to circular imports - data = self._data - mask = self._mask - - cum_function, fill_value = { - "cumprod": (np.cumprod, 1), - "cummax": (np.maximum.accumulate, False), - "cumsum": (np.cumsum, 0), - "cummin": (np.minimum.accumulate, True), - }[name] - from ..nanops import _get_values - - values, mask, dtype, dtype_max, fill_value = _get_values( - data, skipna=skipna, fill_value=fill_value, mask=mask - ) - - if not skipna: - mask = np.maximum.accumulate(mask) - - if name in ["cumsum", "cumprod"]: - from pandas.core.arrays import IntegerArray + def _accumulate(self, name: str, skipna: bool = True, **kwargs): + from pandas.arrays import IntegerArray - result = IntegerArray(cum_function(values, dtype="UInt64"), mask) - else: - result = BooleanArray(cum_function(values, dtype=bool), mask) - return result + return IntegerArray(self._data.astype("int8"), self._mask)._accumulate( + name, skipna, **kwargs + ) def _reduce(self, name: str, skipna: bool = True, **kwargs): From d339250f84d211e9a972d6b1eeb2e56c7f0daffe Mon Sep 17 00:00:00 2001 From: datajanko Date: Mon, 22 Jun 2020 22:06:05 +0200 Subject: [PATCH 32/59] removes runtimewarning catching --- pandas/tests/extension/base/accumulate.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/pandas/tests/extension/base/accumulate.py b/pandas/tests/extension/base/accumulate.py index 8edad7236f3dc..5e05b9bdbc297 100644 --- a/pandas/tests/extension/base/accumulate.py +++ b/pandas/tests/extension/base/accumulate.py @@ -1,5 +1,3 @@ -import warnings - import pytest import pandas as pd @@ -37,9 +35,4 @@ class BaseNumericAccumulateTests(BaseAccumulateTests): def test_accumulate_series(self, data, all_numeric_accumulations, skipna): op_name = all_numeric_accumulations s = pd.Series(data) - - # TODO: check if needed, copied from reduce - # min/max with empty produce numpy warnings - with warnings.catch_warnings(): - warnings.simplefilter("ignore", RuntimeWarning) - self.check_accumulate(s, op_name, skipna) + self.check_accumulate(s, op_name, skipna) From be6f9743d4d7475c022b45911c25d3d8d7556489 Mon Sep 17 00:00:00 2001 From: datajanko Date: Tue, 23 Jun 2020 21:38:31 +0200 Subject: [PATCH 33/59] removes TODOs --- pandas/core/arrays/base.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 0e47156d2bd73..e6a8f98d2980f 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1090,15 +1090,13 @@ def _concat_same_type( def _accumulate(self, name, skipna=True, **kwargs) -> "ExtensionArray": """ - Return an ExtensionArray performing the accumulation operation. + Return an ExtensionArray performing an accumulation operation. The underlying data type might change - # TODO Clarify Parameters ---------- name : str Name of the function, supported values are: - # TODO Add function signatures - cummin - cummax - cumsum @@ -1107,7 +1105,6 @@ def _accumulate(self, name, skipna=True, **kwargs) -> "ExtensionArray": If True, skip NA values. **kwargs Additional keyword arguments passed to the accumulation function. - # TODO check if kwargs are needed Currently, there is no supported kwarg. Returns From a902f4ed4e0ef944fe1d49c354de1244630acf3e Mon Sep 17 00:00:00 2001 From: datajanko Date: Tue, 23 Jun 2020 21:40:01 +0200 Subject: [PATCH 34/59] adds accumulate to autosummary --- doc/source/reference/extensions.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/reference/extensions.rst b/doc/source/reference/extensions.rst index fe4113d100abf..050b867cc8aa6 100644 --- a/doc/source/reference/extensions.rst +++ b/doc/source/reference/extensions.rst @@ -32,6 +32,7 @@ objects. .. autosummary:: :toctree: api/ + api.extensions.ExtensionArray._accumulate api.extensions.ExtensionArray._concat_same_type api.extensions.ExtensionArray._formatter api.extensions.ExtensionArray._from_factorized From 64afb5b2c147b85a48949e31d6604f19d5b70996 Mon Sep 17 00:00:00 2001 From: datajanko Date: Wed, 24 Jun 2020 20:36:13 +0200 Subject: [PATCH 35/59] excludes datetime from propagating to _accumulate --- pandas/core/generic.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 590032e353aa4..ff3c54d995435 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -11524,7 +11524,10 @@ def cum_func(self, axis=None, skipna=True, *args, **kwargs): def block_accum_func(blk_values): values = blk_values.T if hasattr(blk_values, "T") else blk_values - if is_extension_array_dtype(values.dtype): + if is_extension_array_dtype(values.dtype) and values.dtype.kind not in [ + "m", + "M", + ]: result = values._accumulate(name, skipna, **kwargs) else: result = nanops.na_accum_func(values, accum_func, skipna=skipna) From 1e5d77b217c1ff5b0ab681674a5a9cd24c4030bf Mon Sep 17 00:00:00 2001 From: datajanko Date: Mon, 29 Jun 2020 22:00:00 +0200 Subject: [PATCH 36/59] uses pandas.testing instead of pandas.util.testing in accumulate --- pandas/tests/extension/base/accumulate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/extension/base/accumulate.py b/pandas/tests/extension/base/accumulate.py index 5e05b9bdbc297..22cae345d8ea7 100644 --- a/pandas/tests/extension/base/accumulate.py +++ b/pandas/tests/extension/base/accumulate.py @@ -1,7 +1,7 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas.testing as tm from .base import BaseExtensionTests From c95b490b7167ff787a8beeec891367f987ca953c Mon Sep 17 00:00:00 2001 From: datajanko Date: Tue, 30 Jun 2020 08:20:46 +0200 Subject: [PATCH 37/59] replaces assert_almost_equal with assert_series_equal --- pandas/tests/extension/base/accumulate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/extension/base/accumulate.py b/pandas/tests/extension/base/accumulate.py index 22cae345d8ea7..b508ab56426a3 100644 --- a/pandas/tests/extension/base/accumulate.py +++ b/pandas/tests/extension/base/accumulate.py @@ -15,7 +15,7 @@ class BaseAccumulateTests(BaseExtensionTests): def check_accumulate(self, s, op_name, skipna): result = getattr(s, op_name)(skipna=skipna) expected = getattr(s.astype("float64"), op_name)(skipna=skipna) - tm.assert_almost_equal(result, expected, check_dtype=False) + tm.assert_series_equal(result, expected, check_dtype=False) class BaseNoAccumulateTests(BaseAccumulateTests): From dc669ded93599c30a64bc7bc3d8f917db6651599 Mon Sep 17 00:00:00 2001 From: datajanko Date: Tue, 30 Jun 2020 20:48:35 +0200 Subject: [PATCH 38/59] dtypes to lowercase --- pandas/tests/extension/test_integer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py index 244197ecaa308..a321261e6b167 100644 --- a/pandas/tests/extension/test_integer.py +++ b/pandas/tests/extension/test_integer.py @@ -255,9 +255,9 @@ def check_accumulate(self, s, op_name, skipna): # https://github.com/pandas-dev/pandas/issues/30958 if op_name == "cumsum": if s.dtype.name.startswith("U"): - expected_dtype = "UInt64" + expected_dtype = "uint64" else: - expected_dtype = "Int64" + expected_dtype = "int64" result = getattr(s, op_name)(skipna=skipna) expected = pd.Series( integer_array( @@ -278,9 +278,9 @@ def check_accumulate(self, s, op_name, skipna): tm.assert_series_equal(result, expected) elif op_name == "cumprod": if s.dtype.name.startswith("U"): - expected_dtype = "UInt64" + expected_dtype = "uint64" else: - expected_dtype = "Int64" + expected_dtype = "int64" result = getattr(s[:20], op_name)(skipna=skipna) expected = pd.Series( integer_array( From 08475a426896ba765179d42122c5b4844dbd2e40 Mon Sep 17 00:00:00 2001 From: datajanko Date: Tue, 30 Jun 2020 21:42:50 +0200 Subject: [PATCH 39/59] lowercase of uint and int64 dtype in _accumulate --- pandas/core/arrays/integer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 13e1003274a78..2e2daf8e38481 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -565,9 +565,9 @@ def _accumulate(self, name: str, skipna: bool = True, **kwargs) -> "IntegerArray dtype_out = dtype if name in ["cumsum", "cumprod"]: if dtype.name.lower().startswith("u"): - dtype_out = "UInt64" + dtype_out = "uint64" else: - dtype_out = "Int64" + dtype_out = "int64" vals = cum_function(values, dtype=dtype_out) result = IntegerArray(vals, mask) From 67fa99ace33e21750a9f38c4192cd65f93a7f6cb Mon Sep 17 00:00:00 2001 From: datajanko Date: Tue, 21 Jul 2020 21:02:27 +0200 Subject: [PATCH 40/59] uses hint of @simonjayhawkins concerning assert series equals --- pandas/tests/extension/base/accumulate.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/tests/extension/base/accumulate.py b/pandas/tests/extension/base/accumulate.py index b508ab56426a3..3670e89f12ad8 100644 --- a/pandas/tests/extension/base/accumulate.py +++ b/pandas/tests/extension/base/accumulate.py @@ -1,7 +1,6 @@ import pytest import pandas as pd -import pandas.testing as tm from .base import BaseExtensionTests @@ -15,7 +14,7 @@ class BaseAccumulateTests(BaseExtensionTests): def check_accumulate(self, s, op_name, skipna): result = getattr(s, op_name)(skipna=skipna) expected = getattr(s.astype("float64"), op_name)(skipna=skipna) - tm.assert_series_equal(result, expected, check_dtype=False) + self.assert_series_equal(result, expected, check_dtype=False) class BaseNoAccumulateTests(BaseAccumulateTests): From b3d3c812629e0b15ff687d955731ef4334ee1040 Mon Sep 17 00:00:00 2001 From: datajanko Date: Sat, 25 Jul 2020 07:36:15 +0200 Subject: [PATCH 41/59] adds whatsnew entry --- doc/source/whatsnew/v1.1.0.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 43d1244c15d8a..c51c2a102a205 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -340,6 +340,8 @@ Other enhancements - :meth:`DataFrame.agg` and :meth:`Series.agg` now accept named aggregation for renaming the output columns/indexes. (:issue:`26513`) - ``compute.use_numba`` now exists as a configuration option that utilizes the numba engine when available (:issue:`33966`) - :meth:`Series.plot` now supports asymmetric error bars. Previously, if :meth:`Series.plot` received a "2xN" array with error values for `yerr` and/or `xerr`, the left/lower values (first row) were mirrored, while the right/upper values (second row) were ignored. Now, the first row represents the left/lower error values and the second row the right/upper error values. (:issue:`9536`) +- Added :meth:`api.extensionExtensionArray._accumulate` to the extension array interface. Implements this interface for :class: `IntegerArray` and :class: `BooleanArray` such that type coercion to `object` is avoided (:issue:`28385`) + .. --------------------------------------------------------------------------- From 8cb66f98446c0e2cbb3659d0015cbe54ed8119b8 Mon Sep 17 00:00:00 2001 From: datajanko Date: Mon, 10 Aug 2020 21:58:25 +0200 Subject: [PATCH 42/59] moves changes to 1.2.0 --- doc/source/whatsnew/v1.1.0.rst | 1 - doc/source/whatsnew/v1.2.0.rst | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index abe9f7413f3ce..f752aac1aece1 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -341,7 +341,6 @@ Other enhancements - :meth:`DataFrame.agg` and :meth:`Series.agg` now accept named aggregation for renaming the output columns/indexes. (:issue:`26513`) - ``compute.use_numba`` now exists as a configuration option that utilizes the numba engine when available (:issue:`33966`, :issue:`35374`) - :meth:`Series.plot` now supports asymmetric error bars. Previously, if :meth:`Series.plot` received a "2xN" array with error values for `yerr` and/or `xerr`, the left/lower values (first row) were mirrored, while the right/upper values (second row) were ignored. Now, the first row represents the left/lower error values and the second row the right/upper error values. (:issue:`9536`) -- Added :meth:`api.extensionExtensionArray._accumulate` to the extension array interface. Implements this interface for :class: `IntegerArray` and :class: `BooleanArray` such that type coercion to `object` is avoided (:issue:`28385`) .. --------------------------------------------------------------------------- diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 94bb265c32e4c..ab1c8f1f19657 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -188,6 +188,7 @@ Sparse ExtensionArray ^^^^^^^^^^^^^^ +- Added :meth:`api.extensionExtensionArray._accumulate` to the extension array interface. Implements this interface for :class: `IntegerArray` and :class: `BooleanArray` such that type coercion to `object` is avoided (:issue:`28385`) - - From 6ba3ca948470e229455900fbb69fccff23a6c30e Mon Sep 17 00:00:00 2001 From: datajanko Date: Thu, 5 Nov 2020 21:47:35 +0100 Subject: [PATCH 43/59] uses na_accum_func --- pandas/core/arrays/integer.py | 41 ++++++++++------------------------- pandas/core/nanops.py | 22 +++++++++++++++++++ 2 files changed, 34 insertions(+), 29 deletions(-) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 5e69bc0b7272e..6250ccf0c671c 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -604,35 +604,18 @@ def _arith_method(self, other, op): return self._maybe_mask_result(result, mask, other, op_name) def _accumulate(self, name: str, skipna: bool = True, **kwargs) -> "IntegerArray": - data = self._data - mask = self._mask - - cum_function, fill_value = { - "cumprod": (np.cumprod, 1), - "cummax": (np.maximum.accumulate, data.min()), - "cumsum": (np.cumsum, 0), - "cummin": (np.minimum.accumulate, data.max()), - }[name] - from ..nanops import _get_values - - values, mask, dtype, dtype_max, fill_value = _get_values( - data, skipna=skipna, fill_value=fill_value, mask=mask - ) - - if not skipna: - mask = np.maximum.accumulate(mask) - - # makes target dtypes explicit since CI showed optimal UInt32 - # dtype on test data occasionally. This was different across systems - dtype_out = dtype - if name in ["cumsum", "cumprod"]: - if dtype.name.lower().startswith("u"): - dtype_out = "uint64" - else: - dtype_out = "int64" - - vals = cum_function(values, dtype=dtype_out) - result = IntegerArray(vals, mask) + cum_function = { + "cumprod": np.cumprod, + "cummax": np.maximum.accumulate, + "cumsum": np.cumsum, + "cummin": np.minimum.accumulate, + }.get(name) + if not cum_function: + raise ValueError(f"{name} is not defined for IntegerArrays") + + from pandas.core.nanops import na_accum_func + + result = na_accum_func(self, cum_function, skipna=skipna) return result def sum(self, skipna=True, min_count=0, **kwargs): diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 46ff4a0e2f612..530301fc3e38a 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -1690,6 +1690,28 @@ def na_accum_func(values: ArrayLike, accum_func, skipna: bool) -> ArrayLike: result = type(values)._simple_new( # type: ignore[attr-defined] result, dtype=orig_dtype ) + from pandas.core.arrays import IntegerArray + + if isinstance(values, IntegerArray): + data = values._data + mask = values._mask + + fill_value = { + np.cumprod: 1, + np.maximum.accumulate: data.min(), + np.cumsum: 0, + np.minimum.accumulate: data.max(), + }[accum_func] + + values, mask, dtype, dtype_max, fill_value = _get_values( + data, skipna=skipna, fill_value=fill_value, mask=mask + ) + + if not skipna: + mask = np.maximum.accumulate(mask) + + vals = accum_func(values) + result = IntegerArray(vals, mask) elif skipna and not issubclass(values.dtype.type, (np.integer, np.bool_)): vals = values.copy() From 55de384d0336a9265f897e466339c1463c0d2145 Mon Sep 17 00:00:00 2001 From: datajanko Date: Sat, 16 Jan 2021 08:21:13 +0100 Subject: [PATCH 44/59] delegate to EAs _accumulate function in block mgr --- pandas/core/generic.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index ce1e962614c58..82b527e4e701a 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -10429,7 +10429,10 @@ def _accum_func(self, name: str, func, axis=None, skipna=True, *args, **kwargs): def block_accum_func(blk_values): values = blk_values.T if hasattr(blk_values, "T") else blk_values - result = nanops.na_accum_func(values, func, skipna=skipna) + if isinstance(values, ExtensionArray): + result = values._accumulate(name, skipna=skipna, **kwargs) + else: + result = nanops.na_accum_func(values, func, skipna=skipna) result = result.T if hasattr(result, "T") else result return result From 6a5b7f8dee1e28ec720411f879a759a35d66b058 Mon Sep 17 00:00:00 2001 From: datajanko Date: Tue, 19 Jan 2021 22:06:27 +0100 Subject: [PATCH 45/59] moves implementation from nanops to masked_accumulations --- .../core/array_algos/masked_accumulations.py | 69 +++++++++++++++++++ pandas/core/arrays/boolean.py | 7 -- pandas/core/arrays/integer.py | 15 ---- pandas/core/arrays/masked.py | 23 ++++++- pandas/core/nanops.py | 22 ------ 5 files changed, 91 insertions(+), 45 deletions(-) create mode 100644 pandas/core/array_algos/masked_accumulations.py diff --git a/pandas/core/array_algos/masked_accumulations.py b/pandas/core/array_algos/masked_accumulations.py new file mode 100644 index 0000000000000..a737ddd990600 --- /dev/null +++ b/pandas/core/array_algos/masked_accumulations.py @@ -0,0 +1,69 @@ +from typing import Callable + +import numpy as np + +from pandas.core import nanops as no + +""" +masked_accumulations.py is for accumulation algorithms using a mask-based approach +for missing values. +""" + + +def _cum_func( + func: Callable, + values: np.ndarray, + mask: np.ndarray, + *, + skipna: bool = True, + min_count: int = 0, +): + """ + Accumulations for 1D masked array. + + Parameters + ---------- + func : np.cumsum, np.cumprod, np.maximum.accumulate, np.minimum.accumulate + values : np.ndarray + Numpy array with the values (can be of any dtype that support the + operation). + mask : np.ndarray + Boolean numpy array (True values indicate missing values). + skipna : bool, default True + Whether to skip NA. + """ + try: + fill_value = { + np.cumprod: 1, + np.maximum.accumulate: values.min(), + np.cumsum: 0, + np.minimum.accumulate: values.max(), + }[func] + except KeyError: + raise ValueError(f"No accumulation for {func} implemented on BaseMaskedArray") + + values, mask, dtype, dtype_max, fill_value = no._get_values( + values, skipna=skipna, fill_value=fill_value, mask=mask + ) + + if not skipna: + mask = np.maximum.accumulate(mask) + + values = func(values) + return values, mask + + +def cumsum(values: np.ndarray, mask: np.ndarray, *, skipna: bool = True): + return _cum_func(np.cumsum, values, mask, skipna=skipna) + + +def cumprod(values: np.ndarray, mask: np.ndarray, *, skipna: bool = True): + return _cum_func(np.cumsum, values, mask, skipna=skipna) + + +def cummin(values: np.ndarray, mask: np.ndarray, *, skipna: bool = True): + return _cum_func(np.minimum.accumulate, values, mask, skipna=skipna) + + +def cummax(values: np.ndarray, mask: np.ndarray, *, skipna: bool = True): + return _cum_func(np.maximum.accumulate, values, mask, skipna=skipna) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index c624115036486..2bc908186f7f4 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -682,13 +682,6 @@ def _arith_method(self, other, op): return self._maybe_mask_result(result, mask, other, op_name) - def _accumulate(self, name: str, skipna: bool = True, **kwargs): - from pandas.arrays import IntegerArray - - return IntegerArray(self._data.astype("int8"), self._mask)._accumulate( - name, skipna, **kwargs - ) - def _reduce(self, name: str, *, skipna: bool = True, **kwargs): if name in {"any", "all"}: diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 815c84779d30f..f8378fb7d1500 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -470,21 +470,6 @@ def _cmp_method(self, other, op): return BooleanArray(result, mask) - def _accumulate(self, name: str, skipna: bool = True, **kwargs) -> "IntegerArray": - cum_function = { - "cumprod": np.cumprod, - "cummax": np.maximum.accumulate, - "cumsum": np.cumsum, - "cummin": np.minimum.accumulate, - }.get(name) - if not cum_function: - raise ValueError(f"{name} is not defined for IntegerArrays") - - from pandas.core.nanops import na_accum_func - - result = na_accum_func(self, cum_function, skipna=skipna) - return result - def sum(self, *, skipna=True, min_count=0, **kwargs): nv.validate_sum((), kwargs) return super()._reduce("sum", skipna=skipna, min_count=min_count) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index e4a98a54ee94c..f83186a05a9c1 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -22,7 +22,7 @@ from pandas.core import nanops from pandas.core.algorithms import factorize_array, take -from pandas.core.array_algos import masked_reductions +from pandas.core.array_algos import masked_accumulations, masked_reductions from pandas.core.arraylike import OpsMixin from pandas.core.arrays import ExtensionArray from pandas.core.indexers import check_array_indexer @@ -413,3 +413,24 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): return libmissing.NA return result + + def _accumulate( + self, name: str, *, skipna: bool = True, **kwargs + ) -> BaseMaskedArrayT: + data = self._data + mask = self._mask + + if name in {"cumsum", "cumprod", "cummin", "cummax"}: + op = getattr(masked_accumulations, name) + data, mask = op(data, mask, skipna=skipna, **kwargs) + + from pandas.core.arrays import BooleanArray, IntegerArray + + if isinstance(self, BooleanArray): + return IntegerArray(data, mask, copy=False) + + return type(self)(data, mask, copy=False) + + raise NotImplementedError( + "Accumlation {name} not implemented for BaseMaskedArray" + ) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index e6e2f96f04ab9..fb9b20bd43d7c 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -1734,28 +1734,6 @@ def na_accum_func(values: ArrayLike, accum_func, *, skipna: bool) -> ArrayLike: result = type(values)._simple_new( # type: ignore[attr-defined] result, dtype=orig_dtype ) - from pandas.core.arrays import IntegerArray - - if isinstance(values, IntegerArray): - data = values._data - mask = values._mask - - fill_value = { - np.cumprod: 1, - np.maximum.accumulate: data.min(), - np.cumsum: 0, - np.minimum.accumulate: data.max(), - }[accum_func] - - values, mask, dtype, dtype_max, fill_value = _get_values( - data, skipna=skipna, fill_value=fill_value, mask=mask - ) - - if not skipna: - mask = np.maximum.accumulate(mask) - - vals = accum_func(values) - result = IntegerArray(vals, mask) elif skipna and not issubclass(values.dtype.type, (np.integer, np.bool_)): vals = values.copy() From 9c63c64c07bd5a4effbe5dac1e787349144ade9a Mon Sep 17 00:00:00 2001 From: datajanko Date: Thu, 21 Jan 2021 19:04:42 +0100 Subject: [PATCH 46/59] fixes typing annotations in base and masked --- pandas/core/arrays/base.py | 4 +++- pandas/core/arrays/masked.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 3eec621768e75..2e5bb66ac3ffb 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1203,7 +1203,9 @@ def _concat_same_type( # of objects _can_hold_na = True - def _accumulate(self, name, skipna=True, **kwargs) -> "ExtensionArray": + def _accumulate( + self: ExtensionArray, name: str, *, skipna=True, **kwargs + ) -> ExtensionArray: """ Return an ExtensionArray performing an accumulation operation. The underlying data type might change diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index f83186a05a9c1..fc5a4a4d13955 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -416,7 +416,7 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): def _accumulate( self, name: str, *, skipna: bool = True, **kwargs - ) -> BaseMaskedArrayT: + ) -> BaseMaskedArray: data = self._data mask = self._mask From 2f23499eaf0061d9b855586d6044b1e284059387 Mon Sep 17 00:00:00 2001 From: datajanko Date: Fri, 22 Jan 2021 07:49:22 +0100 Subject: [PATCH 47/59] fixes merge error --- pandas/core/arrays/masked.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 2ea79ec863534..7fcc649bdb8ca 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -21,7 +21,7 @@ from pandas.core.dtypes.missing import isna, notna from pandas.core import nanops -from pandas.core.algorithms import factorize_array, take +from pandas.core.algorithms import factorize_array, isin, take from pandas.core.array_algos import masked_accumulations, masked_reductions from pandas.core.arraylike import OpsMixin from pandas.core.arrays import ExtensionArray From a5b30e635fc2dc999a466663ae38589483469224 Mon Sep 17 00:00:00 2001 From: datajanko Date: Fri, 22 Jan 2021 08:26:15 +0100 Subject: [PATCH 48/59] fills na values without nanops --- pandas/core/array_algos/masked_accumulations.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/pandas/core/array_algos/masked_accumulations.py b/pandas/core/array_algos/masked_accumulations.py index a737ddd990600..83e42f4d47250 100644 --- a/pandas/core/array_algos/masked_accumulations.py +++ b/pandas/core/array_algos/masked_accumulations.py @@ -2,8 +2,6 @@ import numpy as np -from pandas.core import nanops as no - """ masked_accumulations.py is for accumulation algorithms using a mask-based approach for missing values. @@ -42,9 +40,7 @@ def _cum_func( except KeyError: raise ValueError(f"No accumulation for {func} implemented on BaseMaskedArray") - values, mask, dtype, dtype_max, fill_value = no._get_values( - values, skipna=skipna, fill_value=fill_value, mask=mask - ) + values[mask] = fill_value if not skipna: mask = np.maximum.accumulate(mask) From d22c8a02dc4ac7edeb419b52b2d6fccb72996198 Mon Sep 17 00:00:00 2001 From: datajanko Date: Mon, 25 Jan 2021 21:21:52 +0100 Subject: [PATCH 49/59] fixes incorrect call to cumsum and changes to cumprod --- pandas/core/array_algos/masked_accumulations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/array_algos/masked_accumulations.py b/pandas/core/array_algos/masked_accumulations.py index 83e42f4d47250..d388ebc4b9b0d 100644 --- a/pandas/core/array_algos/masked_accumulations.py +++ b/pandas/core/array_algos/masked_accumulations.py @@ -54,7 +54,7 @@ def cumsum(values: np.ndarray, mask: np.ndarray, *, skipna: bool = True): def cumprod(values: np.ndarray, mask: np.ndarray, *, skipna: bool = True): - return _cum_func(np.cumsum, values, mask, skipna=skipna) + return _cum_func(np.cumprod, values, mask, skipna=skipna) def cummin(values: np.ndarray, mask: np.ndarray, *, skipna: bool = True): From a5866c7ecb05faefd940da45931c97eae7e23924 Mon Sep 17 00:00:00 2001 From: datajanko Date: Mon, 25 Jan 2021 21:23:16 +0100 Subject: [PATCH 50/59] add _accumulate to boolean --- pandas/core/arrays/boolean.py | 9 +++++++++ pandas/core/arrays/masked.py | 5 ----- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index dd281a39907fd..0bdb255c340c8 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -691,6 +691,15 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): return super()._reduce(name, skipna=skipna, **kwargs) + def _accumulate( + self, name: str, *, skipna: bool = True, **kwargs + ) -> BaseMaskedArray: + from pandas.core.arrays import IntegerArray + + data = self._data.astype(int) + mask = self._mask + return IntegerArray(data, mask)._accumulate(name, skipna=skipna, **kwargs) + def _maybe_mask_result(self, result, mask, other, op_name: str): """ Parameters diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 7fcc649bdb8ca..2ba995b9b45ec 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -438,11 +438,6 @@ def _accumulate( op = getattr(masked_accumulations, name) data, mask = op(data, mask, skipna=skipna, **kwargs) - from pandas.core.arrays import BooleanArray, IntegerArray - - if isinstance(self, BooleanArray): - return IntegerArray(data, mask, copy=False) - return type(self)(data, mask, copy=False) raise NotImplementedError( From 8255457076b1c46828a46abda17651eabd937dd5 Mon Sep 17 00:00:00 2001 From: datajanko Date: Mon, 25 Jan 2021 21:23:48 +0100 Subject: [PATCH 51/59] makes tests a lot easier - cumprod tests still fail --- pandas/tests/extension/test_boolean.py | 5 +- pandas/tests/extension/test_integer.py | 85 +++++++++++++------------- 2 files changed, 48 insertions(+), 42 deletions(-) diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index e9993297919a8..55e5727c58d3d 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -406,7 +406,10 @@ class TestUnaryOps(base.BaseUnaryOpsTests): class TestNumericAccumulation(base.BaseNumericAccumulateTests): - pass + def check_accumulate(self, s, op_name, skipna): + result = getattr(s, op_name)(skipna=skipna) + expected = getattr(pd.Series(s.astype("float64")), op_name)(skipna=skipna) + tm.assert_series_equal(result, expected, check_dtype=False) # TODO parsing not yet supported diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py index cfb070e391019..13b80c1eea754 100644 --- a/pandas/tests/extension/test_integer.py +++ b/pandas/tests/extension/test_integer.py @@ -250,47 +250,50 @@ class TestBooleanReduce(base.BaseBooleanReduceTests): class TestNumericAccumulation(base.BaseNumericAccumulateTests): def check_accumulate(self, s, op_name, skipna): - # overwrite to ensure pd.NA is tested instead of np.nan - # https://github.com/pandas-dev/pandas/issues/30958 - if op_name == "cumsum": - if s.dtype.name.startswith("U"): - expected_dtype = "uint64" - else: - expected_dtype = "int64" - result = getattr(s, op_name)(skipna=skipna) - expected = pd.Series( - integer_array( - getattr(s.astype("float64"), op_name)(skipna=skipna), - dtype=expected_dtype, - ) - ) - tm.assert_series_equal(result, expected) - elif op_name in ["cummax", "cummin"]: - expected_dtype = s.dtype - result = getattr(s, op_name)(skipna=skipna) - expected = pd.Series( - integer_array( - getattr(s.astype("float64"), op_name)(skipna=skipna), - dtype=expected_dtype, - ) - ) - tm.assert_series_equal(result, expected) - elif op_name == "cumprod": - if s.dtype.name.startswith("U"): - expected_dtype = "uint64" - else: - expected_dtype = "int64" - result = getattr(s[:20], op_name)(skipna=skipna) - expected = pd.Series( - integer_array( - getattr(s[:20].astype("float64"), op_name)(skipna=skipna), - dtype=expected_dtype, - ) - ) - tm.assert_series_equal(result, expected) - - else: - raise + result = getattr(s, op_name)(skipna=skipna) + expected = getattr(pd.Series(s.astype("float64")), op_name)(skipna=skipna) + tm.assert_series_equal(result, expected, check_dtype=False) + # # overwrite to ensure pd.NA is tested instead of np.nan + # # https://github.com/pandas-dev/pandas/issues/30958 + # if op_name == "cumsum": + # if s.dtype.name.startswith("U"): + # expected_dtype = "uint64" + # else: + # expected_dtype = "int64" + # result = getattr(s, op_name)(skipna=skipna) + # expected = pd.Series( + # integer_array( + # getattr(s.astype("float64"), op_name)(skipna=skipna), + # dtype=expected_dtype, + # ) + # ) + # tm.assert_series_equal(result, expected) + # elif op_name in ["cummax", "cummin"]: + # expected_dtype = s.dtype + # result = getattr(s, op_name)(skipna=skipna) + # expected = pd.Series( + # integer_array( + # getattr(s.astype("float64"), op_name)(skipna=skipna), + # dtype=expected_dtype, + # ) + # ) + # tm.assert_series_equal(result, expected) + # elif op_name == "cumprod": + # if s.dtype.name.startswith("U"): + # expected_dtype = "uint64" + # else: + # expected_dtype = "int64" + # result = getattr(s[:20], op_name)(skipna=skipna) + # expected = pd.Series( + # integer_array( + # getattr(s[:20].astype("float64"), op_name)(skipna=skipna), + # dtype=expected_dtype, + # ) + # ) + # tm.assert_series_equal(result, expected) + + # else: + # raise class TestPrinting(base.BasePrintingTests): From 483b6088ab453a9072cbac720701996eb9720939 Mon Sep 17 00:00:00 2001 From: datajanko Date: Tue, 26 Jan 2021 20:41:26 +0100 Subject: [PATCH 52/59] adds BaseNumericAccumulation for floating masked array --- pandas/tests/extension/test_floating.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/tests/extension/test_floating.py b/pandas/tests/extension/test_floating.py index 440d7391c558f..8889bf896727d 100644 --- a/pandas/tests/extension/test_floating.py +++ b/pandas/tests/extension/test_floating.py @@ -222,3 +222,7 @@ class TestPrinting(base.BasePrintingTests): class TestParsing(base.BaseParsingTests): pass + + +class TestNumericAccumulation(base.BaseNumericAccumulateTests): + pass From 150fd3b63130fd669615644def565beff66f321d Mon Sep 17 00:00:00 2001 From: datajanko Date: Tue, 26 Jan 2021 21:05:43 +0100 Subject: [PATCH 53/59] tests no numeric accumulations according to _accumulate interface --- pandas/tests/extension/test_sparse.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index 86f9080571459..3d331d8652397 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -444,3 +444,7 @@ def test_EA_types(self, engine, data): expected_msg = r".*must implement _from_sequence_of_strings.*" with pytest.raises(NotImplementedError, match=expected_msg): super().test_EA_types(engine, data) + + +class TestNoNumericAccumulations(base.BaseNoAccumulateTests): + pass From 80e2dc62045f22132ff86c74da174dcb2a827e9c Mon Sep 17 00:00:00 2001 From: datajanko Date: Thu, 28 Jan 2021 19:04:20 +0100 Subject: [PATCH 54/59] uses NotImplementedError in base accumulate function --- pandas/core/arrays/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 30ca10376a0c5..19774f24253d2 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1261,9 +1261,9 @@ def _accumulate( Raises ------ - TypeError : subclass does not define accumulations + NotImplementedError : subclass does not define accumulations """ - raise TypeError(f"cannot perform {name} with type {self.dtype}") + raise NotImplementedError(f"cannot perform {name} with type {self.dtype}") def _reduce(self, name: str, *, skipna: bool = True, **kwargs): """ From dceab99d70485630d67fff2c6c35ee2cac9918a3 Mon Sep 17 00:00:00 2001 From: datajanko Date: Tue, 16 Feb 2021 20:50:39 +0100 Subject: [PATCH 55/59] ensures the fill values are data independent additionally, remove min_count as irrellevant --- pandas/core/array_algos/masked_accumulations.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/pandas/core/array_algos/masked_accumulations.py b/pandas/core/array_algos/masked_accumulations.py index d388ebc4b9b0d..df26b4f6a958a 100644 --- a/pandas/core/array_algos/masked_accumulations.py +++ b/pandas/core/array_algos/masked_accumulations.py @@ -2,6 +2,8 @@ import numpy as np +from pandas.core.dtypes.common import is_float_dtype, is_integer_dtype + """ masked_accumulations.py is for accumulation algorithms using a mask-based approach for missing values. @@ -14,7 +16,6 @@ def _cum_func( mask: np.ndarray, *, skipna: bool = True, - min_count: int = 0, ): """ Accumulations for 1D masked array. @@ -30,12 +31,21 @@ def _cum_func( skipna : bool, default True Whether to skip NA. """ + dtype_info = None + if is_float_dtype(values): + dtype_info = np.finfo(values.dtype.type) + elif is_integer_dtype(values): + dtype_info = np.iinfo(values.dtype.type) + else: + raise NotImplementedError( + f"No masked accumulation defined for dtype {values.dtype.type}" + ) try: fill_value = { np.cumprod: 1, - np.maximum.accumulate: values.min(), + np.maximum.accumulate: dtype_info.min, np.cumsum: 0, - np.minimum.accumulate: values.max(), + np.minimum.accumulate: dtype_info.max, }[func] except KeyError: raise ValueError(f"No accumulation for {func} implemented on BaseMaskedArray") From 1c14f18bd45ee7eaf61688a14aca7c4ac7e2ff8f Mon Sep 17 00:00:00 2001 From: datajanko Date: Tue, 16 Feb 2021 20:54:17 +0100 Subject: [PATCH 56/59] adds accumulation for datetimelikes in generic.py ensure that datetimelikes are wrapped create a twin of masked_accumulations for datetimelikes timedeltas also allow cumsum and cumprod, theoretically --- pandas/core/arrays/datetimelike.py | 67 ++++++++++++++++++++++++++++++ pandas/core/arrays/timedeltas.py | 19 +++++++++ pandas/core/generic.py | 4 ++ 3 files changed, 90 insertions(+) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index b2629e606f8f5..320e8783b58d6 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -64,6 +64,7 @@ from pandas.core import nanops, ops from pandas.core.algorithms import checked_add_with_arr, isin, unique1d, value_counts +from pandas.core.array_algos import datetimelike_accumulations from pandas.core.arraylike import OpsMixin from pandas.core.arrays._mixins import NDArrayBackedExtensionArray, ravel_compat import pandas.core.common as com @@ -1187,6 +1188,72 @@ def _time_shift(self, periods, freq=None): # to be passed explicitly. return self._generate_range(start=start, end=end, periods=None, freq=self.freq) + def _accumulate( + self, name: str, *, skipna: bool = True, **kwargs + ) -> DatetimeLikeArrayT: + + data = self._data.copy() + + if name in {"cummin", "cummax"}: + op = getattr(datetimelike_accumulations, name) + data = op(data, skipna=skipna, **kwargs) + + return type(self)._simple_new(data, freq=self.freq, dtype=self.dtype) + + raise NotImplementedError( + f"Accumlation {name} not implemented for {type(self)}" + ) + + # func_map = {'cumprod' : np.cumprod, 'cummax':np.maximum.accumulate, + # 'cumsum': np.cumsum, 'cummim':np.minimum.accumulate } + # accum_func = func_map[name] + + # freq = self._freq + + # mask_a, mask_b = { + # np.cumprod: (1.0, np.nan), + # np.maximum.accumulate: (-np.inf, np.nan), + # np.cumsum: (0.0, np.nan), + # np.minimum.accumulate: (np.inf, np.nan), + # }[accum_func] + + # values = self._data + # # GH#30460, GH#29058 + # # numpy 1.18 started sorting NaTs at the end instead of beginning, + # # so we need to work around to maintain backwards-consistency. + # #orig_dtype = values.dtype + + # # We need to define mask before masking NaTs + # mask = isna(values) + + # if accum_func == np.minimum.accumulate: + # # Note: the accum_func comparison fails as an "is" comparison + # y = values.view("i8") + # y[mask] = np.iinfo(np.int64).max + # changed = True + # else: + # y = values + # changed = False + + # result = accum_func(y.view("i8"), axis=0) + # if skipna: + # result[mask] = iNaT + # elif accum_func == np.minimum.accumulate: + # # Restore NaTs that we masked previously + # nz = (~np.asarray(mask)).nonzero()[0] + # if len(nz): + # # everything up to the first non-na entry stays NaT + # result[: nz[0]] = iNaT + + # if changed: + # # restore NaT elements + # y[mask] = iNaT # TODO: could try/finally for this? + + # # DatetimeArray + # result = type(self)._simple_new( # type: ignore[attr-defined] + # result, + # ) + @unpack_zerodim_and_defer("__add__") def __add__(self, other): other_dtype = getattr(other, "dtype", None) diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index e9160c92435a4..ee72829b575d9 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -47,6 +47,7 @@ from pandas.core import nanops from pandas.core.algorithms import checked_add_with_arr +from pandas.core.array_algos import datetimelike_accumulations from pandas.core.arrays import IntegerArray, datetimelike as dtl from pandas.core.arrays._ranges import generate_regular_range import pandas.core.common as com @@ -390,6 +391,24 @@ def std( return self._box_func(result) return self._from_backing_data(result) + # ---------------------------------------------------------------- + # Accumulations + + def _accumulate( + self, name: str, *, skipna: bool = True, **kwargs + ) -> TimedeltaArray: + + data = self._data.copy() + + if name in {"cumsum", "cumsum"}: + op = getattr(datetimelike_accumulations, name) + data = op(data, skipna=skipna, **kwargs) + + return type(self)._simple_new(data, freq=None, dtype=self.dtype) + + else: + return super()._accumulate(name, skipna=skipna, **kwargs) + # ---------------------------------------------------------------- # Rendering Methods diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e9549eead0a42..735427da10373 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -10305,6 +10305,10 @@ def _accum_func(self, name: str, func, axis=None, skipna=True, *args, **kwargs): def block_accum_func(blk_values): values = blk_values.T if hasattr(blk_values, "T") else blk_values + from pandas.core.construction import ensure_wrapped_if_datetimelike + + values = ensure_wrapped_if_datetimelike(values) + if isinstance(values, ExtensionArray): result = values._accumulate(name, skipna=skipna, **kwargs) else: From 597e9780f3542478789d0dab1d1b2275a50b1b71 Mon Sep 17 00:00:00 2001 From: datajanko Date: Tue, 16 Feb 2021 21:14:03 +0100 Subject: [PATCH 57/59] actually ads datetimelike accumulation algos --- .../array_algos/datetimelike_accumulations.py | 69 +++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 pandas/core/array_algos/datetimelike_accumulations.py diff --git a/pandas/core/array_algos/datetimelike_accumulations.py b/pandas/core/array_algos/datetimelike_accumulations.py new file mode 100644 index 0000000000000..f2e0bb0ba3ff6 --- /dev/null +++ b/pandas/core/array_algos/datetimelike_accumulations.py @@ -0,0 +1,69 @@ +from typing import Callable + +import numpy as np + +from pandas._libs import iNaT + +from pandas.core.dtypes.missing import isna + +""" +datetimelke_accumulations.py is for accumulations of datetimelike extension arrays +""" + + +def _cum_func( + func: Callable, + values: np.ndarray, + *, + skipna: bool = True, +): + """ + Accumulations for 1D datetimelike arrays. + + Parameters + ---------- + func : np.cumsum, np.cumprod, np.maximum.accumulate, np.minimum.accumulate + values : np.ndarray + Numpy array with the values (can be of any dtype that support the + operation). + skipna : bool, default True + Whether to skip NA. + """ + try: + fill_value = { + np.cumprod: 1, + np.maximum.accumulate: np.iinfo(np.int64).min, + np.cumsum: 0, + np.minimum.accumulate: np.iinfo(np.int64).max, + }[func] + except KeyError: + raise ValueError(f"No accumulation for {func} implemented on BaseMaskedArray") + + mask = isna(values) + y = values.view("i8") + y[mask] = fill_value + + if not skipna: + # This is different compared to the recent implementation for datetimelikes + # but is the same as the implementation for masked arrays + mask = np.maximum.accumulate(mask) + + result = func(y) + result[mask] = iNaT + return result + + +def cumsum(values: np.ndarray, *, skipna: bool = True): + return _cum_func(np.cumsum, values, skipna=skipna) + + +def cumprod(values: np.ndarray, *, skipna: bool = True): + return _cum_func(np.cumprod, values, skipna=skipna) + + +def cummin(values: np.ndarray, *, skipna: bool = True): + return _cum_func(np.minimum.accumulate, values, skipna=skipna) + + +def cummax(values: np.ndarray, *, skipna: bool = True): + return _cum_func(np.maximum.accumulate, values, skipna=skipna) From 5ebe8eab1cf1dfa5f9b62e1b63e389b1b65af6f4 Mon Sep 17 00:00:00 2001 From: datajanko Date: Tue, 16 Feb 2021 21:42:51 +0100 Subject: [PATCH 58/59] fixes absolute imports --- pandas/core/array_algos/masked_accumulations.py | 5 ++++- pandas/tests/extension/base/accumulate.py | 3 +-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/core/array_algos/masked_accumulations.py b/pandas/core/array_algos/masked_accumulations.py index df26b4f6a958a..fee283227df57 100644 --- a/pandas/core/array_algos/masked_accumulations.py +++ b/pandas/core/array_algos/masked_accumulations.py @@ -2,7 +2,10 @@ import numpy as np -from pandas.core.dtypes.common import is_float_dtype, is_integer_dtype +from pandas.core.dtypes.common import ( + is_float_dtype, + is_integer_dtype, +) """ masked_accumulations.py is for accumulation algorithms using a mask-based approach diff --git a/pandas/tests/extension/base/accumulate.py b/pandas/tests/extension/base/accumulate.py index 3670e89f12ad8..632198b47099d 100644 --- a/pandas/tests/extension/base/accumulate.py +++ b/pandas/tests/extension/base/accumulate.py @@ -1,8 +1,7 @@ import pytest import pandas as pd - -from .base import BaseExtensionTests +from pandas.tests.extension.base.base import BaseExtensionTests class BaseAccumulateTests(BaseExtensionTests): From 32367c07f72b37385492ce0794face39a21d39b1 Mon Sep 17 00:00:00 2001 From: datajanko Date: Sat, 20 Feb 2021 08:19:48 +0100 Subject: [PATCH 59/59] changes error to catch to adhere to changed implementation --- pandas/tests/extension/base/accumulate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/extension/base/accumulate.py b/pandas/tests/extension/base/accumulate.py index 632198b47099d..882f96572791c 100644 --- a/pandas/tests/extension/base/accumulate.py +++ b/pandas/tests/extension/base/accumulate.py @@ -24,7 +24,7 @@ def test_accumulate_series_numeric(self, data, all_numeric_accumulations, skipna op_name = all_numeric_accumulations s = pd.Series(data) - with pytest.raises(TypeError): + with pytest.raises(NotImplementedError): getattr(s, op_name)(skipna=skipna)