From 4ddd0b005be35413d80015e86a08a9d5dc464300 Mon Sep 17 00:00:00 2001 From: Yuanhao Geng <41546976+GYHHAHA@users.noreply.github.com> Date: Wed, 20 Jul 2022 19:46:58 -0500 Subject: [PATCH 1/6] Update array.py --- pandas/core/arrays/sparse/array.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 5653d87a4570b..3b714d21ecd09 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -821,7 +821,7 @@ def shift(self: SparseArrayT, periods: int = 1, fill_value=None) -> SparseArrayT def _first_fill_value_loc(self): """ - Get the location of the first missing value. + Get the location of the first fill value. Returns ------- @@ -834,14 +834,24 @@ def _first_fill_value_loc(self): if not len(indices) or indices[0] > 0: return 0 - diff = indices[1:] - indices[:-1] - return np.searchsorted(diff, 2) + 1 + # a number larger than 1 should be appended to + # the last in case of fill value only appears + # in the tail of array + diff = np.r_[indices[1:] - indices[:-1], 2] + return indices[(diff > 1).argmax()] + 1 def unique(self: SparseArrayT) -> SparseArrayT: uniques = algos.unique(self.sp_values) - fill_loc = self._first_fill_value_loc() - if fill_loc >= 0: - uniques = np.insert(uniques, fill_loc, self.fill_value) + if len(self.sp_values) != len(self): + fill_loc = self._first_fill_value_loc() + # Inorder to align the behavior of pd.unique or + # pd.Series.unique, we should keep the original + # order, here we use unique again to find the + # insertion place. Since the length of sp_values + # is not large, maybe minor performance hurt + # is worthwhile to the correctness. + insert_loc = len(algos.unique(self.sp_values[:fill_loc])) + uniques = np.insert(uniques, insert_loc, self.fill_value) return type(self)._from_sequence(uniques, dtype=self.dtype) def _values_for_factorize(self): From 7ed76db5ce13cd9788009f5b991450ebdad4ac2d Mon Sep 17 00:00:00 2001 From: Yuanhao Geng <41546976+GYHHAHA@users.noreply.github.com> Date: Wed, 20 Jul 2022 19:48:21 -0500 Subject: [PATCH 2/6] Update test_array.py --- pandas/tests/arrays/sparse/test_array.py | 33 +++++++++++++++++------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index 492427b2be213..815417cfd93e4 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -391,23 +391,36 @@ def test_setting_fill_value_updates(): @pytest.mark.parametrize( - "arr, loc", + "arr,fill_value,loc", [ - ([None, 1, 2], 0), - ([0, None, 2], 1), - ([0, 1, None], 2), - ([0, 1, 1, None, None], 3), - ([1, 1, 1, 2], -1), - ([], -1), + ([None, 1, 2], None, 0), + ([0, None, 2], None, 1), + ([0, 1, None], None, 2), + ([0, 1, 1, None, None], None, 3), + ([1, 1, 1, 2], None, -1), + ([], None, -1), + ([None, 1, 0, 0, None, 2], None, 0), + ([None, 1, 0, 0, None, 2], 1, 1), + ([None, 1, 0, 0, None, 2], 2, 5), + ([None, 1, 0, 0, None, 2], 3, -1), + ([None, 0, 0, 1, 2, 1], 0, 1), + ([None, 0, 0, 1, 2, 1], 1, 3), ], ) -def test_first_fill_value_loc(arr, loc): - result = SparseArray(arr)._first_fill_value_loc() +def test_first_fill_value_loc(arr, fill_value, loc): + result = SparseArray(arr, fill_value=fill_value)._first_fill_value_loc() assert result == loc @pytest.mark.parametrize( - "arr", [[1, 2, np.nan, np.nan], [1, np.nan, 2, np.nan], [1, 2, np.nan]] + "arr", + [ + [1, 2, np.nan, np.nan], + [1, np.nan, 2, np.nan], + [1, 2, np.nan], + [np.nan, 1, 0, 0, np.nan, 2], + [np.nan, 0, 0, 1, 2, 1], + ] ) @pytest.mark.parametrize("fill_value", [np.nan, 0, 1]) def test_unique_na_fill(arr, fill_value): From a1b438a5dbfd2d17e6e9df53a43676857c1f8444 Mon Sep 17 00:00:00 2001 From: Yuanhao Geng <41546976+GYHHAHA@users.noreply.github.com> Date: Wed, 20 Jul 2022 19:52:55 -0500 Subject: [PATCH 3/6] Update array.py --- pandas/core/arrays/sparse/array.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 3b714d21ecd09..b547446603853 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -837,7 +837,7 @@ def _first_fill_value_loc(self): # a number larger than 1 should be appended to # the last in case of fill value only appears # in the tail of array - diff = np.r_[indices[1:] - indices[:-1], 2] + diff = np.r_[np.diff(indices), 2] return indices[(diff > 1).argmax()] + 1 def unique(self: SparseArrayT) -> SparseArrayT: From b2463a4164fbdf1d293b134d8e8603ce6098ab1b Mon Sep 17 00:00:00 2001 From: Yuanhao Geng <41546976+GYHHAHA@users.noreply.github.com> Date: Wed, 20 Jul 2022 20:05:13 -0500 Subject: [PATCH 4/6] fix format --- pandas/tests/arrays/sparse/test_array.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index 815417cfd93e4..9b78eb345e188 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -420,7 +420,7 @@ def test_first_fill_value_loc(arr, fill_value, loc): [1, 2, np.nan], [np.nan, 1, 0, 0, np.nan, 2], [np.nan, 0, 0, 1, 2, 1], - ] + ], ) @pytest.mark.parametrize("fill_value", [np.nan, 0, 1]) def test_unique_na_fill(arr, fill_value): From d2403c0ba3b87d980f187e0532cc70f190630e8b Mon Sep 17 00:00:00 2001 From: Yuanhao Geng <41546976+GYHHAHA@users.noreply.github.com> Date: Thu, 21 Jul 2022 15:09:06 -0500 Subject: [PATCH 5/6] Update v1.5.0.rst --- doc/source/whatsnew/v1.5.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 090fea57872c5..5ca9dd96d0bf5 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -1026,6 +1026,7 @@ Reshaping Sparse ^^^^^^ - Bug in :meth:`Series.where` and :meth:`DataFrame.where` with ``SparseDtype`` failing to retain the array's ``fill_value`` (:issue:`45691`) +- Bug in :meth:`SparseArray.unique` fails to keep original elements order (:issue:`45691`) - ExtensionArray From cce7f0ee2bd12e1280234cfc2d71c7bccf977733 Mon Sep 17 00:00:00 2001 From: Yuanhao Geng <41546976+GYHHAHA@users.noreply.github.com> Date: Thu, 21 Jul 2022 15:13:16 -0500 Subject: [PATCH 6/6] fix number --- doc/source/whatsnew/v1.5.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 5ca9dd96d0bf5..acd7cec480c39 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -1026,7 +1026,7 @@ Reshaping Sparse ^^^^^^ - Bug in :meth:`Series.where` and :meth:`DataFrame.where` with ``SparseDtype`` failing to retain the array's ``fill_value`` (:issue:`45691`) -- Bug in :meth:`SparseArray.unique` fails to keep original elements order (:issue:`45691`) +- Bug in :meth:`SparseArray.unique` fails to keep original elements order (:issue:`47809`) - ExtensionArray