From 3ebf8c0ad1fa3bddf9e7132617e64b280bf8fd9f Mon Sep 17 00:00:00 2001 From: Greg Siano Date: Tue, 31 Aug 2021 18:47:10 -0400 Subject: [PATCH 01/16] ENH: rolling rank rolling rank and percentile rank using skiplist --- pandas/_libs/src/skiplist.h | 5 +- pandas/_libs/window/aggregations.pyi | 7 +++ pandas/_libs/window/aggregations.pyx | 73 +++++++++++++++++++++++++++- pandas/core/window/rolling.py | 8 +++ pandas/tests/window/test_rolling.py | 18 +++++++ 5 files changed, 107 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/src/skiplist.h b/pandas/_libs/src/skiplist.h index 1679ced174f29..287d70af76b43 100644 --- a/pandas/_libs/src/skiplist.h +++ b/pandas/_libs/src/skiplist.h @@ -183,7 +183,7 @@ PANDAS_INLINE double skiplist_get(skiplist_t *skp, int i, int *ret) { PANDAS_INLINE int skiplist_insert(skiplist_t *skp, double value) { node_t *node, *prevnode, *newnode, *next_at_level; int *steps_at_level; - int size, steps, level; + int size, steps, level, rank = 0; node_t **chain; chain = skp->tmp_chain; @@ -197,6 +197,7 @@ PANDAS_INLINE int skiplist_insert(skiplist_t *skp, double value) { next_at_level = node->next[level]; while (_node_cmp(next_at_level, value) >= 0) { steps_at_level[level] += node->width[level]; + rank += node->width[level]; node = next_at_level; next_at_level = node->next[level]; } @@ -230,7 +231,7 @@ PANDAS_INLINE int skiplist_insert(skiplist_t *skp, double value) { ++(skp->size); - return 1; + return rank; } PANDAS_INLINE int skiplist_remove(skiplist_t *skp, double value) { diff --git a/pandas/_libs/window/aggregations.pyi b/pandas/_libs/window/aggregations.pyi index fe083fe415e4b..bd67d2bfc036a 100644 --- a/pandas/_libs/window/aggregations.pyi +++ b/pandas/_libs/window/aggregations.pyi @@ -63,6 +63,13 @@ def roll_quantile( quantile: float, # float64_t interpolation: Literal["linear", "lower", "higher", "nearest", "midpoint"], ) -> np.ndarray: ... # np.ndarray[float] +def roll_rank( + values: np.ndarray, + start: np.ndarray, + end: np.ndarray, + minp: int, + bint percentile, +) -> np.ndarray: ... # np.ndarray[float] def roll_apply( obj: object, start: np.ndarray, # np.ndarray[np.int64] diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index f792b653eb07b..15140afb3867a 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -795,7 +795,7 @@ def roll_median_c(const float64_t[:] values, ndarray[int64_t] start, val = values[j] if notnan(val): nobs += 1 - err = skiplist_insert(sl, val) != 1 + err = skiplist_insert(sl, val) == -1 if err: break @@ -806,7 +806,7 @@ def roll_median_c(const float64_t[:] values, ndarray[int64_t] start, val = values[j] if notnan(val): nobs += 1 - err = skiplist_insert(sl, val) != 1 + err = skiplist_insert(sl, val) == -1 if err: break @@ -1138,6 +1138,75 @@ def roll_quantile(const float64_t[:] values, ndarray[int64_t] start, return output +def roll_rank(const float64_t[:] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp, bint percentile) -> np.ndarray: + """ + O(N log(window)) implementation using skip list + """ + cdef: + Py_ssize_t i, j, s, e, N = len(values), idx + int ret = 0, rank = 0 + int64_t nobs = 0, win + float64_t val + float64_t vlow, vhigh + skiplist_t *skiplist + ndarray[float64_t] output + + is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds( + start, end + ) + # we use the Fixed/Variable Indexer here as the + # actual skiplist ops outweigh any window computation costs + output = np.empty(N, dtype=np.float64) + + win = (end - start).max() + if win == 0: + output[:] = NaN + return output + skiplist = skiplist_init(win) + if skiplist == NULL: + raise MemoryError("skiplist_init failed") + + with nogil: + for i in range(0, N): + s = start[i] + e = end[i] + + if i == 0 or not is_monotonic_increasing_bounds: + if not is_monotonic_increasing_bounds: + nobs = 0 + skiplist = skiplist_init(win) + + # setup + for j in range(s, e): + val = values[j] + if notnan(val): + nobs += 1 + rank = skiplist_insert(skiplist, val) + + else: + # calculate deletes + for j in range(start[i - 1], s): + val = values[j] + if notnan(val): + skiplist_remove(skiplist, val) + nobs -= 1 + + # calculate adds + for j in range(end[i - 1], e): + val = values[j] + if notnan(val): + nobs += 1 + rank = skiplist_insert(skiplist, val) + if percentile: + output[i] = (rank + 1) / nobs if rank != -1 else NaN + else: + output[i] = rank + 1 if rank != -1 else NaN + + skiplist_destroy(skiplist) + + return output + def roll_apply(object obj, ndarray[int64_t] start, ndarray[int64_t] end, diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 66ffc2600e88e..b8cf86a77cd44 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -1409,6 +1409,14 @@ def quantile(self, quantile: float, interpolation: str = "linear", **kwargs): return self._apply(window_func, name="quantile", **kwargs) + def rank(self, pct: bool = False, **kwargs): + window_func = partial( + window_aggregations.roll_rank, + percentile=pct, + ) + + return self._apply(window_func, name="rank", **kwargs) + def cov( self, other: DataFrame | Series | None = None, diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 2edf22d96a9ba..d1f0199d783ae 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -1500,3 +1500,21 @@ def test_rolling_numeric_dtypes(): dtype="float64", ) tm.assert_frame_equal(result, expected) + +def test_rank(): + window = 3 + ser = Series(data=np.random.rand(50)) + + result = Series(data=[ser[max(0, i - window + 1):i + 1].rank().iloc[-1] for i in range(50)]) + expected = ser.rolling(window).rank() + + tm.assert_series_equal(result, expected) + +def test_percentile_rank(): + window = 3 + ser = Series(data=np.random.rand(50)) + + result = Series(data=[ser[max(0, i - window + 1):i + 1].rank(pct=True).iloc[-1] for i in range(50)]) + expected = ser.rolling(window).rank(pct=True) + + tm.assert_series_equal(result, expected) From ce754f745133fc25c7895cb2b92cdf70adcc25e2 Mon Sep 17 00:00:00 2001 From: Greg Siano Date: Tue, 31 Aug 2021 18:47:10 -0400 Subject: [PATCH 02/16] ENH: rolling rank rolling rank and percentile rank using skiplist --- pandas/_libs/src/skiplist.h | 5 +- pandas/_libs/window/aggregations.pyi | 7 +++ pandas/_libs/window/aggregations.pyx | 73 +++++++++++++++++++++++++++- pandas/core/window/rolling.py | 8 +++ pandas/tests/window/test_rolling.py | 20 ++++++++ 5 files changed, 109 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/src/skiplist.h b/pandas/_libs/src/skiplist.h index 1679ced174f29..287d70af76b43 100644 --- a/pandas/_libs/src/skiplist.h +++ b/pandas/_libs/src/skiplist.h @@ -183,7 +183,7 @@ PANDAS_INLINE double skiplist_get(skiplist_t *skp, int i, int *ret) { PANDAS_INLINE int skiplist_insert(skiplist_t *skp, double value) { node_t *node, *prevnode, *newnode, *next_at_level; int *steps_at_level; - int size, steps, level; + int size, steps, level, rank = 0; node_t **chain; chain = skp->tmp_chain; @@ -197,6 +197,7 @@ PANDAS_INLINE int skiplist_insert(skiplist_t *skp, double value) { next_at_level = node->next[level]; while (_node_cmp(next_at_level, value) >= 0) { steps_at_level[level] += node->width[level]; + rank += node->width[level]; node = next_at_level; next_at_level = node->next[level]; } @@ -230,7 +231,7 @@ PANDAS_INLINE int skiplist_insert(skiplist_t *skp, double value) { ++(skp->size); - return 1; + return rank; } PANDAS_INLINE int skiplist_remove(skiplist_t *skp, double value) { diff --git a/pandas/_libs/window/aggregations.pyi b/pandas/_libs/window/aggregations.pyi index fe083fe415e4b..bd67d2bfc036a 100644 --- a/pandas/_libs/window/aggregations.pyi +++ b/pandas/_libs/window/aggregations.pyi @@ -63,6 +63,13 @@ def roll_quantile( quantile: float, # float64_t interpolation: Literal["linear", "lower", "higher", "nearest", "midpoint"], ) -> np.ndarray: ... # np.ndarray[float] +def roll_rank( + values: np.ndarray, + start: np.ndarray, + end: np.ndarray, + minp: int, + bint percentile, +) -> np.ndarray: ... # np.ndarray[float] def roll_apply( obj: object, start: np.ndarray, # np.ndarray[np.int64] diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index f792b653eb07b..15140afb3867a 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -795,7 +795,7 @@ def roll_median_c(const float64_t[:] values, ndarray[int64_t] start, val = values[j] if notnan(val): nobs += 1 - err = skiplist_insert(sl, val) != 1 + err = skiplist_insert(sl, val) == -1 if err: break @@ -806,7 +806,7 @@ def roll_median_c(const float64_t[:] values, ndarray[int64_t] start, val = values[j] if notnan(val): nobs += 1 - err = skiplist_insert(sl, val) != 1 + err = skiplist_insert(sl, val) == -1 if err: break @@ -1138,6 +1138,75 @@ def roll_quantile(const float64_t[:] values, ndarray[int64_t] start, return output +def roll_rank(const float64_t[:] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp, bint percentile) -> np.ndarray: + """ + O(N log(window)) implementation using skip list + """ + cdef: + Py_ssize_t i, j, s, e, N = len(values), idx + int ret = 0, rank = 0 + int64_t nobs = 0, win + float64_t val + float64_t vlow, vhigh + skiplist_t *skiplist + ndarray[float64_t] output + + is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds( + start, end + ) + # we use the Fixed/Variable Indexer here as the + # actual skiplist ops outweigh any window computation costs + output = np.empty(N, dtype=np.float64) + + win = (end - start).max() + if win == 0: + output[:] = NaN + return output + skiplist = skiplist_init(win) + if skiplist == NULL: + raise MemoryError("skiplist_init failed") + + with nogil: + for i in range(0, N): + s = start[i] + e = end[i] + + if i == 0 or not is_monotonic_increasing_bounds: + if not is_monotonic_increasing_bounds: + nobs = 0 + skiplist = skiplist_init(win) + + # setup + for j in range(s, e): + val = values[j] + if notnan(val): + nobs += 1 + rank = skiplist_insert(skiplist, val) + + else: + # calculate deletes + for j in range(start[i - 1], s): + val = values[j] + if notnan(val): + skiplist_remove(skiplist, val) + nobs -= 1 + + # calculate adds + for j in range(end[i - 1], e): + val = values[j] + if notnan(val): + nobs += 1 + rank = skiplist_insert(skiplist, val) + if percentile: + output[i] = (rank + 1) / nobs if rank != -1 else NaN + else: + output[i] = rank + 1 if rank != -1 else NaN + + skiplist_destroy(skiplist) + + return output + def roll_apply(object obj, ndarray[int64_t] start, ndarray[int64_t] end, diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 66ffc2600e88e..b8cf86a77cd44 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -1409,6 +1409,14 @@ def quantile(self, quantile: float, interpolation: str = "linear", **kwargs): return self._apply(window_func, name="quantile", **kwargs) + def rank(self, pct: bool = False, **kwargs): + window_func = partial( + window_aggregations.roll_rank, + percentile=pct, + ) + + return self._apply(window_func, name="rank", **kwargs) + def cov( self, other: DataFrame | Series | None = None, diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 2edf22d96a9ba..63eef1872b7dd 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -1500,3 +1500,23 @@ def test_rolling_numeric_dtypes(): dtype="float64", ) tm.assert_frame_equal(result, expected) + +@pytest.mark.parametrize("window", [1, 3, 10, 1000]) +def test_rank(window): + length = 1000 + ser = Series(data=np.random.rand(length)) + + result = Series(data=[ser[max(0, i - window + 1):i + 1].rank().iloc[-1] for i in range(length)]) + expected = ser.rolling(window).rank() + + tm.assert_series_equal(result, expected) + +@pytest.mark.parametrize("window", [1, 3, 10, 1000]) +def test_percentile_rank(window): + length = 1000 + ser = Series(data=np.random.rand(length)) + + result = Series(data=[ser[max(0, i - window + 1):i + 1].rank(pct=True).iloc[-1] for i in range(length)]) + expected = ser.rolling(window).rank(pct=True) + + tm.assert_series_equal(result, expected) From 874c980ca1368966d026ce4af18eaac3678e3282 Mon Sep 17 00:00:00 2001 From: Greg Siano Date: Tue, 31 Aug 2021 18:58:38 -0400 Subject: [PATCH 03/16] ENH: rolling rank remove unused variables, fix indentation, add comment to roll_rank() --- pandas/_libs/window/aggregations.pyx | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 15140afb3867a..75fd1b5b0df0c 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -1139,16 +1139,17 @@ def roll_quantile(const float64_t[:] values, ndarray[int64_t] start, return output def roll_rank(const float64_t[:] values, ndarray[int64_t] start, - ndarray[int64_t] end, int64_t minp, bint percentile) -> np.ndarray: + ndarray[int64_t] end, int64_t minp, bint percentile) -> np.ndarray: """ O(N log(window)) implementation using skip list + + derived from roll_quantile """ cdef: Py_ssize_t i, j, s, e, N = len(values), idx - int ret = 0, rank = 0 + int rank = 0 int64_t nobs = 0, win float64_t val - float64_t vlow, vhigh skiplist_t *skiplist ndarray[float64_t] output From 4d06ba33e3f99876110312eb36912ca39e0ed910 Mon Sep 17 00:00:00 2001 From: Greg Siano Date: Tue, 31 Aug 2021 19:05:21 -0400 Subject: [PATCH 04/16] ENH: rolling rank don't fill output when nobs < minp --- pandas/_libs/window/aggregations.pyx | 9 ++++++--- pandas/tests/window/test_rolling.py | 4 ++-- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 75fd1b5b0df0c..87c999725abf6 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -1199,10 +1199,13 @@ def roll_rank(const float64_t[:] values, ndarray[int64_t] start, if notnan(val): nobs += 1 rank = skiplist_insert(skiplist, val) - if percentile: - output[i] = (rank + 1) / nobs if rank != -1 else NaN + if nobs >= minp: + if percentile: + output[i] = (rank + 1) / nobs if rank != -1 else NaN + else: + output[i] = rank + 1 if rank != -1 else NaN else: - output[i] = rank + 1 if rank != -1 else NaN + output[i] = NaN skiplist_destroy(skiplist) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 63eef1872b7dd..8b99a0cc93930 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -1506,7 +1506,7 @@ def test_rank(window): length = 1000 ser = Series(data=np.random.rand(length)) - result = Series(data=[ser[max(0, i - window + 1):i + 1].rank().iloc[-1] for i in range(length)]) + result = Series(data=[ser[i - window + 1:i + 1].rank().iloc[-1] if i - window + 1 >= 0 else np.NaN for i in range(length)]) expected = ser.rolling(window).rank() tm.assert_series_equal(result, expected) @@ -1516,7 +1516,7 @@ def test_percentile_rank(window): length = 1000 ser = Series(data=np.random.rand(length)) - result = Series(data=[ser[max(0, i - window + 1):i + 1].rank(pct=True).iloc[-1] for i in range(length)]) + result = Series(data=[ser[i - window + 1:i + 1].rank(pct=True).iloc[-1] if i - window + 1 >= 0 else np.NaN for i in range(length)]) expected = ser.rolling(window).rank(pct=True) tm.assert_series_equal(result, expected) From 1308208b8c387ff307d8a6bd98726ac3a5253573 Mon Sep 17 00:00:00 2001 From: Greg Siano Date: Tue, 31 Aug 2021 19:28:16 -0400 Subject: [PATCH 05/16] ENH: rolling rank address lint warnings --- pandas/tests/window/test_rolling.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 8b99a0cc93930..13cadbba828fa 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -1501,22 +1501,30 @@ def test_rolling_numeric_dtypes(): ) tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("window", [1, 3, 10, 1000]) def test_rank(window): length = 1000 ser = Series(data=np.random.rand(length)) - result = Series(data=[ser[i - window + 1:i + 1].rank().iloc[-1] if i - window + 1 >= 0 else np.NaN for i in range(length)]) + result = Series([ + ser[i - window:i].rank().iloc[-1] if i - window >= 0 else np.NaN + for i in range(1, length + 1) + ]) expected = ser.rolling(window).rank() tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("window", [1, 3, 10, 1000]) def test_percentile_rank(window): length = 1000 ser = Series(data=np.random.rand(length)) - result = Series(data=[ser[i - window + 1:i + 1].rank(pct=True).iloc[-1] if i - window + 1 >= 0 else np.NaN for i in range(length)]) + result = Series([ + ser[i - window:i].rank(pct=True).iloc[-1] if i - window >= 0 else np.NaN + for i in range(1, length + 1) + ]) expected = ser.rolling(window).rank(pct=True) tm.assert_series_equal(result, expected) From 4caa51b1790d3b1c03835e919fc9f753fbd817b3 Mon Sep 17 00:00:00 2001 From: Greg Siano Date: Tue, 31 Aug 2021 23:46:08 -0400 Subject: [PATCH 06/16] ENH: rolling rank raise MemoryError on skiplist_insert failure, destroy skiplist before re-init, and other minor changes --- asv_bench/benchmarks/rolling.py | 18 ++++++++++++++++++ pandas/_libs/window/aggregations.pyi | 2 +- pandas/_libs/window/aggregations.pyx | 16 +++++++++------- pandas/tests/window/test_rolling.py | 14 ++++---------- 4 files changed, 32 insertions(+), 18 deletions(-) diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index 97294fc02834b..786e4b6566ad5 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -180,6 +180,24 @@ def time_quantile(self, constructor, window, dtype, percentile, interpolation): self.roll.quantile(percentile, interpolation=interpolation) +class Rank: + params = ( + ["DataFrame", "Series"], + [10, 1000], + ["int", "float"], + [True, False], + ) + param_names = ["constructor", "window", "dtype", "percentile"] + + def setup(self, constructor, window, dtype, percentile): + N = 10 ** 5 + arr = np.random.random(N).astype(dtype) + self.roll = getattr(pd, constructor)(arr).rolling(window) + + def time_rank(self, constructor, window, dtype, percentile): + self.roll.rank(percentile) + + class PeakMemFixedWindowMinMax: params = ["min", "max"] diff --git a/pandas/_libs/window/aggregations.pyi b/pandas/_libs/window/aggregations.pyi index bd67d2bfc036a..72fb028e1edd2 100644 --- a/pandas/_libs/window/aggregations.pyi +++ b/pandas/_libs/window/aggregations.pyi @@ -68,7 +68,7 @@ def roll_rank( start: np.ndarray, end: np.ndarray, minp: int, - bint percentile, + percentile: bool, ) -> np.ndarray: ... # np.ndarray[float] def roll_apply( obj: object, diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 87c999725abf6..b79ef6a5c3014 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -1151,7 +1151,7 @@ def roll_rank(const float64_t[:] values, ndarray[int64_t] start, int64_t nobs = 0, win float64_t val skiplist_t *skiplist - ndarray[float64_t] output + float64_t[::1] output = None is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds( start, end @@ -1169,13 +1169,14 @@ def roll_rank(const float64_t[:] values, ndarray[int64_t] start, raise MemoryError("skiplist_init failed") with nogil: - for i in range(0, N): + for i in range(N): s = start[i] e = end[i] if i == 0 or not is_monotonic_increasing_bounds: if not is_monotonic_increasing_bounds: nobs = 0 + skiplist_destroy(skiplist) skiplist = skiplist_init(win) # setup @@ -1184,6 +1185,8 @@ def roll_rank(const float64_t[:] values, ndarray[int64_t] start, if notnan(val): nobs += 1 rank = skiplist_insert(skiplist, val) + if rank == -1: + raise MemoryError("skiplist_insert failed") else: # calculate deletes @@ -1199,17 +1202,16 @@ def roll_rank(const float64_t[:] values, ndarray[int64_t] start, if notnan(val): nobs += 1 rank = skiplist_insert(skiplist, val) + if rank == -1: + raise MemoryError("skiplist_insert failed") if nobs >= minp: - if percentile: - output[i] = (rank + 1) / nobs if rank != -1 else NaN - else: - output[i] = rank + 1 if rank != -1 else NaN + output[i] = (rank + 1) / nobs if percentile else rank + 1 else: output[i] = NaN skiplist_destroy(skiplist) - return output + return np.asarray(output) def roll_apply(object obj, diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 13cadbba828fa..f829ae4be0f0d 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -1507,11 +1507,8 @@ def test_rank(window): length = 1000 ser = Series(data=np.random.rand(length)) - result = Series([ - ser[i - window:i].rank().iloc[-1] if i - window >= 0 else np.NaN - for i in range(1, length + 1) - ]) - expected = ser.rolling(window).rank() + expected = ser.rolling(window).apply(lambda x: x.rank().iloc[-1]) + result = ser.rolling(window).rank() tm.assert_series_equal(result, expected) @@ -1521,10 +1518,7 @@ def test_percentile_rank(window): length = 1000 ser = Series(data=np.random.rand(length)) - result = Series([ - ser[i - window:i].rank(pct=True).iloc[-1] if i - window >= 0 else np.NaN - for i in range(1, length + 1) - ]) - expected = ser.rolling(window).rank(pct=True) + expected = ser.rolling(window).apply(lambda x: x.rank(pct=True).iloc[-1]) + result = ser.rolling(window).rank(pct=True) tm.assert_series_equal(result, expected) From f2ee5b27be7179e7e2e011a1801898c9a67269c6 Mon Sep 17 00:00:00 2001 From: Greg Siano Date: Wed, 1 Sep 2021 12:27:36 -0400 Subject: [PATCH 07/16] ENH: rolling rank - rank methods implement min, max, and average rank methods --- pandas/_libs/src/skiplist.h | 35 +++++++++++++++++++++- pandas/_libs/window/aggregations.pyi | 1 + pandas/_libs/window/aggregations.pyx | 45 +++++++++++++++++++++++++--- pandas/core/window/rolling.py | 3 +- pandas/tests/window/test_rolling.py | 27 +++++++---------- 5 files changed, 89 insertions(+), 22 deletions(-) diff --git a/pandas/_libs/src/skiplist.h b/pandas/_libs/src/skiplist.h index 287d70af76b43..1a7a7b83dcf40 100644 --- a/pandas/_libs/src/skiplist.h +++ b/pandas/_libs/src/skiplist.h @@ -180,6 +180,39 @@ PANDAS_INLINE double skiplist_get(skiplist_t *skp, int i, int *ret) { return node->value; } +PANDAS_INLINE int skiplist_min_rank(skiplist_t *skp, double value) { + node_t *node; + int level, rank = 0; + + node = skp->head; + for (level = skp->maxlevels - 1; level >= 0; --level) { + while (_node_cmp(node->next[level], value) > 0) { + rank += node->width[level]; + node = node->next[level]; + } + } + + return rank + 1; +} + +/*PANDAS_INLINE int skiplist_max_rank(skiplist_t *skp, double value) { + node_t *node; + int level, rank = 0; + + node = skp->head; + for (level = skp->maxlevels - 1; level >= 0; --level) { + while (_node_cmp(node->next[level], value) >= 0) { + rank += node->width[level]; + node = node->next[level]; + } + } + + return rank; +}*/ + +// Returns the rank of the inserted element. When there are duplicates, `rank` is the highest of +// the group, i.e. the 'max' method of +// https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.rank.html PANDAS_INLINE int skiplist_insert(skiplist_t *skp, double value) { node_t *node, *prevnode, *newnode, *next_at_level; int *steps_at_level; @@ -231,7 +264,7 @@ PANDAS_INLINE int skiplist_insert(skiplist_t *skp, double value) { ++(skp->size); - return rank; + return rank + 1; } PANDAS_INLINE int skiplist_remove(skiplist_t *skp, double value) { diff --git a/pandas/_libs/window/aggregations.pyi b/pandas/_libs/window/aggregations.pyi index 72fb028e1edd2..6a176eeef316d 100644 --- a/pandas/_libs/window/aggregations.pyi +++ b/pandas/_libs/window/aggregations.pyi @@ -69,6 +69,7 @@ def roll_rank( end: np.ndarray, minp: int, percentile: bool, + method: Literal["average", "min", "max", "first", "dense"], ) -> np.ndarray: ... # np.ndarray[float] def roll_apply( obj: object, diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index b79ef6a5c3014..a579323e63e32 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -50,6 +50,9 @@ cdef extern from "../src/skiplist.h": double skiplist_get(skiplist_t*, int, int*) nogil int skiplist_insert(skiplist_t*, double) nogil int skiplist_remove(skiplist_t*, double) nogil + int skiplist_rank(skiplist_t*, double) nogil + int skiplist_min_rank(skiplist_t*, double) nogil + int skiplist_max_rank(skiplist_t*, double) nogil cdef: float32_t MINfloat32 = np.NINF @@ -1138,8 +1141,26 @@ def roll_quantile(const float64_t[:] values, ndarray[int64_t] start, return output + +cdef enum RankType: + AVERAGE, + MIN, + MAX, + FIRST, + DENSE + + +rank_types = { + 'average': AVERAGE, + 'min': MIN, + 'max': MAX, + 'first': FIRST, + 'dense': DENSE, +} + + def roll_rank(const float64_t[:] values, ndarray[int64_t] start, - ndarray[int64_t] end, int64_t minp, bint percentile) -> np.ndarray: + ndarray[int64_t] end, int64_t minp, bint percentile, str method) -> np.ndarray: """ O(N log(window)) implementation using skip list @@ -1147,11 +1168,17 @@ def roll_rank(const float64_t[:] values, ndarray[int64_t] start, """ cdef: Py_ssize_t i, j, s, e, N = len(values), idx - int rank = 0 + float64_t rank_min = 0, rank = 0 int64_t nobs = 0, win float64_t val skiplist_t *skiplist float64_t[::1] output = None + RankType rank_type + + try: + rank_type = rank_types[method] + except KeyError: + raise ValueError(f"Method '{method}' is not supported") is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds( start, end @@ -1163,7 +1190,7 @@ def roll_rank(const float64_t[:] values, ndarray[int64_t] start, win = (end - start).max() if win == 0: output[:] = NaN - return output + return np.asarray(output) skiplist = skiplist_init(win) if skiplist == NULL: raise MemoryError("skiplist_init failed") @@ -1187,6 +1214,11 @@ def roll_rank(const float64_t[:] values, ndarray[int64_t] start, rank = skiplist_insert(skiplist, val) if rank == -1: raise MemoryError("skiplist_insert failed") + if rank_type == AVERAGE: + rank_min = skiplist_min_rank(skiplist, val) + rank = ((rank * (rank + 1) / 2) - ((rank_min - 1) * rank_min / 2)) / (rank - rank_min + 1) + elif rank_type == MIN: + rank = skiplist_min_rank(skiplist, val) else: # calculate deletes @@ -1204,8 +1236,13 @@ def roll_rank(const float64_t[:] values, ndarray[int64_t] start, rank = skiplist_insert(skiplist, val) if rank == -1: raise MemoryError("skiplist_insert failed") + if rank_type == AVERAGE: + rank_min = skiplist_min_rank(skiplist, val) + rank = ((rank * (rank + 1) / 2) - ((rank_min - 1) * rank_min / 2)) / (rank - rank_min + 1) + elif rank_type == MIN: + rank = skiplist_min_rank(skiplist, val) if nobs >= minp: - output[i] = (rank + 1) / nobs if percentile else rank + 1 + output[i] = (rank) / nobs if percentile else rank else: output[i] = NaN diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index b8cf86a77cd44..9c63af1d0e8cf 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -1409,10 +1409,11 @@ def quantile(self, quantile: float, interpolation: str = "linear", **kwargs): return self._apply(window_func, name="quantile", **kwargs) - def rank(self, pct: bool = False, **kwargs): + def rank(self, pct: bool = False, method: str = "average", **kwargs): window_func = partial( window_aggregations.roll_rank, percentile=pct, + method=method, ) return self._apply(window_func, name="rank", **kwargs) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index f829ae4be0f0d..70c2c75d8dde9 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -1502,23 +1502,18 @@ def test_rolling_numeric_dtypes(): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("window", [1, 3, 10, 1000]) -def test_rank(window): +@pytest.mark.parametrize("window", [1, 3, 10, 50, 1000]) +@pytest.mark.parametrize("method", ["min", "max", "average"]) +@pytest.mark.parametrize("pct", [True, False]) +@pytest.mark.parametrize("dups", [True, False]) +def test_rank(window, method, pct, dups): length = 1000 - ser = Series(data=np.random.rand(length)) - - expected = ser.rolling(window).apply(lambda x: x.rank().iloc[-1]) - result = ser.rolling(window).rank() - - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize("window", [1, 3, 10, 1000]) -def test_percentile_rank(window): - length = 1000 - ser = Series(data=np.random.rand(length)) + if dups: + ser = Series(data=np.random.choice(3, length)) + else: + ser = Series(data=np.random.rand(length)) - expected = ser.rolling(window).apply(lambda x: x.rank(pct=True).iloc[-1]) - result = ser.rolling(window).rank(pct=True) + expected = ser.rolling(window).apply(lambda x: x.rank(method=method, pct=pct).iloc[-1]) + result = ser.rolling(window).rank(method=method, pct=pct) tm.assert_series_equal(result, expected) From b135f1e0b1e8ef3e5aecf937bf050c4b35f3f96a Mon Sep 17 00:00:00 2001 From: Greg Siano Date: Wed, 1 Sep 2021 18:09:53 -0400 Subject: [PATCH 08/16] ENH: rolling rank - `ascending` flag added the `ascending` flag, various cleanups, expanded tests and asv benchmark --- asv_bench/benchmarks/rolling.py | 10 ++++++---- pandas/_libs/src/skiplist.h | 15 --------------- pandas/_libs/window/aggregations.pyi | 3 ++- pandas/_libs/window/aggregations.pyx | 11 ++++------- pandas/core/window/rolling.py | 3 ++- pandas/tests/window/test_rolling.py | 17 ++++++++++------- 6 files changed, 24 insertions(+), 35 deletions(-) diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index 786e4b6566ad5..b1c7e8c0c6548 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -186,16 +186,18 @@ class Rank: [10, 1000], ["int", "float"], [True, False], + [True, False], + ["min", "max", "average"], ) - param_names = ["constructor", "window", "dtype", "percentile"] + param_names = ["constructor", "window", "dtype", "percentile", "ascending", "method"] - def setup(self, constructor, window, dtype, percentile): + def setup(self, constructor, window, dtype, percentile, ascending, method): N = 10 ** 5 arr = np.random.random(N).astype(dtype) self.roll = getattr(pd, constructor)(arr).rolling(window) - def time_rank(self, constructor, window, dtype, percentile): - self.roll.rank(percentile) + def time_rank(self, constructor, window, dtype, percentile, ascending, method): + self.roll.rank(pct=percentile, ascending=ascending, method=method) class PeakMemFixedWindowMinMax: diff --git a/pandas/_libs/src/skiplist.h b/pandas/_libs/src/skiplist.h index 1a7a7b83dcf40..c00d3c4ddef20 100644 --- a/pandas/_libs/src/skiplist.h +++ b/pandas/_libs/src/skiplist.h @@ -195,21 +195,6 @@ PANDAS_INLINE int skiplist_min_rank(skiplist_t *skp, double value) { return rank + 1; } -/*PANDAS_INLINE int skiplist_max_rank(skiplist_t *skp, double value) { - node_t *node; - int level, rank = 0; - - node = skp->head; - for (level = skp->maxlevels - 1; level >= 0; --level) { - while (_node_cmp(node->next[level], value) >= 0) { - rank += node->width[level]; - node = node->next[level]; - } - } - - return rank; -}*/ - // Returns the rank of the inserted element. When there are duplicates, `rank` is the highest of // the group, i.e. the 'max' method of // https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.rank.html diff --git a/pandas/_libs/window/aggregations.pyi b/pandas/_libs/window/aggregations.pyi index 6a176eeef316d..faf704aa968c3 100644 --- a/pandas/_libs/window/aggregations.pyi +++ b/pandas/_libs/window/aggregations.pyi @@ -69,7 +69,8 @@ def roll_rank( end: np.ndarray, minp: int, percentile: bool, - method: Literal["average", "min", "max", "first", "dense"], + method: Literal["average", "min", "max"], + ascending: bool, ) -> np.ndarray: ... # np.ndarray[float] def roll_apply( obj: object, diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index a579323e63e32..126f15cca723f 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -52,7 +52,6 @@ cdef extern from "../src/skiplist.h": int skiplist_remove(skiplist_t*, double) nogil int skiplist_rank(skiplist_t*, double) nogil int skiplist_min_rank(skiplist_t*, double) nogil - int skiplist_max_rank(skiplist_t*, double) nogil cdef: float32_t MINfloat32 = np.NINF @@ -1154,13 +1153,11 @@ rank_types = { 'average': AVERAGE, 'min': MIN, 'max': MAX, - 'first': FIRST, - 'dense': DENSE, } def roll_rank(const float64_t[:] values, ndarray[int64_t] start, - ndarray[int64_t] end, int64_t minp, bint percentile, str method) -> np.ndarray: + ndarray[int64_t] end, int64_t minp, bint percentile, str method, bint ascending) -> np.ndarray: """ O(N log(window)) implementation using skip list @@ -1208,7 +1205,7 @@ def roll_rank(const float64_t[:] values, ndarray[int64_t] start, # setup for j in range(s, e): - val = values[j] + val = values[j] if ascending else -values[j] if notnan(val): nobs += 1 rank = skiplist_insert(skiplist, val) @@ -1223,14 +1220,14 @@ def roll_rank(const float64_t[:] values, ndarray[int64_t] start, else: # calculate deletes for j in range(start[i - 1], s): - val = values[j] + val = values[j] if ascending else -values[j] if notnan(val): skiplist_remove(skiplist, val) nobs -= 1 # calculate adds for j in range(end[i - 1], e): - val = values[j] + val = values[j] if ascending else -values[j] if notnan(val): nobs += 1 rank = skiplist_insert(skiplist, val) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 9c63af1d0e8cf..4ddcb045228ae 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -1409,11 +1409,12 @@ def quantile(self, quantile: float, interpolation: str = "linear", **kwargs): return self._apply(window_func, name="quantile", **kwargs) - def rank(self, pct: bool = False, method: str = "average", **kwargs): + def rank(self, pct: bool = False, method: str = "average", ascending: bool = True, **kwargs): window_func = partial( window_aggregations.roll_rank, percentile=pct, method=method, + ascending=ascending, ) return self._apply(window_func, name="rank", **kwargs) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 70c2c75d8dde9..6af7c13b75e2a 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -1505,15 +1505,18 @@ def test_rolling_numeric_dtypes(): @pytest.mark.parametrize("window", [1, 3, 10, 50, 1000]) @pytest.mark.parametrize("method", ["min", "max", "average"]) @pytest.mark.parametrize("pct", [True, False]) -@pytest.mark.parametrize("dups", [True, False]) -def test_rank(window, method, pct, dups): +@pytest.mark.parametrize("ascending", [True, False]) +@pytest.mark.parametrize("test_data", ["default", "duplicates", "nans"]) +def test_rank(window, method, pct, ascending, test_data): length = 1000 - if dups: - ser = Series(data=np.random.choice(3, length)) - else: + if test_data == "default": ser = Series(data=np.random.rand(length)) + elif test_data == "duplicates": + ser = Series(data=np.random.choice(3, length)) + elif test_data == "nans": + ser = Series(data=np.random.choice([1.0, 0.25, 0.75, np.nan], length)) - expected = ser.rolling(window).apply(lambda x: x.rank(method=method, pct=pct).iloc[-1]) - result = ser.rolling(window).rank(method=method, pct=pct) + expected = ser.rolling(window).apply(lambda x: x.rank(method=method, pct=pct, ascending=ascending).iloc[-1]) + result = ser.rolling(window).rank(method=method, pct=pct, ascending=ascending) tm.assert_series_equal(result, expected) From fda85b429ee697b9269fab81fd6090da3fcd7c6f Mon Sep 17 00:00:00 2001 From: Greg Siano Date: Wed, 1 Sep 2021 18:12:38 -0400 Subject: [PATCH 09/16] ENH: rolling rank remove unimplemented rank types --- pandas/_libs/window/aggregations.pyx | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 126f15cca723f..ee424f2d75167 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -1145,8 +1145,6 @@ cdef enum RankType: AVERAGE, MIN, MAX, - FIRST, - DENSE rank_types = { From e692ce326bb50be47eee98ff8e22ae999d8c8a56 Mon Sep 17 00:00:00 2001 From: Greg Siano Date: Wed, 1 Sep 2021 20:30:17 -0400 Subject: [PATCH 10/16] ENH: rolling rank - reorder parameter list reorder parameter list to match DataFrame.rank --- pandas/core/window/rolling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 4ddcb045228ae..3f9725ba8033a 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -1409,7 +1409,7 @@ def quantile(self, quantile: float, interpolation: str = "linear", **kwargs): return self._apply(window_func, name="quantile", **kwargs) - def rank(self, pct: bool = False, method: str = "average", ascending: bool = True, **kwargs): + def rank(self, method: str = "average", ascending: bool = True, pct: bool = False, **kwargs): window_func = partial( window_aggregations.roll_rank, percentile=pct, From 6b23fc019da517e22fa3f742b822f245e00ed31a Mon Sep 17 00:00:00 2001 From: Greg Siano Date: Thu, 2 Sep 2021 12:25:17 -0400 Subject: [PATCH 11/16] ENH: rolling rank - address pre-commit errors --- asv_bench/benchmarks/rolling.py | 9 ++++++++- pandas/_libs/src/skiplist.h | 4 ++-- pandas/_libs/window/aggregations.pyx | 11 ++++++++--- pandas/core/window/rolling.py | 10 ++++++++-- pandas/tests/window/test_rolling.py | 4 +++- 5 files changed, 29 insertions(+), 9 deletions(-) diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index b1c7e8c0c6548..3d2273b6d7324 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -189,7 +189,14 @@ class Rank: [True, False], ["min", "max", "average"], ) - param_names = ["constructor", "window", "dtype", "percentile", "ascending", "method"] + param_names = [ + "constructor", + "window", + "dtype", + "percentile", + "ascending", + "method", + ] def setup(self, constructor, window, dtype, percentile, ascending, method): N = 10 ** 5 diff --git a/pandas/_libs/src/skiplist.h b/pandas/_libs/src/skiplist.h index c00d3c4ddef20..27ef4c98307b2 100644 --- a/pandas/_libs/src/skiplist.h +++ b/pandas/_libs/src/skiplist.h @@ -195,8 +195,8 @@ PANDAS_INLINE int skiplist_min_rank(skiplist_t *skp, double value) { return rank + 1; } -// Returns the rank of the inserted element. When there are duplicates, `rank` is the highest of -// the group, i.e. the 'max' method of +// Returns the rank of the inserted element. When there are duplicates, +// `rank` is the highest of the group, i.e. the 'max' method of // https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.rank.html PANDAS_INLINE int skiplist_insert(skiplist_t *skp, double value) { node_t *node, *prevnode, *newnode, *next_at_level; diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index ee424f2d75167..0f9ac43b1f224 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -1155,7 +1155,8 @@ rank_types = { def roll_rank(const float64_t[:] values, ndarray[int64_t] start, - ndarray[int64_t] end, int64_t minp, bint percentile, str method, bint ascending) -> np.ndarray: + ndarray[int64_t] end, int64_t minp, bint percentile, + str method, bint ascending) -> np.ndarray: """ O(N log(window)) implementation using skip list @@ -1211,7 +1212,9 @@ def roll_rank(const float64_t[:] values, ndarray[int64_t] start, raise MemoryError("skiplist_insert failed") if rank_type == AVERAGE: rank_min = skiplist_min_rank(skiplist, val) - rank = ((rank * (rank + 1) / 2) - ((rank_min - 1) * rank_min / 2)) / (rank - rank_min + 1) + rank = ((rank * (rank + 1) / 2) + - ((rank_min - 1) * rank_min / 2)) \ + / (rank - rank_min + 1) elif rank_type == MIN: rank = skiplist_min_rank(skiplist, val) @@ -1233,7 +1236,9 @@ def roll_rank(const float64_t[:] values, ndarray[int64_t] start, raise MemoryError("skiplist_insert failed") if rank_type == AVERAGE: rank_min = skiplist_min_rank(skiplist, val) - rank = ((rank * (rank + 1) / 2) - ((rank_min - 1) * rank_min / 2)) / (rank - rank_min + 1) + rank = ((rank * (rank + 1) / 2) + - ((rank_min - 1) * rank_min / 2)) \ + / (rank - rank_min + 1) elif rank_type == MIN: rank = skiplist_min_rank(skiplist, val) if nobs >= minp: diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 3f9725ba8033a..edaafd2d3b57b 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -1409,12 +1409,18 @@ def quantile(self, quantile: float, interpolation: str = "linear", **kwargs): return self._apply(window_func, name="quantile", **kwargs) - def rank(self, method: str = "average", ascending: bool = True, pct: bool = False, **kwargs): + def rank( + self, + method: str = "average", + ascending: bool = True, + pct: bool = False, + **kwargs, + ): window_func = partial( window_aggregations.roll_rank, - percentile=pct, method=method, ascending=ascending, + percentile=pct, ) return self._apply(window_func, name="rank", **kwargs) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 6af7c13b75e2a..d54996c27c6e8 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -1516,7 +1516,9 @@ def test_rank(window, method, pct, ascending, test_data): elif test_data == "nans": ser = Series(data=np.random.choice([1.0, 0.25, 0.75, np.nan], length)) - expected = ser.rolling(window).apply(lambda x: x.rank(method=method, pct=pct, ascending=ascending).iloc[-1]) + expected = ser.rolling(window).apply( + lambda x: x.rank(method=method, pct=pct, ascending=ascending).iloc[-1] + ) result = ser.rolling(window).rank(method=method, pct=pct, ascending=ascending) tm.assert_series_equal(result, expected) From 5f7d319d508f013b9a9acebae36d8569e3d59409 Mon Sep 17 00:00:00 2001 From: Greg Siano Date: Sat, 4 Sep 2021 10:54:18 -0400 Subject: [PATCH 12/16] ENH: rolling rank added tests for `Expanding`. added doc strings and whatsnew note --- doc/source/reference/window.rst | 2 + doc/source/whatsnew/v1.4.0.rst | 47 +++++++++++++++++ pandas/_libs/window/aggregations.pyx | 16 +++--- pandas/core/window/expanding.py | 75 +++++++++++++++++++++++++++ pandas/core/window/rolling.py | 75 +++++++++++++++++++++++++++ pandas/tests/window/test_expanding.py | 22 ++++++++ 6 files changed, 231 insertions(+), 6 deletions(-) diff --git a/doc/source/reference/window.rst b/doc/source/reference/window.rst index a255b3ae8081e..5e230a533625f 100644 --- a/doc/source/reference/window.rst +++ b/doc/source/reference/window.rst @@ -35,6 +35,7 @@ Rolling window functions Rolling.aggregate Rolling.quantile Rolling.sem + Rolling.rank .. _api.functions_window: @@ -75,6 +76,7 @@ Expanding window functions Expanding.aggregate Expanding.quantile Expanding.sem + Expanding.rank .. _api.functions_ewm: diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index c3b27e7988d4a..1ee697d965ca5 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -87,6 +87,53 @@ Multithreaded CSV reading with a new CSV Engine based on pyarrow :func:`pandas.read_csv` now accepts ``engine="pyarrow"`` (requires at least ``pyarrow`` 0.17.0) as an argument, allowing for faster csv parsing on multicore machines with pyarrow installed. See the :doc:`I/O docs ` for more info. (:issue:`23697`) +.. _whatsnew_140.enhancements.window_rank: + +Rank function for rolling and expanding windows +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Added ``rank`` function to :class:`Rolling` and :class:`Expanding`. The new function supports the ``method``, ``ascending``, and ``pct`` flags of :meth:`DataFrame.rank`. The ``method`` argument supports ``min``, ``max``, and ``average`` ranking methods. +Example: + +.. ipython:: python + + >>> s = pd.Series([1,4,2,3,5,3]) + >>> s.rolling(3).rank() + 0 NaN + 1 NaN + 2 2.0 + 3 2.0 + 4 3.0 + 5 1.5 + dtype: float64 + + >>> s.rolling(3).rank(method="max") + 0 NaN + 1 NaN + 2 2.0 + 3 2.0 + 4 3.0 + 5 2.0 + dtype: float64 + + >>> s.expanding().rank() + 0 1.0 + 1 2.0 + 2 2.0 + 3 3.0 + 4 5.0 + 5 3.5 + dtype: float64 + + >>> s.expanding().rank(method="max") + 0 1.0 + 1 2.0 + 2 2.0 + 3 3.0 + 4 5.0 + 5 4.0 + dtype: float64 + .. _whatsnew_140.enhancements.other: Other enhancements diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 0f9ac43b1f224..100294529e91d 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -1212,11 +1212,13 @@ def roll_rank(const float64_t[:] values, ndarray[int64_t] start, raise MemoryError("skiplist_insert failed") if rank_type == AVERAGE: rank_min = skiplist_min_rank(skiplist, val) - rank = ((rank * (rank + 1) / 2) - - ((rank_min - 1) * rank_min / 2)) \ - / (rank - rank_min + 1) + rank = (((rank * (rank + 1) / 2) + - ((rank_min - 1) * rank_min / 2)) + / (rank - rank_min + 1)) elif rank_type == MIN: rank = skiplist_min_rank(skiplist, val) + else: + rank = NaN else: # calculate deletes @@ -1236,11 +1238,13 @@ def roll_rank(const float64_t[:] values, ndarray[int64_t] start, raise MemoryError("skiplist_insert failed") if rank_type == AVERAGE: rank_min = skiplist_min_rank(skiplist, val) - rank = ((rank * (rank + 1) / 2) - - ((rank_min - 1) * rank_min / 2)) \ - / (rank - rank_min + 1) + rank = (((rank * (rank + 1) / 2) + - ((rank_min - 1) * rank_min / 2)) + / (rank - rank_min + 1)) elif rank_type == MIN: rank = skiplist_min_rank(skiplist, val) + else: + rank = NaN if nobs >= minp: output[i] = (rank) / nobs if percentile else rank else: diff --git a/pandas/core/window/expanding.py b/pandas/core/window/expanding.py index 36a6399df7dbc..73fac29bb2def 100644 --- a/pandas/core/window/expanding.py +++ b/pandas/core/window/expanding.py @@ -564,6 +564,81 @@ def quantile( **kwargs, ) + @doc( + template_header, + ".. versionadded:: 1.4.0 \n\n", + create_section_header("Parameters"), + dedent( + """ + method : {{'average', 'min', 'max'}}, default 'average' + How to rank the group of records that have the same value (i.e. ties): + + * average: average rank of the group + * min: lowest rank in the group + * max: highest rank in the group + + ascending : bool, default True + Whether or not the elements should be ranked in ascending order. + pct : bool, default False + Whether or not to display the returned rankings in percentile + form. + """ + ).replace("\n", "", 1), + kwargs_compat, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also, + create_section_header("Examples"), + dedent( + """ + >>> s = pd.Series([1,4,2,3,5,3]) + >>> s.expanding().rank() + 0 1.0 + 1 2.0 + 2 2.0 + 3 3.0 + 4 5.0 + 5 3.5 + dtype: float64 + + >>> s.expanding().rank(method="max") + 0 1.0 + 1 2.0 + 2 2.0 + 3 3.0 + 4 5.0 + 5 4.0 + dtype: float64 + + >>> s.expanding().rank(method="min") + 0 1.0 + 1 2.0 + 2 2.0 + 3 3.0 + 4 5.0 + 5 3.0 + dtype: float64 + """ + ).replace("\n", "", 1), + window_method="expanding", + aggregation_description="rank", + agg_method="rank", + ) + def rank( + self, + method: str = "average", + ascending: bool = True, + pct: bool = False, + **kwargs, + ): + return super().rank( + method=method, + ascending=ascending, + pct=pct, + **kwargs, + ) + @doc( template_header, create_section_header("Parameters"), diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index edaafd2d3b57b..756ac3596658f 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -2176,6 +2176,81 @@ def quantile(self, quantile: float, interpolation: str = "linear", **kwargs): **kwargs, ) + @doc( + template_header, + ".. versionadded:: 1.4.0 \n\n", + create_section_header("Parameters"), + dedent( + """ + method : {{'average', 'min', 'max'}}, default 'average' + How to rank the group of records that have the same value (i.e. ties): + + * average: average rank of the group + * min: lowest rank in the group + * max: highest rank in the group + + ascending : bool, default True + Whether or not the elements should be ranked in ascending order. + pct : bool, default False + Whether or not to display the returned rankings in percentile + form. + """ + ).replace("\n", "", 1), + kwargs_compat, + create_section_header("Returns"), + template_returns, + create_section_header("See Also"), + template_see_also, + create_section_header("Examples"), + dedent( + """ + >>> s = pd.Series([1,4,2,3,5,3]) + >>> s.rolling(3).rank() + 0 NaN + 1 NaN + 2 2.0 + 3 2.0 + 4 3.0 + 5 1.5 + dtype: float64 + + >>> s.rolling(3).rank(method="max") + 0 NaN + 1 NaN + 2 2.0 + 3 2.0 + 4 3.0 + 5 2.0 + dtype: float64 + + >>> s.rolling(3).rank(method="min") + 0 NaN + 1 NaN + 2 2.0 + 3 2.0 + 4 3.0 + 5 1.0 + dtype: float64 + """ + ).replace("\n", "", 1), + window_method="rolling", + aggregation_description="rank", + agg_method="rank", + ) + def rank( + self, + method: str = "average", + ascending: bool = True, + pct: bool = False, + **kwargs, + ): + return super().rank( + method=method, + ascending=ascending, + pct=pct, + **kwargs, + ) + @doc( template_header, create_section_header("Parameters"), diff --git a/pandas/tests/window/test_expanding.py b/pandas/tests/window/test_expanding.py index 1b9259fd8240e..bd25fab0f6c9b 100644 --- a/pandas/tests/window/test_expanding.py +++ b/pandas/tests/window/test_expanding.py @@ -264,3 +264,25 @@ def test_expanding_skew_kurt_numerical_stability(method): s = s + 5000 result = getattr(s.expanding(3), method)() tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("window", [1, 3, 10, 50, 1000]) +@pytest.mark.parametrize("method", ["min", "max", "average"]) +@pytest.mark.parametrize("pct", [True, False]) +@pytest.mark.parametrize("ascending", [True, False]) +@pytest.mark.parametrize("test_data", ["default", "duplicates", "nans"]) +def test_rank(window, method, pct, ascending, test_data): + length = 1000 + if test_data == "default": + ser = Series(data=np.random.rand(length)) + elif test_data == "duplicates": + ser = Series(data=np.random.choice(3, length)) + elif test_data == "nans": + ser = Series(data=np.random.choice([1.0, 0.25, 0.75, np.nan], length)) + + expected = ser.expanding(window).apply( + lambda x: x.rank(method=method, pct=pct, ascending=ascending).iloc[-1] + ) + result = ser.expanding(window).rank(method=method, pct=pct, ascending=ascending) + + tm.assert_series_equal(result, expected) From 63d37c503badf2eef509eea5dcf0cd77e574d055 Mon Sep 17 00:00:00 2001 From: Greg Siano Date: Sat, 4 Sep 2021 11:09:34 -0400 Subject: [PATCH 13/16] ENH: rolling rank - fix pre-commit errors --- doc/source/whatsnew/v1.4.0.rst | 2 +- pandas/core/window/expanding.py | 2 +- pandas/core/window/rolling.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 1ee697d965ca5..c6b69b508d65a 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -97,7 +97,7 @@ Example: .. ipython:: python - >>> s = pd.Series([1,4,2,3,5,3]) + >>> s = pd.Series([1, 4, 2, 3, 5, 3]) >>> s.rolling(3).rank() 0 NaN 1 NaN diff --git a/pandas/core/window/expanding.py b/pandas/core/window/expanding.py index 73fac29bb2def..0aaacd9e37f05 100644 --- a/pandas/core/window/expanding.py +++ b/pandas/core/window/expanding.py @@ -592,7 +592,7 @@ def quantile( create_section_header("Examples"), dedent( """ - >>> s = pd.Series([1,4,2,3,5,3]) + >>> s = pd.Series([1, 4, 2, 3, 5, 3]) >>> s.expanding().rank() 0 1.0 1 2.0 diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 756ac3596658f..5d618de8fc0c4 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -2204,7 +2204,7 @@ def quantile(self, quantile: float, interpolation: str = "linear", **kwargs): create_section_header("Examples"), dedent( """ - >>> s = pd.Series([1,4,2,3,5,3]) + >>> s = pd.Series([1, 4, 2, 3, 5, 3]) >>> s.rolling(3).rank() 0 NaN 1 NaN From ba468c6eb19b422a151719fc931b4d06ba00dae2 Mon Sep 17 00:00:00 2001 From: Greg Siano Date: Sun, 12 Sep 2021 22:34:25 -0400 Subject: [PATCH 14/16] ENH: rolling rank addressing comments --- doc/source/whatsnew/v1.4.0.rst | 32 ------------------- pandas/_libs/algos.pxd | 16 ++++++++++ pandas/_libs/algos.pyx | 16 ---------- pandas/_libs/src/skiplist.h | 2 ++ pandas/_libs/window/aggregations.pyi | 4 ++- pandas/_libs/window/aggregations.pyx | 46 +++++++++++++++------------ pandas/_typing.py | 3 ++ pandas/core/window/expanding.py | 3 +- pandas/core/window/rolling.py | 5 +-- pandas/tests/window/test_expanding.py | 8 +++-- pandas/tests/window/test_rolling.py | 8 +++-- 11 files changed, 64 insertions(+), 79 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index c6b69b508d65a..2499e8a2b600f 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -99,40 +99,8 @@ Example: >>> s = pd.Series([1, 4, 2, 3, 5, 3]) >>> s.rolling(3).rank() - 0 NaN - 1 NaN - 2 2.0 - 3 2.0 - 4 3.0 - 5 1.5 - dtype: float64 >>> s.rolling(3).rank(method="max") - 0 NaN - 1 NaN - 2 2.0 - 3 2.0 - 4 3.0 - 5 2.0 - dtype: float64 - - >>> s.expanding().rank() - 0 1.0 - 1 2.0 - 2 2.0 - 3 3.0 - 4 5.0 - 5 3.5 - dtype: float64 - - >>> s.expanding().rank(method="max") - 0 1.0 - 1 2.0 - 2 2.0 - 3 3.0 - 4 5.0 - 5 4.0 - dtype: float64 .. _whatsnew_140.enhancements.other: diff --git a/pandas/_libs/algos.pxd b/pandas/_libs/algos.pxd index 7e87f4767c86d..785985d42f520 100644 --- a/pandas/_libs/algos.pxd +++ b/pandas/_libs/algos.pxd @@ -2,3 +2,19 @@ from pandas._libs.util cimport numeric cdef numeric kth_smallest_c(numeric* arr, Py_ssize_t k, Py_ssize_t n) nogil + +cdef enum TiebreakEnumType: + TIEBREAK_AVERAGE + TIEBREAK_MIN, + TIEBREAK_MAX + TIEBREAK_FIRST + TIEBREAK_FIRST_DESCENDING + TIEBREAK_DENSE + +tiebreakers = { + "average": TIEBREAK_AVERAGE, + "min": TIEBREAK_MIN, + "max": TIEBREAK_MAX, + "first": TIEBREAK_FIRST, + "dense": TIEBREAK_DENSE, +} diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 2353c66f3378f..c06f25a0bb76d 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -64,22 +64,6 @@ cdef: float64_t NaN = np.NaN int64_t NPY_NAT = get_nat() -cdef enum TiebreakEnumType: - TIEBREAK_AVERAGE - TIEBREAK_MIN, - TIEBREAK_MAX - TIEBREAK_FIRST - TIEBREAK_FIRST_DESCENDING - TIEBREAK_DENSE - -tiebreakers = { - "average": TIEBREAK_AVERAGE, - "min": TIEBREAK_MIN, - "max": TIEBREAK_MAX, - "first": TIEBREAK_FIRST, - "dense": TIEBREAK_DENSE, -} - cdef inline bint are_diff(object left, object right): try: diff --git a/pandas/_libs/src/skiplist.h b/pandas/_libs/src/skiplist.h index 27ef4c98307b2..5d0b144a1fe61 100644 --- a/pandas/_libs/src/skiplist.h +++ b/pandas/_libs/src/skiplist.h @@ -180,6 +180,8 @@ PANDAS_INLINE double skiplist_get(skiplist_t *skp, int i, int *ret) { return node->value; } +// Returns the lowest rank of all elements with value `value`, as opposed to the +// highest rank returned by `skiplist_insert`. PANDAS_INLINE int skiplist_min_rank(skiplist_t *skp, double value) { node_t *node; int level, rank = 0; diff --git a/pandas/_libs/window/aggregations.pyi b/pandas/_libs/window/aggregations.pyi index faf704aa968c3..879809a259266 100644 --- a/pandas/_libs/window/aggregations.pyi +++ b/pandas/_libs/window/aggregations.pyi @@ -6,6 +6,8 @@ from typing import ( import numpy as np +from pandas._typing import WindowingRankType + def roll_sum( values: np.ndarray, # const float64_t[:] start: np.ndarray, # np.ndarray[np.int64] @@ -69,7 +71,7 @@ def roll_rank( end: np.ndarray, minp: int, percentile: bool, - method: Literal["average", "min", "max"], + method: WindowingRankType, ascending: bool, ) -> np.ndarray: ... # np.ndarray[float] def roll_apply( diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 100294529e91d..f1b9eab4ef009 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -5,6 +5,11 @@ import cython from libc.math cimport round from libcpp.deque cimport deque +from pandas._libs.algos cimport ( + TiebreakEnumType, + tiebreakers, +) + import numpy as np cimport numpy as cnp @@ -1141,19 +1146,6 @@ def roll_quantile(const float64_t[:] values, ndarray[int64_t] start, return output -cdef enum RankType: - AVERAGE, - MIN, - MAX, - - -rank_types = { - 'average': AVERAGE, - 'min': MIN, - 'max': MAX, -} - - def roll_rank(const float64_t[:] values, ndarray[int64_t] start, ndarray[int64_t] end, int64_t minp, bint percentile, str method, bint ascending) -> np.ndarray: @@ -1168,13 +1160,17 @@ def roll_rank(const float64_t[:] values, ndarray[int64_t] start, int64_t nobs = 0, win float64_t val skiplist_t *skiplist - float64_t[::1] output = None - RankType rank_type + float64_t[::1] output + TiebreakEnumType rank_type try: - rank_type = rank_types[method] + rank_type = tiebreakers[method] except KeyError: raise ValueError(f"Method '{method}' is not supported") + if rank_type not in (TiebreakEnumType.TIEBREAK_AVERAGE, + TiebreakEnumType.TIEBREAK_MIN, + TiebreakEnumType.TIEBREAK_MAX): + raise ValueError(f"Method '{method}' is not supported") is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds( start, end @@ -1210,12 +1206,20 @@ def roll_rank(const float64_t[:] values, ndarray[int64_t] start, rank = skiplist_insert(skiplist, val) if rank == -1: raise MemoryError("skiplist_insert failed") - if rank_type == AVERAGE: + if rank_type == TiebreakEnumType.TIEBREAK_AVERAGE: + # The average rank of `val` is the sum of the ranks of all + # instances of `val` in the skip list divided by the number + # of instances. The sum of consecutive integers from 1 to N + # is N * (N + 1) / 2. + # The sum of the ranks is the sum of integers from the + # lowest rank to the highest rank, which is the sum of + # integers from 1 to the highest rank minus the sum of + # integers from 1 to one less than the lowest rank. rank_min = skiplist_min_rank(skiplist, val) rank = (((rank * (rank + 1) / 2) - ((rank_min - 1) * rank_min / 2)) / (rank - rank_min + 1)) - elif rank_type == MIN: + elif rank_type == TiebreakEnumType.TIEBREAK_MIN: rank = skiplist_min_rank(skiplist, val) else: rank = NaN @@ -1236,17 +1240,17 @@ def roll_rank(const float64_t[:] values, ndarray[int64_t] start, rank = skiplist_insert(skiplist, val) if rank == -1: raise MemoryError("skiplist_insert failed") - if rank_type == AVERAGE: + if rank_type == TiebreakEnumType.TIEBREAK_AVERAGE: rank_min = skiplist_min_rank(skiplist, val) rank = (((rank * (rank + 1) / 2) - ((rank_min - 1) * rank_min / 2)) / (rank - rank_min + 1)) - elif rank_type == MIN: + elif rank_type == TiebreakEnumType.TIEBREAK_MIN: rank = skiplist_min_rank(skiplist, val) else: rank = NaN if nobs >= minp: - output[i] = (rank) / nobs if percentile else rank + output[i] = rank / nobs if percentile else rank else: output[i] = NaN diff --git a/pandas/_typing.py b/pandas/_typing.py index ef9f38bbf5168..eadf987c20524 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -208,3 +208,6 @@ PositionalIndexer2D = Union[ PositionalIndexer, Tuple[PositionalIndexer, PositionalIndexer] ] + +# Windowing rank methods +WindowingRankType = Literal["average", "min", "max"] diff --git a/pandas/core/window/expanding.py b/pandas/core/window/expanding.py index 0aaacd9e37f05..b5b2363072ba1 100644 --- a/pandas/core/window/expanding.py +++ b/pandas/core/window/expanding.py @@ -10,6 +10,7 @@ from pandas._typing import ( Axis, FrameOrSeries, + WindowingRankType, ) if TYPE_CHECKING: @@ -627,7 +628,7 @@ def quantile( ) def rank( self, - method: str = "average", + method: WindowingRankType = "average", ascending: bool = True, pct: bool = False, **kwargs, diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 5d618de8fc0c4..07842cdf588e2 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -28,6 +28,7 @@ ArrayLike, Axis, FrameOrSeries, + WindowingRankType, ) from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import function as nv @@ -1411,7 +1412,7 @@ def quantile(self, quantile: float, interpolation: str = "linear", **kwargs): def rank( self, - method: str = "average", + method: WindowingRankType = "average", ascending: bool = True, pct: bool = False, **kwargs, @@ -2239,7 +2240,7 @@ def quantile(self, quantile: float, interpolation: str = "linear", **kwargs): ) def rank( self, - method: str = "average", + method: WindowingRankType = "average", ascending: bool = True, pct: bool = False, **kwargs, diff --git a/pandas/tests/window/test_expanding.py b/pandas/tests/window/test_expanding.py index bd25fab0f6c9b..680ac3654222a 100644 --- a/pandas/tests/window/test_expanding.py +++ b/pandas/tests/window/test_expanding.py @@ -266,19 +266,21 @@ def test_expanding_skew_kurt_numerical_stability(method): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("window", [1, 3, 10, 50, 1000]) +@pytest.mark.parametrize("window", [1, 3, 10, 20]) @pytest.mark.parametrize("method", ["min", "max", "average"]) @pytest.mark.parametrize("pct", [True, False]) @pytest.mark.parametrize("ascending", [True, False]) @pytest.mark.parametrize("test_data", ["default", "duplicates", "nans"]) def test_rank(window, method, pct, ascending, test_data): - length = 1000 + length = 20 if test_data == "default": ser = Series(data=np.random.rand(length)) elif test_data == "duplicates": ser = Series(data=np.random.choice(3, length)) elif test_data == "nans": - ser = Series(data=np.random.choice([1.0, 0.25, 0.75, np.nan], length)) + ser = Series( + data=np.random.choice([1.0, 0.25, 0.75, np.nan, np.inf, -np.inf], length) + ) expected = ser.expanding(window).apply( lambda x: x.rank(method=method, pct=pct, ascending=ascending).iloc[-1] diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index d54996c27c6e8..ed1039223e831 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -1502,19 +1502,21 @@ def test_rolling_numeric_dtypes(): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("window", [1, 3, 10, 50, 1000]) +@pytest.mark.parametrize("window", [1, 3, 10, 20]) @pytest.mark.parametrize("method", ["min", "max", "average"]) @pytest.mark.parametrize("pct", [True, False]) @pytest.mark.parametrize("ascending", [True, False]) @pytest.mark.parametrize("test_data", ["default", "duplicates", "nans"]) def test_rank(window, method, pct, ascending, test_data): - length = 1000 + length = 20 if test_data == "default": ser = Series(data=np.random.rand(length)) elif test_data == "duplicates": ser = Series(data=np.random.choice(3, length)) elif test_data == "nans": - ser = Series(data=np.random.choice([1.0, 0.25, 0.75, np.nan], length)) + ser = Series( + data=np.random.choice([1.0, 0.25, 0.75, np.nan, np.inf, -np.inf], length) + ) expected = ser.rolling(window).apply( lambda x: x.rank(method=method, pct=pct, ascending=ascending).iloc[-1] From bb7005ff762cb2be2f37148985abeeddcaeef068 Mon Sep 17 00:00:00 2001 From: Greg Siano Date: Mon, 13 Sep 2021 13:17:30 -0400 Subject: [PATCH 15/16] ENH: rolling rank add rolling rank tiebreakers dict --- pandas/_libs/algos.pxd | 8 -------- pandas/_libs/algos.pyx | 9 +++++++++ pandas/_libs/window/aggregations.pyx | 18 +++++++++--------- pandas/_typing.py | 2 +- 4 files changed, 19 insertions(+), 18 deletions(-) diff --git a/pandas/_libs/algos.pxd b/pandas/_libs/algos.pxd index 785985d42f520..4f7cc9345ed30 100644 --- a/pandas/_libs/algos.pxd +++ b/pandas/_libs/algos.pxd @@ -10,11 +10,3 @@ cdef enum TiebreakEnumType: TIEBREAK_FIRST TIEBREAK_FIRST_DESCENDING TIEBREAK_DENSE - -tiebreakers = { - "average": TIEBREAK_AVERAGE, - "min": TIEBREAK_MIN, - "max": TIEBREAK_MAX, - "first": TIEBREAK_FIRST, - "dense": TIEBREAK_DENSE, -} diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 069d72d951178..a18cfc41d1e2e 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -67,6 +67,15 @@ cdef: int64_t NPY_NAT = get_nat() +tiebreakers = { + "average": TIEBREAK_AVERAGE, + "min": TIEBREAK_MIN, + "max": TIEBREAK_MAX, + "first": TIEBREAK_FIRST, + "dense": TIEBREAK_DENSE, +} + + cdef inline bint are_diff(object left, object right): try: return fabs(left - right) > FP_ERR diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index f1b9eab4ef009..ea52bd24a3689 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -5,10 +5,7 @@ import cython from libc.math cimport round from libcpp.deque cimport deque -from pandas._libs.algos cimport ( - TiebreakEnumType, - tiebreakers, -) +from pandas._libs.algos cimport TiebreakEnumType import numpy as np @@ -1146,6 +1143,13 @@ def roll_quantile(const float64_t[:] values, ndarray[int64_t] start, return output +rolling_rank_tiebreakers = { + "average": TiebreakEnumType.TIEBREAK_AVERAGE, + "min": TiebreakEnumType.TIEBREAK_MIN, + "max": TiebreakEnumType.TIEBREAK_MAX, +} + + def roll_rank(const float64_t[:] values, ndarray[int64_t] start, ndarray[int64_t] end, int64_t minp, bint percentile, str method, bint ascending) -> np.ndarray: @@ -1164,13 +1168,9 @@ def roll_rank(const float64_t[:] values, ndarray[int64_t] start, TiebreakEnumType rank_type try: - rank_type = tiebreakers[method] + rank_type = rolling_rank_tiebreakers[method] except KeyError: raise ValueError(f"Method '{method}' is not supported") - if rank_type not in (TiebreakEnumType.TIEBREAK_AVERAGE, - TiebreakEnumType.TIEBREAK_MIN, - TiebreakEnumType.TIEBREAK_MAX): - raise ValueError(f"Method '{method}' is not supported") is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds( start, end diff --git a/pandas/_typing.py b/pandas/_typing.py index 40cd38b8ce5c9..9ed31dc3738f3 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -221,4 +221,4 @@ PositionalIndexer2D = Union[PositionalIndexer, PositionalIndexerTuple] # Windowing rank methods -WindowingRankType = Literal["average", "min", "max"] \ No newline at end of file +WindowingRankType = Literal["average", "min", "max"] From 1470c7b587cda8a0c32bc93b76ea5ee111d7eb93 Mon Sep 17 00:00:00 2001 From: Greg Siano Date: Mon, 13 Sep 2021 14:28:45 -0400 Subject: [PATCH 16/16] ENH: rolling rank fix docs warnings - remove '>>>' from code block --- doc/source/whatsnew/v1.4.0.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index e1eddc71d8064..6046d4bf6ec4b 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -104,10 +104,10 @@ Example: .. ipython:: python - >>> s = pd.Series([1, 4, 2, 3, 5, 3]) - >>> s.rolling(3).rank() + s = pd.Series([1, 4, 2, 3, 5, 3]) + s.rolling(3).rank() - >>> s.rolling(3).rank(method="max") + s.rolling(3).rank(method="max") .. _whatsnew_140.enhancements.other: