From 90a6a0c0f969377eef22e2ac16d0a03d10cb6a5b Mon Sep 17 00:00:00 2001 From: sweeneyde Date: Tue, 6 Jul 2021 16:29:53 -0400 Subject: [PATCH 01/19] Don't force inlining --- Objects/stringlib/fastsearch.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 7b8be5d6492157..043cc42faed617 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -305,7 +305,7 @@ typedef struct STRINGLIB(_pre) { } STRINGLIB(prework); -Py_LOCAL_INLINE(void) +static void STRINGLIB(_preprocess)(const STRINGLIB_CHAR *needle, Py_ssize_t len_needle, STRINGLIB(prework) *p) { @@ -339,7 +339,7 @@ STRINGLIB(_preprocess)(const STRINGLIB_CHAR *needle, Py_ssize_t len_needle, } } -Py_LOCAL_INLINE(Py_ssize_t) +static void STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, STRINGLIB(prework) *p) { @@ -465,7 +465,7 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, return -1; } -Py_LOCAL_INLINE(Py_ssize_t) +static Py_ssize_t STRINGLIB(_two_way_find)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, const STRINGLIB_CHAR *needle, @@ -477,7 +477,7 @@ STRINGLIB(_two_way_find)(const STRINGLIB_CHAR *haystack, return STRINGLIB(_two_way)(haystack, len_haystack, &p); } -Py_LOCAL_INLINE(Py_ssize_t) +static Py_ssize_t STRINGLIB(_two_way_count)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, const STRINGLIB_CHAR *needle, From 19fa6616d843c2c44417f5fce80281f4bea2e91d Mon Sep 17 00:00:00 2001 From: sweeneyde Date: Wed, 7 Jul 2021 18:17:13 -0400 Subject: [PATCH 02/19] Use a Boyer-Moore skip table --- Objects/stringlib/fastsearch.h | 190 +++++++++++++++++---------------- 1 file changed, 100 insertions(+), 90 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 043cc42faed617..9e7cc35ed5e965 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -170,10 +170,16 @@ STRINGLIB(rfind_char)(const STRINGLIB_CHAR* s, Py_ssize_t n, STRINGLIB_CHAR ch) /* Change to a 1 to see logging comments walk through the algorithm. */ #if 0 && STRINGLIB_SIZEOF_CHAR == 1 # define LOG(...) printf(__VA_ARGS__) -# define LOG_STRING(s, n) printf("\"%.*s\"", n, s) +# define LOG_STRING(s, n) printf("\"%.*s\"", (int)(n), s) +# define LOG_LINEUP() do { \ + LOG("> "); LOG_STRING(haystack, len_haystack); LOG("\n> "); \ + LOG("%*s",(int)(window_last + 1 - len_needle - haystack), ""); \ + LOG_STRING(needle, len_needle); LOG("\n"); \ +} while(0) #else # define LOG(...) # define LOG_STRING(s, n) +# define LOG_LINEUP() #endif Py_LOCAL_INLINE(Py_ssize_t) @@ -288,8 +294,7 @@ STRINGLIB(_factorize)(const STRINGLIB_CHAR *needle, } #define SHIFT_TYPE uint8_t -#define NOT_FOUND ((1U<<(8*sizeof(SHIFT_TYPE))) - 1U) -#define SHIFT_OVERFLOW (NOT_FOUND - 1U) +#define MAX_SHIFT ((1U<<(8*sizeof(SHIFT_TYPE))) - 1U) #define TABLE_SIZE_BITS 6 #define TABLE_SIZE (1U << TABLE_SIZE_BITS) @@ -300,11 +305,11 @@ typedef struct STRINGLIB(_pre) { Py_ssize_t len_needle; Py_ssize_t cut; Py_ssize_t period; + Py_ssize_t gap; int is_periodic; SHIFT_TYPE table[TABLE_SIZE]; } STRINGLIB(prework); - static void STRINGLIB(_preprocess)(const STRINGLIB_CHAR *needle, Py_ssize_t len_needle, STRINGLIB(prework) *p) @@ -319,145 +324,149 @@ STRINGLIB(_preprocess)(const STRINGLIB_CHAR *needle, Py_ssize_t len_needle, if (p->is_periodic) { assert(p->cut <= len_needle/2); assert(p->cut < p->period); + p->gap = 0; // unused } else { // A lower bound on the period p->period = Py_MAX(p->cut, len_needle - p->cut) + 1; - } - // Now fill up a table - memset(&(p->table[0]), 0xff, TABLE_SIZE*sizeof(SHIFT_TYPE)); - assert(p->table[0] == NOT_FOUND); - assert(p->table[TABLE_MASK] == NOT_FOUND); - for (Py_ssize_t i = 0; i < len_needle; i++) { - Py_ssize_t shift = len_needle - i; - if (shift > SHIFT_OVERFLOW) { - shift = SHIFT_OVERFLOW; + // The gap between the last character and the previous + // occurrence of an equivalent character (modulo TABLE_SIZE) + p->gap = len_needle; + STRINGLIB_CHAR last = needle[len_needle - 1] & TABLE_MASK; + for (Py_ssize_t i = len_needle - 2; i >= 0; i--) { + if ((needle[i] & TABLE_MASK) == last) { + p->gap = len_needle - 1 - i; + break; + } } - p->table[needle[i] & TABLE_MASK] = Py_SAFE_DOWNCAST(shift, - Py_ssize_t, - SHIFT_TYPE); + } + + // Fill up a compressed Boyer-Moore "Bad Character" table + Py_ssize_t not_found_shift = Py_MIN(len_needle, MAX_SHIFT); + for (Py_ssize_t i = 0; i < TABLE_SIZE; i++) { + p->table[i] = Py_SAFE_DOWNCAST(not_found_shift, + Py_ssize_t, SHIFT_TYPE); + } + for (Py_ssize_t i = len_needle - not_found_shift; i < len_needle; i++) { + SHIFT_TYPE shift = Py_SAFE_DOWNCAST(len_needle - 1 - i, + Py_ssize_t, SHIFT_TYPE); + p->table[needle[i] & TABLE_MASK] = shift; } } -static void +static Py_ssize_t STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, STRINGLIB(prework) *p) { // Crochemore and Perrin's (1991) Two-Way algorithm. // See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260 - Py_ssize_t len_needle = p->len_needle; - Py_ssize_t cut = p->cut; - Py_ssize_t period = p->period; - const STRINGLIB_CHAR *needle = p->needle; - const STRINGLIB_CHAR *window = haystack; - const STRINGLIB_CHAR *last_window = haystack + len_haystack - len_needle; + const Py_ssize_t len_needle = p->len_needle; + const Py_ssize_t cut = p->cut; + const Py_ssize_t period = p->period; + const STRINGLIB_CHAR *const needle = p->needle; + const STRINGLIB_CHAR *window_last = haystack + len_needle - 1; + const STRINGLIB_CHAR *const haystack_end = haystack + len_haystack; SHIFT_TYPE *table = p->table; LOG("===== Two-way: \"%s\" in \"%s\". =====\n", needle, haystack); + // static int stats[3]; + if (p->is_periodic) { LOG("Needle is periodic.\n"); Py_ssize_t memory = 0; periodicwindowloop: - while (window <= last_window) { - Py_ssize_t i = Py_MAX(cut, memory); - - // Visualize the line-up: - LOG("> "); LOG_STRING(haystack, len_haystack); - LOG("\n> "); LOG("%*s", window - haystack, ""); - LOG_STRING(needle, len_needle); - LOG("\n> "); LOG("%*s", window - haystack + i, ""); - LOG(" ^ <-- cut\n"); - - if (window[i] != needle[i]) { - // Sunday's trick: if we're going to jump, we might - // as well jump to line up the character *after* the - // current window. - STRINGLIB_CHAR first_outside = window[len_needle]; - SHIFT_TYPE shift = table[first_outside & TABLE_MASK]; - if (shift == NOT_FOUND) { - LOG("\"%c\" not found. Skipping entirely.\n", - first_outside); - window += len_needle + 1; - } - else { - LOG("Shifting to line up \"%c\".\n", first_outside); - Py_ssize_t memory_shift = i - cut + 1; - window += Py_MAX(shift, memory_shift); - } + while (window_last < haystack_end) { + LOG_LINEUP(); + Py_ssize_t shift = table[(*window_last) & TABLE_MASK]; + if (shift > 0 && memory > 0) { + // A mismatch has been identified to the right of + // where i starts, so we can jump at least as far as + // if the mismatch occurred on the first comparison. + Py_ssize_t memory_shift = Py_MAX(cut, memory) - cut + 1; + LOG("Skip with Memory.\n"); + window_last += Py_MAX(shift, memory_shift); + LOG_LINEUP(); memory = 0; - goto periodicwindowloop; + shift = table[(*window_last) & TABLE_MASK]; + } + while (shift > 0 && window_last < haystack_end) { + LOG("Fast Horspool skip.\n"); + window_last += shift; + shift = table[(*window_last) & TABLE_MASK]; + LOG_LINEUP(); } - for (i = i + 1; i < len_needle; i++) { + if (window_last >= haystack_end) { + break; + } + const STRINGLIB_CHAR *const window = window_last - len_needle + 1; + Py_ssize_t i = Py_MAX(cut, memory); + for (; i < len_needle; i++) { if (needle[i] != window[i]) { - LOG("Right half does not match. Jump ahead by %d.\n", + LOG("Right half does not match. Jump ahead by %zd.\n", i - cut + 1); - window += i - cut + 1; + window_last += i - cut + 1; memory = 0; goto periodicwindowloop; } } for (i = memory; i < cut; i++) { if (needle[i] != window[i]) { - LOG("Left half does not match. Jump ahead by period %d.\n", + LOG("Left half does not match. Jump ahead by period %zd.\n", period); - window += period; + window_last += period; memory = len_needle - period; goto periodicwindowloop; } } - LOG("Left half matches. Returning %d.\n", + LOG("Left half matches. Returning %zd.\n", window - haystack); return window - haystack; } } else { + Py_ssize_t gap = p->gap; LOG("Needle is not periodic.\n"); assert(cut < len_needle); STRINGLIB_CHAR needle_cut = needle[cut]; windowloop: - while (window <= last_window) { - - // Visualize the line-up: - LOG("> "); LOG_STRING(haystack, len_haystack); - LOG("\n> "); LOG("%*s", window - haystack, ""); - LOG_STRING(needle, len_needle); - LOG("\n> "); LOG("%*s", window - haystack + cut, ""); - LOG(" ^ <-- cut\n"); - - if (window[cut] != needle_cut) { - // Sunday's trick: if we're going to jump, we might - // as well jump to line up the character *after* the - // current window. - STRINGLIB_CHAR first_outside = window[len_needle]; - SHIFT_TYPE shift = table[first_outside & TABLE_MASK]; - if (shift == NOT_FOUND) { - LOG("\"%c\" not found. Skipping entirely.\n", - first_outside); - window += len_needle + 1; - } - else { - LOG("Shifting to line up \"%c\".\n", first_outside); - window += shift; - } - goto windowloop; + while (window_last < haystack_end) { + LOG_LINEUP(); + Py_ssize_t shift = table[(*window_last) & TABLE_MASK]; + // In most cases, this "Horspool" loop is the hot loop. + while (shift > 0 && window_last < haystack_end) { + LOG("Fast Horspool skip.\n"); + window_last += shift; + LOG_LINEUP(); + shift = table[(*window_last) & TABLE_MASK]; + } + if (window_last >= haystack_end) { + break; } - for (Py_ssize_t i = cut + 1; i < len_needle; i++) { + const STRINGLIB_CHAR *window = window_last - len_needle + 1; + for (Py_ssize_t i = cut; i < len_needle; i++) { if (needle[i] != window[i]) { - LOG("Right half does not match. Advance by %d.\n", - i - cut + 1); - window += i - cut + 1; + Py_ssize_t two_way_shift = i - cut + 1; + LOG("Right half does not match. two-way: %zd, gap: %zd " + "--> Advance by %zd\n", + two_way_shift, gap, Py_MAX(two_way_shift, gap)); + // stats[(two_way_shift <= gap) + (two_way_shift < gap)]++; + // if ((stats[0] + stats[1] + stats[2]) % 1000 == 0) { + // printf("gap wins: %d, tie: %d, two-way wins: %d\n"); + // } + window_last += Py_MAX(two_way_shift, gap); goto windowloop; } } for (Py_ssize_t i = 0; i < cut; i++) { if (needle[i] != window[i]) { - LOG("Left half does not match. Advance by period %d.\n", - period); - window += period; + LOG("Left half does not match. period: %zd, gap: %zd)" + "--> Advance by %zd\n", + period, gap, Py_MAX(period, gap)); + window_last += Py_MAX(period, gap); goto windowloop; } } - LOG("Left half matches. Returning %d.\n", window - haystack); + LOG("Left half matches. Returning %zd.\n", window - haystack); return window - haystack; } } @@ -513,6 +522,7 @@ STRINGLIB(_two_way_count)(const STRINGLIB_CHAR *haystack, #undef LOG #undef LOG_STRING +#undef LOG_LINEUP Py_LOCAL_INLINE(Py_ssize_t) FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, @@ -553,7 +563,7 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, mask = 0; if (mode != FAST_RSEARCH) { - if (m >= 100 && w >= 2000 && w / m >= 5) { + if (1 || m >= 100 && w >= 2000 && w / m >= 5) { /* For larger problems where the needle isn't a huge percentage of the size of the haystack, the relatively expensive O(m) startup cost of the two-way algorithm From d2219d5131fbbeb981619ef484f977f1316b66f5 Mon Sep 17 00:00:00 2001 From: sweeneyde Date: Thu, 8 Jul 2021 02:28:58 -0400 Subject: [PATCH 03/19] Switch to two-way in more cases --- Objects/stringlib/fastsearch.h | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 9e7cc35ed5e965..87a51297aac1be 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -294,7 +294,7 @@ STRINGLIB(_factorize)(const STRINGLIB_CHAR *needle, } #define SHIFT_TYPE uint8_t -#define MAX_SHIFT ((1U<<(8*sizeof(SHIFT_TYPE))) - 1U) +#define MAX_SHIFT 255 #define TABLE_SIZE_BITS 6 #define TABLE_SIZE (1U << TABLE_SIZE_BITS) @@ -369,8 +369,6 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, SHIFT_TYPE *table = p->table; LOG("===== Two-way: \"%s\" in \"%s\". =====\n", needle, haystack); - // static int stats[3]; - if (p->is_periodic) { LOG("Needle is periodic.\n"); Py_ssize_t memory = 0; @@ -389,6 +387,7 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, memory = 0; shift = table[(*window_last) & TABLE_MASK]; } + // In most cases, this "Horspool" loop is the hot loop. while (shift > 0 && window_last < haystack_end) { LOG("Fast Horspool skip.\n"); window_last += shift; @@ -436,8 +435,8 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, while (shift > 0 && window_last < haystack_end) { LOG("Fast Horspool skip.\n"); window_last += shift; - LOG_LINEUP(); shift = table[(*window_last) & TABLE_MASK]; + LOG_LINEUP(); } if (window_last >= haystack_end) { break; @@ -449,10 +448,6 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, LOG("Right half does not match. two-way: %zd, gap: %zd " "--> Advance by %zd\n", two_way_shift, gap, Py_MAX(two_way_shift, gap)); - // stats[(two_way_shift <= gap) + (two_way_shift < gap)]++; - // if ((stats[0] + stats[1] + stats[2]) % 1000 == 0) { - // printf("gap wins: %d, tie: %d, two-way wins: %d\n"); - // } window_last += Py_MAX(two_way_shift, gap); goto windowloop; } @@ -563,7 +558,7 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, mask = 0; if (mode != FAST_RSEARCH) { - if (1 || m >= 100 && w >= 2000 && w / m >= 5) { + if (m > w / 2 && ((m >= 100 && w >= 1000) || (m >= 6 && w >= 30000))) { /* For larger problems where the needle isn't a huge percentage of the size of the haystack, the relatively expensive O(m) startup cost of the two-way algorithm @@ -575,6 +570,7 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, return STRINGLIB(_two_way_count)(s, n, p, m, maxcount); } } + const STRINGLIB_CHAR *ss = s + m - 1; const STRINGLIB_CHAR *pp = p + m - 1; @@ -590,7 +586,7 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, /* process pattern[-1] outside the loop */ STRINGLIB_BLOOM_ADD(mask, p[mlast]); - if (m >= 100 && w >= 8000) { + if (m >= 100 && w >= 1000) { /* To ensure that we have good worst-case behavior, here's an adaptive version of the algorithm, where if we match O(m) characters without any matches of the @@ -625,10 +621,9 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, i = i + skip; } hits += j + 1; - if (hits >= m / 4 && i < w - 1000) { - /* We've done O(m) fruitless comparisons - anyway, so spend the O(m) cost on the - setup for the two-way algorithm. */ + if (hits >= i / 8) { + /* Too much partial matching, so spend the O(m) + startup cost for the two-way algorithm. */ Py_ssize_t res; if (mode == FAST_COUNT) { res = STRINGLIB(_two_way_count)( @@ -656,6 +651,7 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, } return count; } + /* The standard, non-adaptive version of the algorithm. */ for (i = 0; i <= w; i++) { /* note: using mlast in the skip path slows things down on x86 */ From 3b8760f8d1b07f94e2c6f0bad1d1587a5259c112 Mon Sep 17 00:00:00 2001 From: sweeneyde Date: Thu, 8 Jul 2021 21:46:27 -0400 Subject: [PATCH 04/19] Tighter comments --- Objects/stringlib/fastsearch.h | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 87a51297aac1be..4512961955cb4d 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -362,7 +362,7 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, // See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260 const Py_ssize_t len_needle = p->len_needle; const Py_ssize_t cut = p->cut; - const Py_ssize_t period = p->period; + Py_ssize_t period = p->period; const STRINGLIB_CHAR *const needle = p->needle; const STRINGLIB_CHAR *window_last = haystack + len_needle - 1; const STRINGLIB_CHAR *const haystack_end = haystack + len_haystack; @@ -387,9 +387,8 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, memory = 0; shift = table[(*window_last) & TABLE_MASK]; } - // In most cases, this "Horspool" loop is the hot loop. while (shift > 0 && window_last < haystack_end) { - LOG("Fast Horspool skip.\n"); + LOG("Horspool skip.\n"); window_last += shift; shift = table[(*window_last) & TABLE_MASK]; LOG_LINEUP(); @@ -401,8 +400,7 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, Py_ssize_t i = Py_MAX(cut, memory); for (; i < len_needle; i++) { if (needle[i] != window[i]) { - LOG("Right half does not match. Jump ahead by %zd.\n", - i - cut + 1); + LOG("Right half does not match.\n"); window_last += i - cut + 1; memory = 0; goto periodicwindowloop; @@ -410,20 +408,19 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, } for (i = memory; i < cut; i++) { if (needle[i] != window[i]) { - LOG("Left half does not match. Jump ahead by period %zd.\n", - period); + LOG("Left half does not match.\n"); window_last += period; memory = len_needle - period; goto periodicwindowloop; } } - LOG("Left half matches. Returning %zd.\n", - window - haystack); + LOG("Found a match!\n"); return window - haystack; } } else { Py_ssize_t gap = p->gap; + period = Py_MAX(gap, period); LOG("Needle is not periodic.\n"); assert(cut < len_needle); STRINGLIB_CHAR needle_cut = needle[cut]; @@ -431,9 +428,8 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, while (window_last < haystack_end) { LOG_LINEUP(); Py_ssize_t shift = table[(*window_last) & TABLE_MASK]; - // In most cases, this "Horspool" loop is the hot loop. while (shift > 0 && window_last < haystack_end) { - LOG("Fast Horspool skip.\n"); + LOG("Horspool skip.\n"); window_last += shift; shift = table[(*window_last) & TABLE_MASK]; LOG_LINEUP(); @@ -445,23 +441,19 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, for (Py_ssize_t i = cut; i < len_needle; i++) { if (needle[i] != window[i]) { Py_ssize_t two_way_shift = i - cut + 1; - LOG("Right half does not match. two-way: %zd, gap: %zd " - "--> Advance by %zd\n", - two_way_shift, gap, Py_MAX(two_way_shift, gap)); + LOG("Right half does not match.\n"); window_last += Py_MAX(two_way_shift, gap); goto windowloop; } } for (Py_ssize_t i = 0; i < cut; i++) { if (needle[i] != window[i]) { - LOG("Left half does not match. period: %zd, gap: %zd)" - "--> Advance by %zd\n", - period, gap, Py_MAX(period, gap)); - window_last += Py_MAX(period, gap); + LOG("Left half does not match.\n"); + window_last += period; goto windowloop; } } - LOG("Left half matches. Returning %zd.\n", window - haystack); + LOG("Found a match!\n"); return window - haystack; } } From 5d86b8d418dcad97ee27c850339e55d0c8f20ccb Mon Sep 17 00:00:00 2001 From: sweeneyde Date: Fri, 9 Jul 2021 22:51:15 -0400 Subject: [PATCH 05/19] Fix the direction of the cutoff inequality :) --- Objects/stringlib/fastsearch.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 4512961955cb4d..8e97f169a66d73 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -550,7 +550,7 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, mask = 0; if (mode != FAST_RSEARCH) { - if (m > w / 2 && ((m >= 100 && w >= 1000) || (m >= 6 && w >= 30000))) { + if (m < w / 4 && ((m >= 100 && w >= 1000) || (m >= 6 && w >= 30000))) { /* For larger problems where the needle isn't a huge percentage of the size of the haystack, the relatively expensive O(m) startup cost of the two-way algorithm From 2e9a9b1520494c78104b8a42640567f3db9b5200 Mon Sep 17 00:00:00 2001 From: sweeneyde Date: Sat, 10 Jul 2021 03:26:21 -0400 Subject: [PATCH 06/19] Separate a gap loop --- Objects/stringlib/fastsearch.h | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 8e97f169a66d73..4f5ee005d29ede 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -340,7 +340,6 @@ STRINGLIB(_preprocess)(const STRINGLIB_CHAR *needle, Py_ssize_t len_needle, } } } - // Fill up a compressed Boyer-Moore "Bad Character" table Py_ssize_t not_found_shift = Py_MIN(len_needle, MAX_SHIFT); for (Py_ssize_t i = 0; i < TABLE_SIZE; i++) { @@ -424,6 +423,7 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, LOG("Needle is not periodic.\n"); assert(cut < len_needle); STRINGLIB_CHAR needle_cut = needle[cut]; + Py_ssize_t gap_jump_end = Py_MIN(len_needle, cut + gap); windowloop: while (window_last < haystack_end) { LOG_LINEUP(); @@ -438,11 +438,19 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, break; } const STRINGLIB_CHAR *window = window_last - len_needle + 1; - for (Py_ssize_t i = cut; i < len_needle; i++) { + for (Py_ssize_t i = cut; i < gap_jump_end; i++) { if (needle[i] != window[i]) { - Py_ssize_t two_way_shift = i - cut + 1; - LOG("Right half does not match.\n"); - window_last += Py_MAX(two_way_shift, gap); + LOG("Early right half mismatch: jump by gap.\n"); + assert(gap >= i - cut + 1); + window_last += gap; + goto windowloop; + } + } + for (Py_ssize_t i = gap_jump_end; i < len_needle; i++) { + if (needle[i] != window[i]) { + LOG("Late right half mismatch.\n"); + assert(i - cut + 1 > gap); + window_last += gap; goto windowloop; } } @@ -550,7 +558,7 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, mask = 0; if (mode != FAST_RSEARCH) { - if (m < w / 4 && ((m >= 100 && w >= 1000) || (m >= 6 && w >= 30000))) { + if (m < w / 8 && ((m >= 100 && w >= 3000) || (m >= 6 && w >= 30000))) { /* For larger problems where the needle isn't a huge percentage of the size of the haystack, the relatively expensive O(m) startup cost of the two-way algorithm @@ -578,7 +586,7 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, /* process pattern[-1] outside the loop */ STRINGLIB_BLOOM_ADD(mask, p[mlast]); - if (m >= 100 && w >= 1000) { + if (m >= 100 && w >= 3000) { /* To ensure that we have good worst-case behavior, here's an adaptive version of the algorithm, where if we match O(m) characters without any matches of the From c84314c44539f610fd4d16837f0c42ebeb1b03d5 Mon Sep 17 00:00:00 2001 From: sweeneyde Date: Sat, 10 Jul 2021 05:37:04 -0400 Subject: [PATCH 07/19] update cutoffs --- Objects/stringlib/fastsearch.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 4f5ee005d29ede..0ff14a82a730da 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -558,7 +558,7 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, mask = 0; if (mode != FAST_RSEARCH) { - if (m < w / 8 && ((m >= 100 && w >= 3000) || (m >= 6 && w >= 30000))) { + if (m < w / 4 && ((m >= 100 && w >= 2000) || (m >= 6 && w >= 30000))) { /* For larger problems where the needle isn't a huge percentage of the size of the haystack, the relatively expensive O(m) startup cost of the two-way algorithm @@ -586,7 +586,7 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, /* process pattern[-1] outside the loop */ STRINGLIB_BLOOM_ADD(mask, p[mlast]); - if (m >= 100 && w >= 3000) { + if (m >= 100 && w >= 4000) { /* To ensure that we have good worst-case behavior, here's an adaptive version of the algorithm, where if we match O(m) characters without any matches of the @@ -621,7 +621,7 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, i = i + skip; } hits += j + 1; - if (hits >= i / 8) { + if (hits >= m) { /* Too much partial matching, so spend the O(m) startup cost for the two-way algorithm. */ Py_ssize_t res; From 7517c6076dc595f5376a5a1e745aef77d5eb4d17 Mon Sep 17 00:00:00 2001 From: sweeneyde Date: Sat, 10 Jul 2021 15:53:09 -0400 Subject: [PATCH 08/19] tweak cutoffs, fix i - cut + 1 jump --- Objects/stringlib/fastsearch.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 0ff14a82a730da..04324c4854889f 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -294,7 +294,7 @@ STRINGLIB(_factorize)(const STRINGLIB_CHAR *needle, } #define SHIFT_TYPE uint8_t -#define MAX_SHIFT 255 +#define MAX_SHIFT UINT8_MAX #define TABLE_SIZE_BITS 6 #define TABLE_SIZE (1U << TABLE_SIZE_BITS) @@ -450,7 +450,7 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, if (needle[i] != window[i]) { LOG("Late right half mismatch.\n"); assert(i - cut + 1 > gap); - window_last += gap; + window_last += i - cut + 1; goto windowloop; } } @@ -586,7 +586,7 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, /* process pattern[-1] outside the loop */ STRINGLIB_BLOOM_ADD(mask, p[mlast]); - if (m >= 100 && w >= 4000) { + if (m >= 100 && w >= 8000) { /* To ensure that we have good worst-case behavior, here's an adaptive version of the algorithm, where if we match O(m) characters without any matches of the From e5ef33bd9642cf5baf5fd759c5720a2fabac2aa6 Mon Sep 17 00:00:00 2001 From: sweeneyde Date: Sun, 11 Jul 2021 02:48:50 -0400 Subject: [PATCH 09/19] Refactor into smaller functions --- Objects/stringlib/fastsearch.h | 409 ++++++++++++++++++--------------- 1 file changed, 224 insertions(+), 185 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 7b8be5d6492157..8946cebf8251c8 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -176,7 +176,7 @@ STRINGLIB(rfind_char)(const STRINGLIB_CHAR* s, Py_ssize_t n, STRINGLIB_CHAR ch) # define LOG_STRING(s, n) #endif -Py_LOCAL_INLINE(Py_ssize_t) +static inline Py_ssize_t STRINGLIB(_lex_search)(const STRINGLIB_CHAR *needle, Py_ssize_t len_needle, Py_ssize_t *return_period, int invert_alphabet) { @@ -228,7 +228,7 @@ STRINGLIB(_lex_search)(const STRINGLIB_CHAR *needle, Py_ssize_t len_needle, return max_suffix; } -Py_LOCAL_INLINE(Py_ssize_t) +static Py_ssize_t STRINGLIB(_factorize)(const STRINGLIB_CHAR *needle, Py_ssize_t len_needle, Py_ssize_t *return_period) @@ -305,7 +305,7 @@ typedef struct STRINGLIB(_pre) { } STRINGLIB(prework); -Py_LOCAL_INLINE(void) +static void STRINGLIB(_preprocess)(const STRINGLIB_CHAR *needle, Py_ssize_t len_needle, STRINGLIB(prework) *p) { @@ -339,7 +339,7 @@ STRINGLIB(_preprocess)(const STRINGLIB_CHAR *needle, Py_ssize_t len_needle, } } -Py_LOCAL_INLINE(Py_ssize_t) +static Py_ssize_t STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, STRINGLIB(prework) *p) { @@ -465,7 +465,8 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, return -1; } -Py_LOCAL_INLINE(Py_ssize_t) + +static Py_ssize_t STRINGLIB(_two_way_find)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, const STRINGLIB_CHAR *needle, @@ -477,7 +478,8 @@ STRINGLIB(_two_way_find)(const STRINGLIB_CHAR *haystack, return STRINGLIB(_two_way)(haystack, len_haystack, &p); } -Py_LOCAL_INLINE(Py_ssize_t) + +static Py_ssize_t STRINGLIB(_two_way_count)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, const STRINGLIB_CHAR *needle, @@ -514,46 +516,235 @@ STRINGLIB(_two_way_count)(const STRINGLIB_CHAR *haystack, #undef LOG #undef LOG_STRING -Py_LOCAL_INLINE(Py_ssize_t) +static inline Py_ssize_t +STRINGLIB(default_find)(const STRINGLIB_CHAR* s, Py_ssize_t n, + const STRINGLIB_CHAR* p, Py_ssize_t m, + Py_ssize_t maxcount, int mode) +{ + const Py_ssize_t w = n - m; + Py_ssize_t mlast = m - 1, count = 0; + Py_ssize_t gap = mlast; + const STRINGLIB_CHAR last = p[mlast]; + const STRINGLIB_CHAR *const ss = &s[mlast]; + + unsigned long mask = 0; + for (Py_ssize_t i = 0; i < mlast; i++) { + STRINGLIB_BLOOM_ADD(mask, p[i]); + if (p[i] == last) { + gap = mlast - i; + } + } + STRINGLIB_BLOOM_ADD(mask, last); + + for (Py_ssize_t i = 0; i <= w; i++) { + if (ss[i] == last) { + /* candidate match */ + Py_ssize_t j; + for (j = 0; j < mlast; j++) { + if (s[i+j] != p[j]) { + break; + } + } + if (j == mlast) { + /* got a match! */ + if (mode != FAST_COUNT) { + return i; + } + count++; + if (count == maxcount) { + return maxcount; + } + i = i + mlast; + continue; + } + /* miss: check if next character is part of pattern */ + if (!STRINGLIB_BLOOM(mask, ss[i+1])) { + i = i + m; + } + else { + i = i + gap; + } + } + else { + /* skip: check if next character is part of pattern */ + if (!STRINGLIB_BLOOM(mask, ss[i+1])) { + i = i + m; + } + } + } + return mode == FAST_COUNT ? count : -1; +} + + +static Py_ssize_t +STRINGLIB(adaptive_find)(const STRINGLIB_CHAR* s, Py_ssize_t n, + const STRINGLIB_CHAR* p, Py_ssize_t m, + Py_ssize_t maxcount, int mode) +{ + const Py_ssize_t w = n - m; + Py_ssize_t mlast = m - 1, count = 0; + Py_ssize_t gap = mlast; + Py_ssize_t hits = 0, res; + const STRINGLIB_CHAR last = p[mlast]; + const STRINGLIB_CHAR *const ss = &s[mlast]; + + unsigned long mask = 0; + for (Py_ssize_t i = 0; i < mlast; i++) { + STRINGLIB_BLOOM_ADD(mask, p[i]); + if (p[i] == last) { + gap = mlast - i; + } + } + STRINGLIB_BLOOM_ADD(mask, last); + + for (Py_ssize_t i = 0; i <= w; i++) { + if (ss[i] == last) { + /* candidate match */ + Py_ssize_t j; + for (j = 0; j < mlast; j++) { + if (s[i+j] != p[j]) { + break; + } + } + if (j == mlast) { + /* got a match! */ + if (mode != FAST_COUNT) { + return i; + } + count++; + if (count == maxcount) { + return maxcount; + } + i = i + mlast; + continue; + } + hits += j + 1; + if (hits > m / 4 && w - i > 2000) { + if (mode == FAST_SEARCH) { + res = STRINGLIB(_two_way_find)(s + i, n - i, p, m); + return res == -1 ? -1 : res + i; + } + else { + res = STRINGLIB(_two_way_count)(s + i, n - i, p, m, + maxcount - count); + return res + count; + } + } + /* miss: check if next character is part of pattern */ + if (!STRINGLIB_BLOOM(mask, ss[i+1])) { + i = i + m; + } + else { + i = i + gap; + } + } + else { + /* skip: check if next character is part of pattern */ + if (!STRINGLIB_BLOOM(mask, ss[i+1])) { + i = i + m; + } + } + } + return mode == FAST_COUNT ? count : -1; +} + + +static Py_ssize_t +STRINGLIB(default_rfind)(const STRINGLIB_CHAR* s, Py_ssize_t n, + const STRINGLIB_CHAR* p, Py_ssize_t m, + Py_ssize_t maxcount, int mode) +{ + /* create compressed boyer-moore delta 1 table */ + unsigned long mask = 0; + Py_ssize_t i, j, mlast = m - 1, skip = m - 1, w = n - m; + + /* process pattern[0] outside the loop */ + STRINGLIB_BLOOM_ADD(mask, p[0]); + /* process pattern[:0:-1] */ + for (i = mlast; i > 0; i--) { + STRINGLIB_BLOOM_ADD(mask, p[i]); + if (p[i] == p[0]) { + skip = i - 1; + } + } + + for (i = w; i >= 0; i--) { + if (s[i] == p[0]) { + /* candidate match */ + for (j = mlast; j > 0; j--) { + if (s[i+j] != p[j]) { + break; + } + } + if (j == 0) { + /* got a match! */ + return i; + } + /* miss: check if previous character is part of pattern */ + if (i > 0 && !STRINGLIB_BLOOM(mask, s[i-1])) { + i = i - m; + } + else { + i = i - skip; + } + } + else { + /* skip: check if previous character is part of pattern */ + if (i > 0 && !STRINGLIB_BLOOM(mask, s[i-1])) { + i = i - m; + } + } + } + return -1; +} + + +static inline Py_ssize_t +STRINGLIB(count_char)(const STRINGLIB_CHAR *s, Py_ssize_t n, + const STRINGLIB_CHAR p0, Py_ssize_t maxcount) +{ + Py_ssize_t i, count = 0; + for (i = 0; i < n; i++) { + if (s[i] == p0) { + count++; + if (count == maxcount) { + return maxcount; + } + } + } + return count; +} + + +static inline Py_ssize_t FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, const STRINGLIB_CHAR* p, Py_ssize_t m, Py_ssize_t maxcount, int mode) { - unsigned long mask; - Py_ssize_t skip, count = 0; - Py_ssize_t i, j, mlast, w; - - w = n - m; - - if (w < 0 || (mode == FAST_COUNT && maxcount == 0)) + if (n < m || (mode == FAST_COUNT && maxcount == 0)) { return -1; + } /* look for special cases */ if (m <= 1) { - if (m <= 0) + if (m <= 0) { return -1; + } /* use special case for 1-character strings */ if (mode == FAST_SEARCH) return STRINGLIB(find_char)(s, n, p[0]); else if (mode == FAST_RSEARCH) return STRINGLIB(rfind_char)(s, n, p[0]); - else { /* FAST_COUNT */ - for (i = 0; i < n; i++) - if (s[i] == p[0]) { - count++; - if (count == maxcount) - return maxcount; - } - return count; + else { + return STRINGLIB(count_char)(s, n, p[0], maxcount); } } - mlast = m - 1; - skip = mlast; - mask = 0; - if (mode != FAST_RSEARCH) { - if (m >= 100 && w >= 2000 && w / m >= 5) { + if (n < 2000 || m < 6 || (m < 100 && n < 30000)) { + return STRINGLIB(default_find)(s, n, p, m, maxcount, mode); + } + else if (5 * m < n) { /* For larger problems where the needle isn't a huge percentage of the size of the haystack, the relatively expensive O(m) startup cost of the two-way algorithm @@ -565,170 +756,18 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, return STRINGLIB(_two_way_count)(s, n, p, m, maxcount); } } - const STRINGLIB_CHAR *ss = s + m - 1; - const STRINGLIB_CHAR *pp = p + m - 1; - - /* create compressed boyer-moore delta 1 table */ - - /* process pattern[:-1] */ - for (i = 0; i < mlast; i++) { - STRINGLIB_BLOOM_ADD(mask, p[i]); - if (p[i] == p[mlast]) { - skip = mlast - i - 1; - } - } - /* process pattern[-1] outside the loop */ - STRINGLIB_BLOOM_ADD(mask, p[mlast]); - - if (m >= 100 && w >= 8000) { + else { /* To ensure that we have good worst-case behavior, here's an adaptive version of the algorithm, where if we match O(m) characters without any matches of the entire needle, then we predict that the startup cost of the two-way algorithm will probably be worth it. */ - Py_ssize_t hits = 0; - for (i = 0; i <= w; i++) { - if (ss[i] == pp[0]) { - /* candidate match */ - for (j = 0; j < mlast; j++) { - if (s[i+j] != p[j]) { - break; - } - } - if (j == mlast) { - /* got a match! */ - if (mode != FAST_COUNT) { - return i; - } - count++; - if (count == maxcount) { - return maxcount; - } - i = i + mlast; - continue; - } - /* miss: check if next character is part of pattern */ - if (!STRINGLIB_BLOOM(mask, ss[i+1])) { - i = i + m; - } - else { - i = i + skip; - } - hits += j + 1; - if (hits >= m / 4 && i < w - 1000) { - /* We've done O(m) fruitless comparisons - anyway, so spend the O(m) cost on the - setup for the two-way algorithm. */ - Py_ssize_t res; - if (mode == FAST_COUNT) { - res = STRINGLIB(_two_way_count)( - s+i, n-i, p, m, maxcount-count); - return count + res; - } - else { - res = STRINGLIB(_two_way_find)(s+i, n-i, p, m); - if (res == -1) { - return -1; - } - return i + res; - } - } - } - else { - /* skip: check if next character is part of pattern */ - if (!STRINGLIB_BLOOM(mask, ss[i+1])) { - i = i + m; - } - } - } - if (mode != FAST_COUNT) { - return -1; - } - return count; - } - /* The standard, non-adaptive version of the algorithm. */ - for (i = 0; i <= w; i++) { - /* note: using mlast in the skip path slows things down on x86 */ - if (ss[i] == pp[0]) { - /* candidate match */ - for (j = 0; j < mlast; j++) { - if (s[i+j] != p[j]) { - break; - } - } - if (j == mlast) { - /* got a match! */ - if (mode != FAST_COUNT) { - return i; - } - count++; - if (count == maxcount) { - return maxcount; - } - i = i + mlast; - continue; - } - /* miss: check if next character is part of pattern */ - if (!STRINGLIB_BLOOM(mask, ss[i+1])) { - i = i + m; - } - else { - i = i + skip; - } - } - else { - /* skip: check if next character is part of pattern */ - if (!STRINGLIB_BLOOM(mask, ss[i+1])) { - i = i + m; - } - } + return STRINGLIB(adaptive_find)(s, n, p, m, maxcount, mode); } } - else { /* FAST_RSEARCH */ - - /* create compressed boyer-moore delta 1 table */ - - /* process pattern[0] outside the loop */ - STRINGLIB_BLOOM_ADD(mask, p[0]); - /* process pattern[:0:-1] */ - for (i = mlast; i > 0; i--) { - STRINGLIB_BLOOM_ADD(mask, p[i]); - if (p[i] == p[0]) { - skip = i - 1; - } - } - - for (i = w; i >= 0; i--) { - if (s[i] == p[0]) { - /* candidate match */ - for (j = mlast; j > 0; j--) { - if (s[i+j] != p[j]) { - break; - } - } - if (j == 0) { - /* got a match! */ - return i; - } - /* miss: check if previous character is part of pattern */ - if (i > 0 && !STRINGLIB_BLOOM(mask, s[i-1])) { - i = i - m; - } - else { - i = i - skip; - } - } - else { - /* skip: check if previous character is part of pattern */ - if (i > 0 && !STRINGLIB_BLOOM(mask, s[i-1])) { - i = i - m; - } - } - } + else { + /* FAST_RSEARCH */ + return STRINGLIB(default_rfind)(s, n, p, m, maxcount, mode); } - - if (mode != FAST_COUNT) - return -1; - return count; } From 6be0c32379205935cb907182c33b6f651b2eaf76 Mon Sep 17 00:00:00 2001 From: sweeneyde Date: Sun, 11 Jul 2021 19:22:29 -0400 Subject: [PATCH 10/19] tweak cutoffs --- Objects/stringlib/fastsearch.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index ab615a2ee7e69a..2881c64439d8ef 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -748,10 +748,10 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, } if (mode != FAST_RSEARCH) { - if (n < 2000 || (m < 100 && n < 30000) || m < 6) { + if (n < 3000 || (m < 100 && n < 30000) || m < 6) { return STRINGLIB(default_find)(s, n, p, m, maxcount, mode); } - else if (5 * m < n) { + else if (6 * m < n) { /* For larger problems where the needle isn't a huge percentage of the size of the haystack, the relatively expensive O(m) startup cost of the two-way algorithm From f9fe8073b449deca8b3c796d51c716d554ec84a3 Mon Sep 17 00:00:00 2001 From: "blurb-it[bot]" <43283697+blurb-it[bot]@users.noreply.github.com> Date: Mon, 12 Jul 2021 04:06:59 +0000 Subject: [PATCH 11/19] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20b?= =?UTF-8?q?lurb=5Fit.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../Core and Builtins/2021-07-12-04-06-57.bpo-41972.nDX5k_.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2021-07-12-04-06-57.bpo-41972.nDX5k_.rst diff --git a/Misc/NEWS.d/next/Core and Builtins/2021-07-12-04-06-57.bpo-41972.nDX5k_.rst b/Misc/NEWS.d/next/Core and Builtins/2021-07-12-04-06-57.bpo-41972.nDX5k_.rst new file mode 100644 index 00000000000000..3daffb9c0e1dff --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2021-07-12-04-06-57.bpo-41972.nDX5k_.rst @@ -0,0 +1 @@ +Tuned the string-searching algorithm of fastsearch.h to have a shorter inner loop for most cases. \ No newline at end of file From 25f9b5692c931bc4bf9c0b1cdc73456690efffc4 Mon Sep 17 00:00:00 2001 From: sweeneyde Date: Mon, 12 Jul 2021 02:01:17 -0400 Subject: [PATCH 12/19] Fix sanitizer warnings --- Objects/stringlib/fastsearch.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 2881c64439d8ef..791ba4fd4b5627 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -173,7 +173,7 @@ STRINGLIB(rfind_char)(const STRINGLIB_CHAR* s, Py_ssize_t n, STRINGLIB_CHAR ch) # define LOG_STRING(s, n) printf("\"%.*s\"", (int)(n), s) # define LOG_LINEUP() do { \ LOG("> "); LOG_STRING(haystack, len_haystack); LOG("\n> "); \ - LOG("%*s",(int)(window_last + 1 - len_needle - haystack), ""); \ + LOG("%*s",(int)(window_last - haystack + 1 - len_needle), ""); \ LOG_STRING(needle, len_needle); LOG("\n"); \ } while(0) #else @@ -297,7 +297,7 @@ STRINGLIB(_factorize)(const STRINGLIB_CHAR *needle, #define SHIFT_TYPE uint8_t #define MAX_SHIFT UINT8_MAX -#define TABLE_SIZE_BITS 6 +#define TABLE_SIZE_BITS 6u #define TABLE_SIZE (1U << TABLE_SIZE_BITS) #define TABLE_MASK (TABLE_SIZE - 1U) @@ -336,7 +336,8 @@ STRINGLIB(_preprocess)(const STRINGLIB_CHAR *needle, Py_ssize_t len_needle, p->gap = len_needle; STRINGLIB_CHAR last = needle[len_needle - 1] & TABLE_MASK; for (Py_ssize_t i = len_needle - 2; i >= 0; i--) { - if ((needle[i] & TABLE_MASK) == last) { + STRINGLIB_CHAR x = needle[i] & TABLE_MASK; + if (x == last) { p->gap = len_needle - 1 - i; break; } @@ -423,8 +424,6 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, Py_ssize_t gap = p->gap; period = Py_MAX(gap, period); LOG("Needle is not periodic.\n"); - assert(cut < len_needle); - STRINGLIB_CHAR needle_cut = needle[cut]; Py_ssize_t gap_jump_end = Py_MIN(len_needle, cut + gap); windowloop: while (window_last < haystack_end) { From 4379ea689c268d40e4b6ea83a025469b1a8d8866 Mon Sep 17 00:00:00 2001 From: sweeneyde Date: Mon, 12 Jul 2021 03:31:17 -0400 Subject: [PATCH 13/19] Refactor for sanitizer --- Objects/stringlib/fastsearch.h | 56 ++++++++++++++++++---------------- 1 file changed, 29 insertions(+), 27 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 791ba4fd4b5627..51da9878006afe 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -376,29 +376,18 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, Py_ssize_t memory = 0; periodicwindowloop: while (window_last < haystack_end) { + assert(memory == 0); LOG_LINEUP(); Py_ssize_t shift = table[(*window_last) & TABLE_MASK]; - if (shift > 0 && memory > 0) { - // A mismatch has been identified to the right of - // where i starts, so we can jump at least as far as - // if the mismatch occurred on the first comparison. - Py_ssize_t memory_shift = Py_MAX(cut, memory) - cut + 1; - LOG("Skip with Memory.\n"); - window_last += Py_MAX(shift, memory_shift); - LOG_LINEUP(); - memory = 0; - shift = table[(*window_last) & TABLE_MASK]; - } - while (shift > 0 && window_last < haystack_end) { + window_last += shift; + if (shift) { LOG("Horspool skip.\n"); - window_last += shift; - shift = table[(*window_last) & TABLE_MASK]; - LOG_LINEUP(); - } - if (window_last >= haystack_end) { - break; + continue; } + no_shift: const STRINGLIB_CHAR *const window = window_last - len_needle + 1; + assert((window[len_needle - 1] & TABLE_MASK) == + (needle[len_needle - 1] & TABLE_MASK)); Py_ssize_t i = Py_MAX(cut, memory); for (; i < len_needle; i++) { if (needle[i] != window[i]) { @@ -413,7 +402,22 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, LOG("Left half does not match.\n"); window_last += period; memory = len_needle - period; - goto periodicwindowloop; + if (window_last >= haystack_end) { + return -1; + } + shift = table[(*window_last) & TABLE_MASK]; + if (shift) { + // A mismatch has been identified to the right + // of where i will next start, so we can jump + // at least as far as if the mismatch occurred + // on the first comparison. + Py_ssize_t mem_jump = Py_MAX(cut, memory) - cut + 1; + LOG("Skip with Memory.\n"); + memory = 0; + window_last += Py_MAX(shift, mem_jump); + goto periodicwindowloop; + } + goto no_shift; } } LOG("Found a match!\n"); @@ -429,16 +433,14 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, while (window_last < haystack_end) { LOG_LINEUP(); Py_ssize_t shift = table[(*window_last) & TABLE_MASK]; - while (shift > 0 && window_last < haystack_end) { + window_last += shift; + if (shift) { LOG("Horspool skip.\n"); - window_last += shift; - shift = table[(*window_last) & TABLE_MASK]; - LOG_LINEUP(); - } - if (window_last >= haystack_end) { - break; + continue; } - const STRINGLIB_CHAR *window = window_last - len_needle + 1; + const STRINGLIB_CHAR *const window = window_last - len_needle + 1; + assert((window[len_needle - 1] & TABLE_MASK) == + (needle[len_needle - 1] & TABLE_MASK)); for (Py_ssize_t i = cut; i < gap_jump_end; i++) { if (needle[i] != window[i]) { LOG("Early right half mismatch: jump by gap.\n"); From e336b7909bc27c25fca6678bc71ae92a57904bd0 Mon Sep 17 00:00:00 2001 From: sweeneyde Date: Mon, 12 Jul 2021 03:57:54 -0400 Subject: [PATCH 14/19] Don't goto declarations --- Objects/stringlib/fastsearch.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 51da9878006afe..d2225047345df9 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -369,6 +369,7 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, const STRINGLIB_CHAR *window_last = haystack + len_needle - 1; const STRINGLIB_CHAR *const haystack_end = haystack + len_haystack; SHIFT_TYPE *table = p->table; + const STRINGLIB_CHAR *window; LOG("===== Two-way: \"%s\" in \"%s\". =====\n", needle, haystack); if (p->is_periodic) { @@ -385,7 +386,7 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, continue; } no_shift: - const STRINGLIB_CHAR *const window = window_last - len_needle + 1; + window = window_last - len_needle + 1; assert((window[len_needle - 1] & TABLE_MASK) == (needle[len_needle - 1] & TABLE_MASK)); Py_ssize_t i = Py_MAX(cut, memory); @@ -438,7 +439,7 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, LOG("Horspool skip.\n"); continue; } - const STRINGLIB_CHAR *const window = window_last - len_needle + 1; + window = window_last - len_needle + 1; assert((window[len_needle - 1] & TABLE_MASK) == (needle[len_needle - 1] & TABLE_MASK)); for (Py_ssize_t i = cut; i < gap_jump_end; i++) { From b8df3e13585b6a65c828c188ba33457390a60dd0 Mon Sep 17 00:00:00 2001 From: sweeneyde Date: Tue, 13 Jul 2021 12:43:02 -0400 Subject: [PATCH 15/19] Emphasize to the compiler that the loop is a loop --- Objects/stringlib/fastsearch.h | 36 ++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index d2225047345df9..86121fde7df503 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -378,12 +378,17 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, periodicwindowloop: while (window_last < haystack_end) { assert(memory == 0); - LOG_LINEUP(); - Py_ssize_t shift = table[(*window_last) & TABLE_MASK]; - window_last += shift; - if (shift) { - LOG("Horspool skip.\n"); - continue; + for (;;) { + LOG_LINEUP(); + Py_ssize_t shift = table[(*window_last) & TABLE_MASK]; + window_last += shift; + if (shift == 0) { + break; + } + if (window_last >= haystack_end) { + return -1; + } + LOG("Horspool skip"); } no_shift: window = window_last - len_needle + 1; @@ -406,7 +411,7 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, if (window_last >= haystack_end) { return -1; } - shift = table[(*window_last) & TABLE_MASK]; + Py_ssize_t shift = table[(*window_last) & TABLE_MASK]; if (shift) { // A mismatch has been identified to the right // of where i will next start, so we can jump @@ -432,12 +437,17 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack, Py_ssize_t gap_jump_end = Py_MIN(len_needle, cut + gap); windowloop: while (window_last < haystack_end) { - LOG_LINEUP(); - Py_ssize_t shift = table[(*window_last) & TABLE_MASK]; - window_last += shift; - if (shift) { - LOG("Horspool skip.\n"); - continue; + for (;;) { + LOG_LINEUP(); + Py_ssize_t shift = table[(*window_last) & TABLE_MASK]; + window_last += shift; + if (shift == 0) { + break; + } + if (window_last >= haystack_end) { + return -1; + } + LOG("Horspool skip"); } window = window_last - len_needle + 1; assert((window[len_needle - 1] & TABLE_MASK) == From 869f7d0079b3845e7d01b15b839c000354f0b899 Mon Sep 17 00:00:00 2001 From: sweeneyde Date: Thu, 15 Jul 2021 14:24:17 -0400 Subject: [PATCH 16/19] Use Py_LOCAL_INLINE --- Objects/stringlib/fastsearch.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 86121fde7df503..b107ee908ee3b9 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -182,7 +182,7 @@ STRINGLIB(rfind_char)(const STRINGLIB_CHAR* s, Py_ssize_t n, STRINGLIB_CHAR ch) # define LOG_LINEUP() #endif -static inline Py_ssize_t +Py_LOCAL_INLINE(Py_ssize_t) STRINGLIB(_lex_search)(const STRINGLIB_CHAR *needle, Py_ssize_t len_needle, Py_ssize_t *return_period, int invert_alphabet) { @@ -234,7 +234,7 @@ STRINGLIB(_lex_search)(const STRINGLIB_CHAR *needle, Py_ssize_t len_needle, return max_suffix; } -static Py_ssize_t +Py_LOCAL_INLINE(Py_ssize_t) STRINGLIB(_factorize)(const STRINGLIB_CHAR *needle, Py_ssize_t len_needle, Py_ssize_t *return_period) From 747a39a2d1b0558f958137a917a7dbe0a5a5ed1e Mon Sep 17 00:00:00 2001 From: sweeneyde Date: Thu, 15 Jul 2021 14:37:06 -0400 Subject: [PATCH 17/19] Use Py_LOCAL_INLINE again --- Objects/stringlib/fastsearch.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index b107ee908ee3b9..111a15358591f6 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -735,7 +735,7 @@ STRINGLIB(count_char)(const STRINGLIB_CHAR *s, Py_ssize_t n, } -static inline Py_ssize_t +Py_LOCAL_INLINE(Py_ssize_t) FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, const STRINGLIB_CHAR* p, Py_ssize_t m, Py_ssize_t maxcount, int mode) From 72143efaade208f684c1801daa00a9c502f82ceb Mon Sep 17 00:00:00 2001 From: sweeneyde Date: Thu, 15 Jul 2021 23:22:25 -0400 Subject: [PATCH 18/19] Tweak thresholds based on most recent comparison. --- Objects/stringlib/fastsearch.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 111a15358591f6..c276930b357bd6 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -760,10 +760,11 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, } if (mode != FAST_RSEARCH) { - if (n < 3000 || (m < 100 && n < 30000) || m < 6) { + if (n < 2500 || (m < 100 && n < 30000) || m < 6) { return STRINGLIB(default_find)(s, n, p, m, maxcount, mode); } - else if (6 * m < n) { + else if ((m >> 2) * 3 > (n >> 2)) { + /* 33% threshold, but don't overflow. */ /* For larger problems where the needle isn't a huge percentage of the size of the haystack, the relatively expensive O(m) startup cost of the two-way algorithm From 4d7d1022477a9b2855932ac665c022cc3e814566 Mon Sep 17 00:00:00 2001 From: sweeneyde Date: Thu, 15 Jul 2021 23:31:44 -0400 Subject: [PATCH 19/19] Fix greater/less than sign. --- Objects/stringlib/fastsearch.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index c276930b357bd6..b91082bd523cb6 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -763,7 +763,7 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n, if (n < 2500 || (m < 100 && n < 30000) || m < 6) { return STRINGLIB(default_find)(s, n, p, m, maxcount, mode); } - else if ((m >> 2) * 3 > (n >> 2)) { + else if ((m >> 2) * 3 < (n >> 2)) { /* 33% threshold, but don't overflow. */ /* For larger problems where the needle isn't a huge percentage of the size of the haystack, the relatively