From 652d87cee0958b514d023221317a5179076ba340 Mon Sep 17 00:00:00 2001 From: Ma Lin Date: Wed, 30 Mar 2022 11:32:37 +0800 Subject: [PATCH 1/4] re: fix memory leak when a match is terminated by a signal Count the number of REPEAT when compiling a pattern, and allocate an array in `SRE_STATE`. At any time, a REPEAT will have at most one in active, so a `SRE_REPEAT` array is fine. --- Lib/sre_compile.py | 27 ++++--- Lib/sre_constants.py | 2 +- Lib/test/test_re.py | 80 +++++++++---------- .../2022-03-30-15-50-46.bpo-23689.kQj4p0.rst | 2 + Modules/_sre.c | 29 +++++-- Modules/sre.h | 2 + Modules/sre_constants.h | 2 +- Modules/sre_lib.h | 58 +++++++------- 8 files changed, 114 insertions(+), 88 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2022-03-30-15-50-46.bpo-23689.kQj4p0.rst diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py index 0867200a59a230..d8364313d8fd79 100644 --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -20,6 +20,7 @@ _SUCCESS_CODES = {SUCCESS, FAILURE} _ASSERT_CODES = {ASSERT, ASSERT_NOT} _UNIT_CODES = _LITERAL_CODES | {ANY, IN} +_REPEAT_COUNT_OFFSET = 5 _REPEATING_CODES = { MIN_REPEAT: (REPEAT, MIN_UNTIL, MIN_REPEAT_ONE), @@ -147,6 +148,8 @@ def _compile(code, pattern, flags): skip = _len(code); emit(0) emit(av[0]) emit(av[1]) + emit(code[_REPEAT_COUNT_OFFSET]) # REPEAT index + code[_REPEAT_COUNT_OFFSET] += 1 # REPEAT count + 1 _compile(code, av[2], flags) emit(SUCCESS) code[skip] = _len(code) - skip @@ -155,6 +158,8 @@ def _compile(code, pattern, flags): skip = _len(code); emit(0) emit(av[0]) emit(av[1]) + emit(code[_REPEAT_COUNT_OFFSET]) # REPEAT index + code[_REPEAT_COUNT_OFFSET] += 1 # REPEAT count + 1 _compile(code, av[2], flags) code[skip] = _len(code) - skip emit(REPEATING_CODES[op][1]) @@ -551,7 +556,8 @@ def _compile_info(code, pattern, flags): if hi > MAXCODE: hi = MAXCODE if lo == 0: - code.extend([INFO, 4, 0, lo, hi]) + # INFO, skip, mask, lo, hi, repeat_count + code.extend([INFO, 5, 0, lo, hi, 0]) return # look for a literal prefix prefix = [] @@ -587,6 +593,9 @@ def _compile_info(code, pattern, flags): emit(MAXCODE) prefix = prefix[:MAXCODE] emit(min(hi, MAXCODE)) + # REPEAT count + assert len(code) == _REPEAT_COUNT_OFFSET + emit(0) # add literal prefix if prefix: emit(len(prefix)) # length @@ -721,11 +730,11 @@ def print_2(*args): i += 1 elif op in (REPEAT, REPEAT_ONE, MIN_REPEAT_ONE, POSSESSIVE_REPEAT, POSSESSIVE_REPEAT_ONE): - skip, min, max = code[i: i+3] + skip, min, max, repeat_index = code[i: i+4] if max == MAXREPEAT: max = 'MAXREPEAT' - print_(op, skip, min, max, to=i+skip) - dis_(i+3, i+skip) + print_(op, skip, min, max, repeat_index, to=i+skip) + dis_(i+4, i+skip) i += skip elif op is GROUPREF_EXISTS: arg, skip = code[i: i+2] @@ -742,15 +751,15 @@ def print_2(*args): dis_(i+1, i+skip) i += skip elif op is INFO: - skip, flags, min, max = code[i: i+4] + skip, flags, min, max, repeat_count = code[i: i+5] if max == MAXREPEAT: max = 'MAXREPEAT' - print_(op, skip, bin(flags), min, max, to=i+skip) - start = i+4 + print_(op, skip, bin(flags), min, max, repeat_count, to=i+skip) + start = i+5 if flags & SRE_INFO_PREFIX: - prefix_len, prefix_skip = code[i+4: i+6] + prefix_len, prefix_skip = code[i+5: i+7] print_2(' prefix_skip', prefix_skip) - start = i + 6 + start = i + 7 prefix = code[start: start+prefix_len] print_2(' prefix', '[%s]' % ', '.join('%#02x' % x for x in prefix), diff --git a/Lib/sre_constants.py b/Lib/sre_constants.py index a00b0170607b59..d2f547340209fe 100644 --- a/Lib/sre_constants.py +++ b/Lib/sre_constants.py @@ -13,7 +13,7 @@ # update when constants are added or removed -MAGIC = 20220318 +MAGIC = 20220330 from _sre import MAXREPEAT, MAXGROUPS diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 85716fbe2a8e8d..36863783fdbd26 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -2264,30 +2264,30 @@ def test_debug_flag(self): LITERAL 58 LITERAL 32 - 0. INFO 8 0b1 2 5 (to 9) + 0. INFO 9 0b1 2 5 0 (to 10) prefix_skip 0 prefix [0x2e] ('.') overlap [0] - 9: MARK 0 -11. LITERAL 0x2e ('.') -13. MARK 1 -15. BRANCH 10 (to 26) -17. IN 6 (to 24) -19. LITERAL 0x63 ('c') -21. LITERAL 0x68 ('h') -23. FAILURE -24: JUMP 9 (to 34) -26: branch 7 (to 33) -27. LITERAL 0x70 ('p') -29. LITERAL 0x79 ('y') -31. JUMP 2 (to 34) -33: FAILURE -34: GROUPREF_EXISTS 0 6 (to 41) -37. AT END -39. JUMP 5 (to 45) -41: LITERAL 0x3a (':') -43. LITERAL 0x20 (' ') -45: SUCCESS +10: MARK 0 +12. LITERAL 0x2e ('.') +14. MARK 1 +16. BRANCH 10 (to 27) +18. IN 6 (to 25) +20. LITERAL 0x63 ('c') +22. LITERAL 0x68 ('h') +24. FAILURE +25: JUMP 9 (to 35) +27: branch 7 (to 34) +28. LITERAL 0x70 ('p') +30. LITERAL 0x79 ('y') +32. JUMP 2 (to 35) +34: FAILURE +35: GROUPREF_EXISTS 0 6 (to 42) +38. AT END +40. JUMP 5 (to 46) +42: LITERAL 0x3a (':') +44. LITERAL 0x20 (' ') +46: SUCCESS ''' self.assertEqual(get_debug_out(pat), dump) # Debug output is output again even a second time (bypassing @@ -2298,14 +2298,14 @@ def test_atomic_group(self): self.assertEqual(get_debug_out(r'(?>ab?)'), '''\ ATOMIC_GROUP [(LITERAL, 97), (MAX_REPEAT, (0, 1, [(LITERAL, 98)]))] - 0. INFO 4 0b0 1 2 (to 5) - 5: ATOMIC_GROUP 11 (to 17) - 7. LITERAL 0x61 ('a') - 9. REPEAT_ONE 6 0 1 (to 16) -13. LITERAL 0x62 ('b') -15. SUCCESS -16: SUCCESS -17: SUCCESS + 0. INFO 5 0b0 1 2 1 (to 6) + 6: ATOMIC_GROUP 12 (to 19) + 8. LITERAL 0x61 ('a') +10. REPEAT_ONE 7 0 1 0 (to 18) +15. LITERAL 0x62 ('b') +17. SUCCESS +18: SUCCESS +19: SUCCESS ''') def test_possesive_repeat_one(self): @@ -2313,11 +2313,11 @@ def test_possesive_repeat_one(self): POSSESSIVE_REPEAT 0 1 LITERAL 97 - 0. INFO 4 0b0 0 1 (to 5) - 5: POSSESSIVE_REPEAT_ONE 6 0 1 (to 12) - 9. LITERAL 0x61 ('a') -11. SUCCESS -12: SUCCESS + 0. INFO 5 0b0 0 1 1 (to 6) + 6: POSSESSIVE_REPEAT_ONE 7 0 1 0 (to 14) +11. LITERAL 0x61 ('a') +13. SUCCESS +14: SUCCESS ''') def test_possesive_repeat(self): @@ -2326,12 +2326,12 @@ def test_possesive_repeat(self): LITERAL 97 LITERAL 98 - 0. INFO 4 0b0 0 2 (to 5) - 5: POSSESSIVE_REPEAT 7 0 1 (to 13) - 9. LITERAL 0x61 ('a') -11. LITERAL 0x62 ('b') -13: SUCCESS -14. SUCCESS + 0. INFO 5 0b0 0 2 1 (to 6) + 6: POSSESSIVE_REPEAT 8 0 1 0 (to 15) +11. LITERAL 0x61 ('a') +13. LITERAL 0x62 ('b') +15: SUCCESS +16. SUCCESS ''') diff --git a/Misc/NEWS.d/next/Library/2022-03-30-15-50-46.bpo-23689.kQj4p0.rst b/Misc/NEWS.d/next/Library/2022-03-30-15-50-46.bpo-23689.kQj4p0.rst new file mode 100644 index 00000000000000..4acb602f3f7298 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2022-03-30-15-50-46.bpo-23689.kQj4p0.rst @@ -0,0 +1,2 @@ +:mod:`re` module: fix memory leak when a match is terminated by a signal. +Patch by Ma Lin. diff --git a/Modules/_sre.c b/Modules/_sre.c index 48193f82475a42..790c833a1d9d25 100644 --- a/Modules/_sre.c +++ b/Modules/_sre.c @@ -427,6 +427,13 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string, state->lastmark = -1; state->lastindex = -1; + state->repeats_array = PyMem_RawCalloc(pattern->code[5], + sizeof(SRE_REPEAT)); + if (!state->repeats_array) { + PyErr_NoMemory(); + goto err; + } + state->buffer.buf = NULL; ptr = getstring(string, &length, &isbytes, &charsize, &state->buffer); if (!ptr) @@ -476,6 +483,10 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string, safely casted to `void*`, see bpo-39943 for details. */ PyMem_Free((void*) state->mark); state->mark = NULL; + + PyMem_RawFree((void*) state->repeats_array); + state->repeats_array = NULL; + if (state->buffer.buf) PyBuffer_Release(&state->buffer); return NULL; @@ -490,6 +501,7 @@ state_fini(SRE_STATE* state) data_stack_dealloc(state); /* See above PyMem_Del for why we explicitly cast here. */ PyMem_Free((void*) state->mark); + PyMem_RawFree((void*) state->repeats_array); state->mark = NULL; } @@ -1731,7 +1743,7 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups) case SRE_OP_INFO: { /* A minimal info field is - <1=skip> <2=flags> <3=min> <4=max>; + <1=skip> <2=flags> <3=min> <4=max> <5=repeat_count>; If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags, more follows. */ SRE_CODE flags, i; @@ -1739,8 +1751,9 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups) GET_SKIP; newcode = code+skip-1; GET_ARG; flags = arg; - GET_ARG; - GET_ARG; + GET_ARG; // min + GET_ARG; // max + GET_ARG; // repeat count /* Check that only valid flags are present */ if ((flags & ~(SRE_INFO_PREFIX | SRE_INFO_LITERAL | @@ -1821,13 +1834,14 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups) GET_SKIP; GET_ARG; min = arg; GET_ARG; max = arg; + GET_ARG; // repeat index if (min > max) FAIL; if (max > SRE_MAXREPEAT) FAIL; - if (!_validate_inner(code, code+skip-4, groups)) + if (!_validate_inner(code, code+skip-5, groups)) FAIL; - code += skip-4; + code += skip-5; GET_OP; if (op != SRE_OP_SUCCESS) FAIL; @@ -1841,13 +1855,14 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups) GET_SKIP; GET_ARG; min = arg; GET_ARG; max = arg; + GET_ARG; // repeat index if (min > max) FAIL; if (max > SRE_MAXREPEAT) FAIL; - if (!_validate_inner(code, code+skip-3, groups)) + if (!_validate_inner(code, code+skip-4, groups)) FAIL; - code += skip-3; + code += skip-4; GET_OP; if (op1 == SRE_OP_POSSESSIVE_REPEAT) { if (op != SRE_OP_SUCCESS) diff --git a/Modules/sre.h b/Modules/sre.h index 785adbd003e7fd..4d6886d5021943 100644 --- a/Modules/sre.h +++ b/Modules/sre.h @@ -83,6 +83,8 @@ typedef struct { size_t data_stack_base; /* current repeat context */ SRE_REPEAT *repeat; + /* repeat contexts array */ + SRE_REPEAT *repeats_array; } SRE_STATE; typedef struct { diff --git a/Modules/sre_constants.h b/Modules/sre_constants.h index 8b9125b75b4568..2670e338b5d853 100644 --- a/Modules/sre_constants.h +++ b/Modules/sre_constants.h @@ -11,7 +11,7 @@ * See the _sre.c file for information on usage and redistribution. */ -#define SRE_MAGIC 20220318 +#define SRE_MAGIC 20220330 #define SRE_OP_FAILURE 0 #define SRE_OP_SUCCESS 1 #define SRE_OP_ANY 2 diff --git a/Modules/sre_lib.h b/Modules/sre_lib.h index 8e4e714eada389..a5ceb009e39c45 100644 --- a/Modules/sre_lib.h +++ b/Modules/sre_lib.h @@ -546,7 +546,8 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) if (ctx->pattern[0] == SRE_OP_INFO) { /* optimization info block */ - /* <1=skip> <2=flags> <3=min> ... */ + /* <1=skip> <2=flags> <3=min> <4=max> + <5=repeat_count> ... */ if (ctx->pattern[3] && (uintptr_t)(end - ctx->ptr) < ctx->pattern[3]) { TRACE(("reject (got %zd chars, need %zd)\n", end - ctx->ptr, (Py_ssize_t) ctx->pattern[3])); @@ -806,7 +807,7 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) collecting backtracking points. for other cases, use the MAX_REPEAT operator */ - /* <1=min> <2=max> item tail */ + /* <1=min> <2=max> <3=repeat_index> item tail */ TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr, ctx->pattern[1], ctx->pattern[2])); @@ -816,7 +817,7 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) state->ptr = ctx->ptr; - ret = SRE(count)(state, ctx->pattern+3, ctx->pattern[2]); + ret = SRE(count)(state, ctx->pattern+4, ctx->pattern[2]); RETURN_ON_ERROR(ret); DATA_LOOKUP_AT(SRE(match_context), ctx, ctx_pos); ctx->count = ret; @@ -905,7 +906,7 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) collecting backtracking points. for other cases, use the MIN_REPEAT operator */ - /* <1=min> <2=max> item tail */ + /* <1=min> <2=max> <3=repeat_index> item tail */ TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr, ctx->pattern[1], ctx->pattern[2])); @@ -919,7 +920,7 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) ctx->count = 0; else { /* count using pattern min as the maximum */ - ret = SRE(count)(state, ctx->pattern+3, ctx->pattern[1]); + ret = SRE(count)(state, ctx->pattern+4, ctx->pattern[1]); RETURN_ON_ERROR(ret); DATA_LOOKUP_AT(SRE(match_context), ctx, ctx_pos); if (ret < (Py_ssize_t) ctx->pattern[1]) @@ -961,7 +962,7 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) LASTMARK_RESTORE(); state->ptr = ctx->ptr; - ret = SRE(count)(state, ctx->pattern+3, 1); + ret = SRE(count)(state, ctx->pattern+4, 1); RETURN_ON_ERROR(ret); DATA_LOOKUP_AT(SRE(match_context), ctx, ctx_pos); if (ret == 0) @@ -984,7 +985,7 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) collecting backtracking points. for other cases, use the MAX_REPEAT operator */ - /* <1=min> <2=max> item + /* <1=min> <2=max> <3=repeat_index> item tail */ TRACE(("|%p|%p|POSSESSIVE_REPEAT_ONE %d %d\n", ctx->pattern, @@ -996,7 +997,7 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) state->ptr = ctx->ptr; - ret = SRE(count)(state, ctx->pattern + 3, ctx->pattern[2]); + ret = SRE(count)(state, ctx->pattern + 4, ctx->pattern[2]); RETURN_ON_ERROR(ret); DATA_LOOKUP_AT(SRE(match_context), ctx, ctx_pos); ctx->count = ret; @@ -1032,16 +1033,13 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) case SRE_OP_REPEAT: /* create repeat context. all the hard work is done by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */ - /* <1=min> <2=max> item tail */ + /* <1=min> <2=max> <3=repeat_index> item tail */ TRACE(("|%p|%p|REPEAT %d %d\n", ctx->pattern, ctx->ptr, ctx->pattern[1], ctx->pattern[2])); - /* install new repeat context */ - ctx->u.rep = (SRE_REPEAT*) PyObject_Malloc(sizeof(*ctx->u.rep)); - if (!ctx->u.rep) { - PyErr_NoMemory(); - RETURN_FAILURE; - } + /* install repeat context */ + ctx->u.rep = &state->repeats_array[ctx->pattern[3]]; + ctx->u.rep->count = -1; ctx->u.rep->pattern = ctx->pattern; ctx->u.rep->prev = state->repeat; @@ -1051,7 +1049,6 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) state->ptr = ctx->ptr; DO_JUMP(JUMP_REPEAT, jump_repeat, ctx->pattern+ctx->pattern[0]); state->repeat = ctx->u.rep->prev; - PyObject_Free(ctx->u.rep); if (ret) { RETURN_ON_ERROR(ret); @@ -1061,7 +1058,7 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) case SRE_OP_MAX_UNTIL: /* maximizing repeat */ - /* <1=min> <2=max> item tail */ + /* <1=min> <2=max> <3=repeat_index> item tail */ /* FIXME: we probably need to deal with zero-width matches in here... */ @@ -1081,7 +1078,7 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) /* not enough matches */ ctx->u.rep->count = ctx->count; DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1, - ctx->u.rep->pattern+3); + ctx->u.rep->pattern+4); if (ret) { RETURN_ON_ERROR(ret); RETURN_SUCCESS; @@ -1103,7 +1100,7 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) DATA_PUSH(&ctx->u.rep->last_ptr); ctx->u.rep->last_ptr = state->ptr; DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2, - ctx->u.rep->pattern+3); + ctx->u.rep->pattern+4); DATA_POP(&ctx->u.rep->last_ptr); if (ret) { MARK_POP_DISCARD(ctx->lastmark); @@ -1128,7 +1125,7 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) case SRE_OP_MIN_UNTIL: /* minimizing repeat */ - /* <1=min> <2=max> item tail */ + /* <1=min> <2=max> <3=repeat_index> item tail */ ctx->u.rep = state->repeat; if (!ctx->u.rep) @@ -1145,7 +1142,7 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) /* not enough matches */ ctx->u.rep->count = ctx->count; DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1, - ctx->u.rep->pattern+3); + ctx->u.rep->pattern+4); if (ret) { RETURN_ON_ERROR(ret); RETURN_SUCCESS; @@ -1188,7 +1185,7 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) DATA_PUSH(&ctx->u.rep->last_ptr); ctx->u.rep->last_ptr = state->ptr; DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3, - ctx->u.rep->pattern+3); + ctx->u.rep->pattern+4); DATA_POP(&ctx->u.rep->last_ptr); if (ret) { RETURN_ON_ERROR(ret); @@ -1200,7 +1197,7 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) case SRE_OP_POSSESSIVE_REPEAT: /* create possessive repeat contexts. */ - /* <1=min> <2=max> pattern + /* <1=min> <2=max> <3=repeat_index> pattern tail */ TRACE(("|%p|%p|POSSESSIVE_REPEAT %d %d\n", ctx->pattern, ctx->ptr, ctx->pattern[1], ctx->pattern[2])); @@ -1216,7 +1213,7 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) while (ctx->count < (Py_ssize_t)ctx->pattern[1]) { /* not enough matches */ DO_JUMP(JUMP_POSS_REPEAT_1, jump_poss_repeat_1, - &ctx->pattern[3]); + &ctx->pattern[4]); if (ret) { RETURN_ON_ERROR(ret); ctx->count++; @@ -1263,7 +1260,7 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) /* We have not reached the maximin matches, so try to match once more. */ DO_JUMP(JUMP_POSS_REPEAT_2, jump_poss_repeat_2, - &ctx->pattern[3]); + &ctx->pattern[4]); /* Check to see if the last attempted match succeeded. */ @@ -1593,7 +1590,8 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern) if (pattern[0] == SRE_OP_INFO) { /* optimization info block */ - /* <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */ + /* <1=skip> <2=flags> <3=min> <4=max> + <5=repeat_count> <6=prefix info> */ flags = pattern[2]; @@ -1613,14 +1611,14 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern) if (flags & SRE_INFO_PREFIX) { /* pattern starts with a known prefix */ /* */ - prefix_len = pattern[5]; - prefix_skip = pattern[6]; - prefix = pattern + 7; + prefix_len = pattern[6]; + prefix_skip = pattern[7]; + prefix = pattern + 8; overlap = prefix + prefix_len - 1; } else if (flags & SRE_INFO_CHARSET) /* pattern starts with a character from a known set */ /* */ - charset = pattern + 5; + charset = pattern + 6; pattern += 1 + pattern[1]; } From d2b505cb192b0ae354833397e6f3b3d5baa61e7d Mon Sep 17 00:00:00 2001 From: Ma Lin Date: Wed, 30 Mar 2022 18:14:03 +0800 Subject: [PATCH 2/4] don't emit repeat_index for REPEAT_ONE, MIN_REPEAT_ONE, POSSESSIVE_REPEAT_ONE --- Lib/sre_compile.py | 12 ++++++++---- Lib/test/test_re.py | 24 ++++++++++++------------ Modules/_sre.c | 5 ++--- Modules/sre_lib.h | 14 +++++++------- 4 files changed, 29 insertions(+), 26 deletions(-) diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py index d8364313d8fd79..6c5a2d1ccacbb6 100644 --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -148,8 +148,6 @@ def _compile(code, pattern, flags): skip = _len(code); emit(0) emit(av[0]) emit(av[1]) - emit(code[_REPEAT_COUNT_OFFSET]) # REPEAT index - code[_REPEAT_COUNT_OFFSET] += 1 # REPEAT count + 1 _compile(code, av[2], flags) emit(SUCCESS) code[skip] = _len(code) - skip @@ -728,14 +726,20 @@ def print_2(*args): else: print_(FAILURE) i += 1 - elif op in (REPEAT, REPEAT_ONE, MIN_REPEAT_ONE, - POSSESSIVE_REPEAT, POSSESSIVE_REPEAT_ONE): + elif op in (REPEAT, POSSESSIVE_REPEAT): skip, min, max, repeat_index = code[i: i+4] if max == MAXREPEAT: max = 'MAXREPEAT' print_(op, skip, min, max, repeat_index, to=i+skip) dis_(i+4, i+skip) i += skip + elif op in (REPEAT_ONE, MIN_REPEAT_ONE, POSSESSIVE_REPEAT_ONE): + skip, min, max = code[i: i+3] + if max == MAXREPEAT: + max = 'MAXREPEAT' + print_(op, skip, min, max, to=i+skip) + dis_(i+3, i+skip) + i += skip elif op is GROUPREF_EXISTS: arg, skip = code[i: i+2] print_(op, arg, skip, to=i+skip) diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 36863783fdbd26..1428f83615ab7b 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -2298,14 +2298,14 @@ def test_atomic_group(self): self.assertEqual(get_debug_out(r'(?>ab?)'), '''\ ATOMIC_GROUP [(LITERAL, 97), (MAX_REPEAT, (0, 1, [(LITERAL, 98)]))] - 0. INFO 5 0b0 1 2 1 (to 6) - 6: ATOMIC_GROUP 12 (to 19) + 0. INFO 5 0b0 1 2 0 (to 6) + 6: ATOMIC_GROUP 11 (to 18) 8. LITERAL 0x61 ('a') -10. REPEAT_ONE 7 0 1 0 (to 18) -15. LITERAL 0x62 ('b') -17. SUCCESS -18: SUCCESS -19: SUCCESS +10. REPEAT_ONE 6 0 1 (to 17) +14. LITERAL 0x62 ('b') +16. SUCCESS +17: SUCCESS +18: SUCCESS ''') def test_possesive_repeat_one(self): @@ -2313,11 +2313,11 @@ def test_possesive_repeat_one(self): POSSESSIVE_REPEAT 0 1 LITERAL 97 - 0. INFO 5 0b0 0 1 1 (to 6) - 6: POSSESSIVE_REPEAT_ONE 7 0 1 0 (to 14) -11. LITERAL 0x61 ('a') -13. SUCCESS -14: SUCCESS + 0. INFO 5 0b0 0 1 0 (to 6) + 6: POSSESSIVE_REPEAT_ONE 6 0 1 (to 13) +10. LITERAL 0x61 ('a') +12. SUCCESS +13: SUCCESS ''') def test_possesive_repeat(self): diff --git a/Modules/_sre.c b/Modules/_sre.c index 790c833a1d9d25..c8e062763e592f 100644 --- a/Modules/_sre.c +++ b/Modules/_sre.c @@ -1834,14 +1834,13 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups) GET_SKIP; GET_ARG; min = arg; GET_ARG; max = arg; - GET_ARG; // repeat index if (min > max) FAIL; if (max > SRE_MAXREPEAT) FAIL; - if (!_validate_inner(code, code+skip-5, groups)) + if (!_validate_inner(code, code+skip-4, groups)) FAIL; - code += skip-5; + code += skip-4; GET_OP; if (op != SRE_OP_SUCCESS) FAIL; diff --git a/Modules/sre_lib.h b/Modules/sre_lib.h index a5ceb009e39c45..1d2374cb0dcdf1 100644 --- a/Modules/sre_lib.h +++ b/Modules/sre_lib.h @@ -807,7 +807,7 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) collecting backtracking points. for other cases, use the MAX_REPEAT operator */ - /* <1=min> <2=max> <3=repeat_index> item tail */ + /* <1=min> <2=max> item tail */ TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr, ctx->pattern[1], ctx->pattern[2])); @@ -817,7 +817,7 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) state->ptr = ctx->ptr; - ret = SRE(count)(state, ctx->pattern+4, ctx->pattern[2]); + ret = SRE(count)(state, ctx->pattern+3, ctx->pattern[2]); RETURN_ON_ERROR(ret); DATA_LOOKUP_AT(SRE(match_context), ctx, ctx_pos); ctx->count = ret; @@ -906,7 +906,7 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) collecting backtracking points. for other cases, use the MIN_REPEAT operator */ - /* <1=min> <2=max> <3=repeat_index> item tail */ + /* <1=min> <2=max> item tail */ TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr, ctx->pattern[1], ctx->pattern[2])); @@ -920,7 +920,7 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) ctx->count = 0; else { /* count using pattern min as the maximum */ - ret = SRE(count)(state, ctx->pattern+4, ctx->pattern[1]); + ret = SRE(count)(state, ctx->pattern+3, ctx->pattern[1]); RETURN_ON_ERROR(ret); DATA_LOOKUP_AT(SRE(match_context), ctx, ctx_pos); if (ret < (Py_ssize_t) ctx->pattern[1]) @@ -962,7 +962,7 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) LASTMARK_RESTORE(); state->ptr = ctx->ptr; - ret = SRE(count)(state, ctx->pattern+4, 1); + ret = SRE(count)(state, ctx->pattern+3, 1); RETURN_ON_ERROR(ret); DATA_LOOKUP_AT(SRE(match_context), ctx, ctx_pos); if (ret == 0) @@ -985,7 +985,7 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) collecting backtracking points. for other cases, use the MAX_REPEAT operator */ - /* <1=min> <2=max> <3=repeat_index> item + /* <1=min> <2=max> item tail */ TRACE(("|%p|%p|POSSESSIVE_REPEAT_ONE %d %d\n", ctx->pattern, @@ -997,7 +997,7 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) state->ptr = ctx->ptr; - ret = SRE(count)(state, ctx->pattern + 4, ctx->pattern[2]); + ret = SRE(count)(state, ctx->pattern + 3, ctx->pattern[2]); RETURN_ON_ERROR(ret); DATA_LOOKUP_AT(SRE(match_context), ctx, ctx_pos); ctx->count = ret; From adedb2417895b4ba44391869a18b4dea27e4d311 Mon Sep 17 00:00:00 2001 From: Ma Lin Date: Wed, 30 Mar 2022 18:17:28 +0800 Subject: [PATCH 3/4] print repeat_index in TRACE() --- Modules/sre_lib.h | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/Modules/sre_lib.h b/Modules/sre_lib.h index 1d2374cb0dcdf1..8110760476f935 100644 --- a/Modules/sre_lib.h +++ b/Modules/sre_lib.h @@ -1033,9 +1033,10 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) case SRE_OP_REPEAT: /* create repeat context. all the hard work is done by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */ - /* <1=min> <2=max> <3=repeat_index> item tail */ - TRACE(("|%p|%p|REPEAT %d %d\n", ctx->pattern, ctx->ptr, - ctx->pattern[1], ctx->pattern[2])); + /* <1=min> <2=max> + <3=repeat_index> item tail */ + TRACE(("|%p|%p|REPEAT %d %d %d\n", ctx->pattern, ctx->ptr, + ctx->pattern[1], ctx->pattern[2], ctx->pattern[3])); /* install repeat context */ ctx->u.rep = &state->repeats_array[ctx->pattern[3]]; @@ -1058,7 +1059,8 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) case SRE_OP_MAX_UNTIL: /* maximizing repeat */ - /* <1=min> <2=max> <3=repeat_index> item tail */ + /* <1=min> <2=max> + <3=repeat_index> item tail */ /* FIXME: we probably need to deal with zero-width matches in here... */ @@ -1125,7 +1127,8 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) case SRE_OP_MIN_UNTIL: /* minimizing repeat */ - /* <1=min> <2=max> <3=repeat_index> item tail */ + /* <1=min> <2=max> + <3=repeat_index> item tail */ ctx->u.rep = state->repeat; if (!ctx->u.rep) @@ -1197,10 +1200,10 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) case SRE_OP_POSSESSIVE_REPEAT: /* create possessive repeat contexts. */ - /* <1=min> <2=max> <3=repeat_index> pattern - tail */ - TRACE(("|%p|%p|POSSESSIVE_REPEAT %d %d\n", ctx->pattern, - ctx->ptr, ctx->pattern[1], ctx->pattern[2])); + /* <1=min> <2=max> + <3=repeat_index> pattern tail */ + TRACE(("|%p|%p|POSSESSIVE_REPEAT %d %d %d\n", ctx->pattern, + ctx->ptr, ctx->pattern[1], ctx->pattern[2], ctx->pattern[3])); /* Set the global Input pointer to this context's Input pointer */ From e8567e17efa8e1178790f79edc94d569d92e8994 Mon Sep 17 00:00:00 2001 From: Ma Lin Date: Wed, 30 Mar 2022 18:23:30 +0800 Subject: [PATCH 4/4] use PyMem_New instead of PyMem_RawCalloc Same as the code above here, maybe a bit more cache friendly. --- Modules/_sre.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/Modules/_sre.c b/Modules/_sre.c index c8e062763e592f..b17adccee2916d 100644 --- a/Modules/_sre.c +++ b/Modules/_sre.c @@ -427,8 +427,7 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string, state->lastmark = -1; state->lastindex = -1; - state->repeats_array = PyMem_RawCalloc(pattern->code[5], - sizeof(SRE_REPEAT)); + state->repeats_array = PyMem_New(SRE_REPEAT, pattern->code[5]); if (!state->repeats_array) { PyErr_NoMemory(); goto err; @@ -484,7 +483,7 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string, PyMem_Free((void*) state->mark); state->mark = NULL; - PyMem_RawFree((void*) state->repeats_array); + PyMem_Free((void*) state->repeats_array); state->repeats_array = NULL; if (state->buffer.buf) @@ -501,7 +500,7 @@ state_fini(SRE_STATE* state) data_stack_dealloc(state); /* See above PyMem_Del for why we explicitly cast here. */ PyMem_Free((void*) state->mark); - PyMem_RawFree((void*) state->repeats_array); + PyMem_Free((void*) state->repeats_array); state->mark = NULL; }