From 087f13460a6b470314318b14ef966194c39c4b21 Mon Sep 17 00:00:00 2001 From: Thomas Wouters Date: Mon, 14 Apr 2025 14:20:54 +0200 Subject: [PATCH 1/2] Only disable SLP autovectorization of `_PyEval_EvalFrameDefault` on newer GCCs, as the optimization bug seems to exist only on GCC 12 and later, and before GCC 9 disabling the optimization has a dramatic performance impact. --- Python/ceval.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/Python/ceval.c b/Python/ceval.c index 47d068edac2743..c484bb8dfbbfa5 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -948,11 +948,14 @@ _PyObjectArray_Free(PyObject **array, PyObject **scratch) #include "generated_cases.c.h" #endif -#if (defined(__GNUC__) && !defined(__clang__)) && defined(__x86_64__) +#if (defined(__GNUC__) && __GNUC__ >= 10 && !defined(__clang__)) && defined(__x86_64__) /* - * gh-129987: The SLP autovectorizer can cause poor code generation for opcode - * dispatch, negating any benefit we get from vectorization elsewhere in the - * interpreter loop. + * gh-129987: The SLP autovectorizer can cause poor code generation for + * opcode dispatch in some GCC versions (observed in GCCs 12 through 15), + * negating any benefit we get from vectorization elsewhere in the + * interpreter loop. Disabling it significantly affected older GCC versions + * (prior to GCC 9, 40% performance drop), so we have to selectively disable + * it. */ #define DONT_SLP_VECTORIZE __attribute__((optimize ("no-tree-slp-vectorize"))) #else From fc53ffe466974b1f5e2fdf96552d69cf6b1174a2 Mon Sep 17 00:00:00 2001 From: Thomas Wouters Date: Tue, 15 Apr 2025 11:14:24 +0200 Subject: [PATCH 2/2] Add comment linking to the likely culprit in GCC. --- Python/ceval.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Python/ceval.c b/Python/ceval.c index c484bb8dfbbfa5..8b6f8bf2e15f3a 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -951,7 +951,8 @@ _PyObjectArray_Free(PyObject **array, PyObject **scratch) #if (defined(__GNUC__) && __GNUC__ >= 10 && !defined(__clang__)) && defined(__x86_64__) /* * gh-129987: The SLP autovectorizer can cause poor code generation for - * opcode dispatch in some GCC versions (observed in GCCs 12 through 15), + * opcode dispatch in some GCC versions (observed in GCCs 12 through 15, + * probably caused by https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115777), * negating any benefit we get from vectorization elsewhere in the * interpreter loop. Disabling it significantly affected older GCC versions * (prior to GCC 9, 40% performance drop), so we have to selectively disable