From 6651884fd3c77d62f6d3f87bd84791b59acad681 Mon Sep 17 00:00:00 2001
From: Dino Viehland <dinoviehland@fb.com>
Date: Wed, 10 Jan 2024 09:39:25 -0800
Subject: [PATCH 1/5] Make type cache thread safe with sequence lock

---
 Include/internal/pycore_lock.h       |   1 +
 Include/internal/pycore_typeobject.h |   5 +-
 Objects/typeobject.c                 | 119 +++++++++++++++++++++++++--
 Python/lock.c                        |   2 +-
 Python/pystate.c                     |   2 +
 5 files changed, 118 insertions(+), 11 deletions(-)

diff --git a/Include/internal/pycore_lock.h b/Include/internal/pycore_lock.h
index 18a8896d97a548..63c7376f9027bd 100644
--- a/Include/internal/pycore_lock.h
+++ b/Include/internal/pycore_lock.h
@@ -251,6 +251,7 @@ PyAPI_FUNC(void) _PyRWMutex_RUnlock(_PyRWMutex *rwmutex);
 PyAPI_FUNC(void) _PyRWMutex_Lock(_PyRWMutex *rwmutex);
 PyAPI_FUNC(void) _PyRWMutex_Unlock(_PyRWMutex *rwmutex);
 
+void _Py_yield(void);
 
 #ifdef __cplusplus
 }
diff --git a/Include/internal/pycore_typeobject.h b/Include/internal/pycore_typeobject.h
index c03c3d766bef61..d2f5d4077f33a7 100644
--- a/Include/internal/pycore_typeobject.h
+++ b/Include/internal/pycore_typeobject.h
@@ -28,6 +28,9 @@ struct _types_runtime_state {
 // see _PyType_Lookup().
 struct type_cache_entry {
     unsigned int version;  // initialized from type->tp_version_tag
+#ifdef Py_GIL_DISABLED
+    int sequence;
+#endif
     PyObject *name;        // reference to exactly a str or None
     PyObject *value;       // borrowed reference or NULL
 };
@@ -74,7 +77,7 @@ struct types_state {
 extern PyStatus _PyTypes_InitTypes(PyInterpreterState *);
 extern void _PyTypes_FiniTypes(PyInterpreterState *);
 extern void _PyTypes_Fini(PyInterpreterState *);
-
+void _PyTypes_AfterFork(void);
 
 /* other API */
 
diff --git a/Objects/typeobject.c b/Objects/typeobject.c
index c65d0ec2acae52..bcdace54cf1f33 100644
--- a/Objects/typeobject.c
+++ b/Objects/typeobject.c
@@ -4765,6 +4765,82 @@ is_dunder_name(PyObject *name)
     return 0;
 }
 
+static void
+update_cache(struct type_cache_entry *entry, PyObject *name, unsigned int version_tag, PyObject *value)
+{
+    entry->version = version_tag;
+    entry->value = value;  /* borrowed */
+    assert(_PyASCIIObject_CAST(name)->hash != -1);
+    OBJECT_STAT_INC_COND(type_cache_collisions, entry->name != Py_None && entry->name != name);
+    Py_SETREF(entry->name, Py_NewRef(name));
+}
+
+#if Py_GIL_DISABLED
+
+#define TYPE_CACHE_IS_UPDATING(sequence) (sequence & 0x01)
+
+static void
+update_cache_gil_disabled(struct type_cache_entry *entry, PyObject *name,
+                          unsigned int version_tag, PyObject *value)
+{
+    // Similar to linux seqlock: https://en.wikipedia.org/wiki/Seqlock
+    // We use a sequence number to lock the writer, an even sequence means we're unlocked, an odd
+    // sequence means we're locked.
+    // Differs a little bit in that we use CAS on sequence as the lock, instead of a seperate spin lock.
+    // If our writer detects that another thread has already done the same write we'll just bail
+    // and restore the previous sequence number without doing any updates.
+
+    // lock the entry by setting by moving to an odd sequence number
+    int prev = entry->sequence;
+    while (1) {
+        if (TYPE_CACHE_IS_UPDATING(prev)) {
+            // Someone else is currently updating the cache
+            _Py_yield();
+            prev = _Py_atomic_load_int32_relaxed(&entry->sequence);
+        } else if(_Py_atomic_compare_exchange_int32(&entry->sequence, &prev, prev + 1)) {
+            // We've locked the cache
+            break;
+        } else {
+            _Py_yield();
+        }
+    }
+
+    // update the entry
+    if (entry->name == name &&
+        entry->value == value &&
+        entry->version == version_tag) {
+        // We reaced with another update, bail and restore previous sequence.
+        _Py_atomic_exchange_int32(&entry->sequence, prev);
+        return;
+    }
+
+    update_cache(entry, name, version_tag, value);
+
+    // Then update sequence to the next valid value
+    int new_sequence = prev + 2;
+    assert(!TYPE_CACHE_IS_UPDATING(new_sequence));
+    _Py_atomic_exchange_int32(&entry->sequence, new_sequence);
+}
+
+#endif
+
+void _PyTypes_AfterFork() {
+#ifdef Py_GIL_DISABLED
+    struct type_cache *cache = get_type_cache();
+    for (int i = 0; i < 1 << MCACHE_SIZE_EXP; i++) {
+        struct type_cache_entry *entry = &cache->hashtable[i];
+        int sequence = _Py_atomic_load_int_acquire(&entry->sequence);
+        if (TYPE_CACHE_IS_UPDATING(sequence)) {
+            // Entry was in the process of updating while forking, clear it...
+            entry->sequence = 0;
+            entry->value = NULL;
+            entry->name = NULL;
+            entry->version = 0;
+        }
+    }
+#endif
+}
+
 /* Internal API to look for a name through the MRO.
    This returns a borrowed reference, and doesn't set an exception! */
 PyObject *
@@ -4777,13 +4853,40 @@ _PyType_Lookup(PyTypeObject *type, PyObject *name)
     unsigned int h = MCACHE_HASH_METHOD(type, name);
     struct type_cache *cache = get_type_cache();
     struct type_cache_entry *entry = &cache->hashtable[h];
+#ifdef Py_GIL_DISABLED
+    // synchronize-with other writing threads by doing an acquire load on the sequence
+    while (1) {
+        int sequence = _Py_atomic_load_int_acquire(&entry->sequence);
+        if (!TYPE_CACHE_IS_UPDATING(sequence)) {
+            if (_Py_atomic_load_uint32_relaxed(&entry->version) == type->tp_version_tag &&
+                _Py_atomic_load_ptr_relaxed(&entry->name) == name) {
+                assert(_PyType_HasFeature(type, Py_TPFLAGS_VALID_VERSION_TAG));
+                OBJECT_STAT_INC_COND(type_cache_hits, !is_dunder_name(name));
+                OBJECT_STAT_INC_COND(type_cache_dunder_hits, is_dunder_name(name));
+                PyObject *value = _Py_atomic_load_ptr_relaxed(&entry->value);
+
+                // Synchronize again and validate that the entry hasn't been updated
+                // while we were readying the values.
+                if (_Py_atomic_load_int_acquire(&entry->sequence) == sequence) {
+                    return value;
+                }
+             } else {
+                // Cache miss
+                break;
+             }
+        }
+        // We are in progress of updating the cache, retry
+        _Py_yield();
+    }
+#else
     if (entry->version == type->tp_version_tag &&
         entry->name == name) {
         assert(_PyType_HasFeature(type, Py_TPFLAGS_VALID_VERSION_TAG));
         OBJECT_STAT_INC_COND(type_cache_hits, !is_dunder_name(name));
         OBJECT_STAT_INC_COND(type_cache_dunder_hits, is_dunder_name(name));
-        return entry->value;
+            return entry->value;
     }
+#endif
     OBJECT_STAT_INC_COND(type_cache_misses, !is_dunder_name(name));
     OBJECT_STAT_INC_COND(type_cache_dunder_misses, is_dunder_name(name));
 
@@ -4808,15 +4911,13 @@ _PyType_Lookup(PyTypeObject *type, PyObject *name)
     }
 
     if (MCACHE_CACHEABLE_NAME(name) && assign_version_tag(interp, type)) {
-        h = MCACHE_HASH_METHOD(type, name);
-        struct type_cache_entry *entry = &cache->hashtable[h];
-        entry->version = type->tp_version_tag;
-        entry->value = res;  /* borrowed */
-        assert(_PyASCIIObject_CAST(name)->hash != -1);
-        OBJECT_STAT_INC_COND(type_cache_collisions, entry->name != Py_None && entry->name != name);
+#if Py_GIL_DISABLED
+        update_cache_gil_disabled(entry, name, type->tp_version_tag, res);
+#else
+        update_cache(entry, name, type->tp_version_tag, res);
+#endif
         assert(_PyType_HasFeature(type, Py_TPFLAGS_VALID_VERSION_TAG));
-        Py_SETREF(entry->name, Py_NewRef(name));
-    }
+}
     return res;
 }
 
diff --git a/Python/lock.c b/Python/lock.c
index f0ff1176941da8..9d73ec3fa2217d 100644
--- a/Python/lock.c
+++ b/Python/lock.c
@@ -36,7 +36,7 @@ struct mutex_entry {
     int handed_off;
 };
 
-static void
+void
 _Py_yield(void)
 {
 #ifdef MS_WINDOWS
diff --git a/Python/pystate.c b/Python/pystate.c
index 08ec586963ce11..580b2f0ae3277f 100644
--- a/Python/pystate.c
+++ b/Python/pystate.c
@@ -499,6 +499,8 @@ _PyRuntimeState_ReInitThreads(_PyRuntimeState *runtime)
         _PyMutex_at_fork_reinit(locks[i]);
     }
 
+    _PyTypes_AfterFork();
+
     /* bpo-42540: id_mutex is freed by _PyInterpreterState_Delete, which does
      * not force the default allocator. */
     if (_PyThread_at_fork_reinit(&runtime->interpreters.main->id_mutex) < 0) {

From b961d515c71d2493cc7d129b589cafb213fb91ae Mon Sep 17 00:00:00 2001
From: Dino Viehland <dinoviehland@fb.com>
Date: Wed, 10 Jan 2024 12:14:55 -0800
Subject: [PATCH 2/5] Make bases/mro accesses thread safe

---
 Include/internal/pycore_typeobject.h |   2 +
 Modules/_abc.c                       |  15 +-
 Objects/typeobject.c                 | 323 ++++++++++++++++++++++-----
 Python/pystate.c                     |   1 +
 4 files changed, 276 insertions(+), 65 deletions(-)

diff --git a/Include/internal/pycore_typeobject.h b/Include/internal/pycore_typeobject.h
index d2f5d4077f33a7..ce7e08385aa6bd 100644
--- a/Include/internal/pycore_typeobject.h
+++ b/Include/internal/pycore_typeobject.h
@@ -9,6 +9,7 @@ extern "C" {
 #endif
 
 #include "pycore_moduleobject.h"  // PyModuleObject
+#include "pycore_lock.h"          // PyMutex
 
 
 /* state */
@@ -21,6 +22,7 @@ struct _types_runtime_state {
     // bpo-42745: next_version_tag remains shared by all interpreters
     // because of static types.
     unsigned int next_version_tag;
+    PyMutex type_mutex;
 };
 
 
diff --git a/Modules/_abc.c b/Modules/_abc.c
index 9473905243d438..399ecbbd6a2172 100644
--- a/Modules/_abc.c
+++ b/Modules/_abc.c
@@ -8,7 +8,6 @@
 #include "pycore_object.h"        // _PyType_GetSubclasses()
 #include "pycore_runtime.h"       // _Py_ID()
 #include "pycore_setobject.h"     // _PySet_NextEntry()
-#include "pycore_typeobject.h"    // _PyType_GetMRO()
 #include "pycore_weakref.h"       // _PyWeakref_GET_REF()
 #include "clinic/_abc.c.h"
 
@@ -744,18 +743,12 @@ _abc__abc_subclasscheck_impl(PyObject *module, PyObject *self,
     Py_DECREF(ok);
 
     /* 4. Check if it's a direct subclass. */
-    PyObject *mro = _PyType_GetMRO((PyTypeObject *)subclass);
-    assert(PyTuple_Check(mro));
-    for (pos = 0; pos < PyTuple_GET_SIZE(mro); pos++) {
-        PyObject *mro_item = PyTuple_GET_ITEM(mro, pos);
-        assert(mro_item != NULL);
-        if ((PyObject *)self == mro_item) {
-            if (_add_to_weak_set(&impl->_abc_cache, subclass) < 0) {
-                goto end;
-            }
-            result = Py_True;
+    if (PyType_IsSubtype((PyTypeObject *)subclass, (PyTypeObject *)self)) {
+        if (_add_to_weak_set(&impl->_abc_cache, subclass) < 0) {
             goto end;
         }
+        result = Py_True;
+        goto end;
     }
 
     /* 5. Check if it's a subclass of a registered class (recursive). */
diff --git a/Objects/typeobject.c b/Objects/typeobject.c
index bcdace54cf1f33..def1c6da1e8948 100644
--- a/Objects/typeobject.c
+++ b/Objects/typeobject.c
@@ -52,6 +52,34 @@ class object "PyObject *" "&PyBaseObject_Type"
 #define NEXT_VERSION_TAG(interp) \
     (interp)->types.next_version_tag
 
+#ifdef Py_GIL_DISABLED
+
+// There's a global lock for mutation of types.  This avoids having to take
+// additonal locks while doing various subclass processing which may result
+// in odd behaviors w.r.t. running with the GIL as the outer type lock could
+// be released and reacquired during a subclass update if there's contention
+// on the subclass lock.
+#define BEGIN_TYPE_LOCK()                                           \
+    {                                                               \
+    _PyCriticalSection _cs;                                         \
+    _PyCriticalSection_Begin(&_cs, &_PyRuntime.types.type_mutex);   \
+
+#define END_TYPE_LOCK()                                             \
+        _PyCriticalSection_End(&_cs);                               \
+    }
+
+#define ASSERT_TYPE_LOCK_HELD() \
+    _Py_CRITICAL_SECTION_ASSERT_MUTEX_LOCKED(&_PyRuntime.types.type_mutex)
+
+#else
+
+#define BEGIN_TYPE_LOCK()
+#define END_TYPE_LOCK()
+#define ASSERT_TYPE_LOCK_HELD()
+
+#endif
+
+
 typedef struct PySlot_Offset {
     short subslot_offset;
     short slot_offset;
@@ -279,8 +307,14 @@ lookup_tp_bases(PyTypeObject *self)
 PyObject *
 _PyType_GetBases(PyTypeObject *self)
 {
-    /* It returns a borrowed reference. */
-    return lookup_tp_bases(self);
+    PyObject *res;
+
+    BEGIN_TYPE_LOCK();
+    res = lookup_tp_bases(self);
+    Py_INCREF(res);
+    END_TYPE_LOCK()
+
+    return res;
 }
 
 static inline void
@@ -330,14 +364,19 @@ clear_tp_bases(PyTypeObject *self)
 static inline PyObject *
 lookup_tp_mro(PyTypeObject *self)
 {
+    ASSERT_TYPE_LOCK_HELD();
     return self->tp_mro;
 }
 
 PyObject *
 _PyType_GetMRO(PyTypeObject *self)
 {
-    /* It returns a borrowed reference. */
-    return lookup_tp_mro(self);
+    PyObject *mro;
+    BEGIN_TYPE_LOCK();
+    mro = lookup_tp_mro(self);
+    Py_INCREF(mro);
+    END_TYPE_LOCK()
+    return mro;
 }
 
 static inline void
@@ -760,8 +799,10 @@ PyType_Watch(int watcher_id, PyObject* obj)
         return -1;
     }
     // ensure we will get a callback on the next modification
+    BEGIN_TYPE_LOCK()
     assign_version_tag(interp, type);
     type->tp_watched |= (1 << watcher_id);
+    END_TYPE_LOCK()
     return 0;
 }
 
@@ -781,8 +822,8 @@ PyType_Unwatch(int watcher_id, PyObject* obj)
     return 0;
 }
 
-void
-PyType_Modified(PyTypeObject *type)
+static void
+type_modified_unlocked(PyTypeObject *type)
 {
     /* Invalidate any cached data for the specified type and all
        subclasses.  This function is called after the base
@@ -848,6 +889,22 @@ PyType_Modified(PyTypeObject *type)
     }
 }
 
+void
+PyType_Modified(PyTypeObject *type)
+{
+    // Quick check without the lock held
+    if (!_PyType_HasFeature(type, Py_TPFLAGS_VALID_VERSION_TAG)) {
+        return;
+    }
+
+    BEGIN_TYPE_LOCK()
+    type_modified_unlocked(type);
+    END_TYPE_LOCK()
+}
+
+static int
+is_subtype_unlocked(PyTypeObject *a, PyTypeObject *b);
+
 static void
 type_mro_modified(PyTypeObject *type, PyObject *bases) {
     /*
@@ -866,6 +923,7 @@ type_mro_modified(PyTypeObject *type, PyObject *bases) {
     int custom = !Py_IS_TYPE(type, &PyType_Type);
     int unbound;
 
+    ASSERT_TYPE_LOCK_HELD();
     if (custom) {
         PyObject *mro_meth, *type_mro_meth;
         mro_meth = lookup_maybe_method(
@@ -891,7 +949,7 @@ type_mro_modified(PyTypeObject *type, PyObject *bases) {
         PyObject *b = PyTuple_GET_ITEM(bases, i);
         PyTypeObject *cls = _PyType_CAST(b);
 
-        if (!PyType_IsSubtype(type, cls)) {
+        if (!is_subtype_unlocked(type, cls)) {
             goto clear;
         }
     }
@@ -913,6 +971,8 @@ type_mro_modified(PyTypeObject *type, PyObject *bases) {
 static int
 assign_version_tag(PyInterpreterState *interp, PyTypeObject *type)
 {
+    ASSERT_TYPE_LOCK_HELD();
+
     /* Ensure that the tp_version_tag is valid and set
        Py_TPFLAGS_VALID_VERSION_TAG.  To respect the invariant, this
        must first be done on all super classes.  Return 0 if this
@@ -961,7 +1021,11 @@ assign_version_tag(PyInterpreterState *interp, PyTypeObject *type)
 int PyUnstable_Type_AssignVersionTag(PyTypeObject *type)
 {
     PyInterpreterState *interp = _PyInterpreterState_GET();
-    return assign_version_tag(interp, type);
+    int assigned;
+    BEGIN_TYPE_LOCK()
+    assigned = assign_version_tag(interp, type);
+    END_TYPE_LOCK()
+    return assigned;
 }
 
 
@@ -1183,21 +1247,28 @@ type_set_abstractmethods(PyTypeObject *type, PyObject *value, void *context)
 static PyObject *
 type_get_bases(PyTypeObject *type, void *context)
 {
-    PyObject *bases = lookup_tp_bases(type);
+    PyObject *bases = _PyType_GetBases(type);
     if (bases == NULL) {
         Py_RETURN_NONE;
     }
-    return Py_NewRef(bases);
+    return bases;
 }
 
 static PyObject *
 type_get_mro(PyTypeObject *type, void *context)
 {
-    PyObject *mro = lookup_tp_mro(type);
+    PyObject *mro;
+
+    BEGIN_TYPE_LOCK()
+    mro = lookup_tp_mro(type);
     if (mro == NULL) {
-        Py_RETURN_NONE;
+        mro = Py_None;
+    } else {
+        Py_INCREF(mro);
     }
-    return Py_NewRef(mro);
+
+    END_TYPE_LOCK()
+    return mro;
 }
 
 static PyTypeObject *best_base(PyObject *);
@@ -1219,6 +1290,8 @@ static int recurse_down_subclasses(PyTypeObject *type, PyObject *name,
 static int
 mro_hierarchy(PyTypeObject *type, PyObject *temp)
 {
+    ASSERT_TYPE_LOCK_HELD();
+
     PyObject *old_mro;
     int res = mro_internal(type, &old_mro);
     if (res <= 0) {
@@ -1282,7 +1355,7 @@ mro_hierarchy(PyTypeObject *type, PyObject *temp)
 }
 
 static int
-type_set_bases(PyTypeObject *type, PyObject *new_bases, void *context)
+type_set_bases_unlocked(PyTypeObject *type, PyObject *new_bases, void *context)
 {
     // Check arguments
     if (!check_set_special_type_attr(type, new_bases, "__bases__")) {
@@ -1313,7 +1386,7 @@ type_set_bases(PyTypeObject *type, PyObject *new_bases, void *context)
         }
         PyTypeObject *base = (PyTypeObject*)ob;
 
-        if (PyType_IsSubtype(base, type) ||
+        if (is_subtype_unlocked(base, type) ||
             /* In case of reentering here again through a custom mro()
                the above check is not enough since it relies on
                base->tp_mro which would gonna be updated inside
@@ -1418,6 +1491,16 @@ type_set_bases(PyTypeObject *type, PyObject *new_bases, void *context)
     return -1;
 }
 
+static int
+type_set_bases(PyTypeObject *type, PyObject *new_bases, void *context)
+{
+    int res;
+    BEGIN_TYPE_LOCK();
+    res = type_set_bases_unlocked(type, new_bases, context);
+    END_TYPE_LOCK();
+    return res;
+}
+
 static PyObject *
 type_dict(PyTypeObject *type, void *context)
 {
@@ -2156,11 +2239,12 @@ type_is_subtype_base_chain(PyTypeObject *a, PyTypeObject *b)
     return (b == &PyBaseObject_Type);
 }
 
-int
-PyType_IsSubtype(PyTypeObject *a, PyTypeObject *b)
+static int
+is_subtype_unlocked(PyTypeObject *a, PyTypeObject *b)
 {
     PyObject *mro;
 
+    ASSERT_TYPE_LOCK_HELD();
     mro = lookup_tp_mro(a);
     if (mro != NULL) {
         /* Deal with multiple inheritance without recursion
@@ -2179,6 +2263,16 @@ PyType_IsSubtype(PyTypeObject *a, PyTypeObject *b)
         return type_is_subtype_base_chain(a, b);
 }
 
+int
+PyType_IsSubtype(PyTypeObject *a, PyTypeObject *b)
+{
+    int res;
+    BEGIN_TYPE_LOCK();
+    res = is_subtype_unlocked(a, b);
+    END_TYPE_LOCK()
+    return res;
+}
+
 /* Routines to do a method lookup in the type without looking in the
    instance dictionary (so we can't use PyObject_GetAttr) but still
    binding it to the instance.
@@ -2538,8 +2632,10 @@ pmerge(PyObject *acc, PyObject **to_merge, Py_ssize_t to_merge_size)
 }
 
 static PyObject *
-mro_implementation(PyTypeObject *type)
+mro_implementation_unlocked(PyTypeObject *type)
 {
+    ASSERT_TYPE_LOCK_HELD();
+
     if (!_PyType_IsReady(type)) {
         if (PyType_Ready(type) < 0)
             return NULL;
@@ -2619,6 +2715,16 @@ mro_implementation(PyTypeObject *type)
     return result;
 }
 
+static PyObject *
+mro_implementation(PyTypeObject *type)
+{
+    PyObject *mro;
+    BEGIN_TYPE_LOCK()
+    mro = mro_implementation_unlocked(type);
+    END_TYPE_LOCK()
+    return mro;
+}
+
 /*[clinic input]
 type.mro
 
@@ -2657,7 +2763,7 @@ mro_check(PyTypeObject *type, PyObject *mro)
         }
         PyTypeObject *base = (PyTypeObject*)obj;
 
-        if (!PyType_IsSubtype(solid, solid_base(base))) {
+        if (!is_subtype_unlocked(solid, solid_base(base))) {
             PyErr_Format(
                 PyExc_TypeError,
                 "mro() returned base with unsuitable layout ('%.500s')",
@@ -2688,6 +2794,9 @@ mro_invoke(PyTypeObject *type)
 {
     PyObject *mro_result;
     PyObject *new_mro;
+
+    ASSERT_TYPE_LOCK_HELD();
+
     const int custom = !Py_IS_TYPE(type, &PyType_Type);
 
     if (custom) {
@@ -2700,7 +2809,7 @@ mro_invoke(PyTypeObject *type)
         Py_DECREF(mro_meth);
     }
     else {
-        mro_result = mro_implementation(type);
+        mro_result = mro_implementation_unlocked(type);
     }
     if (mro_result == NULL)
         return NULL;
@@ -2747,8 +2856,10 @@ mro_invoke(PyTypeObject *type)
      - Returns -1 in case of an error.
 */
 static int
-mro_internal(PyTypeObject *type, PyObject **p_old_mro)
+mro_internal_unlocked(PyTypeObject *type, PyObject **p_old_mro)
 {
+    ASSERT_TYPE_LOCK_HELD();
+
     PyObject *new_mro, *old_mro;
     int reent;
 
@@ -2793,6 +2904,16 @@ mro_internal(PyTypeObject *type, PyObject **p_old_mro)
     return 1;
 }
 
+static int
+mro_internal(PyTypeObject *type, PyObject **p_old_mro)
+{
+    int res;
+    BEGIN_TYPE_LOCK()
+    res = mro_internal_unlocked(type, p_old_mro);
+    END_TYPE_LOCK()
+    return res;
+}
+
 /* Calculate the best base amongst multiple base classes.
    This is the first one that's on the path to the "solid base". */
 
@@ -4631,6 +4752,9 @@ PyType_GetModuleByDef(PyTypeObject *type, PyModuleDef *def)
 {
     assert(PyType_Check(type));
 
+    PyObject *res = NULL;
+    BEGIN_TYPE_LOCK()
+
     PyObject *mro = lookup_tp_mro(type);
     // The type must be ready
     assert(mro != NULL);
@@ -4650,15 +4774,19 @@ PyType_GetModuleByDef(PyTypeObject *type, PyModuleDef *def)
         PyHeapTypeObject *ht = (PyHeapTypeObject*)super;
         PyObject *module = ht->ht_module;
         if (module && _PyModule_GetDef(module) == def) {
-            return module;
+            res = module;
+            break;
         }
     }
+    END_TYPE_LOCK()
 
-    PyErr_Format(
-        PyExc_TypeError,
-        "PyType_GetModuleByDef: No superclass of '%s' has the given module",
-        type->tp_name);
-    return NULL;
+    if (res == NULL) {
+        PyErr_Format(
+            PyExc_TypeError,
+            "PyType_GetModuleByDef: No superclass of '%s' has the given module",
+            type->tp_name);
+    }
+    return res;
 }
 
 void *
@@ -4696,6 +4824,8 @@ PyObject_GetItemData(PyObject *obj)
 static PyObject *
 find_name_in_mro(PyTypeObject *type, PyObject *name, int *error)
 {
+    ASSERT_TYPE_LOCK_HELD();
+
     Py_hash_t hash;
     if (!PyUnicode_CheckExact(name) ||
         (hash = _PyASCIIObject_CAST(name)->hash) == -1)
@@ -4884,7 +5014,7 @@ _PyType_Lookup(PyTypeObject *type, PyObject *name)
         assert(_PyType_HasFeature(type, Py_TPFLAGS_VALID_VERSION_TAG));
         OBJECT_STAT_INC_COND(type_cache_hits, !is_dunder_name(name));
         OBJECT_STAT_INC_COND(type_cache_dunder_hits, is_dunder_name(name));
-            return entry->value;
+        return entry->value;
     }
 #endif
     OBJECT_STAT_INC_COND(type_cache_misses, !is_dunder_name(name));
@@ -4893,7 +5023,19 @@ _PyType_Lookup(PyTypeObject *type, PyObject *name)
     /* We may end up clearing live exceptions below, so make sure it's ours. */
     assert(!PyErr_Occurred());
 
+    // We need to atomically do the lookup and capture the version before
+    // anyone else can modify our mro or mutate the type.
+
+    int has_version = 0;
+    int version = 0;
+    BEGIN_TYPE_LOCK()
     res = find_name_in_mro(type, name, &error);
+    if (MCACHE_CACHEABLE_NAME(name)) {
+        has_version = assign_version_tag(interp, type);
+        version = type->tp_version_tag;
+    }
+    END_TYPE_LOCK()
+
     /* Only put NULL results into cache if there was no error. */
     if (error) {
         /* It's not ideal to clear the error condition,
@@ -4910,11 +5052,11 @@ _PyType_Lookup(PyTypeObject *type, PyObject *name)
         return NULL;
     }
 
-    if (MCACHE_CACHEABLE_NAME(name) && assign_version_tag(interp, type)) {
+    if (has_version) {
 #if Py_GIL_DISABLED
-        update_cache_gil_disabled(entry, name, type->tp_version_tag, res);
+        update_cache_gil_disabled(entry, name, version, res);
 #else
-        update_cache(entry, name, type->tp_version_tag, res);
+        update_cache(entry, name, version, res);
 #endif
         assert(_PyType_HasFeature(type, Py_TPFLAGS_VALID_VERSION_TAG));
 }
@@ -5079,6 +5221,8 @@ type_setattro(PyObject *self, PyObject *name, PyObject *value)
         /* Will fail in _PyObject_GenericSetAttrWithDict. */
         Py_INCREF(name);
     }
+
+    BEGIN_TYPE_LOCK()
     res = _PyObject_GenericSetAttrWithDict((PyObject *)type, name, value, NULL);
     if (res == 0) {
         /* Clear the VALID_VERSION flag of 'type' and all its
@@ -5086,13 +5230,15 @@ type_setattro(PyObject *self, PyObject *name, PyObject *value)
            update_subclasses() recursion in update_slot(), but carefully:
            they each have their own conditions on which to stop
            recursing into subclasses. */
-        PyType_Modified(type);
+        type_modified_unlocked(type);
 
         if (is_dunder_name(name)) {
             res = update_slot(type, name);
         }
         assert(_PyType_CheckConsistency(type));
     }
+    END_TYPE_LOCK()
+
     Py_DECREF(name);
     return res;
 }
@@ -6897,28 +7043,28 @@ inherit_special(PyTypeObject *type, PyTypeObject *base)
 #undef COPYVAL
 
     /* Setup fast subclass flags */
-    if (PyType_IsSubtype(base, (PyTypeObject*)PyExc_BaseException)) {
+    if (is_subtype_unlocked(base, (PyTypeObject*)PyExc_BaseException)) {
         type->tp_flags |= Py_TPFLAGS_BASE_EXC_SUBCLASS;
     }
-    else if (PyType_IsSubtype(base, &PyType_Type)) {
+    else if (is_subtype_unlocked(base, &PyType_Type)) {
         type->tp_flags |= Py_TPFLAGS_TYPE_SUBCLASS;
     }
-    else if (PyType_IsSubtype(base, &PyLong_Type)) {
+    else if (is_subtype_unlocked(base, &PyLong_Type)) {
         type->tp_flags |= Py_TPFLAGS_LONG_SUBCLASS;
     }
-    else if (PyType_IsSubtype(base, &PyBytes_Type)) {
+    else if (is_subtype_unlocked(base, &PyBytes_Type)) {
         type->tp_flags |= Py_TPFLAGS_BYTES_SUBCLASS;
     }
-    else if (PyType_IsSubtype(base, &PyUnicode_Type)) {
+    else if (is_subtype_unlocked(base, &PyUnicode_Type)) {
         type->tp_flags |= Py_TPFLAGS_UNICODE_SUBCLASS;
     }
-    else if (PyType_IsSubtype(base, &PyTuple_Type)) {
+    else if (is_subtype_unlocked(base, &PyTuple_Type)) {
         type->tp_flags |= Py_TPFLAGS_TUPLE_SUBCLASS;
     }
-    else if (PyType_IsSubtype(base, &PyList_Type)) {
+    else if (is_subtype_unlocked(base, &PyList_Type)) {
         type->tp_flags |= Py_TPFLAGS_LIST_SUBCLASS;
     }
-    else if (PyType_IsSubtype(base, &PyDict_Type)) {
+    else if (is_subtype_unlocked(base, &PyDict_Type)) {
         type->tp_flags |= Py_TPFLAGS_DICT_SUBCLASS;
     }
 
@@ -7357,6 +7503,8 @@ type_ready_preheader(PyTypeObject *type)
 static int
 type_ready_mro(PyTypeObject *type)
 {
+    ASSERT_TYPE_LOCK_HELD();
+
     if (type->tp_flags & _Py_TPFLAGS_STATIC_BUILTIN) {
         if (!_Py_IsMainInterpreter(_PyInterpreterState_GET())) {
             assert(lookup_tp_mro(type) != NULL);
@@ -7366,7 +7514,7 @@ type_ready_mro(PyTypeObject *type)
     }
 
     /* Calculate method resolution order */
-    if (mro_internal(type, NULL) < 0) {
+    if (mro_internal_unlocked(type, NULL) < 0) {
         return -1;
     }
     PyObject *mro = lookup_tp_mro(type);
@@ -7430,6 +7578,8 @@ inherit_patma_flags(PyTypeObject *type, PyTypeObject *base) {
 static int
 type_ready_inherit(PyTypeObject *type)
 {
+    ASSERT_TYPE_LOCK_HELD();
+
     /* Inherit special flags from dominant base */
     PyTypeObject *base = type->tp_base;
     if (base != NULL) {
@@ -7622,6 +7772,8 @@ type_ready_post_checks(PyTypeObject *type)
 static int
 type_ready(PyTypeObject *type, int rerunbuiltin)
 {
+    ASSERT_TYPE_LOCK_HELD();
+
     _PyObject_ASSERT((PyObject *)type, !is_readying(type));
     start_readying(type);
 
@@ -7709,7 +7861,16 @@ PyType_Ready(PyTypeObject *type)
         type->tp_flags |= Py_TPFLAGS_IMMUTABLETYPE;
     }
 
-    return type_ready(type, 0);
+    int res;
+    BEGIN_TYPE_LOCK()
+    if (!(type->tp_flags & Py_TPFLAGS_READY)) {
+        res = type_ready(type, 0);
+    } else {
+        res = 0;
+        assert(_PyType_CheckConsistency(type));
+    }
+    END_TYPE_LOCK()
+    return res;
 }
 
 int
@@ -7739,7 +7900,10 @@ _PyStaticType_InitBuiltin(PyInterpreterState *interp, PyTypeObject *self)
 
     static_builtin_state_init(interp, self);
 
-    int res = type_ready(self, !ismain);
+    int res;
+    BEGIN_TYPE_LOCK();
+    res = type_ready(self, !ismain);
+    END_TYPE_LOCK()
     if (res < 0) {
         static_builtin_state_clear(interp, self);
     }
@@ -8141,9 +8305,12 @@ wrap_delitem(PyObject *self, PyObject *args, void *wrapped)
    https://mail.python.org/pipermail/python-dev/2003-April/034535.html
    */
 static int
-hackcheck(PyObject *self, setattrofunc func, const char *what)
+hackcheck_unlocked(PyObject *self, setattrofunc func, const char *what)
 {
     PyTypeObject *type = Py_TYPE(self);
+
+    ASSERT_TYPE_LOCK_HELD();
+
     PyObject *mro = lookup_tp_mro(type);
     if (!mro) {
         /* Probably ok not to check the call in this case. */
@@ -8185,6 +8352,20 @@ hackcheck(PyObject *self, setattrofunc func, const char *what)
     return 1;
 }
 
+static int
+hackcheck(PyObject *self, setattrofunc func, const char *what)
+{
+    if (!PyType_Check(self)) {
+        return 1;
+    }
+
+    int res;
+    BEGIN_TYPE_LOCK();
+    res = hackcheck_unlocked(self, func, what);
+    END_TYPE_LOCK()
+    return res;
+}
+
 static PyObject *
 wrap_setattr(PyObject *self, PyObject *args, void *wrapped)
 {
@@ -9354,7 +9535,7 @@ slot_bf_getbuffer(PyObject *self, Py_buffer *buffer, int flags)
 }
 
 static int
-releasebuffer_maybe_call_super(PyObject *self, Py_buffer *buffer)
+releasebuffer_maybe_call_super_unlocked(PyObject *self, Py_buffer *buffer, releasebufferproc *base_releasebuffer)
 {
     PyTypeObject *self_type = Py_TYPE(self);
     PyObject *mro = lookup_tp_mro(self_type);
@@ -9375,7 +9556,7 @@ releasebuffer_maybe_call_super(PyObject *self, Py_buffer *buffer)
     if (i >= n)
         return -1;
 
-    releasebufferproc base_releasebuffer = NULL;
+    *base_releasebuffer = NULL;
     for (; i < n; i++) {
         PyObject *obj = PyTuple_GET_ITEM(mro, i);
         if (!PyType_Check(obj)) {
@@ -9385,15 +9566,28 @@ releasebuffer_maybe_call_super(PyObject *self, Py_buffer *buffer)
         if (base_type->tp_as_buffer != NULL
             && base_type->tp_as_buffer->bf_releasebuffer != NULL
             && base_type->tp_as_buffer->bf_releasebuffer != slot_bf_releasebuffer) {
-            base_releasebuffer = base_type->tp_as_buffer->bf_releasebuffer;
+            *base_releasebuffer = base_type->tp_as_buffer->bf_releasebuffer;
             break;
         }
     }
 
-    if (base_releasebuffer != NULL) {
+    return 0;
+}
+
+static int
+releasebuffer_maybe_call_super(PyObject *self, Py_buffer *buffer)
+{
+    int res;
+    releasebufferproc base_releasebuffer;
+
+    BEGIN_TYPE_LOCK();
+    res = releasebuffer_maybe_call_super_unlocked(self, buffer, &base_releasebuffer);
+    END_TYPE_LOCK();
+
+    if (res == 0) {
         base_releasebuffer(self, buffer);
     }
-    return 0;
+    return res;
 }
 
 static void
@@ -9929,6 +10123,8 @@ resolve_slotdups(PyTypeObject *type, PyObject *name)
 static pytype_slotdef *
 update_one_slot(PyTypeObject *type, pytype_slotdef *p)
 {
+    ASSERT_TYPE_LOCK_HELD();
+
     PyObject *descr;
     PyWrapperDescrObject *d;
 
@@ -9977,7 +10173,7 @@ update_one_slot(PyTypeObject *type, pytype_slotdef *p)
             d = (PyWrapperDescrObject *)descr;
             if ((specific == NULL || specific == d->d_wrapped) &&
                 d->d_base->wrapper == p->wrapper &&
-                PyType_IsSubtype(type, PyDescr_TYPE(d)))
+                is_subtype_unlocked(type, PyDescr_TYPE(d)))
             {
                 specific = d->d_wrapped;
             }
@@ -10042,6 +10238,8 @@ update_one_slot(PyTypeObject *type, pytype_slotdef *p)
 static int
 update_slots_callback(PyTypeObject *type, void *data)
 {
+    ASSERT_TYPE_LOCK_HELD();
+
     pytype_slotdef **pp = (pytype_slotdef **)data;
     for (; *pp; pp++) {
         update_one_slot(type, *pp);
@@ -10058,6 +10256,7 @@ update_slot(PyTypeObject *type, PyObject *name)
     pytype_slotdef **pp;
     int offset;
 
+    ASSERT_TYPE_LOCK_HELD();
     assert(PyUnicode_CheckExact(name));
     assert(PyUnicode_CHECK_INTERNED(name));
 
@@ -10091,10 +10290,17 @@ update_slot(PyTypeObject *type, PyObject *name)
 static void
 fixup_slot_dispatchers(PyTypeObject *type)
 {
+    // This lock isn't strictly necessary because the type has not been
+    // exposed to anyone else yet, but update_ont_slot calls find_name_in_mro
+    // where we'd like to assert that the tyep is locked.
+    BEGIN_TYPE_LOCK()
+
     assert(!PyErr_Occurred());
     for (pytype_slotdef *p = slotdefs; p->name; ) {
         p = update_one_slot(type, p);
     }
+
+    END_TYPE_LOCK()
 }
 
 static void
@@ -10102,6 +10308,8 @@ update_all_slots(PyTypeObject* type)
 {
     pytype_slotdef *p;
 
+    ASSERT_TYPE_LOCK_HELD();
+
     /* Clear the VALID_VERSION flag of 'type' and all its subclasses. */
     PyType_Modified(type);
 
@@ -10374,7 +10582,15 @@ _super_lookup_descr(PyTypeObject *su_type, PyTypeObject *su_obj_type, PyObject *
     PyObject *mro, *res;
     Py_ssize_t i, n;
 
+    BEGIN_TYPE_LOCK();
     mro = lookup_tp_mro(su_obj_type);
+    /* keep a strong reference to mro because su_obj_type->tp_mro can be
+       replaced during PyDict_GetItemRef(dict, name, &res) and because
+       another thread can modify it after we end the critical section
+       below  */
+    Py_XINCREF(mro);
+    END_TYPE_LOCK()
+
     if (mro == NULL)
         return NULL;
 
@@ -10387,12 +10603,11 @@ _super_lookup_descr(PyTypeObject *su_type, PyTypeObject *su_obj_type, PyObject *
             break;
     }
     i++;  /* skip su->type (if any)  */
-    if (i >= n)
+    if (i >= n) {
+        Py_DECREF(mro);
         return NULL;
+    }
 
-    /* keep a strong reference to mro because su_obj_type->tp_mro can be
-       replaced during PyDict_GetItemRef(dict, name, &res)  */
-    Py_INCREF(mro);
     do {
         PyObject *obj = PyTuple_GET_ITEM(mro, i);
         PyObject *dict = lookup_tp_dict(_PyType_CAST(obj));
diff --git a/Python/pystate.c b/Python/pystate.c
index 580b2f0ae3277f..b1d1a085d629b4 100644
--- a/Python/pystate.c
+++ b/Python/pystate.c
@@ -395,6 +395,7 @@ _Py_COMP_DIAG_POP
         &(runtime)->atexit.mutex, \
         &(runtime)->audit_hooks.mutex, \
         &(runtime)->allocators.mutex, \
+        &(runtime)->types.type_mutex, \
     }
 
 static void

From 3799cbebd42ac2a0f7f729a2a647a8eeabeeb213 Mon Sep 17 00:00:00 2001
From: Dino Viehland <dinoviehland@fb.com>
Date: Tue, 16 Jan 2024 12:02:37 -0800
Subject: [PATCH 3/5] Move seqlock implementation to lock.c

---
 Include/internal/pycore_lock.h       | 34 ++++++++++++-
 Include/internal/pycore_typeobject.h |  2 +-
 Objects/typeobject.c                 | 70 ++++++++-------------------
 Python/lock.c                        | 71 +++++++++++++++++++++++++++-
 4 files changed, 125 insertions(+), 52 deletions(-)

diff --git a/Include/internal/pycore_lock.h b/Include/internal/pycore_lock.h
index 63c7376f9027bd..30d0ac0ae8f0be 100644
--- a/Include/internal/pycore_lock.h
+++ b/Include/internal/pycore_lock.h
@@ -251,7 +251,39 @@ PyAPI_FUNC(void) _PyRWMutex_RUnlock(_PyRWMutex *rwmutex);
 PyAPI_FUNC(void) _PyRWMutex_Lock(_PyRWMutex *rwmutex);
 PyAPI_FUNC(void) _PyRWMutex_Unlock(_PyRWMutex *rwmutex);
 
-void _Py_yield(void);
+// Similar to linux seqlock: https://en.wikipedia.org/wiki/Seqlock
+// We use a sequence number to lock the writer, an even sequence means we're unlocked, an odd
+// sequence means we're locked.  Readers will read the sequence before attempting to read the
+// underlying data and then read the sequence number again after reading the data.  If the
+// sequence has not changed the data is valid.
+//
+// Differs a little bit in that we use CAS on sequence as the lock, instead of a seperate spin lock.
+// The writer can also detect that the undelering data has not changed and abandon the write
+// and restore the previous sequence.
+typedef struct {
+    int sequence;
+} _PySeqLock;
+
+// Lock the sequence lock for the writer
+PyAPI_FUNC(void) _PySeqLock_LockWrite(_PySeqLock *seqlock);
+
+// Unlock the sequence lock and move to the next sequence number.
+PyAPI_FUNC(void) _PySeqLock_UnlockWrite(_PySeqLock *seqlock);
+
+// Abandon the current update indicating that no mutations have occured
+// and restore the previous sequence value.
+PyAPI_FUNC(void) _PySeqLock_AbandonWrite(_PySeqLock *seqlock);
+
+// Begin a read operation and return the current sequence number.
+PyAPI_FUNC(int) _PySeqLock_BeginRead(_PySeqLock *seqlock);
+
+// End the read operation and confirm that the sequence number has not changed.
+// Returns 1 if the read was successful or 0 if the read should be re-tried.
+PyAPI_FUNC(int) _PySeqLock_EndRead(_PySeqLock *seqlock, int previous);
+
+// Check if the lock was held during a fork and clear the lock.  Returns 1
+// if the lock was held and any associated datat should be cleared.
+PyAPI_FUNC(int) _PySeqLock_AfterFork(_PySeqLock *seqlock);
 
 #ifdef __cplusplus
 }
diff --git a/Include/internal/pycore_typeobject.h b/Include/internal/pycore_typeobject.h
index ce7e08385aa6bd..115855f2a8e24c 100644
--- a/Include/internal/pycore_typeobject.h
+++ b/Include/internal/pycore_typeobject.h
@@ -31,7 +31,7 @@ struct _types_runtime_state {
 struct type_cache_entry {
     unsigned int version;  // initialized from type->tp_version_tag
 #ifdef Py_GIL_DISABLED
-    int sequence;
+   _PySeqLock sequence;
 #endif
     PyObject *name;        // reference to exactly a str or None
     PyObject *value;       // borrowed reference or NULL
diff --git a/Objects/typeobject.c b/Objects/typeobject.c
index def1c6da1e8948..abbcf7d2b32f32 100644
--- a/Objects/typeobject.c
+++ b/Objects/typeobject.c
@@ -6,6 +6,7 @@
 #include "pycore_code.h"          // CO_FAST_FREE
 #include "pycore_dict.h"          // _PyDict_KeysSize()
 #include "pycore_frame.h"         // _PyInterpreterFrame
+#include "pycore_lock.h"          // _PySeqLock_*
 #include "pycore_long.h"          // _PyLong_IsNegative()
 #include "pycore_memoryobject.h"  // _PyMemoryView_FromBufferProc()
 #include "pycore_modsupport.h"    // _PyArg_NoKwnames()
@@ -4913,43 +4914,21 @@ static void
 update_cache_gil_disabled(struct type_cache_entry *entry, PyObject *name,
                           unsigned int version_tag, PyObject *value)
 {
-    // Similar to linux seqlock: https://en.wikipedia.org/wiki/Seqlock
-    // We use a sequence number to lock the writer, an even sequence means we're unlocked, an odd
-    // sequence means we're locked.
-    // Differs a little bit in that we use CAS on sequence as the lock, instead of a seperate spin lock.
-    // If our writer detects that another thread has already done the same write we'll just bail
-    // and restore the previous sequence number without doing any updates.
-
-    // lock the entry by setting by moving to an odd sequence number
-    int prev = entry->sequence;
-    while (1) {
-        if (TYPE_CACHE_IS_UPDATING(prev)) {
-            // Someone else is currently updating the cache
-            _Py_yield();
-            prev = _Py_atomic_load_int32_relaxed(&entry->sequence);
-        } else if(_Py_atomic_compare_exchange_int32(&entry->sequence, &prev, prev + 1)) {
-            // We've locked the cache
-            break;
-        } else {
-            _Py_yield();
-        }
-    }
+    _PySeqLock_LockWrite(&entry->sequence);
 
     // update the entry
     if (entry->name == name &&
         entry->value == value &&
         entry->version == version_tag) {
-        // We reaced with another update, bail and restore previous sequence.
-        _Py_atomic_exchange_int32(&entry->sequence, prev);
+        // We raced with another update, bail and restore previous sequence.
+        _PySeqLock_AbandonWrite(&entry->sequence);
         return;
     }
 
     update_cache(entry, name, version_tag, value);
 
     // Then update sequence to the next valid value
-    int new_sequence = prev + 2;
-    assert(!TYPE_CACHE_IS_UPDATING(new_sequence));
-    _Py_atomic_exchange_int32(&entry->sequence, new_sequence);
+    _PySeqLock_UnlockWrite(&entry->sequence);
 }
 
 #endif
@@ -4959,10 +4938,8 @@ void _PyTypes_AfterFork() {
     struct type_cache *cache = get_type_cache();
     for (int i = 0; i < 1 << MCACHE_SIZE_EXP; i++) {
         struct type_cache_entry *entry = &cache->hashtable[i];
-        int sequence = _Py_atomic_load_int_acquire(&entry->sequence);
-        if (TYPE_CACHE_IS_UPDATING(sequence)) {
+        if (_PySeqLock_AfterFork(&entry->sequence)) {
             // Entry was in the process of updating while forking, clear it...
-            entry->sequence = 0;
             entry->value = NULL;
             entry->name = NULL;
             entry->version = 0;
@@ -4986,27 +4963,22 @@ _PyType_Lookup(PyTypeObject *type, PyObject *name)
 #ifdef Py_GIL_DISABLED
     // synchronize-with other writing threads by doing an acquire load on the sequence
     while (1) {
-        int sequence = _Py_atomic_load_int_acquire(&entry->sequence);
-        if (!TYPE_CACHE_IS_UPDATING(sequence)) {
-            if (_Py_atomic_load_uint32_relaxed(&entry->version) == type->tp_version_tag &&
-                _Py_atomic_load_ptr_relaxed(&entry->name) == name) {
-                assert(_PyType_HasFeature(type, Py_TPFLAGS_VALID_VERSION_TAG));
-                OBJECT_STAT_INC_COND(type_cache_hits, !is_dunder_name(name));
-                OBJECT_STAT_INC_COND(type_cache_dunder_hits, is_dunder_name(name));
-                PyObject *value = _Py_atomic_load_ptr_relaxed(&entry->value);
-
-                // Synchronize again and validate that the entry hasn't been updated
-                // while we were readying the values.
-                if (_Py_atomic_load_int_acquire(&entry->sequence) == sequence) {
-                    return value;
-                }
-             } else {
-                // Cache miss
-                break;
-             }
+        int sequence = _PySeqLock_BeginRead(&entry->sequence);
+        if (_Py_atomic_load_uint32_relaxed(&entry->version) == type->tp_version_tag &&
+            _Py_atomic_load_ptr_relaxed(&entry->name) == name) {
+            assert(_PyType_HasFeature(type, Py_TPFLAGS_VALID_VERSION_TAG));
+            OBJECT_STAT_INC_COND(type_cache_hits, !is_dunder_name(name));
+            OBJECT_STAT_INC_COND(type_cache_dunder_hits, is_dunder_name(name));
+            PyObject *value = _Py_atomic_load_ptr_relaxed(&entry->value);
+
+            // If the sequence is still valid then we're done
+            if (_PySeqLock_EndRead(&entry->sequence, sequence)) {
+                return value;
+            }
+        } else {
+            // cache miss
+            break;
         }
-        // We are in progress of updating the cache, retry
-        _Py_yield();
     }
 #else
     if (entry->version == type->tp_version_tag &&
diff --git a/Python/lock.c b/Python/lock.c
index 9d73ec3fa2217d..87444c75db9c0a 100644
--- a/Python/lock.c
+++ b/Python/lock.c
@@ -36,7 +36,7 @@ struct mutex_entry {
     int handed_off;
 };
 
-void
+static void
 _Py_yield(void)
 {
 #ifdef MS_WINDOWS
@@ -459,3 +459,72 @@ _PyRWMutex_Unlock(_PyRWMutex *rwmutex)
         _PyParkingLot_UnparkAll(&rwmutex->bits);
     }
 }
+
+#define SEQLOCK_IS_UPDATING(sequence) (sequence & 0x01)
+
+void _PySeqLock_LockWrite(_PySeqLock *seqlock)
+{
+    // lock the entry by setting by moving to an odd sequence number
+    int prev = seqlock->sequence;
+    while (1) {
+        if (SEQLOCK_IS_UPDATING(prev)) {
+            // Someone else is currently updating the cache
+            _Py_yield();
+            prev = _Py_atomic_load_int32_relaxed(&seqlock->sequence);
+        } else if (_Py_atomic_compare_exchange_int32(&seqlock->sequence, &prev, prev + 1)) {
+            // We've locked the cache
+            break;
+        } else {
+            _Py_yield();
+        }
+    }
+}
+
+void _PySeqLock_AbandonWrite(_PySeqLock *seqlock)
+{
+    int new_seq = seqlock->sequence - 1;
+    assert(!SEQLOCK_IS_UPDATING(new_seq));
+    _Py_atomic_exchange_int32(&seqlock->sequence, new_seq);
+}
+
+void _PySeqLock_UnlockWrite(_PySeqLock *seqlock)
+{
+    int new_seq = seqlock->sequence + 1;
+    assert(!SEQLOCK_IS_UPDATING(new_seq));
+    _Py_atomic_exchange_int32(&seqlock->sequence, new_seq);
+}
+
+int _PySeqLock_BeginRead(_PySeqLock *seqlock)
+{
+    int sequence = _Py_atomic_load_int_acquire(&seqlock->sequence);
+    while (SEQLOCK_IS_UPDATING(sequence)) {
+        _Py_yield();
+        sequence = _Py_atomic_load_int_acquire(&seqlock->sequence);
+    }
+
+    return sequence;
+}
+
+int _PySeqLock_EndRead(_PySeqLock *seqlock, int previous)
+{
+    // Synchronize again and validate that the entry hasn't been updated
+    // while we were readying the values.
+     if (_Py_atomic_load_int_acquire(&seqlock->sequence) == previous) {
+        return 1;
+     }
+
+     _Py_yield();
+     return 0;
+}
+
+int _PySeqLock_AfterFork(_PySeqLock *seqlock)
+{
+    // Synchronize again and validate that the entry hasn't been updated
+    // while we were readying the values.
+     if (SEQLOCK_IS_UPDATING(seqlock->sequence)) {
+        seqlock->sequence = 0;
+        return 1;
+     }
+
+     return 0;
+}

From 51a4b93d27fa3b5a57842489de346e7be772ec84 Mon Sep 17 00:00:00 2001
From: Dino Viehland <dinoviehland@fb.com>
Date: Tue, 16 Jan 2024 12:34:13 -0800
Subject: [PATCH 4/5] Review feedback

---
 Include/internal/pycore_critical_section.h |  3 +-
 Include/internal/pycore_typeobject.h       |  2 +-
 Objects/typeobject.c                       | 48 +++++++++++-----------
 3 files changed, 28 insertions(+), 25 deletions(-)

diff --git a/Include/internal/pycore_critical_section.h b/Include/internal/pycore_critical_section.h
index 30820a24f5bb64..9163b5cf0f2e8a 100644
--- a/Include/internal/pycore_critical_section.h
+++ b/Include/internal/pycore_critical_section.h
@@ -293,7 +293,8 @@ _PyCriticalSection_SuspendAll(PyThreadState *tstate);
 #ifdef Py_GIL_DISABLED
 
 static inline void
-_PyCriticalSection_AssertHeld(PyMutex *mutex) {
+_PyCriticalSection_AssertHeld(PyMutex *mutex)
+{
 #ifdef Py_DEBUG
     PyThreadState *tstate = _PyThreadState_GET();
     uintptr_t prev = tstate->critical_section;
diff --git a/Include/internal/pycore_typeobject.h b/Include/internal/pycore_typeobject.h
index 115855f2a8e24c..664f6fb212a57d 100644
--- a/Include/internal/pycore_typeobject.h
+++ b/Include/internal/pycore_typeobject.h
@@ -79,7 +79,7 @@ struct types_state {
 extern PyStatus _PyTypes_InitTypes(PyInterpreterState *);
 extern void _PyTypes_FiniTypes(PyInterpreterState *);
 extern void _PyTypes_Fini(PyInterpreterState *);
-void _PyTypes_AfterFork(void);
+extern void _PyTypes_AfterFork(void);
 
 /* other API */
 
diff --git a/Objects/typeobject.c b/Objects/typeobject.c
index abbcf7d2b32f32..892206be189915 100644
--- a/Objects/typeobject.c
+++ b/Objects/typeobject.c
@@ -686,9 +686,15 @@ type_cache_clear(struct type_cache *cache, PyObject *value)
 {
     for (Py_ssize_t i = 0; i < (1 << MCACHE_SIZE_EXP); i++) {
         struct type_cache_entry *entry = &cache->hashtable[i];
+#ifdef Py_GIL_DISABLED
+        _PySeqLock_LockWrite(&entry->sequence);
+#endif
         entry->version = 0;
         Py_XSETREF(entry->name, _Py_XNewRef(value));
         entry->value = NULL;
+#ifdef Py_GIL_DISABLED
+        _PySeqLock_UnlockWrite(&entry->sequence);
+#endif
     }
 }
 
@@ -4903,6 +4909,8 @@ update_cache(struct type_cache_entry *entry, PyObject *name, unsigned int versio
     entry->value = value;  /* borrowed */
     assert(_PyASCIIObject_CAST(name)->hash != -1);
     OBJECT_STAT_INC_COND(type_cache_collisions, entry->name != Py_None && entry->name != name);
+    // We're releasing this under the lock for simplicity sake because it's always a
+    // exact unicode object or Py_None so it's safe to do so.
     Py_SETREF(entry->name, Py_NewRef(name));
 }
 
@@ -4933,15 +4941,17 @@ update_cache_gil_disabled(struct type_cache_entry *entry, PyObject *name,
 
 #endif
 
-void _PyTypes_AfterFork() {
+void
+_PyTypes_AfterFork()
+{
 #ifdef Py_GIL_DISABLED
     struct type_cache *cache = get_type_cache();
-    for (int i = 0; i < 1 << MCACHE_SIZE_EXP; i++) {
+    for (Py_ssize_t i = 0; i < (1 << MCACHE_SIZE_EXP); i++) {
         struct type_cache_entry *entry = &cache->hashtable[i];
         if (_PySeqLock_AfterFork(&entry->sequence)) {
             // Entry was in the process of updating while forking, clear it...
             entry->value = NULL;
-            entry->name = NULL;
+            Py_SETREF(entry->name, Py_None);
             entry->version = 0;
         }
     }
@@ -5005,6 +5015,7 @@ _PyType_Lookup(PyTypeObject *type, PyObject *name)
     if (MCACHE_CACHEABLE_NAME(name)) {
         has_version = assign_version_tag(interp, type);
         version = type->tp_version_tag;
+        assert(!has_version || _PyType_HasFeature(type, Py_TPFLAGS_VALID_VERSION_TAG));
     }
     END_TYPE_LOCK()
 
@@ -5030,8 +5041,7 @@ _PyType_Lookup(PyTypeObject *type, PyObject *name)
 #else
         update_cache(entry, name, version, res);
 #endif
-        assert(_PyType_HasFeature(type, Py_TPFLAGS_VALID_VERSION_TAG));
-}
+    }
     return res;
 }
 
@@ -9506,13 +9516,13 @@ slot_bf_getbuffer(PyObject *self, Py_buffer *buffer, int flags)
     return -1;
 }
 
-static int
-releasebuffer_maybe_call_super_unlocked(PyObject *self, Py_buffer *buffer, releasebufferproc *base_releasebuffer)
+static releasebufferproc
+releasebuffer_maybe_call_super_unlocked(PyObject *self, Py_buffer *buffer)
 {
     PyTypeObject *self_type = Py_TYPE(self);
     PyObject *mro = lookup_tp_mro(self_type);
     if (mro == NULL) {
-        return -1;
+        return NULL;
     }
 
     assert(PyTuple_Check(mro));
@@ -9526,9 +9536,8 @@ releasebuffer_maybe_call_super_unlocked(PyObject *self, Py_buffer *buffer, relea
     }
     i++;  /* skip self_type */
     if (i >= n)
-        return -1;
+        return NULL;
 
-    *base_releasebuffer = NULL;
     for (; i < n; i++) {
         PyObject *obj = PyTuple_GET_ITEM(mro, i);
         if (!PyType_Check(obj)) {
@@ -9538,28 +9547,25 @@ releasebuffer_maybe_call_super_unlocked(PyObject *self, Py_buffer *buffer, relea
         if (base_type->tp_as_buffer != NULL
             && base_type->tp_as_buffer->bf_releasebuffer != NULL
             && base_type->tp_as_buffer->bf_releasebuffer != slot_bf_releasebuffer) {
-            *base_releasebuffer = base_type->tp_as_buffer->bf_releasebuffer;
-            break;
+            return base_type->tp_as_buffer->bf_releasebuffer;
         }
     }
 
-    return 0;
+    return NULL;
 }
 
-static int
+static void
 releasebuffer_maybe_call_super(PyObject *self, Py_buffer *buffer)
 {
-    int res;
     releasebufferproc base_releasebuffer;
 
     BEGIN_TYPE_LOCK();
-    res = releasebuffer_maybe_call_super_unlocked(self, buffer, &base_releasebuffer);
+    base_releasebuffer = releasebuffer_maybe_call_super_unlocked(self, buffer);
     END_TYPE_LOCK();
 
-    if (res == 0) {
+    if (base_releasebuffer != NULL) {
         base_releasebuffer(self, buffer);
     }
-    return res;
 }
 
 static void
@@ -9636,11 +9642,7 @@ static void
 slot_bf_releasebuffer(PyObject *self, Py_buffer *buffer)
 {
     releasebuffer_call_python(self, buffer);
-    if (releasebuffer_maybe_call_super(self, buffer) < 0) {
-        if (PyErr_Occurred()) {
-            PyErr_WriteUnraisable(self);
-        }
-    }
+    releasebuffer_maybe_call_super(self, buffer);
 }
 
 static PyObject *

From 66ce5f582b1568fd8a4fa86cd422cfd804a800c8 Mon Sep 17 00:00:00 2001
From: Dino Viehland <dinoviehland@fb.com>
Date: Wed, 17 Jan 2024 14:03:25 -0800
Subject: [PATCH 5/5] Use int32_t for sequence lock Use atomic storage instead
 of exchange Use relaxed load when starting sequence lock for write Fix some
 formatting

---
 Include/cpython/pyatomic.h     |  3 +++
 Include/cpython/pyatomic_gcc.h |  3 +++
 Include/cpython/pyatomic_msc.h | 11 +++++++++++
 Include/cpython/pyatomic_std.h |  7 +++++++
 Include/internal/pycore_lock.h |  8 ++++----
 Objects/typeobject.c           | 12 ++++++------
 Python/lock.c                  | 30 ++++++++++++++++--------------
 7 files changed, 50 insertions(+), 24 deletions(-)

diff --git a/Include/cpython/pyatomic.h b/Include/cpython/pyatomic.h
index 5314a70436bfc3..e10d48285367cf 100644
--- a/Include/cpython/pyatomic.h
+++ b/Include/cpython/pyatomic.h
@@ -469,6 +469,9 @@ _Py_atomic_store_int_release(int *obj, int value);
 static inline int
 _Py_atomic_load_int_acquire(const int *obj);
 
+static inline uint32_t
+_Py_atomic_load_uint32_acquire(const uint32_t *obj);
+
 
 // --- _Py_atomic_fence ------------------------------------------------------
 
diff --git a/Include/cpython/pyatomic_gcc.h b/Include/cpython/pyatomic_gcc.h
index 70f2b7e1b5706a..4095e1873d8b07 100644
--- a/Include/cpython/pyatomic_gcc.h
+++ b/Include/cpython/pyatomic_gcc.h
@@ -495,6 +495,9 @@ static inline int
 _Py_atomic_load_int_acquire(const int *obj)
 { return __atomic_load_n(obj, __ATOMIC_ACQUIRE); }
 
+static inline uint32_t
+_Py_atomic_load_uint32_acquire(const uint32_t *obj)
+{ return __atomic_load_n(obj, __ATOMIC_ACQUIRE); }
 
 // --- _Py_atomic_fence ------------------------------------------------------
 
diff --git a/Include/cpython/pyatomic_msc.h b/Include/cpython/pyatomic_msc.h
index 601a0cf65afc1c..b5c1ec94112562 100644
--- a/Include/cpython/pyatomic_msc.h
+++ b/Include/cpython/pyatomic_msc.h
@@ -938,6 +938,17 @@ _Py_atomic_load_int_acquire(const int *obj)
 #endif
 }
 
+static inline uint32_t
+_Py_atomic_load_uint32_acquire(const uint32_t *obj)
+{
+#if defined(_M_X64) || defined(_M_IX86)
+    return *(uint32_t volatile *)obj;
+#elif defined(_M_ARM64)
+    return (int)__ldar32((uint32_t volatile *)obj);
+#else
+#  error "no implementation of _Py_atomic_load_uint32_acquire"
+#endif
+}
 
 // --- _Py_atomic_fence ------------------------------------------------------
 
diff --git a/Include/cpython/pyatomic_std.h b/Include/cpython/pyatomic_std.h
index a05bfaec47e89d..6c934a2c5e7b64 100644
--- a/Include/cpython/pyatomic_std.h
+++ b/Include/cpython/pyatomic_std.h
@@ -870,6 +870,13 @@ _Py_atomic_load_int_acquire(const int *obj)
                                 memory_order_acquire);
 }
 
+static inline uint32_t
+_Py_atomic_load_uint32_acquire(const uint32_t *obj)
+{
+    _Py_USING_STD;
+    return atomic_load_explicit((const _Atomic(uint32_t)*)obj,
+                                memory_order_acquire);
+}
 
 
 // --- _Py_atomic_fence ------------------------------------------------------
diff --git a/Include/internal/pycore_lock.h b/Include/internal/pycore_lock.h
index 30d0ac0ae8f0be..674a1d170fec10 100644
--- a/Include/internal/pycore_lock.h
+++ b/Include/internal/pycore_lock.h
@@ -261,7 +261,7 @@ PyAPI_FUNC(void) _PyRWMutex_Unlock(_PyRWMutex *rwmutex);
 // The writer can also detect that the undelering data has not changed and abandon the write
 // and restore the previous sequence.
 typedef struct {
-    int sequence;
+    uint32_t sequence;
 } _PySeqLock;
 
 // Lock the sequence lock for the writer
@@ -275,15 +275,15 @@ PyAPI_FUNC(void) _PySeqLock_UnlockWrite(_PySeqLock *seqlock);
 PyAPI_FUNC(void) _PySeqLock_AbandonWrite(_PySeqLock *seqlock);
 
 // Begin a read operation and return the current sequence number.
-PyAPI_FUNC(int) _PySeqLock_BeginRead(_PySeqLock *seqlock);
+PyAPI_FUNC(uint32_t) _PySeqLock_BeginRead(_PySeqLock *seqlock);
 
 // End the read operation and confirm that the sequence number has not changed.
 // Returns 1 if the read was successful or 0 if the read should be re-tried.
-PyAPI_FUNC(int) _PySeqLock_EndRead(_PySeqLock *seqlock, int previous);
+PyAPI_FUNC(uint32_t) _PySeqLock_EndRead(_PySeqLock *seqlock, uint32_t previous);
 
 // Check if the lock was held during a fork and clear the lock.  Returns 1
 // if the lock was held and any associated datat should be cleared.
-PyAPI_FUNC(int) _PySeqLock_AfterFork(_PySeqLock *seqlock);
+PyAPI_FUNC(uint32_t) _PySeqLock_AfterFork(_PySeqLock *seqlock);
 
 #ifdef __cplusplus
 }
diff --git a/Objects/typeobject.c b/Objects/typeobject.c
index 892206be189915..e0711dfe8545b7 100644
--- a/Objects/typeobject.c
+++ b/Objects/typeobject.c
@@ -60,13 +60,13 @@ class object "PyObject *" "&PyBaseObject_Type"
 // in odd behaviors w.r.t. running with the GIL as the outer type lock could
 // be released and reacquired during a subclass update if there's contention
 // on the subclass lock.
-#define BEGIN_TYPE_LOCK()                                           \
-    {                                                               \
-    _PyCriticalSection _cs;                                         \
-    _PyCriticalSection_Begin(&_cs, &_PyRuntime.types.type_mutex);   \
+#define BEGIN_TYPE_LOCK()                                               \
+    {                                                                   \
+        _PyCriticalSection _cs;                                         \
+        _PyCriticalSection_Begin(&_cs, &_PyRuntime.types.type_mutex);   \
 
-#define END_TYPE_LOCK()                                             \
-        _PyCriticalSection_End(&_cs);                               \
+#define END_TYPE_LOCK()                                                 \
+        _PyCriticalSection_End(&_cs);                                   \
     }
 
 #define ASSERT_TYPE_LOCK_HELD() \
diff --git a/Python/lock.c b/Python/lock.c
index 87444c75db9c0a..bf0143654bd692 100644
--- a/Python/lock.c
+++ b/Python/lock.c
@@ -465,16 +465,18 @@ _PyRWMutex_Unlock(_PyRWMutex *rwmutex)
 void _PySeqLock_LockWrite(_PySeqLock *seqlock)
 {
     // lock the entry by setting by moving to an odd sequence number
-    int prev = seqlock->sequence;
+    uint32_t prev = _Py_atomic_load_uint32_relaxed(&seqlock->sequence);
     while (1) {
         if (SEQLOCK_IS_UPDATING(prev)) {
             // Someone else is currently updating the cache
             _Py_yield();
-            prev = _Py_atomic_load_int32_relaxed(&seqlock->sequence);
-        } else if (_Py_atomic_compare_exchange_int32(&seqlock->sequence, &prev, prev + 1)) {
+            prev = _Py_atomic_load_uint32_relaxed(&seqlock->sequence);
+        }
+        else if (_Py_atomic_compare_exchange_uint32(&seqlock->sequence, &prev, prev + 1)) {
             // We've locked the cache
             break;
-        } else {
+        }
+        else {
             _Py_yield();
         }
     }
@@ -482,34 +484,34 @@ void _PySeqLock_LockWrite(_PySeqLock *seqlock)
 
 void _PySeqLock_AbandonWrite(_PySeqLock *seqlock)
 {
-    int new_seq = seqlock->sequence - 1;
+    uint32_t new_seq = seqlock->sequence - 1;
     assert(!SEQLOCK_IS_UPDATING(new_seq));
-    _Py_atomic_exchange_int32(&seqlock->sequence, new_seq);
+    _Py_atomic_store_uint32(&seqlock->sequence, new_seq);
 }
 
 void _PySeqLock_UnlockWrite(_PySeqLock *seqlock)
 {
-    int new_seq = seqlock->sequence + 1;
+    uint32_t new_seq = seqlock->sequence + 1;
     assert(!SEQLOCK_IS_UPDATING(new_seq));
-    _Py_atomic_exchange_int32(&seqlock->sequence, new_seq);
+    _Py_atomic_store_uint32(&seqlock->sequence, new_seq);
 }
 
-int _PySeqLock_BeginRead(_PySeqLock *seqlock)
+uint32_t _PySeqLock_BeginRead(_PySeqLock *seqlock)
 {
-    int sequence = _Py_atomic_load_int_acquire(&seqlock->sequence);
+    uint32_t sequence = _Py_atomic_load_uint32_acquire(&seqlock->sequence);
     while (SEQLOCK_IS_UPDATING(sequence)) {
         _Py_yield();
-        sequence = _Py_atomic_load_int_acquire(&seqlock->sequence);
+        sequence = _Py_atomic_load_uint32_acquire(&seqlock->sequence);
     }
 
     return sequence;
 }
 
-int _PySeqLock_EndRead(_PySeqLock *seqlock, int previous)
+uint32_t _PySeqLock_EndRead(_PySeqLock *seqlock, uint32_t previous)
 {
     // Synchronize again and validate that the entry hasn't been updated
     // while we were readying the values.
-     if (_Py_atomic_load_int_acquire(&seqlock->sequence) == previous) {
+     if (_Py_atomic_load_uint32_acquire(&seqlock->sequence) == previous) {
         return 1;
      }
 
@@ -517,7 +519,7 @@ int _PySeqLock_EndRead(_PySeqLock *seqlock, int previous)
      return 0;
 }
 
-int _PySeqLock_AfterFork(_PySeqLock *seqlock)
+uint32_t _PySeqLock_AfterFork(_PySeqLock *seqlock)
 {
     // Synchronize again and validate that the entry hasn't been updated
     // while we were readying the values.