Skip to content

Commit 7b47699

Browse files
committed
Issue #18808: Thread.join() now waits for the underlying thread state to be destroyed before returning.
This prevents unpredictable aborts in Py_EndInterpreter() when some non-daemon threads are still running.
1 parent eda7c64 commit 7b47699

File tree

7 files changed

+224
-34
lines changed

7 files changed

+224
-34
lines changed

Include/pystate.h

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,32 @@ typedef struct _ts {
118118
int trash_delete_nesting;
119119
PyObject *trash_delete_later;
120120

121+
/* Called when a thread state is deleted normally, but not when it
122+
* is destroyed after fork().
123+
* Pain: to prevent rare but fatal shutdown errors (issue 18808),
124+
* Thread.join() must wait for the join'ed thread's tstate to be unlinked
125+
* from the tstate chain. That happens at the end of a thread's life,
126+
* in pystate.c.
127+
* The obvious way doesn't quite work: create a lock which the tstate
128+
* unlinking code releases, and have Thread.join() wait to acquire that
129+
* lock. The problem is that we _are_ at the end of the thread's life:
130+
* if the thread holds the last reference to the lock, decref'ing the
131+
* lock will delete the lock, and that may trigger arbitrary Python code
132+
* if there's a weakref, with a callback, to the lock. But by this time
133+
* _PyThreadState_Current is already NULL, so only the simplest of C code
134+
* can be allowed to run (in particular it must not be possible to
135+
* release the GIL).
136+
* So instead of holding the lock directly, the tstate holds a weakref to
137+
* the lock: that's the value of on_delete_data below. Decref'ing a
138+
* weakref is harmless.
139+
* on_delete points to _threadmodule.c's static release_sentinel() function.
140+
* After the tstate is unlinked, release_sentinel is called with the
141+
* weakref-to-lock (on_delete_data) argument, and release_sentinel releases
142+
* the indirectly held lock.
143+
*/
144+
void (*on_delete)(void *);
145+
void *on_delete_data;
146+
121147
/* XXX signal handlers should also be here */
122148

123149
} PyThreadState;

Lib/_dummy_thread.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,10 @@ def stack_size(size=None):
8181
raise error("setting thread stack size not supported")
8282
return 0
8383

84+
def _set_sentinel():
85+
"""Dummy implementation of _thread._set_sentinel()."""
86+
return LockType()
87+
8488
class LockType(object):
8589
"""Class implementing dummy implementation of _thread.LockType.
8690

Lib/test/test_threading.py

Lines changed: 68 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -539,6 +539,40 @@ def f():
539539
self.assertEqual(err, b"")
540540
self.assertEqual(data, "Thread-1\nTrue\nTrue\n")
541541

542+
def test_tstate_lock(self):
543+
# Test an implementation detail of Thread objects.
544+
started = _thread.allocate_lock()
545+
finish = _thread.allocate_lock()
546+
started.acquire()
547+
finish.acquire()
548+
def f():
549+
started.release()
550+
finish.acquire()
551+
time.sleep(0.01)
552+
# The tstate lock is None until the thread is started
553+
t = threading.Thread(target=f)
554+
self.assertIs(t._tstate_lock, None)
555+
t.start()
556+
started.acquire()
557+
self.assertTrue(t.is_alive())
558+
# The tstate lock can't be acquired when the thread is running
559+
# (or suspended).
560+
tstate_lock = t._tstate_lock
561+
self.assertFalse(tstate_lock.acquire(timeout=0), False)
562+
finish.release()
563+
# When the thread ends, the state_lock can be successfully
564+
# acquired.
565+
self.assertTrue(tstate_lock.acquire(timeout=5), False)
566+
# But is_alive() is still True: we hold _tstate_lock now, which
567+
# prevents is_alive() from knowing the thread's end-of-life C code
568+
# is done.
569+
self.assertTrue(t.is_alive())
570+
# Let is_alive() find out the C code is done.
571+
tstate_lock.release()
572+
self.assertFalse(t.is_alive())
573+
# And verify the thread disposed of _tstate_lock.
574+
self.assertTrue(t._tstate_lock is None)
575+
542576

543577
class ThreadJoinOnShutdown(BaseTestCase):
544578

@@ -669,7 +703,7 @@ def worker():
669703
# someone else tries to fix this test case by acquiring this lock
670704
# before forking instead of resetting it, the test case will
671705
# deadlock when it shouldn't.
672-
condition = w._block
706+
condition = w._stopped._cond
673707
orig_acquire = condition.acquire
674708
call_count_lock = threading.Lock()
675709
call_count = 0
@@ -733,7 +767,7 @@ def worker():
733767
# causes the worker to fork. At this point, the problematic waiter
734768
# lock has been acquired once by the waiter and has been put onto
735769
# the waiters list.
736-
condition = w._block
770+
condition = w._stopped._cond
737771
orig_release_save = condition._release_save
738772
def my_release_save():
739773
global start_fork
@@ -867,6 +901,38 @@ def f():
867901
# The thread was joined properly.
868902
self.assertEqual(os.read(r, 1), b"x")
869903

904+
def test_threads_join_2(self):
905+
# Same as above, but a delay gets introduced after the thread's
906+
# Python code returned but before the thread state is deleted.
907+
# To achieve this, we register a thread-local object which sleeps
908+
# a bit when deallocated.
909+
r, w = os.pipe()
910+
self.addCleanup(os.close, r)
911+
self.addCleanup(os.close, w)
912+
code = r"""if 1:
913+
import os
914+
import threading
915+
import time
916+
917+
class Sleeper:
918+
def __del__(self):
919+
time.sleep(0.05)
920+
921+
tls = threading.local()
922+
923+
def f():
924+
# Sleep a bit so that the thread is still running when
925+
# Py_EndInterpreter is called.
926+
time.sleep(0.05)
927+
tls.x = Sleeper()
928+
os.write(%d, b"x")
929+
threading.Thread(target=f).start()
930+
""" % (w,)
931+
ret = _testcapi.run_in_subinterp(code)
932+
self.assertEqual(ret, 0)
933+
# The thread was joined properly.
934+
self.assertEqual(os.read(r, 1), b"x")
935+
870936
def test_daemon_threads_fatal_error(self):
871937
subinterp_code = r"""if 1:
872938
import os

Lib/threading.py

Lines changed: 55 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
# Rename some stuff so "from threading import *" is safe
3434
_start_new_thread = _thread.start_new_thread
3535
_allocate_lock = _thread.allocate_lock
36+
_set_sentinel = _thread._set_sentinel
3637
get_ident = _thread.get_ident
3738
ThreadError = _thread.error
3839
try:
@@ -548,28 +549,33 @@ def __init__(self, group=None, target=None, name=None,
548549
else:
549550
self._daemonic = current_thread().daemon
550551
self._ident = None
552+
self._tstate_lock = None
551553
self._started = Event()
552-
self._stopped = False
553-
self._block = Condition(Lock())
554+
self._stopped = Event()
554555
self._initialized = True
555556
# sys.stderr is not stored in the class like
556557
# sys.exc_info since it can be changed between instances
557558
self._stderr = _sys.stderr
558559
_dangling.add(self)
559560

560-
def _reset_internal_locks(self):
561+
def _reset_internal_locks(self, is_alive):
561562
# private! Called by _after_fork() to reset our internal locks as
562563
# they may be in an invalid state leading to a deadlock or crash.
563-
if hasattr(self, '_block'): # DummyThread deletes _block
564-
self._block.__init__()
565564
self._started._reset_internal_locks()
565+
self._stopped._reset_internal_locks()
566+
if is_alive:
567+
self._set_tstate_lock()
568+
else:
569+
# The thread isn't alive after fork: it doesn't have a tstate
570+
# anymore.
571+
self._tstate_lock = None
566572

567573
def __repr__(self):
568574
assert self._initialized, "Thread.__init__() was not called"
569575
status = "initial"
570576
if self._started.is_set():
571577
status = "started"
572-
if self._stopped:
578+
if self._stopped.is_set():
573579
status = "stopped"
574580
if self._daemonic:
575581
status += " daemon"
@@ -625,9 +631,18 @@ def _bootstrap(self):
625631
def _set_ident(self):
626632
self._ident = get_ident()
627633

634+
def _set_tstate_lock(self):
635+
"""
636+
Set a lock object which will be released by the interpreter when
637+
the underlying thread state (see pystate.h) gets deleted.
638+
"""
639+
self._tstate_lock = _set_sentinel()
640+
self._tstate_lock.acquire()
641+
628642
def _bootstrap_inner(self):
629643
try:
630644
self._set_ident()
645+
self._set_tstate_lock()
631646
self._started.set()
632647
with _active_limbo_lock:
633648
_active[self._ident] = self
@@ -691,10 +706,7 @@ def _bootstrap_inner(self):
691706
pass
692707

693708
def _stop(self):
694-
self._block.acquire()
695-
self._stopped = True
696-
self._block.notify_all()
697-
self._block.release()
709+
self._stopped.set()
698710

699711
def _delete(self):
700712
"Remove current thread from the dict of currently running threads."
@@ -738,21 +750,29 @@ def join(self, timeout=None):
738750
raise RuntimeError("cannot join thread before it is started")
739751
if self is current_thread():
740752
raise RuntimeError("cannot join current thread")
741-
742-
self._block.acquire()
743-
try:
744-
if timeout is None:
745-
while not self._stopped:
746-
self._block.wait()
747-
else:
748-
deadline = _time() + timeout
749-
while not self._stopped:
750-
delay = deadline - _time()
751-
if delay <= 0:
752-
break
753-
self._block.wait(delay)
754-
finally:
755-
self._block.release()
753+
if not self.is_alive():
754+
return
755+
self._stopped.wait(timeout)
756+
if self._stopped.is_set():
757+
self._wait_for_tstate_lock(timeout is None)
758+
759+
def _wait_for_tstate_lock(self, block):
760+
# Issue #18808: wait for the thread state to be gone.
761+
# When self._stopped is set, the Python part of the thread is done,
762+
# but the thread's tstate has not yet been destroyed. The C code
763+
# releases self._tstate_lock when the C part of the thread is done
764+
# (the code at the end of the thread's life to remove all knowledge
765+
# of the thread from the C data structures).
766+
# This method waits to acquire _tstate_lock if `block` is True, or
767+
# sees whether it can be acquired immediately if `block` is False.
768+
# If it does acquire the lock, the C code is done, and _tstate_lock
769+
# is set to None.
770+
lock = self._tstate_lock
771+
if lock is None:
772+
return # already determined that the C code is done
773+
if lock.acquire(block):
774+
lock.release()
775+
self._tstate_lock = None
756776

757777
@property
758778
def name(self):
@@ -771,7 +791,14 @@ def ident(self):
771791

772792
def is_alive(self):
773793
assert self._initialized, "Thread.__init__() not called"
774-
return self._started.is_set() and not self._stopped
794+
if not self._started.is_set():
795+
return False
796+
if not self._stopped.is_set():
797+
return True
798+
# The Python part of the thread is done, but the C part may still be
799+
# waiting to run.
800+
self._wait_for_tstate_lock(False)
801+
return self._tstate_lock is not None
775802

776803
isAlive = is_alive
777804

@@ -854,11 +881,6 @@ class _DummyThread(Thread):
854881
def __init__(self):
855882
Thread.__init__(self, name=_newname("Dummy-%d"), daemon=True)
856883

857-
# Thread._block consumes an OS-level locking primitive, which
858-
# can never be used by a _DummyThread. Since a _DummyThread
859-
# instance is immortal, that's bad, so release this resource.
860-
del self._block
861-
862884
self._started.set()
863885
self._set_ident()
864886
with _active_limbo_lock:
@@ -952,15 +974,16 @@ def _after_fork():
952974
for thread in _enumerate():
953975
# Any lock/condition variable may be currently locked or in an
954976
# invalid state, so we reinitialize them.
955-
thread._reset_internal_locks()
956977
if thread is current:
957978
# There is only one active thread. We reset the ident to
958979
# its new value since it can have changed.
980+
thread._reset_internal_locks(True)
959981
ident = get_ident()
960982
thread._ident = ident
961983
new_active[ident] = thread
962984
else:
963985
# All the others are already stopped.
986+
thread._reset_internal_locks(False)
964987
thread._stop()
965988

966989
_limbo.clear()

Misc/NEWS

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,10 @@ Core and Builtins
5656
Library
5757
-------
5858

59+
- Issue #18808: Thread.join() now waits for the underlying thread state to
60+
be destroyed before returning. This prevents unpredictable aborts in
61+
Py_EndInterpreter() when some non-daemon threads are still running.
62+
5963
- Issue #18458: Prevent crashes with newer versions of libedit. Its readline
6064
emulation has changed from 0-based indexing to 1-based like gnu readline.
6165

Modules/_threadmodule.c

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1172,6 +1172,66 @@ yet finished.\n\
11721172
This function is meant for internal and specialized purposes only.\n\
11731173
In most applications `threading.enumerate()` should be used instead.");
11741174

1175+
static void
1176+
release_sentinel(void *wr)
1177+
{
1178+
/* Tricky: this function is called when the current thread state
1179+
is being deleted. Therefore, only simple C code can safely
1180+
execute here. */
1181+
PyObject *obj = PyWeakref_GET_OBJECT(wr);
1182+
lockobject *lock;
1183+
if (obj != Py_None) {
1184+
assert(Py_TYPE(obj) == &Locktype);
1185+
lock = (lockobject *) obj;
1186+
if (lock->locked) {
1187+
PyThread_release_lock(lock->lock_lock);
1188+
lock->locked = 0;
1189+
}
1190+
}
1191+
/* Deallocating a weakref with a NULL callback only calls
1192+
PyObject_GC_Del(), which can't call any Python code. */
1193+
Py_DECREF(wr);
1194+
}
1195+
1196+
static PyObject *
1197+
thread__set_sentinel(PyObject *self)
1198+
{
1199+
PyObject *wr;
1200+
PyThreadState *tstate = PyThreadState_Get();
1201+
lockobject *lock;
1202+
1203+
if (tstate->on_delete_data != NULL) {
1204+
/* We must support the re-creation of the lock from a
1205+
fork()ed child. */
1206+
assert(tstate->on_delete == &release_sentinel);
1207+
wr = (PyObject *) tstate->on_delete_data;
1208+
tstate->on_delete = NULL;
1209+
tstate->on_delete_data = NULL;
1210+
Py_DECREF(wr);
1211+
}
1212+
lock = newlockobject();
1213+
if (lock == NULL)
1214+
return NULL;
1215+
/* The lock is owned by whoever called _set_sentinel(), but the weakref
1216+
hangs to the thread state. */
1217+
wr = PyWeakref_NewRef((PyObject *) lock, NULL);
1218+
if (wr == NULL) {
1219+
Py_DECREF(lock);
1220+
return NULL;
1221+
}
1222+
tstate->on_delete_data = (void *) wr;
1223+
tstate->on_delete = &release_sentinel;
1224+
return (PyObject *) lock;
1225+
}
1226+
1227+
PyDoc_STRVAR(_set_sentinel_doc,
1228+
"_set_sentinel() -> lock\n\
1229+
\n\
1230+
Set a sentinel lock that will be released when the current thread\n\
1231+
state is finalized (after it is untied from the interpreter).\n\
1232+
\n\
1233+
This is a private API for the threading module.");
1234+
11751235
static PyObject *
11761236
thread_stack_size(PyObject *self, PyObject *args)
11771237
{
@@ -1247,6 +1307,8 @@ static PyMethodDef thread_methods[] = {
12471307
METH_NOARGS, _count_doc},
12481308
{"stack_size", (PyCFunction)thread_stack_size,
12491309
METH_VARARGS, stack_size_doc},
1310+
{"_set_sentinel", (PyCFunction)thread__set_sentinel,
1311+
METH_NOARGS, _set_sentinel_doc},
12501312
{NULL, NULL} /* sentinel */
12511313
};
12521314

0 commit comments

Comments
 (0)