From d37b421cb39646046990dc431fe31ff9efbed6d0 Mon Sep 17 00:00:00 2001 From: vasilito Date: Thu, 2 Jul 2026 10:36:17 +0300 Subject: [PATCH] kernel: fix wakeup_contexts vs steal_work deadlock Two-sided fix for the lock-ordering deadlock discovered by Oracle review (Issue 24): 1. wakeup_contexts (this fn) held IDLE_CONTEXTS while waiting for SchedQueuesLock on its own CPU via SchedQueuesLock::new(&percpu.sched). If another CPU's steal_work was holding that SchedQueuesLock (via a victim SchedQueuesLock) and waiting for IDLE_CONTEXTS, both threads spin forever. Fix: drop idle_contexts immediately after building the wakeups Vec. The Vec is the only data we need; releasing the lock here means steal_work on another CPU can proceed while this CPU acquires its own SchedQueuesLock. 2. steal_work held a victim's SchedQueuesLock (victim_lock) while calling idle_contexts(token.downgrade()).push_back on a context that turned out to be Blocked. This is the matching side of the deadlock: CPU A held IDLE_CONTEXTS and waited for its own SchedQueuesLock; CPU B (steal_work) held CPU A's SchedQueuesLock and waited for IDLE_CONTEXTS. Fix: use idle_contexts_try (try_lock) instead of idle_contexts (blocking lock). If IDLE_CONTEXTS is busy (owned by wakeup_contexts on another CPU), skip the push-back; the context will be re-checked on the next wakeup round because it was not removed from IDLE_CONTEXTS (the Blocked status was set, but it stayed in IDLE_CONTEXTS because we never re-pushed it). The original code at line 429 used idle_contexts (blocking) which is what makes this a real deadlock. try_lock is safe because: - If try_lock succeeds, the context is correctly pushed - If try_lock fails, the context is still in IDLE_CONTEXTS (we never removed it), so the next wakeup_contexts will find it again --- src/context/switch.rs | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/src/context/switch.rs b/src/context/switch.rs index d054734248..09d7e550c4 100644 --- a/src/context/switch.rs +++ b/src/context/switch.rs @@ -426,7 +426,15 @@ fn steal_work( } if matches!(sw, UpdateResult::Blocked) { - idle_contexts(token.downgrade()).push_back(context_ref); + // Use try_lock to avoid deadlocking with + // wakeup_contexts (which holds IDLE_CONTEXTS then + // waits for our victim_lock). If IDLE_CONTEXTS is + // busy (owned by another CPU's wakeup_contexts), + // skip the push-back; the context will be re-checked + // on the next wakeup round. + if let Some(mut idle) = idle_contexts_try(token.downgrade()) { + idle.push_back(context_ref); + } } else { victim_queues[prio].push_back(context_ref); } @@ -584,6 +592,18 @@ fn wakeup_contexts(token: &mut CleanLockToken, percpu: &PercpuBlock, switch_time idle_contexts.push_back(context_ref); } + // Drop IDLE_CONTEXTS before acquiring SchedQueuesLock to avoid a + // deadlock with steal_work on another CPU. The previous code held + // both locks simultaneously: wakeup_contexts (this fn) held + // IDLE_CONTEXTS, then waited for SchedQueuesLock on its own CPU; + // steal_work on another CPU held that same SchedQueuesLock and + // waited for IDLE_CONTEXTS. Classic circular wait. + // + // The wakeups Vec is the only data we need from IDLE_CONTEXTS; + // releasing the lock here means steal_work on another CPU can + // proceed while this CPU is acquiring SchedQueuesLock. + drop(idle_contexts); + if wakeups.is_empty() { return; }