d37b421cb3
Two-sided fix for the lock-ordering deadlock discovered by
Oracle review (Issue 24):
1. wakeup_contexts (this fn) held IDLE_CONTEXTS while
waiting for SchedQueuesLock on its own CPU via
SchedQueuesLock::new(&percpu.sched). If another CPU's
steal_work was holding that SchedQueuesLock (via a victim
SchedQueuesLock) and waiting for IDLE_CONTEXTS, both
threads spin forever.
Fix: drop idle_contexts immediately after building the
wakeups Vec. The Vec is the only data we need; releasing
the lock here means steal_work on another CPU can proceed
while this CPU acquires its own SchedQueuesLock.
2. steal_work held a victim's SchedQueuesLock (victim_lock)
while calling idle_contexts(token.downgrade()).push_back
on a context that turned out to be Blocked. This is the
matching side of the deadlock: CPU A held IDLE_CONTEXTS and
waited for its own SchedQueuesLock; CPU B (steal_work) held
CPU A's SchedQueuesLock and waited for IDLE_CONTEXTS.
Fix: use idle_contexts_try (try_lock) instead of
idle_contexts (blocking lock). If IDLE_CONTEXTS is busy
(owned by wakeup_contexts on another CPU), skip the
push-back; the context will be re-checked on the next
wakeup round because it was not removed from IDLE_CONTEXTS
(the Blocked status was set, but it stayed in IDLE_CONTEXTS
because we never re-pushed it).
The original code at line 429 used idle_contexts (blocking)
which is what makes this a real deadlock. try_lock is safe
because:
- If try_lock succeeds, the context is correctly pushed
- If try_lock fails, the context is still in IDLE_CONTEXTS
(we never removed it), so the next wakeup_contexts will
find it again
1178 lines
39 KiB
Rust
1178 lines
39 KiB
Rust
//! This module provides a context-switching mechanism that utilizes a simple round-robin scheduler.
|
|
//! The scheduler iterates over available contexts, selecting the next context to run, while
|
|
//! handling process states and synchronization.
|
|
|
|
use crate::{
|
|
context::{
|
|
self, arch, idle_contexts, idle_contexts_try, run_contexts, ArcContextLockWriteGuard,
|
|
Context, ContextLock, SchedPolicy, WeakContextRef, RUN_QUEUE_COUNT,
|
|
},
|
|
cpu_set::{LogicalCpuId, LogicalCpuSet},
|
|
cpu_stats::{self, CpuState},
|
|
percpu::{get_percpu_block, PerCpuSched, PercpuBlock},
|
|
sync::{ArcRwLockWriteGuard, CleanLockToken, LockToken, L1, L4},
|
|
};
|
|
use alloc::{sync::Arc, vec::Vec};
|
|
use core::{
|
|
cell::{Cell, RefCell},
|
|
hint, mem,
|
|
sync::atomic::{AtomicUsize, Ordering},
|
|
};
|
|
use syscall::PtraceFlags;
|
|
|
|
enum UpdateResult {
|
|
CanSwitch,
|
|
Skip,
|
|
Blocked,
|
|
}
|
|
|
|
// A simple geometric series where value[i] ~= value[i - 1] * 1.25
|
|
const SCHED_PRIO_TO_WEIGHT: [usize; 40] = [
|
|
88761, 71755, 56483, 46273, 36291, 29154, 23254, 18705, 14949, 11916, 9548, 7620, 6100, 4904,
|
|
3906, 3121, 2501, 1991, 1586, 1277, 1024, 820, 655, 526, 423, 335, 272, 215, 172, 137, 110, 87,
|
|
70, 56, 45, 36, 29, 23, 18, 15,
|
|
];
|
|
|
|
const LOAD_BALANCE_INTERVAL_NS: u128 = 100_000_000;
|
|
|
|
static SCHED_STEAL_COUNT: AtomicUsize = AtomicUsize::new(0);
|
|
|
|
struct SchedQueuesLock<'a> {
|
|
sched: &'a PerCpuSched,
|
|
}
|
|
|
|
impl<'a> SchedQueuesLock<'a> {
|
|
fn new(sched: &'a PerCpuSched) -> Self {
|
|
sched.take_lock();
|
|
Self { sched }
|
|
}
|
|
|
|
unsafe fn queues_mut(
|
|
&mut self,
|
|
) -> &mut [alloc::collections::VecDeque<WeakContextRef>; RUN_QUEUE_COUNT] {
|
|
unsafe { self.sched.queues_mut() }
|
|
}
|
|
}
|
|
|
|
impl Drop for SchedQueuesLock<'_> {
|
|
fn drop(&mut self) {
|
|
self.sched.release_lock();
|
|
}
|
|
}
|
|
|
|
fn assign_context_to_cpu(context: &mut Context, cpu_id: LogicalCpuId) {
|
|
context.sched_affinity = LogicalCpuSet::empty();
|
|
context.sched_affinity.atomic_set(cpu_id);
|
|
}
|
|
|
|
unsafe fn update_runnable(
|
|
context: &mut Context,
|
|
cpu_id: LogicalCpuId,
|
|
switch_time: u128,
|
|
) -> UpdateResult {
|
|
if context.running {
|
|
return UpdateResult::Skip;
|
|
}
|
|
if !context.sched_affinity.contains(cpu_id) {
|
|
return UpdateResult::Skip;
|
|
}
|
|
if context.status.is_soft_blocked()
|
|
&& let Some(wake) = context.wake
|
|
&& switch_time >= wake
|
|
{
|
|
context.wake = None;
|
|
context.unblock_no_ipi();
|
|
}
|
|
if context.status.is_runnable() {
|
|
UpdateResult::CanSwitch
|
|
} else {
|
|
UpdateResult::Blocked
|
|
}
|
|
}
|
|
|
|
struct SwitchResultInner {
|
|
_prev_guard: ArcContextLockWriteGuard,
|
|
_next_guard: ArcContextLockWriteGuard,
|
|
}
|
|
|
|
/// Tick function to update PIT ticks and trigger a context switch if necessary.
|
|
///
|
|
/// Called periodically, this function increments a per-CPU tick counter and performs a context
|
|
/// switch if the counter reaches a set threshold (e.g., every 3 ticks).
|
|
///
|
|
/// The function also calls the signal handler after switching contexts.
|
|
pub fn tick(token: &mut CleanLockToken) {
|
|
let percpu = PercpuBlock::current();
|
|
let ticks_cell = &percpu.switch_internals.pit_ticks;
|
|
|
|
let new_ticks = ticks_cell.get() + 1;
|
|
ticks_cell.set(new_ticks);
|
|
|
|
let balance_time = crate::time::monotonic(token);
|
|
maybe_balance_queues(token, percpu, balance_time);
|
|
|
|
// Trigger a context switch after every 3 ticks.
|
|
if new_ticks >= 3 {
|
|
switch(token);
|
|
crate::context::signal::signal_handler(token);
|
|
}
|
|
}
|
|
|
|
/// Finishes the context switch by clearing any temporary data and resetting the lock.
|
|
///
|
|
/// This function is called after a context switch is completed to perform cleanup, including
|
|
/// clearing the switch result data and releasing the context switch lock.
|
|
///
|
|
/// # Safety
|
|
/// This function involves unsafe operations such as resetting state and releasing locks.
|
|
pub unsafe extern "C" fn switch_finish_hook() {
|
|
unsafe {
|
|
match PercpuBlock::current().switch_internals.switch_result.take() {
|
|
Some(switch_result) => {
|
|
drop(switch_result);
|
|
}
|
|
_ => {
|
|
// TODO: unreachable_unchecked()?
|
|
crate::arch::stop::emergency_reset();
|
|
}
|
|
}
|
|
arch::CONTEXT_SWITCH_LOCK.store(false, Ordering::SeqCst);
|
|
crate::percpu::switch_arch_hook();
|
|
}
|
|
}
|
|
|
|
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
|
|
pub enum SwitchResult {
|
|
Switched,
|
|
AllContextsIdle,
|
|
}
|
|
|
|
/// This function performs the context switch, using select_next_context to
|
|
/// actually select the next context to switch to.
|
|
///
|
|
/// # Warning
|
|
/// This is not memory-unsafe to call. But do NOT call this while holding locks!
|
|
///
|
|
/// # Returns
|
|
/// - `SwitchResult::Switched`: Indicates a successful switch to a new context.
|
|
/// - `SwitchResult::AllContextsIdle`: Indicates all contexts are idle, and the CPU will switch
|
|
/// to an idle context.
|
|
pub fn switch(token: &mut CleanLockToken) -> SwitchResult {
|
|
let switch_time = crate::time::monotonic(token);
|
|
|
|
let percpu = PercpuBlock::current();
|
|
cpu_stats::add_context_switch();
|
|
|
|
//set PIT Interrupt counter to 0, giving each process same amount of PIT ticks
|
|
percpu.switch_internals.pit_ticks.set(0);
|
|
|
|
// Acquire the global lock to ensure exclusive access during context switch and avoid
|
|
// issues that would be caused by the unsafe operations below
|
|
// TODO: Better memory orderings?
|
|
while arch::CONTEXT_SWITCH_LOCK
|
|
.compare_exchange_weak(false, true, Ordering::SeqCst, Ordering::Relaxed)
|
|
.is_err()
|
|
{
|
|
hint::spin_loop();
|
|
percpu.maybe_handle_tlb_shootdown();
|
|
}
|
|
|
|
// Lock the previous context.
|
|
let prev_context_lock = crate::context::current();
|
|
// We are careful not to lock this context twice
|
|
let mut prev_context_guard = unsafe { prev_context_lock.write_arc() };
|
|
|
|
if !prev_context_guard.is_preemptable() {
|
|
arch::CONTEXT_SWITCH_LOCK.store(false, Ordering::SeqCst);
|
|
return SwitchResult::Switched;
|
|
}
|
|
|
|
// Alarm (previously in update_runnable)
|
|
wakeup_contexts(token, percpu, switch_time);
|
|
|
|
let cpu_id = crate::cpu_id();
|
|
|
|
// Update per-cpu times
|
|
let percpu_nanos = switch_time.saturating_sub(percpu.switch_internals.switch_time.get()) as u64;
|
|
let percpu_ms = percpu_nanos / 1_000_000;
|
|
let was_idle = percpu.stats.add_time(percpu_ms) == CpuState::Idle as u8;
|
|
percpu.switch_internals.switch_time.set(switch_time);
|
|
|
|
let switch_context_opt = match select_next_context(
|
|
token,
|
|
percpu,
|
|
cpu_id,
|
|
switch_time,
|
|
was_idle,
|
|
&mut prev_context_guard,
|
|
) {
|
|
Ok(opt) => opt,
|
|
Err(early_ret) => return early_ret,
|
|
};
|
|
|
|
// Switch process states, TSS stack pointer, and store new context ID
|
|
match switch_context_opt {
|
|
Some(mut next_context_guard) => {
|
|
// Update context states and prepare for the switch.
|
|
let prev_context = &mut *prev_context_guard;
|
|
let next_context = &mut *next_context_guard;
|
|
|
|
// Set the previous context as "not running"
|
|
prev_context.running = false;
|
|
prev_context.last_cpu = prev_context.cpu_id;
|
|
|
|
// Set the next context as "running"
|
|
next_context.running = true;
|
|
// Set the CPU ID for the next context
|
|
next_context.cpu_id = Some(cpu_id);
|
|
|
|
// Update times
|
|
if !was_idle {
|
|
prev_context.cpu_time += switch_time.saturating_sub(prev_context.switch_time);
|
|
if prev_context.sched_policy == SchedPolicy::Other {
|
|
let actual_ns = switch_time.saturating_sub(prev_context.switch_time);
|
|
let weight =
|
|
SCHED_PRIO_TO_WEIGHT[prev_context.sched_static_prio.min(39)] as u128;
|
|
let default_weight = SCHED_PRIO_TO_WEIGHT[20] as u128;
|
|
let delta = actual_ns.saturating_mul(default_weight) / weight.max(1);
|
|
prev_context.vruntime = prev_context.vruntime.saturating_add(delta);
|
|
}
|
|
}
|
|
next_context.switch_time = switch_time;
|
|
if next_context.userspace {
|
|
percpu.stats.set_state(cpu_stats::CpuState::User);
|
|
} else {
|
|
percpu.stats.set_state(cpu_stats::CpuState::Kernel);
|
|
}
|
|
unsafe {
|
|
percpu.switch_internals.set_current_context(Arc::clone(
|
|
ArcContextLockWriteGuard::rwlock(&next_context_guard),
|
|
));
|
|
}
|
|
|
|
// FIXME set the switch result in arch::switch_to instead
|
|
let prev_context = unsafe {
|
|
mem::transmute::<&'_ mut Context, &'_ mut Context>(&mut *prev_context_guard)
|
|
};
|
|
let next_context = unsafe {
|
|
mem::transmute::<&'_ mut Context, &'_ mut Context>(&mut *next_context_guard)
|
|
};
|
|
|
|
percpu
|
|
.switch_internals
|
|
.switch_result
|
|
.set(Some(SwitchResultInner {
|
|
_prev_guard: prev_context_guard,
|
|
_next_guard: next_context_guard,
|
|
}));
|
|
|
|
/*let (ptrace_session, ptrace_flags) = if let Some((session, bp)) = ptrace::sessions()
|
|
.get(&next_context.pid)
|
|
.map(|s| (Arc::downgrade(s), s.data.lock().breakpoint))
|
|
{
|
|
(Some(session), bp.map_or(PtraceFlags::empty(), |f| f.flags))
|
|
} else {
|
|
(None, PtraceFlags::empty())
|
|
};*/
|
|
let ptrace_flags = PtraceFlags::empty();
|
|
|
|
//*percpu.ptrace_session.borrow_mut() = ptrace_session;
|
|
percpu.ptrace_flags.set(ptrace_flags);
|
|
prev_context.inside_syscall =
|
|
percpu.inside_syscall.replace(next_context.inside_syscall);
|
|
|
|
#[cfg(feature = "syscall_debug")]
|
|
{
|
|
prev_context.syscall_debug_info = percpu
|
|
.syscall_debug_info
|
|
.replace(next_context.syscall_debug_info);
|
|
prev_context.syscall_debug_info.on_switch_from(token);
|
|
next_context.syscall_debug_info.on_switch_to(token);
|
|
}
|
|
|
|
percpu
|
|
.switch_internals
|
|
.being_sigkilled
|
|
.set(next_context.being_sigkilled);
|
|
|
|
unsafe {
|
|
arch::switch_to(prev_context, next_context);
|
|
}
|
|
|
|
// NOTE: After switch_to is called, the return address can even be different from the
|
|
// current return address, meaning that we cannot use local variables here, and that we
|
|
// need to use the `switch_finish_hook` to be able to release the locks. Newly created
|
|
// contexts will return directly to the function pointer passed to context::spawn, and not
|
|
// reach this code until the next context switch back.
|
|
SwitchResult::Switched
|
|
}
|
|
_ => {
|
|
// No target was found, unset global lock and return
|
|
arch::CONTEXT_SWITCH_LOCK.store(false, Ordering::SeqCst);
|
|
|
|
percpu.stats.set_state(cpu_stats::CpuState::Idle);
|
|
|
|
SwitchResult::AllContextsIdle
|
|
}
|
|
}
|
|
}
|
|
|
|
fn queue_previous_context(
|
|
token: &mut CleanLockToken,
|
|
percpu: &PercpuBlock,
|
|
prev_context_lock: &Arc<ContextLock>,
|
|
prev_context_guard: &ArcRwLockWriteGuard<L4, Context>,
|
|
idle_context: &Arc<ContextLock>,
|
|
) {
|
|
if Arc::ptr_eq(prev_context_lock, idle_context) {
|
|
return;
|
|
}
|
|
|
|
let prev_ctx = WeakContextRef(Arc::downgrade(prev_context_lock));
|
|
if prev_context_guard.status.is_runnable() {
|
|
let prio = prev_context_guard.prio;
|
|
let mut sched_lock = SchedQueuesLock::new(&percpu.sched);
|
|
unsafe {
|
|
sched_lock.queues_mut()[prio].push_back(prev_ctx);
|
|
}
|
|
} else {
|
|
idle_contexts(token.downgrade()).push_back(prev_ctx);
|
|
}
|
|
}
|
|
|
|
fn pop_movable_context(
|
|
token: &mut CleanLockToken,
|
|
queues: &mut [alloc::collections::VecDeque<WeakContextRef>; RUN_QUEUE_COUNT],
|
|
target_cpu: LogicalCpuId,
|
|
switch_time: u128,
|
|
idle_context: &Arc<ContextLock>,
|
|
) -> Option<(usize, WeakContextRef)> {
|
|
for prio in 0..RUN_QUEUE_COUNT {
|
|
let len = queues[prio].len();
|
|
for _ in 0..len {
|
|
let Some(context_ref) = queues[prio].pop_front() else {
|
|
break;
|
|
};
|
|
let Some(context_lock) = context_ref.upgrade() else {
|
|
continue;
|
|
};
|
|
if Arc::ptr_eq(&context_lock, idle_context) {
|
|
queues[prio].push_back(context_ref);
|
|
continue;
|
|
}
|
|
|
|
let mut context_guard = unsafe { context_lock.write_arc() };
|
|
let sw = unsafe { update_stealable(&mut context_guard, switch_time) };
|
|
if let UpdateResult::CanSwitch = sw {
|
|
assign_context_to_cpu(&mut context_guard, target_cpu);
|
|
let moved_ref = WeakContextRef(Arc::downgrade(ArcContextLockWriteGuard::rwlock(
|
|
&context_guard,
|
|
)));
|
|
drop(context_guard);
|
|
return Some((prio, moved_ref));
|
|
}
|
|
|
|
if matches!(sw, UpdateResult::Blocked) {
|
|
idle_contexts(token.downgrade()).push_back(context_ref);
|
|
} else {
|
|
queues[prio].push_back(context_ref);
|
|
}
|
|
}
|
|
}
|
|
|
|
None
|
|
}
|
|
|
|
fn steal_work(
|
|
token: &mut CleanLockToken,
|
|
cpu_id: LogicalCpuId,
|
|
switch_time: u128,
|
|
) -> Option<ArcContextLockWriteGuard> {
|
|
let cpu_count = crate::cpu_count();
|
|
if cpu_count <= 1 {
|
|
return None;
|
|
}
|
|
|
|
for offset in 1..cpu_count {
|
|
let victim_id = LogicalCpuId::new((cpu_id.get() + offset) % cpu_count);
|
|
let Some(victim) = get_percpu_block(victim_id) else {
|
|
continue;
|
|
};
|
|
|
|
let victim_idle = victim.switch_internals.idle_context();
|
|
let mut victim_lock = SchedQueuesLock::new(&victim.sched);
|
|
let victim_queues = unsafe { victim_lock.queues_mut() };
|
|
|
|
for prio in 0..RUN_QUEUE_COUNT {
|
|
let len = victim_queues[prio].len();
|
|
for _ in 0..len {
|
|
let Some(context_ref) = victim_queues[prio].pop_front() else {
|
|
break;
|
|
};
|
|
let Some(context_lock) = context_ref.upgrade() else {
|
|
continue;
|
|
};
|
|
if Arc::ptr_eq(&context_lock, &victim_idle) {
|
|
victim_queues[prio].push_back(context_ref);
|
|
continue;
|
|
}
|
|
|
|
let mut context_guard = unsafe { context_lock.write_arc() };
|
|
let sw = unsafe { update_stealable(&mut context_guard, switch_time) };
|
|
if let UpdateResult::CanSwitch = sw {
|
|
assign_context_to_cpu(&mut context_guard, cpu_id);
|
|
SCHED_STEAL_COUNT.fetch_add(1, Ordering::Relaxed);
|
|
return Some(context_guard);
|
|
}
|
|
|
|
if matches!(sw, UpdateResult::Blocked) {
|
|
// Use try_lock to avoid deadlocking with
|
|
// wakeup_contexts (which holds IDLE_CONTEXTS then
|
|
// waits for our victim_lock). If IDLE_CONTEXTS is
|
|
// busy (owned by another CPU's wakeup_contexts),
|
|
// skip the push-back; the context will be re-checked
|
|
// on the next wakeup round.
|
|
if let Some(mut idle) = idle_contexts_try(token.downgrade()) {
|
|
idle.push_back(context_ref);
|
|
}
|
|
} else {
|
|
victim_queues[prio].push_back(context_ref);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
None
|
|
}
|
|
|
|
fn queue_depth(percpu: &PercpuBlock) -> usize {
|
|
let mut sched_lock = SchedQueuesLock::new(&percpu.sched);
|
|
unsafe {
|
|
sched_lock
|
|
.queues_mut()
|
|
.iter()
|
|
.map(|queue| queue.len())
|
|
.sum()
|
|
}
|
|
}
|
|
|
|
fn migrate_one_context(
|
|
token: &mut CleanLockToken,
|
|
source_id: LogicalCpuId,
|
|
target_id: LogicalCpuId,
|
|
switch_time: u128,
|
|
) -> bool {
|
|
let Some(source) = get_percpu_block(source_id) else {
|
|
return false;
|
|
};
|
|
let Some(target) = get_percpu_block(target_id) else {
|
|
return false;
|
|
};
|
|
|
|
let source_idle = source.switch_internals.idle_context();
|
|
let moved = {
|
|
let mut source_lock = SchedQueuesLock::new(&source.sched);
|
|
let source_queues = unsafe { source_lock.queues_mut() };
|
|
pop_movable_context(token, source_queues, target_id, switch_time, &source_idle)
|
|
};
|
|
|
|
let Some((prio, context_ref)) = moved else {
|
|
return false;
|
|
};
|
|
|
|
let mut target_lock = SchedQueuesLock::new(&target.sched);
|
|
unsafe {
|
|
target_lock.queues_mut()[prio].push_back(context_ref);
|
|
}
|
|
true
|
|
}
|
|
|
|
fn maybe_balance_queues(token: &mut CleanLockToken, percpu: &PercpuBlock, balance_time: u128) {
|
|
if crate::cpu_count() <= 1 || percpu.cpu_id != LogicalCpuId::BSP {
|
|
return;
|
|
}
|
|
if balance_time.saturating_sub(percpu.sched.last_balance_time.get()) < LOAD_BALANCE_INTERVAL_NS
|
|
{
|
|
return;
|
|
}
|
|
|
|
percpu.sched.last_balance_time.set(balance_time);
|
|
|
|
let mut depths = Vec::new();
|
|
let mut total_depth = 0usize;
|
|
for raw_id in 0..crate::cpu_count() {
|
|
let cpu_id = LogicalCpuId::new(raw_id);
|
|
let Some(cpu_percpu) = get_percpu_block(cpu_id) else {
|
|
continue;
|
|
};
|
|
let depth = queue_depth(cpu_percpu);
|
|
total_depth += depth;
|
|
depths.push((cpu_id, depth));
|
|
}
|
|
|
|
if depths.len() <= 1 || total_depth == 0 {
|
|
return;
|
|
}
|
|
|
|
let avg_depth = (total_depth + depths.len().saturating_sub(1)) / depths.len();
|
|
|
|
for target_index in 0..depths.len() {
|
|
if depths[target_index].1 != 0 {
|
|
continue;
|
|
}
|
|
|
|
let mut source_index = None;
|
|
let mut source_depth = 0usize;
|
|
for (idx, &(_, depth)) in depths.iter().enumerate() {
|
|
if idx == target_index {
|
|
continue;
|
|
}
|
|
if depth > avg_depth + 1 && depth > source_depth {
|
|
source_index = Some(idx);
|
|
source_depth = depth;
|
|
}
|
|
}
|
|
|
|
let Some(source_index) = source_index else {
|
|
continue;
|
|
};
|
|
|
|
let source_id = depths[source_index].0;
|
|
let target_id = depths[target_index].0;
|
|
if migrate_one_context(token, source_id, target_id, balance_time) {
|
|
depths[source_index].1 = depths[source_index].1.saturating_sub(1);
|
|
depths[target_index].1 += 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
fn wakeup_contexts(token: &mut CleanLockToken, percpu: &PercpuBlock, switch_time: u128) {
|
|
// TODO: Optimise this somehow. Perhaps using a separate timer queue?
|
|
let mut wakeups = Vec::new();
|
|
let current_context = context::current();
|
|
let Some(idle_contexts) = idle_contexts_try(token.downgrade()) else {
|
|
// other cpus may spawning or killing contexts so let's skip wakeups to avoid contention
|
|
return;
|
|
};
|
|
let (mut idle_contexts, mut token) = idle_contexts.into_split();
|
|
let len = idle_contexts.len();
|
|
for _ in 0..len {
|
|
let Some(context_ref) = idle_contexts.pop_front() else {
|
|
break;
|
|
};
|
|
let Some(context) = context_ref.upgrade() else {
|
|
continue;
|
|
};
|
|
if Arc::ptr_eq(&context, ¤t_context) {
|
|
idle_contexts.push_back(context_ref);
|
|
continue;
|
|
}
|
|
let Some(guard) = context.try_read(token.token()) else {
|
|
idle_contexts.push_back(context_ref);
|
|
continue;
|
|
};
|
|
if guard.status.is_soft_blocked()
|
|
&& let Some(wake) = guard.wake
|
|
&& switch_time >= wake
|
|
{
|
|
let prio = guard.prio;
|
|
drop(guard);
|
|
wakeups.push((prio, context_ref));
|
|
continue;
|
|
}
|
|
|
|
if guard.status.is_runnable() && !guard.running {
|
|
let prio = guard.prio;
|
|
drop(guard);
|
|
wakeups.push((prio, context_ref));
|
|
continue;
|
|
}
|
|
|
|
drop(guard);
|
|
idle_contexts.push_back(context_ref);
|
|
}
|
|
|
|
// Drop IDLE_CONTEXTS before acquiring SchedQueuesLock to avoid a
|
|
// deadlock with steal_work on another CPU. The previous code held
|
|
// both locks simultaneously: wakeup_contexts (this fn) held
|
|
// IDLE_CONTEXTS, then waited for SchedQueuesLock on its own CPU;
|
|
// steal_work on another CPU held that same SchedQueuesLock and
|
|
// waited for IDLE_CONTEXTS. Classic circular wait.
|
|
//
|
|
// The wakeups Vec is the only data we need from IDLE_CONTEXTS;
|
|
// releasing the lock here means steal_work on another CPU can
|
|
// proceed while this CPU is acquiring SchedQueuesLock.
|
|
drop(idle_contexts);
|
|
|
|
if wakeups.is_empty() {
|
|
return;
|
|
}
|
|
|
|
let mut sched_lock = SchedQueuesLock::new(&percpu.sched);
|
|
let run_queues = unsafe { sched_lock.queues_mut() };
|
|
for (prio, context_ref) in wakeups {
|
|
if let Some(context_lock) = context_ref.upgrade() {
|
|
let mut context_guard = unsafe { context_lock.write_arc() };
|
|
assign_context_to_cpu(&mut context_guard, percpu.cpu_id);
|
|
}
|
|
run_queues[prio].push_back(context_ref);
|
|
}
|
|
}
|
|
|
|
fn pick_next_from_queues(
|
|
token: &mut CleanLockToken,
|
|
contexts_list: &mut [alloc::collections::VecDeque<WeakContextRef>; RUN_QUEUE_COUNT],
|
|
cpu_id: LogicalCpuId,
|
|
switch_time: u128,
|
|
prev_context_lock: &Arc<ContextLock>,
|
|
idle_context: &Arc<ContextLock>,
|
|
balance: &mut [usize; RUN_QUEUE_COUNT],
|
|
i: &mut usize,
|
|
) -> Option<ArcContextLockWriteGuard> {
|
|
let mut empty_queues = 0;
|
|
let mut total_iters = 0;
|
|
let total_contexts: usize = contexts_list.iter().map(|q| q.len()).sum();
|
|
let mut skipped_contexts = 0;
|
|
|
|
for prio in 0..RUN_QUEUE_COUNT {
|
|
let rt_contexts = contexts_list
|
|
.get_mut(prio)
|
|
.expect("prio should be between [0, 39]");
|
|
let len = rt_contexts.len();
|
|
for _ in 0..len {
|
|
let (rt_ref, rt_lock) = match rt_contexts.pop_front() {
|
|
Some(lock) => match lock.upgrade() {
|
|
Some(l) => (lock, l),
|
|
None => {
|
|
skipped_contexts += 1;
|
|
continue;
|
|
}
|
|
},
|
|
None => break,
|
|
};
|
|
if Arc::ptr_eq(&rt_lock, idle_context) || Arc::ptr_eq(&rt_lock, prev_context_lock) {
|
|
rt_contexts.push_back(rt_ref);
|
|
continue;
|
|
}
|
|
let rt_guard = unsafe { rt_lock.write_arc() };
|
|
if !rt_guard.status.is_runnable()
|
|
|| rt_guard.running
|
|
|| !rt_guard.sched_affinity.contains(cpu_id)
|
|
{
|
|
rt_contexts.push_back(rt_ref);
|
|
continue;
|
|
}
|
|
if rt_guard.sched_policy == SchedPolicy::Fifo
|
|
|| rt_guard.sched_policy == SchedPolicy::RoundRobin
|
|
{
|
|
return Some(rt_guard);
|
|
}
|
|
rt_contexts.push_back(rt_ref);
|
|
}
|
|
}
|
|
|
|
{
|
|
let mut min_vruntime = u128::MAX;
|
|
let mut best: Option<(usize, WeakContextRef)> = None;
|
|
for (prio, queue) in contexts_list.iter().enumerate() {
|
|
for ctx_ref in queue.iter() {
|
|
if let Some(ctx_lock) = ctx_ref.upgrade() {
|
|
if Arc::ptr_eq(&ctx_lock, prev_context_lock)
|
|
|| Arc::ptr_eq(&ctx_lock, idle_context)
|
|
{
|
|
continue;
|
|
}
|
|
if let Some(guard) = ctx_lock.try_read(token.token()) {
|
|
if guard.status.is_runnable()
|
|
&& !guard.running
|
|
&& guard.sched_affinity.contains(cpu_id)
|
|
&& guard.sched_policy == SchedPolicy::Other
|
|
{
|
|
let mut vruntime = guard.vruntime;
|
|
if guard.last_cpu == Some(cpu_id) {
|
|
vruntime = vruntime.saturating_sub(vruntime / 8);
|
|
}
|
|
drop(guard);
|
|
if vruntime < min_vruntime {
|
|
min_vruntime = vruntime;
|
|
best = Some((prio, ctx_ref.clone()));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if let Some((best_prio, ctx_ref)) = best {
|
|
contexts_list[best_prio].retain(|r| !WeakContextRef::eq(r, &ctx_ref));
|
|
if let Some(ctx_lock) = ctx_ref.upgrade() {
|
|
let guard = unsafe { ctx_lock.write_arc() };
|
|
if guard.status.is_runnable()
|
|
&& !guard.running
|
|
&& guard.sched_affinity.contains(cpu_id)
|
|
&& guard.sched_policy == SchedPolicy::Other
|
|
{
|
|
return Some(guard);
|
|
}
|
|
|
|
drop(guard);
|
|
contexts_list[best_prio].push_back(ctx_ref);
|
|
}
|
|
}
|
|
}
|
|
|
|
'priority: loop {
|
|
*i = (*i + 1) % RUN_QUEUE_COUNT;
|
|
total_iters += 1;
|
|
|
|
if total_iters >= 5000 {
|
|
break 'priority;
|
|
}
|
|
|
|
if skipped_contexts > total_contexts && total_contexts > 0 {
|
|
break 'priority;
|
|
}
|
|
|
|
let contexts = contexts_list
|
|
.get_mut(*i)
|
|
.expect("i should be between [0, 39]!");
|
|
|
|
if contexts.is_empty() {
|
|
empty_queues += 1;
|
|
if empty_queues >= RUN_QUEUE_COUNT {
|
|
break 'priority;
|
|
}
|
|
continue;
|
|
}
|
|
|
|
empty_queues = 0;
|
|
|
|
if balance[*i] < SCHED_PRIO_TO_WEIGHT[20] {
|
|
balance[*i] += SCHED_PRIO_TO_WEIGHT[*i];
|
|
continue;
|
|
}
|
|
|
|
let len = contexts.len();
|
|
for _ in 0..len {
|
|
let (next_context_ref, next_context_lock) = match contexts.pop_front() {
|
|
Some(lock) => match lock.upgrade() {
|
|
Some(new_lock) => (lock, new_lock),
|
|
None => {
|
|
skipped_contexts += 1;
|
|
continue;
|
|
}
|
|
},
|
|
None => break,
|
|
};
|
|
|
|
if Arc::ptr_eq(&next_context_lock, prev_context_lock)
|
|
|| Arc::ptr_eq(&next_context_lock, idle_context)
|
|
{
|
|
contexts.push_back(next_context_ref);
|
|
continue;
|
|
}
|
|
let mut next_context_guard = unsafe { next_context_lock.write_arc() };
|
|
|
|
let sw = unsafe { update_runnable(&mut next_context_guard, cpu_id, switch_time) };
|
|
if let UpdateResult::CanSwitch = sw {
|
|
balance[*i] -= SCHED_PRIO_TO_WEIGHT[20];
|
|
return Some(next_context_guard);
|
|
}
|
|
|
|
if matches!(sw, UpdateResult::Blocked) {
|
|
idle_contexts(token.downgrade()).push_back(next_context_ref);
|
|
} else {
|
|
contexts.push_back(next_context_ref);
|
|
}
|
|
skipped_contexts += 1;
|
|
|
|
if skipped_contexts >= total_contexts {
|
|
break 'priority;
|
|
}
|
|
}
|
|
}
|
|
|
|
None
|
|
}
|
|
|
|
fn pick_next_from_global_queues(
|
|
token: &mut LockToken<L1>,
|
|
contexts_list: &mut [alloc::collections::VecDeque<WeakContextRef>; RUN_QUEUE_COUNT],
|
|
cpu_id: LogicalCpuId,
|
|
switch_time: u128,
|
|
prev_context_lock: &Arc<ContextLock>,
|
|
idle_context: &Arc<ContextLock>,
|
|
balance: &mut [usize; RUN_QUEUE_COUNT],
|
|
i: &mut usize,
|
|
) -> Option<ArcContextLockWriteGuard> {
|
|
let mut empty_queues = 0;
|
|
let mut total_iters = 0;
|
|
let total_contexts: usize = contexts_list.iter().map(|q| q.len()).sum();
|
|
let mut skipped_contexts = 0;
|
|
|
|
for prio in 0..RUN_QUEUE_COUNT {
|
|
let rt_contexts = contexts_list
|
|
.get_mut(prio)
|
|
.expect("prio should be between [0, 39]");
|
|
let len = rt_contexts.len();
|
|
for _ in 0..len {
|
|
let (rt_ref, rt_lock) = match rt_contexts.pop_front() {
|
|
Some(lock) => match lock.upgrade() {
|
|
Some(l) => (lock, l),
|
|
None => {
|
|
skipped_contexts += 1;
|
|
continue;
|
|
}
|
|
},
|
|
None => break,
|
|
};
|
|
if Arc::ptr_eq(&rt_lock, idle_context) || Arc::ptr_eq(&rt_lock, prev_context_lock) {
|
|
rt_contexts.push_back(rt_ref);
|
|
continue;
|
|
}
|
|
let rt_guard = unsafe { rt_lock.write_arc() };
|
|
if !rt_guard.status.is_runnable()
|
|
|| rt_guard.running
|
|
|| !rt_guard.sched_affinity.contains(cpu_id)
|
|
{
|
|
rt_contexts.push_back(rt_ref);
|
|
continue;
|
|
}
|
|
if rt_guard.sched_policy == SchedPolicy::Fifo
|
|
|| rt_guard.sched_policy == SchedPolicy::RoundRobin
|
|
{
|
|
return Some(rt_guard);
|
|
}
|
|
rt_contexts.push_back(rt_ref);
|
|
}
|
|
}
|
|
|
|
{
|
|
let mut min_vruntime = u128::MAX;
|
|
let mut best: Option<(usize, WeakContextRef)> = None;
|
|
for (prio, queue) in contexts_list.iter().enumerate() {
|
|
for ctx_ref in queue.iter() {
|
|
if let Some(ctx_lock) = ctx_ref.upgrade() {
|
|
if Arc::ptr_eq(&ctx_lock, prev_context_lock)
|
|
|| Arc::ptr_eq(&ctx_lock, idle_context)
|
|
{
|
|
continue;
|
|
}
|
|
if let Some(guard) = ctx_lock.try_read(token.token()) {
|
|
if guard.status.is_runnable()
|
|
&& !guard.running
|
|
&& guard.sched_affinity.contains(cpu_id)
|
|
&& guard.sched_policy == SchedPolicy::Other
|
|
{
|
|
let mut vruntime = guard.vruntime;
|
|
if guard.last_cpu == Some(cpu_id) {
|
|
vruntime = vruntime.saturating_sub(vruntime / 8);
|
|
}
|
|
drop(guard);
|
|
if vruntime < min_vruntime {
|
|
min_vruntime = vruntime;
|
|
best = Some((prio, ctx_ref.clone()));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if let Some((best_prio, ctx_ref)) = best {
|
|
contexts_list[best_prio].retain(|r| !WeakContextRef::eq(r, &ctx_ref));
|
|
if let Some(ctx_lock) = ctx_ref.upgrade() {
|
|
let guard = unsafe { ctx_lock.write_arc() };
|
|
if guard.status.is_runnable()
|
|
&& !guard.running
|
|
&& guard.sched_affinity.contains(cpu_id)
|
|
&& guard.sched_policy == SchedPolicy::Other
|
|
{
|
|
return Some(guard);
|
|
}
|
|
|
|
drop(guard);
|
|
contexts_list[best_prio].push_back(ctx_ref);
|
|
}
|
|
}
|
|
}
|
|
|
|
'priority: loop {
|
|
*i = (*i + 1) % RUN_QUEUE_COUNT;
|
|
total_iters += 1;
|
|
|
|
if total_iters >= 5000 {
|
|
break 'priority;
|
|
}
|
|
|
|
if skipped_contexts > total_contexts && total_contexts > 0 {
|
|
break 'priority;
|
|
}
|
|
|
|
let contexts = contexts_list
|
|
.get_mut(*i)
|
|
.expect("i should be between [0, 39]!");
|
|
|
|
if contexts.is_empty() {
|
|
empty_queues += 1;
|
|
if empty_queues >= RUN_QUEUE_COUNT {
|
|
break 'priority;
|
|
}
|
|
continue;
|
|
}
|
|
|
|
empty_queues = 0;
|
|
|
|
if balance[*i] < SCHED_PRIO_TO_WEIGHT[20] {
|
|
balance[*i] += SCHED_PRIO_TO_WEIGHT[*i];
|
|
continue;
|
|
}
|
|
|
|
let len = contexts.len();
|
|
for _ in 0..len {
|
|
let (next_context_ref, next_context_lock) = match contexts.pop_front() {
|
|
Some(lock) => match lock.upgrade() {
|
|
Some(new_lock) => (lock, new_lock),
|
|
None => {
|
|
skipped_contexts += 1;
|
|
continue;
|
|
}
|
|
},
|
|
None => break,
|
|
};
|
|
|
|
if Arc::ptr_eq(&next_context_lock, prev_context_lock)
|
|
|| Arc::ptr_eq(&next_context_lock, idle_context)
|
|
{
|
|
contexts.push_back(next_context_ref);
|
|
continue;
|
|
}
|
|
let mut next_context_guard = unsafe { next_context_lock.write_arc() };
|
|
|
|
let sw = unsafe { update_runnable(&mut next_context_guard, cpu_id, switch_time) };
|
|
if let UpdateResult::CanSwitch = sw {
|
|
balance[*i] -= SCHED_PRIO_TO_WEIGHT[20];
|
|
return Some(next_context_guard);
|
|
}
|
|
|
|
if matches!(sw, UpdateResult::Blocked) {
|
|
idle_contexts(token.token()).push_back(next_context_ref);
|
|
} else {
|
|
contexts.push_back(next_context_ref);
|
|
}
|
|
skipped_contexts += 1;
|
|
|
|
if skipped_contexts >= total_contexts {
|
|
break 'priority;
|
|
}
|
|
}
|
|
}
|
|
|
|
None
|
|
}
|
|
|
|
unsafe fn update_stealable(context: &mut Context, switch_time: u128) -> UpdateResult {
|
|
if context.running {
|
|
return UpdateResult::Skip;
|
|
}
|
|
if context.status.is_soft_blocked()
|
|
&& let Some(wake) = context.wake
|
|
&& switch_time >= wake
|
|
{
|
|
context.wake = None;
|
|
context.unblock_no_ipi();
|
|
}
|
|
if context.status.is_runnable() {
|
|
UpdateResult::CanSwitch
|
|
} else {
|
|
UpdateResult::Blocked
|
|
}
|
|
}
|
|
|
|
/// This is the scheduler function which currently utilises Deficit Weighted Round Robin Scheduler
|
|
fn select_next_context(
|
|
token: &mut CleanLockToken,
|
|
percpu: &PercpuBlock,
|
|
cpu_id: LogicalCpuId,
|
|
switch_time: u128,
|
|
was_idle: bool,
|
|
prev_context_guard: &mut ArcRwLockWriteGuard<L4, Context>,
|
|
) -> Result<Option<ArcContextLockWriteGuard>, SwitchResult> {
|
|
let idle_context = percpu.switch_internals.idle_context();
|
|
let prev_context_lock = crate::context::current();
|
|
|
|
let local_next = {
|
|
let mut sched_lock = SchedQueuesLock::new(&percpu.sched);
|
|
let mut balance = percpu.sched.balance.get();
|
|
let mut last_queue = percpu.sched.last_queue.get() % RUN_QUEUE_COUNT;
|
|
let next = pick_next_from_queues(
|
|
token,
|
|
unsafe { sched_lock.queues_mut() },
|
|
cpu_id,
|
|
switch_time,
|
|
&prev_context_lock,
|
|
&idle_context,
|
|
&mut balance,
|
|
&mut last_queue,
|
|
);
|
|
percpu.sched.balance.set(balance);
|
|
percpu.sched.last_queue.set(last_queue);
|
|
next
|
|
};
|
|
|
|
if let Some(next_context_guard) = local_next {
|
|
queue_previous_context(
|
|
token,
|
|
percpu,
|
|
&prev_context_lock,
|
|
prev_context_guard,
|
|
&idle_context,
|
|
);
|
|
return Ok(Some(next_context_guard));
|
|
}
|
|
|
|
if let Some(next_context_guard) = steal_work(token, cpu_id, switch_time) {
|
|
queue_previous_context(
|
|
token,
|
|
percpu,
|
|
&prev_context_lock,
|
|
prev_context_guard,
|
|
&idle_context,
|
|
);
|
|
return Ok(Some(next_context_guard));
|
|
}
|
|
|
|
let global_next = {
|
|
let contexts_data = run_contexts(token.token());
|
|
let (mut contexts_data, mut contexts_token) = contexts_data.into_split();
|
|
let mut balance = percpu.sched.balance.get();
|
|
let mut last_queue = percpu.sched.last_queue.get() % RUN_QUEUE_COUNT;
|
|
let next = pick_next_from_global_queues(
|
|
&mut contexts_token,
|
|
&mut contexts_data.set,
|
|
cpu_id,
|
|
switch_time,
|
|
&prev_context_lock,
|
|
&idle_context,
|
|
&mut balance,
|
|
&mut last_queue,
|
|
);
|
|
percpu.sched.balance.set(balance);
|
|
percpu.sched.last_queue.set(last_queue);
|
|
next
|
|
};
|
|
|
|
if let Some(next_context_guard) = global_next {
|
|
queue_previous_context(
|
|
token,
|
|
percpu,
|
|
&prev_context_lock,
|
|
prev_context_guard,
|
|
&idle_context,
|
|
);
|
|
return Ok(Some(next_context_guard));
|
|
}
|
|
|
|
queue_previous_context(
|
|
token,
|
|
percpu,
|
|
&prev_context_lock,
|
|
prev_context_guard,
|
|
&idle_context,
|
|
);
|
|
|
|
if !was_idle && !Arc::ptr_eq(&prev_context_lock, &idle_context) {
|
|
Ok(Some(unsafe { idle_context.write_arc() }))
|
|
} else {
|
|
Ok(None)
|
|
}
|
|
}
|
|
|
|
/// Holds per-CPU state necessary for context switching.
|
|
///
|
|
/// This struct contains information such as the idle context, current context, and PIT tick counts,
|
|
/// as well as fields required for managing ptrace sessions and signals.
|
|
pub struct ContextSwitchPercpu {
|
|
switch_result: Cell<Option<SwitchResultInner>>,
|
|
switch_time: Cell<u128>,
|
|
pit_ticks: Cell<usize>,
|
|
|
|
current_ctxt: RefCell<Option<Arc<ContextLock>>>,
|
|
|
|
/// The idle process.
|
|
idle_ctxt: RefCell<Option<Arc<ContextLock>>>,
|
|
pub(crate) being_sigkilled: Cell<bool>,
|
|
}
|
|
|
|
impl ContextSwitchPercpu {
|
|
pub const fn default() -> Self {
|
|
Self {
|
|
switch_result: Cell::new(None),
|
|
switch_time: Cell::new(0),
|
|
pit_ticks: Cell::new(0),
|
|
current_ctxt: RefCell::new(None),
|
|
idle_ctxt: RefCell::new(None),
|
|
being_sigkilled: Cell::new(false),
|
|
}
|
|
}
|
|
|
|
/// Applies a function to the current context, allowing controlled access.
|
|
///
|
|
/// # Parameters
|
|
/// - `f`: A closure that receives a reference to the current context and returns a value.
|
|
///
|
|
/// # Returns
|
|
/// The result of applying `f` to the current context.
|
|
pub fn with_context<T>(&self, f: impl FnOnce(&Arc<ContextLock>) -> T) -> T {
|
|
f(self
|
|
.current_ctxt
|
|
.borrow()
|
|
.as_ref()
|
|
.expect("not inside of context"))
|
|
}
|
|
|
|
/// Applies a function to the current context, allowing controlled access.
|
|
///
|
|
/// # Parameters
|
|
/// - `f`: A closure that receives a reference to the current context and returns a value.
|
|
///
|
|
/// # Returns
|
|
/// The result of applying `f` to the current context if any.
|
|
pub fn try_with_context<T>(&self, f: impl FnOnce(Option<&Arc<ContextLock>>) -> T) -> T {
|
|
f(self.current_ctxt.borrow().as_ref())
|
|
}
|
|
|
|
/// Sets the current context to a new value.
|
|
///
|
|
/// # Safety
|
|
/// This function is unsafe as it modifies the context state directly.
|
|
///
|
|
/// # Parameters
|
|
/// - `new`: The new context to be set as the current context.
|
|
pub unsafe fn set_current_context(&self, new: Arc<ContextLock>) {
|
|
*self.current_ctxt.borrow_mut() = Some(new);
|
|
}
|
|
|
|
/// Sets the idle context to a new value.
|
|
///
|
|
/// # Safety
|
|
/// This function is unsafe as it modifies the idle context state directly.
|
|
///
|
|
/// # Parameters
|
|
/// - `new`: The new context to be set as the idle context.
|
|
pub unsafe fn set_idle_context(&self, new: Arc<ContextLock>) {
|
|
*self.idle_ctxt.borrow_mut() = Some(new);
|
|
}
|
|
|
|
/// Retrieves the current idle context.
|
|
///
|
|
/// # Returns
|
|
/// A reference to the idle context.
|
|
pub fn idle_context(&self) -> Arc<ContextLock> {
|
|
Arc::clone(
|
|
self.idle_ctxt
|
|
.borrow()
|
|
.as_ref()
|
|
.expect("no idle context present"),
|
|
)
|
|
}
|
|
}
|