Files
RedBear-OS/src/context/switch.rs
T
vasilito e812356cf0 fix: per-CPU idle context race condition + nightly-2026-04-11 pin
- Add try_idle_context() to ContextSwitchPercpu (switch.rs)
  Cross-CPU paths (steal_work, migrate_one_context) use try_idle_context()
  instead of idle_context() to avoid panic when APs haven't called
  context::init() yet. Returns Option<context::Arc> instead of panicking.
- Pin rust-toolchain.toml to nightly-2026-04-11
- Remove build artifacts (kernel, kernel.all, kernel.sym) from git tracking
- This fixes the boot panic that occurred during multi-CPU scheduling
2026-07-02 16:53:19 +03:00

1195 lines
40 KiB
Rust

//! This module provides a context-switching mechanism that utilizes a simple round-robin scheduler.
//! The scheduler iterates over available contexts, selecting the next context to run, while
//! handling process states and synchronization.
use crate::{
context::{
self, arch, idle_contexts, idle_contexts_try, run_contexts, ArcContextLockWriteGuard,
Context, ContextLock, SchedPolicy, WeakContextRef, RUN_QUEUE_COUNT,
},
cpu_set::{LogicalCpuId, LogicalCpuSet},
cpu_stats::{self, CpuState},
percpu::{get_percpu_block, PerCpuSched, PercpuBlock},
sync::{ArcRwLockWriteGuard, CleanLockToken, LockToken, L1, L4},
};
use alloc::{sync::Arc, vec::Vec};
use core::{
cell::{Cell, RefCell},
hint, mem,
sync::atomic::{AtomicUsize, Ordering},
};
use syscall::PtraceFlags;
enum UpdateResult {
CanSwitch,
Skip,
Blocked,
}
// A simple geometric series where value[i] ~= value[i - 1] * 1.25
const SCHED_PRIO_TO_WEIGHT: [usize; 40] = [
88761, 71755, 56483, 46273, 36291, 29154, 23254, 18705, 14949, 11916, 9548, 7620, 6100, 4904,
3906, 3121, 2501, 1991, 1586, 1277, 1024, 820, 655, 526, 423, 335, 272, 215, 172, 137, 110, 87,
70, 56, 45, 36, 29, 23, 18, 15,
];
const LOAD_BALANCE_INTERVAL_NS: u128 = 100_000_000;
static SCHED_STEAL_COUNT: AtomicUsize = AtomicUsize::new(0);
struct SchedQueuesLock<'a> {
sched: &'a PerCpuSched,
}
impl<'a> SchedQueuesLock<'a> {
fn new(sched: &'a PerCpuSched) -> Self {
sched.take_lock();
Self { sched }
}
unsafe fn queues_mut(
&mut self,
) -> &mut [alloc::collections::VecDeque<WeakContextRef>; RUN_QUEUE_COUNT] {
unsafe { self.sched.queues_mut() }
}
}
impl Drop for SchedQueuesLock<'_> {
fn drop(&mut self) {
self.sched.release_lock();
}
}
fn assign_context_to_cpu(context: &mut Context, cpu_id: LogicalCpuId) {
context.sched_affinity = LogicalCpuSet::empty();
context.sched_affinity.atomic_set(cpu_id);
}
unsafe fn update_runnable(
context: &mut Context,
cpu_id: LogicalCpuId,
switch_time: u128,
) -> UpdateResult {
if context.running {
return UpdateResult::Skip;
}
if !context.sched_affinity.contains(cpu_id) {
return UpdateResult::Skip;
}
if context.status.is_soft_blocked()
&& let Some(wake) = context.wake
&& switch_time >= wake
{
context.wake = None;
context.unblock_no_ipi();
}
if context.status.is_runnable() {
UpdateResult::CanSwitch
} else {
UpdateResult::Blocked
}
}
struct SwitchResultInner {
_prev_guard: ArcContextLockWriteGuard,
_next_guard: ArcContextLockWriteGuard,
}
/// Tick function to update PIT ticks and trigger a context switch if necessary.
///
/// Called periodically, this function increments a per-CPU tick counter and performs a context
/// switch if the counter reaches a set threshold (e.g., every 3 ticks).
///
/// The function also calls the signal handler after switching contexts.
pub fn tick(token: &mut CleanLockToken) {
let percpu = PercpuBlock::current();
let ticks_cell = &percpu.switch_internals.pit_ticks;
let new_ticks = ticks_cell.get() + 1;
ticks_cell.set(new_ticks);
let balance_time = crate::time::monotonic(token);
maybe_balance_queues(token, percpu, balance_time);
// Trigger a context switch after every 3 ticks.
if new_ticks >= 3 {
switch(token);
crate::context::signal::signal_handler(token);
}
}
/// Finishes the context switch by clearing any temporary data and resetting the lock.
///
/// This function is called after a context switch is completed to perform cleanup, including
/// clearing the switch result data and releasing the context switch lock.
///
/// # Safety
/// This function involves unsafe operations such as resetting state and releasing locks.
pub unsafe extern "C" fn switch_finish_hook() {
unsafe {
match PercpuBlock::current().switch_internals.switch_result.take() {
Some(switch_result) => {
drop(switch_result);
}
_ => {
// TODO: unreachable_unchecked()?
crate::arch::stop::emergency_reset();
}
}
arch::CONTEXT_SWITCH_LOCK.store(false, Ordering::SeqCst);
crate::percpu::switch_arch_hook();
}
}
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub enum SwitchResult {
Switched,
AllContextsIdle,
}
/// This function performs the context switch, using select_next_context to
/// actually select the next context to switch to.
///
/// # Warning
/// This is not memory-unsafe to call. But do NOT call this while holding locks!
///
/// # Returns
/// - `SwitchResult::Switched`: Indicates a successful switch to a new context.
/// - `SwitchResult::AllContextsIdle`: Indicates all contexts are idle, and the CPU will switch
/// to an idle context.
pub fn switch(token: &mut CleanLockToken) -> SwitchResult {
let switch_time = crate::time::monotonic(token);
let percpu = PercpuBlock::current();
cpu_stats::add_context_switch();
//set PIT Interrupt counter to 0, giving each process same amount of PIT ticks
percpu.switch_internals.pit_ticks.set(0);
// Acquire the global lock to ensure exclusive access during context switch and avoid
// issues that would be caused by the unsafe operations below
// TODO: Better memory orderings?
while arch::CONTEXT_SWITCH_LOCK
.compare_exchange_weak(false, true, Ordering::SeqCst, Ordering::Relaxed)
.is_err()
{
hint::spin_loop();
percpu.maybe_handle_tlb_shootdown();
}
// Lock the previous context.
let prev_context_lock = crate::context::current();
// We are careful not to lock this context twice
let mut prev_context_guard = unsafe { prev_context_lock.write_arc() };
if !prev_context_guard.is_preemptable() {
arch::CONTEXT_SWITCH_LOCK.store(false, Ordering::SeqCst);
return SwitchResult::Switched;
}
// Alarm (previously in update_runnable)
wakeup_contexts(token, percpu, switch_time);
let cpu_id = crate::cpu_id();
// Update per-cpu times
let percpu_nanos = switch_time.saturating_sub(percpu.switch_internals.switch_time.get()) as u64;
let percpu_ms = percpu_nanos / 1_000_000;
let was_idle = percpu.stats.add_time(percpu_ms) == CpuState::Idle as u8;
percpu.switch_internals.switch_time.set(switch_time);
let switch_context_opt = match select_next_context(
token,
percpu,
cpu_id,
switch_time,
was_idle,
&mut prev_context_guard,
) {
Ok(opt) => opt,
Err(early_ret) => return early_ret,
};
// Switch process states, TSS stack pointer, and store new context ID
match switch_context_opt {
Some(mut next_context_guard) => {
// Update context states and prepare for the switch.
let prev_context = &mut *prev_context_guard;
let next_context = &mut *next_context_guard;
// Set the previous context as "not running"
prev_context.running = false;
prev_context.last_cpu = prev_context.cpu_id;
// Set the next context as "running"
next_context.running = true;
// Set the CPU ID for the next context
next_context.cpu_id = Some(cpu_id);
// Update times
if !was_idle {
prev_context.cpu_time += switch_time.saturating_sub(prev_context.switch_time);
if prev_context.sched_policy == SchedPolicy::Other {
let actual_ns = switch_time.saturating_sub(prev_context.switch_time);
let weight =
SCHED_PRIO_TO_WEIGHT[prev_context.sched_static_prio.min(39)] as u128;
let default_weight = SCHED_PRIO_TO_WEIGHT[20] as u128;
let delta = actual_ns.saturating_mul(default_weight) / weight.max(1);
prev_context.vruntime = prev_context.vruntime.saturating_add(delta);
}
}
next_context.switch_time = switch_time;
if next_context.userspace {
percpu.stats.set_state(cpu_stats::CpuState::User);
} else {
percpu.stats.set_state(cpu_stats::CpuState::Kernel);
}
unsafe {
percpu.switch_internals.set_current_context(Arc::clone(
ArcContextLockWriteGuard::rwlock(&next_context_guard),
));
}
// FIXME set the switch result in arch::switch_to instead
let prev_context = unsafe {
mem::transmute::<&'_ mut Context, &'_ mut Context>(&mut *prev_context_guard)
};
let next_context = unsafe {
mem::transmute::<&'_ mut Context, &'_ mut Context>(&mut *next_context_guard)
};
percpu
.switch_internals
.switch_result
.set(Some(SwitchResultInner {
_prev_guard: prev_context_guard,
_next_guard: next_context_guard,
}));
/*let (ptrace_session, ptrace_flags) = if let Some((session, bp)) = ptrace::sessions()
.get(&next_context.pid)
.map(|s| (Arc::downgrade(s), s.data.lock().breakpoint))
{
(Some(session), bp.map_or(PtraceFlags::empty(), |f| f.flags))
} else {
(None, PtraceFlags::empty())
};*/
let ptrace_flags = PtraceFlags::empty();
//*percpu.ptrace_session.borrow_mut() = ptrace_session;
percpu.ptrace_flags.set(ptrace_flags);
prev_context.inside_syscall =
percpu.inside_syscall.replace(next_context.inside_syscall);
#[cfg(feature = "syscall_debug")]
{
prev_context.syscall_debug_info = percpu
.syscall_debug_info
.replace(next_context.syscall_debug_info);
prev_context.syscall_debug_info.on_switch_from(token);
next_context.syscall_debug_info.on_switch_to(token);
}
percpu
.switch_internals
.being_sigkilled
.set(next_context.being_sigkilled);
unsafe {
arch::switch_to(prev_context, next_context);
}
// NOTE: After switch_to is called, the return address can even be different from the
// current return address, meaning that we cannot use local variables here, and that we
// need to use the `switch_finish_hook` to be able to release the locks. Newly created
// contexts will return directly to the function pointer passed to context::spawn, and not
// reach this code until the next context switch back.
SwitchResult::Switched
}
_ => {
// No target was found, unset global lock and return
arch::CONTEXT_SWITCH_LOCK.store(false, Ordering::SeqCst);
percpu.stats.set_state(cpu_stats::CpuState::Idle);
SwitchResult::AllContextsIdle
}
}
}
fn queue_previous_context(
token: &mut CleanLockToken,
percpu: &PercpuBlock,
prev_context_lock: &Arc<ContextLock>,
prev_context_guard: &ArcRwLockWriteGuard<L4, Context>,
idle_context: &Arc<ContextLock>,
) {
if Arc::ptr_eq(prev_context_lock, idle_context) {
return;
}
let prev_ctx = WeakContextRef(Arc::downgrade(prev_context_lock));
if prev_context_guard.status.is_runnable() {
let prio = prev_context_guard.prio;
let mut sched_lock = SchedQueuesLock::new(&percpu.sched);
unsafe {
sched_lock.queues_mut()[prio].push_back(prev_ctx);
}
} else {
idle_contexts(token.downgrade()).push_back(prev_ctx);
}
}
fn pop_movable_context(
token: &mut CleanLockToken,
queues: &mut [alloc::collections::VecDeque<WeakContextRef>; RUN_QUEUE_COUNT],
target_cpu: LogicalCpuId,
switch_time: u128,
idle_context: &Arc<ContextLock>,
) -> Option<(usize, WeakContextRef)> {
for prio in 0..RUN_QUEUE_COUNT {
let len = queues[prio].len();
for _ in 0..len {
let Some(context_ref) = queues[prio].pop_front() else {
break;
};
let Some(context_lock) = context_ref.upgrade() else {
continue;
};
if Arc::ptr_eq(&context_lock, idle_context) {
queues[prio].push_back(context_ref);
continue;
}
let mut context_guard = unsafe { context_lock.write_arc() };
let sw = unsafe { update_stealable(&mut context_guard, switch_time) };
if let UpdateResult::CanSwitch = sw {
assign_context_to_cpu(&mut context_guard, target_cpu);
let moved_ref = WeakContextRef(Arc::downgrade(ArcContextLockWriteGuard::rwlock(
&context_guard,
)));
drop(context_guard);
return Some((prio, moved_ref));
}
if matches!(sw, UpdateResult::Blocked) {
idle_contexts(token.downgrade()).push_back(context_ref);
} else {
queues[prio].push_back(context_ref);
}
}
}
None
}
fn steal_work(
token: &mut CleanLockToken,
cpu_id: LogicalCpuId,
switch_time: u128,
) -> Option<ArcContextLockWriteGuard> {
let cpu_count = crate::cpu_count();
if cpu_count <= 1 {
return None;
}
for offset in 1..cpu_count {
let victim_id = LogicalCpuId::new((cpu_id.get() + offset) % cpu_count);
let Some(victim) = get_percpu_block(victim_id) else {
continue;
};
let Some(victim_idle) = victim.switch_internals.try_idle_context() else {
continue;
};
let mut victim_lock = SchedQueuesLock::new(&victim.sched);
let victim_queues = unsafe { victim_lock.queues_mut() };
for prio in 0..RUN_QUEUE_COUNT {
let len = victim_queues[prio].len();
for _ in 0..len {
let Some(context_ref) = victim_queues[prio].pop_front() else {
break;
};
let Some(context_lock) = context_ref.upgrade() else {
continue;
};
if Arc::ptr_eq(&context_lock, &victim_idle) {
victim_queues[prio].push_back(context_ref);
continue;
}
let mut context_guard = unsafe { context_lock.write_arc() };
let sw = unsafe { update_stealable(&mut context_guard, switch_time) };
if let UpdateResult::CanSwitch = sw {
assign_context_to_cpu(&mut context_guard, cpu_id);
SCHED_STEAL_COUNT.fetch_add(1, Ordering::Relaxed);
return Some(context_guard);
}
if matches!(sw, UpdateResult::Blocked) {
// Use try_lock to avoid deadlocking with
// wakeup_contexts (which holds IDLE_CONTEXTS then
// waits for our victim_lock). If IDLE_CONTEXTS is
// busy (owned by another CPU's wakeup_contexts),
// skip the push-back; the context will be re-checked
// on the next wakeup round.
if let Some(mut idle) = idle_contexts_try(token.downgrade()) {
idle.push_back(context_ref);
}
} else {
victim_queues[prio].push_back(context_ref);
}
}
}
}
None
}
fn queue_depth(percpu: &PercpuBlock) -> usize {
let mut sched_lock = SchedQueuesLock::new(&percpu.sched);
unsafe {
sched_lock
.queues_mut()
.iter()
.map(|queue| queue.len())
.sum()
}
}
fn migrate_one_context(
token: &mut CleanLockToken,
source_id: LogicalCpuId,
target_id: LogicalCpuId,
switch_time: u128,
) -> bool {
let Some(source) = get_percpu_block(source_id) else {
return false;
};
let Some(target) = get_percpu_block(target_id) else {
return false;
};
let Some(source_idle) = source.switch_internals.try_idle_context() else {
return false;
};
let moved = {
let mut source_lock = SchedQueuesLock::new(&source.sched);
let source_queues = unsafe { source_lock.queues_mut() };
pop_movable_context(token, source_queues, target_id, switch_time, &source_idle)
};
let Some((prio, context_ref)) = moved else {
return false;
};
let mut target_lock = SchedQueuesLock::new(&target.sched);
unsafe {
target_lock.queues_mut()[prio].push_back(context_ref);
}
true
}
fn maybe_balance_queues(token: &mut CleanLockToken, percpu: &PercpuBlock, balance_time: u128) {
if crate::cpu_count() <= 1 || percpu.cpu_id != LogicalCpuId::BSP {
return;
}
if balance_time.saturating_sub(percpu.sched.last_balance_time.get()) < LOAD_BALANCE_INTERVAL_NS
{
return;
}
percpu.sched.last_balance_time.set(balance_time);
let mut depths = Vec::new();
let mut total_depth = 0usize;
for raw_id in 0..crate::cpu_count() {
let cpu_id = LogicalCpuId::new(raw_id);
let Some(cpu_percpu) = get_percpu_block(cpu_id) else {
continue;
};
let depth = queue_depth(cpu_percpu);
total_depth += depth;
depths.push((cpu_id, depth));
}
if depths.len() <= 1 || total_depth == 0 {
return;
}
let avg_depth = (total_depth + depths.len().saturating_sub(1)) / depths.len();
for target_index in 0..depths.len() {
if depths[target_index].1 != 0 {
continue;
}
let mut source_index = None;
let mut source_depth = 0usize;
for (idx, &(_, depth)) in depths.iter().enumerate() {
if idx == target_index {
continue;
}
if depth > avg_depth + 1 && depth > source_depth {
source_index = Some(idx);
source_depth = depth;
}
}
let Some(source_index) = source_index else {
continue;
};
let source_id = depths[source_index].0;
let target_id = depths[target_index].0;
if migrate_one_context(token, source_id, target_id, balance_time) {
depths[source_index].1 = depths[source_index].1.saturating_sub(1);
depths[target_index].1 += 1;
}
}
}
fn wakeup_contexts(token: &mut CleanLockToken, percpu: &PercpuBlock, switch_time: u128) {
// TODO: Optimise this somehow. Perhaps using a separate timer queue?
let mut wakeups = Vec::new();
let current_context = context::current();
let Some(idle_contexts) = idle_contexts_try(token.downgrade()) else {
// other cpus may spawning or killing contexts so let's skip wakeups to avoid contention
return;
};
let (mut idle_contexts, mut token) = idle_contexts.into_split();
let len = idle_contexts.len();
for _ in 0..len {
let Some(context_ref) = idle_contexts.pop_front() else {
break;
};
let Some(context) = context_ref.upgrade() else {
continue;
};
if Arc::ptr_eq(&context, &current_context) {
idle_contexts.push_back(context_ref);
continue;
}
let Some(guard) = context.try_read(token.token()) else {
idle_contexts.push_back(context_ref);
continue;
};
if guard.status.is_soft_blocked()
&& let Some(wake) = guard.wake
&& switch_time >= wake
{
let prio = guard.prio;
drop(guard);
wakeups.push((prio, context_ref));
continue;
}
if guard.status.is_runnable() && !guard.running {
let prio = guard.prio;
drop(guard);
wakeups.push((prio, context_ref));
continue;
}
drop(guard);
idle_contexts.push_back(context_ref);
}
// Drop IDLE_CONTEXTS before acquiring SchedQueuesLock to avoid a
// deadlock with steal_work on another CPU. The previous code held
// both locks simultaneously: wakeup_contexts (this fn) held
// IDLE_CONTEXTS, then waited for SchedQueuesLock on its own CPU;
// steal_work on another CPU held that same SchedQueuesLock and
// waited for IDLE_CONTEXTS. Classic circular wait.
//
// The wakeups Vec is the only data we need from IDLE_CONTEXTS;
// releasing the lock here means steal_work on another CPU can
// proceed while this CPU is acquiring SchedQueuesLock.
drop(idle_contexts);
if wakeups.is_empty() {
return;
}
let mut sched_lock = SchedQueuesLock::new(&percpu.sched);
let run_queues = unsafe { sched_lock.queues_mut() };
for (prio, context_ref) in wakeups {
if let Some(context_lock) = context_ref.upgrade() {
let mut context_guard = unsafe { context_lock.write_arc() };
assign_context_to_cpu(&mut context_guard, percpu.cpu_id);
}
run_queues[prio].push_back(context_ref);
}
}
fn pick_next_from_queues(
token: &mut CleanLockToken,
contexts_list: &mut [alloc::collections::VecDeque<WeakContextRef>; RUN_QUEUE_COUNT],
cpu_id: LogicalCpuId,
switch_time: u128,
prev_context_lock: &Arc<ContextLock>,
idle_context: &Arc<ContextLock>,
balance: &mut [usize; RUN_QUEUE_COUNT],
i: &mut usize,
) -> Option<ArcContextLockWriteGuard> {
let mut empty_queues = 0;
let mut total_iters = 0;
let total_contexts: usize = contexts_list.iter().map(|q| q.len()).sum();
let mut skipped_contexts = 0;
for prio in 0..RUN_QUEUE_COUNT {
let rt_contexts = contexts_list
.get_mut(prio)
.expect("prio should be between [0, 39]");
let len = rt_contexts.len();
for _ in 0..len {
let (rt_ref, rt_lock) = match rt_contexts.pop_front() {
Some(lock) => match lock.upgrade() {
Some(l) => (lock, l),
None => {
skipped_contexts += 1;
continue;
}
},
None => break,
};
if Arc::ptr_eq(&rt_lock, idle_context) || Arc::ptr_eq(&rt_lock, prev_context_lock) {
rt_contexts.push_back(rt_ref);
continue;
}
let rt_guard = unsafe { rt_lock.write_arc() };
if !rt_guard.status.is_runnable()
|| rt_guard.running
|| !rt_guard.sched_affinity.contains(cpu_id)
{
rt_contexts.push_back(rt_ref);
continue;
}
if rt_guard.sched_policy == SchedPolicy::Fifo
|| rt_guard.sched_policy == SchedPolicy::RoundRobin
{
return Some(rt_guard);
}
rt_contexts.push_back(rt_ref);
}
}
{
let mut min_vruntime = u128::MAX;
let mut best: Option<(usize, WeakContextRef)> = None;
for (prio, queue) in contexts_list.iter().enumerate() {
for ctx_ref in queue.iter() {
if let Some(ctx_lock) = ctx_ref.upgrade() {
if Arc::ptr_eq(&ctx_lock, prev_context_lock)
|| Arc::ptr_eq(&ctx_lock, idle_context)
{
continue;
}
if let Some(guard) = ctx_lock.try_read(token.token()) {
if guard.status.is_runnable()
&& !guard.running
&& guard.sched_affinity.contains(cpu_id)
&& guard.sched_policy == SchedPolicy::Other
{
let mut vruntime = guard.vruntime;
if guard.last_cpu == Some(cpu_id) {
vruntime = vruntime.saturating_sub(vruntime / 8);
}
drop(guard);
if vruntime < min_vruntime {
min_vruntime = vruntime;
best = Some((prio, ctx_ref.clone()));
}
}
}
}
}
}
if let Some((best_prio, ctx_ref)) = best {
contexts_list[best_prio].retain(|r| !WeakContextRef::eq(r, &ctx_ref));
if let Some(ctx_lock) = ctx_ref.upgrade() {
let guard = unsafe { ctx_lock.write_arc() };
if guard.status.is_runnable()
&& !guard.running
&& guard.sched_affinity.contains(cpu_id)
&& guard.sched_policy == SchedPolicy::Other
{
return Some(guard);
}
drop(guard);
contexts_list[best_prio].push_back(ctx_ref);
}
}
}
'priority: loop {
*i = (*i + 1) % RUN_QUEUE_COUNT;
total_iters += 1;
if total_iters >= 5000 {
break 'priority;
}
if skipped_contexts > total_contexts && total_contexts > 0 {
break 'priority;
}
let contexts = contexts_list
.get_mut(*i)
.expect("i should be between [0, 39]!");
if contexts.is_empty() {
empty_queues += 1;
if empty_queues >= RUN_QUEUE_COUNT {
break 'priority;
}
continue;
}
empty_queues = 0;
if balance[*i] < SCHED_PRIO_TO_WEIGHT[20] {
balance[*i] += SCHED_PRIO_TO_WEIGHT[*i];
continue;
}
let len = contexts.len();
for _ in 0..len {
let (next_context_ref, next_context_lock) = match contexts.pop_front() {
Some(lock) => match lock.upgrade() {
Some(new_lock) => (lock, new_lock),
None => {
skipped_contexts += 1;
continue;
}
},
None => break,
};
if Arc::ptr_eq(&next_context_lock, prev_context_lock)
|| Arc::ptr_eq(&next_context_lock, idle_context)
{
contexts.push_back(next_context_ref);
continue;
}
let mut next_context_guard = unsafe { next_context_lock.write_arc() };
let sw = unsafe { update_runnable(&mut next_context_guard, cpu_id, switch_time) };
if let UpdateResult::CanSwitch = sw {
balance[*i] -= SCHED_PRIO_TO_WEIGHT[20];
return Some(next_context_guard);
}
if matches!(sw, UpdateResult::Blocked) {
idle_contexts(token.downgrade()).push_back(next_context_ref);
} else {
contexts.push_back(next_context_ref);
}
skipped_contexts += 1;
if skipped_contexts >= total_contexts {
break 'priority;
}
}
}
None
}
fn pick_next_from_global_queues(
token: &mut LockToken<L1>,
contexts_list: &mut [alloc::collections::VecDeque<WeakContextRef>; RUN_QUEUE_COUNT],
cpu_id: LogicalCpuId,
switch_time: u128,
prev_context_lock: &Arc<ContextLock>,
idle_context: &Arc<ContextLock>,
balance: &mut [usize; RUN_QUEUE_COUNT],
i: &mut usize,
) -> Option<ArcContextLockWriteGuard> {
let mut empty_queues = 0;
let mut total_iters = 0;
let total_contexts: usize = contexts_list.iter().map(|q| q.len()).sum();
let mut skipped_contexts = 0;
for prio in 0..RUN_QUEUE_COUNT {
let rt_contexts = contexts_list
.get_mut(prio)
.expect("prio should be between [0, 39]");
let len = rt_contexts.len();
for _ in 0..len {
let (rt_ref, rt_lock) = match rt_contexts.pop_front() {
Some(lock) => match lock.upgrade() {
Some(l) => (lock, l),
None => {
skipped_contexts += 1;
continue;
}
},
None => break,
};
if Arc::ptr_eq(&rt_lock, idle_context) || Arc::ptr_eq(&rt_lock, prev_context_lock) {
rt_contexts.push_back(rt_ref);
continue;
}
let rt_guard = unsafe { rt_lock.write_arc() };
if !rt_guard.status.is_runnable()
|| rt_guard.running
|| !rt_guard.sched_affinity.contains(cpu_id)
{
rt_contexts.push_back(rt_ref);
continue;
}
if rt_guard.sched_policy == SchedPolicy::Fifo
|| rt_guard.sched_policy == SchedPolicy::RoundRobin
{
return Some(rt_guard);
}
rt_contexts.push_back(rt_ref);
}
}
{
let mut min_vruntime = u128::MAX;
let mut best: Option<(usize, WeakContextRef)> = None;
for (prio, queue) in contexts_list.iter().enumerate() {
for ctx_ref in queue.iter() {
if let Some(ctx_lock) = ctx_ref.upgrade() {
if Arc::ptr_eq(&ctx_lock, prev_context_lock)
|| Arc::ptr_eq(&ctx_lock, idle_context)
{
continue;
}
if let Some(guard) = ctx_lock.try_read(token.token()) {
if guard.status.is_runnable()
&& !guard.running
&& guard.sched_affinity.contains(cpu_id)
&& guard.sched_policy == SchedPolicy::Other
{
let mut vruntime = guard.vruntime;
if guard.last_cpu == Some(cpu_id) {
vruntime = vruntime.saturating_sub(vruntime / 8);
}
drop(guard);
if vruntime < min_vruntime {
min_vruntime = vruntime;
best = Some((prio, ctx_ref.clone()));
}
}
}
}
}
}
if let Some((best_prio, ctx_ref)) = best {
contexts_list[best_prio].retain(|r| !WeakContextRef::eq(r, &ctx_ref));
if let Some(ctx_lock) = ctx_ref.upgrade() {
let guard = unsafe { ctx_lock.write_arc() };
if guard.status.is_runnable()
&& !guard.running
&& guard.sched_affinity.contains(cpu_id)
&& guard.sched_policy == SchedPolicy::Other
{
return Some(guard);
}
drop(guard);
contexts_list[best_prio].push_back(ctx_ref);
}
}
}
'priority: loop {
*i = (*i + 1) % RUN_QUEUE_COUNT;
total_iters += 1;
if total_iters >= 5000 {
break 'priority;
}
if skipped_contexts > total_contexts && total_contexts > 0 {
break 'priority;
}
let contexts = contexts_list
.get_mut(*i)
.expect("i should be between [0, 39]!");
if contexts.is_empty() {
empty_queues += 1;
if empty_queues >= RUN_QUEUE_COUNT {
break 'priority;
}
continue;
}
empty_queues = 0;
if balance[*i] < SCHED_PRIO_TO_WEIGHT[20] {
balance[*i] += SCHED_PRIO_TO_WEIGHT[*i];
continue;
}
let len = contexts.len();
for _ in 0..len {
let (next_context_ref, next_context_lock) = match contexts.pop_front() {
Some(lock) => match lock.upgrade() {
Some(new_lock) => (lock, new_lock),
None => {
skipped_contexts += 1;
continue;
}
},
None => break,
};
if Arc::ptr_eq(&next_context_lock, prev_context_lock)
|| Arc::ptr_eq(&next_context_lock, idle_context)
{
contexts.push_back(next_context_ref);
continue;
}
let mut next_context_guard = unsafe { next_context_lock.write_arc() };
let sw = unsafe { update_runnable(&mut next_context_guard, cpu_id, switch_time) };
if let UpdateResult::CanSwitch = sw {
balance[*i] -= SCHED_PRIO_TO_WEIGHT[20];
return Some(next_context_guard);
}
if matches!(sw, UpdateResult::Blocked) {
idle_contexts(token.token()).push_back(next_context_ref);
} else {
contexts.push_back(next_context_ref);
}
skipped_contexts += 1;
if skipped_contexts >= total_contexts {
break 'priority;
}
}
}
None
}
unsafe fn update_stealable(context: &mut Context, switch_time: u128) -> UpdateResult {
if context.running {
return UpdateResult::Skip;
}
if context.status.is_soft_blocked()
&& let Some(wake) = context.wake
&& switch_time >= wake
{
context.wake = None;
context.unblock_no_ipi();
}
if context.status.is_runnable() {
UpdateResult::CanSwitch
} else {
UpdateResult::Blocked
}
}
/// This is the scheduler function which currently utilises Deficit Weighted Round Robin Scheduler
fn select_next_context(
token: &mut CleanLockToken,
percpu: &PercpuBlock,
cpu_id: LogicalCpuId,
switch_time: u128,
was_idle: bool,
prev_context_guard: &mut ArcRwLockWriteGuard<L4, Context>,
) -> Result<Option<ArcContextLockWriteGuard>, SwitchResult> {
let idle_context = percpu.switch_internals.idle_context();
let prev_context_lock = crate::context::current();
let local_next = {
let mut sched_lock = SchedQueuesLock::new(&percpu.sched);
let mut balance = percpu.sched.balance.get();
let mut last_queue = percpu.sched.last_queue.get() % RUN_QUEUE_COUNT;
let next = pick_next_from_queues(
token,
unsafe { sched_lock.queues_mut() },
cpu_id,
switch_time,
&prev_context_lock,
&idle_context,
&mut balance,
&mut last_queue,
);
percpu.sched.balance.set(balance);
percpu.sched.last_queue.set(last_queue);
next
};
if let Some(next_context_guard) = local_next {
queue_previous_context(
token,
percpu,
&prev_context_lock,
prev_context_guard,
&idle_context,
);
return Ok(Some(next_context_guard));
}
if let Some(next_context_guard) = steal_work(token, cpu_id, switch_time) {
queue_previous_context(
token,
percpu,
&prev_context_lock,
prev_context_guard,
&idle_context,
);
return Ok(Some(next_context_guard));
}
let global_next = {
let contexts_data = run_contexts(token.token());
let (mut contexts_data, mut contexts_token) = contexts_data.into_split();
let mut balance = percpu.sched.balance.get();
let mut last_queue = percpu.sched.last_queue.get() % RUN_QUEUE_COUNT;
let next = pick_next_from_global_queues(
&mut contexts_token,
&mut contexts_data.set,
cpu_id,
switch_time,
&prev_context_lock,
&idle_context,
&mut balance,
&mut last_queue,
);
percpu.sched.balance.set(balance);
percpu.sched.last_queue.set(last_queue);
next
};
if let Some(next_context_guard) = global_next {
queue_previous_context(
token,
percpu,
&prev_context_lock,
prev_context_guard,
&idle_context,
);
return Ok(Some(next_context_guard));
}
queue_previous_context(
token,
percpu,
&prev_context_lock,
prev_context_guard,
&idle_context,
);
if !was_idle && !Arc::ptr_eq(&prev_context_lock, &idle_context) {
Ok(Some(unsafe { idle_context.write_arc() }))
} else {
Ok(None)
}
}
/// Holds per-CPU state necessary for context switching.
///
/// This struct contains information such as the idle context, current context, and PIT tick counts,
/// as well as fields required for managing ptrace sessions and signals.
pub struct ContextSwitchPercpu {
switch_result: Cell<Option<SwitchResultInner>>,
switch_time: Cell<u128>,
pit_ticks: Cell<usize>,
current_ctxt: RefCell<Option<Arc<ContextLock>>>,
/// The idle process.
idle_ctxt: RefCell<Option<Arc<ContextLock>>>,
pub(crate) being_sigkilled: Cell<bool>,
}
impl ContextSwitchPercpu {
pub const fn default() -> Self {
Self {
switch_result: Cell::new(None),
switch_time: Cell::new(0),
pit_ticks: Cell::new(0),
current_ctxt: RefCell::new(None),
idle_ctxt: RefCell::new(None),
being_sigkilled: Cell::new(false),
}
}
/// Applies a function to the current context, allowing controlled access.
///
/// # Parameters
/// - `f`: A closure that receives a reference to the current context and returns a value.
///
/// # Returns
/// The result of applying `f` to the current context.
pub fn with_context<T>(&self, f: impl FnOnce(&Arc<ContextLock>) -> T) -> T {
f(self
.current_ctxt
.borrow()
.as_ref()
.expect("not inside of context"))
}
/// Applies a function to the current context, allowing controlled access.
///
/// # Parameters
/// - `f`: A closure that receives a reference to the current context and returns a value.
///
/// # Returns
/// The result of applying `f` to the current context if any.
pub fn try_with_context<T>(&self, f: impl FnOnce(Option<&Arc<ContextLock>>) -> T) -> T {
f(self.current_ctxt.borrow().as_ref())
}
/// Sets the current context to a new value.
///
/// # Safety
/// This function is unsafe as it modifies the context state directly.
///
/// # Parameters
/// - `new`: The new context to be set as the current context.
pub unsafe fn set_current_context(&self, new: Arc<ContextLock>) {
*self.current_ctxt.borrow_mut() = Some(new);
}
/// Sets the idle context to a new value.
///
/// # Safety
/// This function is unsafe as it modifies the idle context state directly.
///
/// # Parameters
/// - `new`: The new context to be set as the idle context.
pub unsafe fn set_idle_context(&self, new: Arc<ContextLock>) {
*self.idle_ctxt.borrow_mut() = Some(new);
}
/// Retrieves the current idle context.
///
/// # Returns
/// A reference to the idle context.
pub fn idle_context(&self) -> Arc<ContextLock> {
Arc::clone(
self.idle_ctxt
.borrow()
.as_ref()
.expect("no idle context present"),
)
}
/// Retrieves the current idle context if it has been initialized.
///
/// This is the fallible variant of [`idle_context`], intended for
/// cross-CPU paths (`steal_work`, `migrate_one_context`) that may
/// access a PercpuBlock whose `context::init()` has not run yet
/// during AP bring-up.
///
/// # Returns
/// `Some(Arc<ContextLock>)` if the idle context is set, `None` otherwise.
pub fn try_idle_context(&self) -> Option<Arc<ContextLock>> {
self.idle_ctxt.borrow().as_ref().map(Arc::clone)
}
}