RedBear-OS/src/context/switch.rs

//! This module provides a context-switching mechanism that utilizes a simple round-robin scheduler.
//! The scheduler iterates over available contexts, selecting the next context to run, while
//! handling process states and synchronization.

use crate::{
    context::{
        self, arch, idle_contexts, idle_contexts_try, run_contexts, ArcContextLockWriteGuard,
        Context, ContextLock, SchedPolicy, WeakContextRef, RUN_QUEUE_COUNT,
    },
    cpu_set::{LogicalCpuId, LogicalCpuSet},
    cpu_stats::{self, CpuState},
    percpu::{get_percpu_block, PerCpuSched, PercpuBlock},
    sync::{ArcRwLockWriteGuard, CleanLockToken, LockToken, L1, L4},
};
use alloc::{sync::Arc, vec::Vec};
use core::{
    cell::{Cell, RefCell},
    hint, mem,
    sync::atomic::{AtomicUsize, Ordering},
};
use syscall::PtraceFlags;

enum UpdateResult {
    CanSwitch,
    Skip,
    Blocked,
}

// A simple geometric series where value[i] ~= value[i - 1] * 1.25
const SCHED_PRIO_TO_WEIGHT: [usize; 40] = [
    88761, 71755, 56483, 46273, 36291, 29154, 23254, 18705, 14949, 11916, 9548, 7620, 6100, 4904,
    3906, 3121, 2501, 1991, 1586, 1277, 1024, 820, 655, 526, 423, 335, 272, 215, 172, 137, 110, 87,
    70, 56, 45, 36, 29, 23, 18, 15,
];

const LOAD_BALANCE_INTERVAL_NS: u128 = 100_000_000;

static SCHED_STEAL_COUNT: AtomicUsize = AtomicUsize::new(0);

struct SchedQueuesLock<'a> {
    sched: &'a PerCpuSched,
}

impl<'a> SchedQueuesLock<'a> {
    fn new(sched: &'a PerCpuSched) -> Self {
        sched.take_lock();
        Self { sched }
    }

    unsafe fn queues_mut(
        &mut self,
    ) -> &mut [alloc::collections::VecDeque<WeakContextRef>; RUN_QUEUE_COUNT] {
        unsafe { self.sched.queues_mut() }
    }
}

impl Drop for SchedQueuesLock<'_> {
    fn drop(&mut self) {
        self.sched.release_lock();
    }
}

fn assign_context_to_cpu(context: &mut Context, cpu_id: LogicalCpuId) {
    context.sched_affinity = LogicalCpuSet::empty();
    context.sched_affinity.atomic_set(cpu_id);
}

unsafe fn update_runnable(
    context: &mut Context,
    cpu_id: LogicalCpuId,
    switch_time: u128,
) -> UpdateResult {
    if context.running {
        return UpdateResult::Skip;
    }
    if !context.sched_affinity.contains(cpu_id) {
        return UpdateResult::Skip;
    }
    if context.status.is_soft_blocked()
        && let Some(wake) = context.wake
        && switch_time >= wake
    {
        context.wake = None;
        context.unblock_no_ipi();
    }
    if context.status.is_runnable() {
        UpdateResult::CanSwitch
    } else {
        UpdateResult::Blocked
    }
}

struct SwitchResultInner {
    _prev_guard: ArcContextLockWriteGuard,
    _next_guard: ArcContextLockWriteGuard,
}

/// Tick function to update PIT ticks and trigger a context switch if necessary.
///
/// Called periodically, this function increments a per-CPU tick counter and performs a context
/// switch if the counter reaches a set threshold (e.g., every 3 ticks).
///
/// The function also calls the signal handler after switching contexts.
pub fn tick(token: &mut CleanLockToken) {
    let percpu = PercpuBlock::current();
    let ticks_cell = &percpu.switch_internals.pit_ticks;

    let new_ticks = ticks_cell.get() + 1;
    ticks_cell.set(new_ticks);

    let balance_time = crate::time::monotonic(token);
    maybe_balance_queues(token, percpu, balance_time);

    // Trigger a context switch after every 3 ticks.
    if new_ticks >= 3 {
        switch(token);
        crate::context::signal::signal_handler(token);
    }
}

/// Finishes the context switch by clearing any temporary data and resetting the lock.
///
/// This function is called after a context switch is completed to perform cleanup, including
/// clearing the switch result data and releasing the context switch lock.
///
/// # Safety
/// This function involves unsafe operations such as resetting state and releasing locks.
pub unsafe extern "C" fn switch_finish_hook() {
    unsafe {
        match PercpuBlock::current().switch_internals.switch_result.take() {
            Some(switch_result) => {
                drop(switch_result);
            }
            _ => {
                // TODO: unreachable_unchecked()?
                crate::arch::stop::emergency_reset();
            }
        }
        arch::CONTEXT_SWITCH_LOCK.store(false, Ordering::SeqCst);
        crate::percpu::switch_arch_hook();
    }
}

#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub enum SwitchResult {
    Switched,
    AllContextsIdle,
}

/// This function performs the context switch, using select_next_context to
/// actually select the next context to switch to.
///
/// # Warning
/// This is not memory-unsafe to call. But do NOT call this while holding locks!
///
/// # Returns
/// - `SwitchResult::Switched`: Indicates a successful switch to a new context.
/// - `SwitchResult::AllContextsIdle`: Indicates all contexts are idle, and the CPU will switch
///   to an idle context.
pub fn switch(token: &mut CleanLockToken) -> SwitchResult {
    let switch_time = crate::time::monotonic(token);

    let percpu = PercpuBlock::current();
    cpu_stats::add_context_switch();

    //set PIT Interrupt counter to 0, giving each process same amount of PIT ticks
    percpu.switch_internals.pit_ticks.set(0);

    // Acquire the global lock to ensure exclusive access during context switch and avoid
    // issues that would be caused by the unsafe operations below
    // TODO: Better memory orderings?
    while arch::CONTEXT_SWITCH_LOCK
        .compare_exchange_weak(false, true, Ordering::SeqCst, Ordering::Relaxed)
        .is_err()
    {
        hint::spin_loop();
        percpu.maybe_handle_tlb_shootdown();
    }

    // Lock the previous context.
    let prev_context_lock = crate::context::current();
    // We are careful not to lock this context twice
    let mut prev_context_guard = unsafe { prev_context_lock.write_arc() };

    if !prev_context_guard.is_preemptable() {
        arch::CONTEXT_SWITCH_LOCK.store(false, Ordering::SeqCst);
        return SwitchResult::Switched;
    }

    // Alarm (previously in update_runnable)
    wakeup_contexts(token, percpu, switch_time);

    let cpu_id = crate::cpu_id();

    // Update per-cpu times
    let percpu_nanos = switch_time.saturating_sub(percpu.switch_internals.switch_time.get()) as u64;
    let percpu_ms = percpu_nanos / 1_000_000;
    let was_idle = percpu.stats.add_time(percpu_ms) == CpuState::Idle as u8;
    percpu.switch_internals.switch_time.set(switch_time);

    let switch_context_opt = match select_next_context(
        token,
        percpu,
        cpu_id,
        switch_time,
        was_idle,
        &mut prev_context_guard,
    ) {
        Ok(opt) => opt,
        Err(early_ret) => return early_ret,
    };

    // Switch process states, TSS stack pointer, and store new context ID
    match switch_context_opt {
        Some(mut next_context_guard) => {
            // Update context states and prepare for the switch.
            let prev_context = &mut *prev_context_guard;
            let next_context = &mut *next_context_guard;

            // Set the previous context as "not running"
            prev_context.running = false;
            prev_context.last_cpu = prev_context.cpu_id;

            // Set the next context as "running"
            next_context.running = true;
            // Set the CPU ID for the next context
            next_context.cpu_id = Some(cpu_id);

            // Update times
            if !was_idle {
                prev_context.cpu_time += switch_time.saturating_sub(prev_context.switch_time);
                if prev_context.sched_policy == SchedPolicy::Other {
                    let actual_ns = switch_time.saturating_sub(prev_context.switch_time);
                    let weight =
                        SCHED_PRIO_TO_WEIGHT[prev_context.sched_static_prio.min(39)] as u128;
                    let default_weight = SCHED_PRIO_TO_WEIGHT[20] as u128;
                    let delta = actual_ns.saturating_mul(default_weight) / weight.max(1);
                    prev_context.vruntime = prev_context.vruntime.saturating_add(delta);
                }
            }
            next_context.switch_time = switch_time;
            if next_context.userspace {
                percpu.stats.set_state(cpu_stats::CpuState::User);
            } else {
                percpu.stats.set_state(cpu_stats::CpuState::Kernel);
            }
            unsafe {
                percpu.switch_internals.set_current_context(Arc::clone(
                    ArcContextLockWriteGuard::rwlock(&next_context_guard),
                ));
            }

            // FIXME set the switch result in arch::switch_to instead
            let prev_context = unsafe {
                mem::transmute::<&'_ mut Context, &'_ mut Context>(&mut *prev_context_guard)
            };
            let next_context = unsafe {
                mem::transmute::<&'_ mut Context, &'_ mut Context>(&mut *next_context_guard)
            };

            percpu
                .switch_internals
                .switch_result
                .set(Some(SwitchResultInner {
                    _prev_guard: prev_context_guard,
                    _next_guard: next_context_guard,
                }));

            /*let (ptrace_session, ptrace_flags) = if let Some((session, bp)) = ptrace::sessions()
                .get(&next_context.pid)
                .map(|s| (Arc::downgrade(s), s.data.lock().breakpoint))
            {
                (Some(session), bp.map_or(PtraceFlags::empty(), |f| f.flags))
            } else {
                (None, PtraceFlags::empty())
            };*/
            let ptrace_flags = PtraceFlags::empty();

            //*percpu.ptrace_session.borrow_mut() = ptrace_session;
            percpu.ptrace_flags.set(ptrace_flags);
            prev_context.inside_syscall =
                percpu.inside_syscall.replace(next_context.inside_syscall);

            #[cfg(feature = "syscall_debug")]
            {
                prev_context.syscall_debug_info = percpu
                    .syscall_debug_info
                    .replace(next_context.syscall_debug_info);
                prev_context.syscall_debug_info.on_switch_from(token);
                next_context.syscall_debug_info.on_switch_to(token);
            }

            percpu
                .switch_internals
                .being_sigkilled
                .set(next_context.being_sigkilled);

            unsafe {
                arch::switch_to(prev_context, next_context);
            }

            // NOTE: After switch_to is called, the return address can even be different from the
            // current return address, meaning that we cannot use local variables here, and that we
            // need to use the `switch_finish_hook` to be able to release the locks. Newly created
            // contexts will return directly to the function pointer passed to context::spawn, and not
            // reach this code until the next context switch back.
            SwitchResult::Switched
        }
        _ => {
            // No target was found, unset global lock and return
            arch::CONTEXT_SWITCH_LOCK.store(false, Ordering::SeqCst);

            percpu.stats.set_state(cpu_stats::CpuState::Idle);

            SwitchResult::AllContextsIdle
        }
    }
}

fn queue_previous_context(
    token: &mut CleanLockToken,
    percpu: &PercpuBlock,
    prev_context_lock: &Arc<ContextLock>,
    prev_context_guard: &ArcRwLockWriteGuard<L4, Context>,
    idle_context: &Arc<ContextLock>,
) {
    if Arc::ptr_eq(prev_context_lock, idle_context) {
        return;
    }

    let prev_ctx = WeakContextRef(Arc::downgrade(prev_context_lock));
    if prev_context_guard.status.is_runnable() {
        let prio = prev_context_guard.prio;
        let mut sched_lock = SchedQueuesLock::new(&percpu.sched);
        unsafe {
            sched_lock.queues_mut()[prio].push_back(prev_ctx);
        }
    } else {
        idle_contexts(token.downgrade()).push_back(prev_ctx);
    }
}

fn pop_movable_context(
    token: &mut CleanLockToken,
    queues: &mut [alloc::collections::VecDeque<WeakContextRef>; RUN_QUEUE_COUNT],
    target_cpu: LogicalCpuId,
    switch_time: u128,
    idle_context: &Arc<ContextLock>,
) -> Option<(usize, WeakContextRef)> {
    for prio in 0..RUN_QUEUE_COUNT {
        let len = queues[prio].len();
        for _ in 0..len {
            let Some(context_ref) = queues[prio].pop_front() else {
                break;
            };
            let Some(context_lock) = context_ref.upgrade() else {
                continue;
            };
            if Arc::ptr_eq(&context_lock, idle_context) {
                queues[prio].push_back(context_ref);
                continue;
            }

            let mut context_guard = unsafe { context_lock.write_arc() };
            let sw = unsafe { update_stealable(&mut context_guard, switch_time) };
            if let UpdateResult::CanSwitch = sw {
                assign_context_to_cpu(&mut context_guard, target_cpu);
                let moved_ref = WeakContextRef(Arc::downgrade(ArcContextLockWriteGuard::rwlock(
                    &context_guard,
                )));
                drop(context_guard);
                return Some((prio, moved_ref));
            }

            if matches!(sw, UpdateResult::Blocked) {
                idle_contexts(token.downgrade()).push_back(context_ref);
            } else {
                queues[prio].push_back(context_ref);
            }
        }
    }

    None
}

fn steal_work(
    token: &mut CleanLockToken,
    cpu_id: LogicalCpuId,
    switch_time: u128,
) -> Option<ArcContextLockWriteGuard> {
    let cpu_count = crate::cpu_count();
    if cpu_count <= 1 {
        return None;
    }

    for offset in 1..cpu_count {
        let victim_id = LogicalCpuId::new((cpu_id.get() + offset) % cpu_count);
        let Some(victim) = get_percpu_block(victim_id) else {
            continue;
        };

        let Some(victim_idle) = victim.switch_internals.try_idle_context() else {
            continue;
        };
        let mut victim_lock = SchedQueuesLock::new(&victim.sched);
        let victim_queues = unsafe { victim_lock.queues_mut() };

        for prio in 0..RUN_QUEUE_COUNT {
            let len = victim_queues[prio].len();
            for _ in 0..len {
                let Some(context_ref) = victim_queues[prio].pop_front() else {
                    break;
                };
                let Some(context_lock) = context_ref.upgrade() else {
                    continue;
                };
                if Arc::ptr_eq(&context_lock, &victim_idle) {
                    victim_queues[prio].push_back(context_ref);
                    continue;
                }

                let mut context_guard = unsafe { context_lock.write_arc() };
                let sw = unsafe { update_stealable(&mut context_guard, switch_time) };
                if let UpdateResult::CanSwitch = sw {
                    assign_context_to_cpu(&mut context_guard, cpu_id);
                    SCHED_STEAL_COUNT.fetch_add(1, Ordering::Relaxed);
                    return Some(context_guard);
                }

                if matches!(sw, UpdateResult::Blocked) {
                    // Use try_lock to avoid deadlocking with
                    // wakeup_contexts (which holds IDLE_CONTEXTS then
                    // waits for our victim_lock). If IDLE_CONTEXTS is
                    // busy (owned by another CPU's wakeup_contexts),
                    // skip the push-back; the context will be re-checked
                    // on the next wakeup round.
                    if let Some(mut idle) = idle_contexts_try(token.downgrade()) {
                        idle.push_back(context_ref);
                    }
                } else {
                    victim_queues[prio].push_back(context_ref);
                }
            }
        }
    }

    None
}

fn queue_depth(percpu: &PercpuBlock) -> usize {
    let mut sched_lock = SchedQueuesLock::new(&percpu.sched);
    unsafe {
        sched_lock
            .queues_mut()
            .iter()
            .map(|queue| queue.len())
            .sum()
    }
}

fn migrate_one_context(
    token: &mut CleanLockToken,
    source_id: LogicalCpuId,
    target_id: LogicalCpuId,
    switch_time: u128,
) -> bool {
    let Some(source) = get_percpu_block(source_id) else {
        return false;
    };
    let Some(target) = get_percpu_block(target_id) else {
        return false;
    };

    let Some(source_idle) = source.switch_internals.try_idle_context() else {
        return false;
    };
    let moved = {
        let mut source_lock = SchedQueuesLock::new(&source.sched);
        let source_queues = unsafe { source_lock.queues_mut() };
        pop_movable_context(token, source_queues, target_id, switch_time, &source_idle)
    };

    let Some((prio, context_ref)) = moved else {
        return false;
    };

    let mut target_lock = SchedQueuesLock::new(&target.sched);
    unsafe {
        target_lock.queues_mut()[prio].push_back(context_ref);
    }
    true
}

fn maybe_balance_queues(token: &mut CleanLockToken, percpu: &PercpuBlock, balance_time: u128) {
    if crate::cpu_count() <= 1 || percpu.cpu_id != LogicalCpuId::BSP {
        return;
    }
    if balance_time.saturating_sub(percpu.sched.last_balance_time.get()) < LOAD_BALANCE_INTERVAL_NS
    {
        return;
    }

    percpu.sched.last_balance_time.set(balance_time);

    let mut depths = Vec::new();
    let mut total_depth = 0usize;
    for raw_id in 0..crate::cpu_count() {
        let cpu_id = LogicalCpuId::new(raw_id);
        let Some(cpu_percpu) = get_percpu_block(cpu_id) else {
            continue;
        };
        let depth = queue_depth(cpu_percpu);
        total_depth += depth;
        depths.push((cpu_id, depth));
    }

    if depths.len() <= 1 || total_depth == 0 {
        return;
    }

    let avg_depth = (total_depth + depths.len().saturating_sub(1)) / depths.len();

    for target_index in 0..depths.len() {
        if depths[target_index].1 != 0 {
            continue;
        }

        let mut source_index = None;
        let mut source_depth = 0usize;
        for (idx, &(_, depth)) in depths.iter().enumerate() {
            if idx == target_index {
                continue;
            }
            if depth > avg_depth + 1 && depth > source_depth {
                source_index = Some(idx);
                source_depth = depth;
            }
        }

        let Some(source_index) = source_index else {
            continue;
        };

        let source_id = depths[source_index].0;
        let target_id = depths[target_index].0;
        if migrate_one_context(token, source_id, target_id, balance_time) {
            depths[source_index].1 = depths[source_index].1.saturating_sub(1);
            depths[target_index].1 += 1;
        }
    }
}

fn wakeup_contexts(token: &mut CleanLockToken, percpu: &PercpuBlock, switch_time: u128) {
    // TODO: Optimise this somehow. Perhaps using a separate timer queue?
    let mut wakeups = Vec::new();
    let current_context = context::current();
    let Some(idle_contexts) = idle_contexts_try(token.downgrade()) else {
        // other cpus may spawning or killing contexts so let's skip wakeups to avoid contention
        return;
    };
    let (mut idle_contexts, mut token) = idle_contexts.into_split();
    let len = idle_contexts.len();
    for _ in 0..len {
        let Some(context_ref) = idle_contexts.pop_front() else {
            break;
        };
        let Some(context) = context_ref.upgrade() else {
            continue;
        };
        if Arc::ptr_eq(&context, &current_context) {
            idle_contexts.push_back(context_ref);
            continue;
        }
        let Some(guard) = context.try_read(token.token()) else {
            idle_contexts.push_back(context_ref);
            continue;
        };
        if guard.status.is_soft_blocked()
            && let Some(wake) = guard.wake
            && switch_time >= wake
        {
            let prio = guard.prio;
            drop(guard);
            wakeups.push((prio, context_ref));
            continue;
        }

        if guard.status.is_runnable() && !guard.running {
            let prio = guard.prio;
            drop(guard);
            wakeups.push((prio, context_ref));
            continue;
        }

        drop(guard);
        idle_contexts.push_back(context_ref);
    }

    // Drop IDLE_CONTEXTS before acquiring SchedQueuesLock to avoid a
    // deadlock with steal_work on another CPU. The previous code held
    // both locks simultaneously: wakeup_contexts (this fn) held
    // IDLE_CONTEXTS, then waited for SchedQueuesLock on its own CPU;
    // steal_work on another CPU held that same SchedQueuesLock and
    // waited for IDLE_CONTEXTS. Classic circular wait.
    //
    // The wakeups Vec is the only data we need from IDLE_CONTEXTS;
    // releasing the lock here means steal_work on another CPU can
    // proceed while this CPU is acquiring SchedQueuesLock.
    drop(idle_contexts);

    if wakeups.is_empty() {
        return;
    }

    let mut sched_lock = SchedQueuesLock::new(&percpu.sched);
    let run_queues = unsafe { sched_lock.queues_mut() };
    for (prio, context_ref) in wakeups {
        if let Some(context_lock) = context_ref.upgrade() {
            let mut context_guard = unsafe { context_lock.write_arc() };
            assign_context_to_cpu(&mut context_guard, percpu.cpu_id);
        }
        run_queues[prio].push_back(context_ref);
    }
}

fn pick_next_from_queues(
    token: &mut CleanLockToken,
    contexts_list: &mut [alloc::collections::VecDeque<WeakContextRef>; RUN_QUEUE_COUNT],
    cpu_id: LogicalCpuId,
    switch_time: u128,
    prev_context_lock: &Arc<ContextLock>,
    idle_context: &Arc<ContextLock>,
    balance: &mut [usize; RUN_QUEUE_COUNT],
    i: &mut usize,
) -> Option<ArcContextLockWriteGuard> {
    let mut empty_queues = 0;
    let mut total_iters = 0;
    let total_contexts: usize = contexts_list.iter().map(|q| q.len()).sum();
    let mut skipped_contexts = 0;

    for prio in 0..RUN_QUEUE_COUNT {
        let rt_contexts = contexts_list
            .get_mut(prio)
            .expect("prio should be between [0, 39]");
        let len = rt_contexts.len();
        for _ in 0..len {
            let (rt_ref, rt_lock) = match rt_contexts.pop_front() {
                Some(lock) => match lock.upgrade() {
                    Some(l) => (lock, l),
                    None => {
                        skipped_contexts += 1;
                        continue;
                    }
                },
                None => break,
            };
            if Arc::ptr_eq(&rt_lock, idle_context) || Arc::ptr_eq(&rt_lock, prev_context_lock) {
                rt_contexts.push_back(rt_ref);
                continue;
            }
            let rt_guard = unsafe { rt_lock.write_arc() };
            if !rt_guard.status.is_runnable()
                || rt_guard.running
                || !rt_guard.sched_affinity.contains(cpu_id)
            {
                rt_contexts.push_back(rt_ref);
                continue;
            }
            if rt_guard.sched_policy == SchedPolicy::Fifo
                || rt_guard.sched_policy == SchedPolicy::RoundRobin
            {
                return Some(rt_guard);
            }
            rt_contexts.push_back(rt_ref);
        }
    }

    {
        let mut min_vruntime = u128::MAX;
        let mut best: Option<(usize, WeakContextRef)> = None;
        for (prio, queue) in contexts_list.iter().enumerate() {
            for ctx_ref in queue.iter() {
                if let Some(ctx_lock) = ctx_ref.upgrade() {
                    if Arc::ptr_eq(&ctx_lock, prev_context_lock)
                        || Arc::ptr_eq(&ctx_lock, idle_context)
                    {
                        continue;
                    }
                    if let Some(guard) = ctx_lock.try_read(token.token()) {
                        if guard.status.is_runnable()
                            && !guard.running
                            && guard.sched_affinity.contains(cpu_id)
                            && guard.sched_policy == SchedPolicy::Other
                        {
                            let mut vruntime = guard.vruntime;
                            if guard.last_cpu == Some(cpu_id) {
                                vruntime = vruntime.saturating_sub(vruntime / 8);
                            }
                            drop(guard);
                            if vruntime < min_vruntime {
                                min_vruntime = vruntime;
                                best = Some((prio, ctx_ref.clone()));
                            }
                        }
                    }
                }
            }
        }
        if let Some((best_prio, ctx_ref)) = best {
            contexts_list[best_prio].retain(|r| !WeakContextRef::eq(r, &ctx_ref));
            if let Some(ctx_lock) = ctx_ref.upgrade() {
                let guard = unsafe { ctx_lock.write_arc() };
                if guard.status.is_runnable()
                    && !guard.running
                    && guard.sched_affinity.contains(cpu_id)
                    && guard.sched_policy == SchedPolicy::Other
                {
                    return Some(guard);
                }

                drop(guard);
                contexts_list[best_prio].push_back(ctx_ref);
            }
        }
    }

    'priority: loop {
        *i = (*i + 1) % RUN_QUEUE_COUNT;
        total_iters += 1;

        if total_iters >= 5000 {
            break 'priority;
        }

        if skipped_contexts > total_contexts && total_contexts > 0 {
            break 'priority;
        }

        let contexts = contexts_list
            .get_mut(*i)
            .expect("i should be between [0, 39]!");

        if contexts.is_empty() {
            empty_queues += 1;
            if empty_queues >= RUN_QUEUE_COUNT {
                break 'priority;
            }
            continue;
        }

        empty_queues = 0;

        if balance[*i] < SCHED_PRIO_TO_WEIGHT[20] {
            balance[*i] += SCHED_PRIO_TO_WEIGHT[*i];
            continue;
        }

        let len = contexts.len();
        for _ in 0..len {
            let (next_context_ref, next_context_lock) = match contexts.pop_front() {
                Some(lock) => match lock.upgrade() {
                    Some(new_lock) => (lock, new_lock),
                    None => {
                        skipped_contexts += 1;
                        continue;
                    }
                },
                None => break,
            };

            if Arc::ptr_eq(&next_context_lock, prev_context_lock)
                || Arc::ptr_eq(&next_context_lock, idle_context)
            {
                contexts.push_back(next_context_ref);
                continue;
            }
            let mut next_context_guard = unsafe { next_context_lock.write_arc() };

            let sw = unsafe { update_runnable(&mut next_context_guard, cpu_id, switch_time) };
            if let UpdateResult::CanSwitch = sw {
                balance[*i] -= SCHED_PRIO_TO_WEIGHT[20];
                return Some(next_context_guard);
            }

            if matches!(sw, UpdateResult::Blocked) {
                idle_contexts(token.downgrade()).push_back(next_context_ref);
            } else {
                contexts.push_back(next_context_ref);
            }
            skipped_contexts += 1;

            if skipped_contexts >= total_contexts {
                break 'priority;
            }
        }
    }

    None
}

fn pick_next_from_global_queues(
    token: &mut LockToken<L1>,
    contexts_list: &mut [alloc::collections::VecDeque<WeakContextRef>; RUN_QUEUE_COUNT],
    cpu_id: LogicalCpuId,
    switch_time: u128,
    prev_context_lock: &Arc<ContextLock>,
    idle_context: &Arc<ContextLock>,
    balance: &mut [usize; RUN_QUEUE_COUNT],
    i: &mut usize,
) -> Option<ArcContextLockWriteGuard> {
    let mut empty_queues = 0;
    let mut total_iters = 0;
    let total_contexts: usize = contexts_list.iter().map(|q| q.len()).sum();
    let mut skipped_contexts = 0;

    for prio in 0..RUN_QUEUE_COUNT {
        let rt_contexts = contexts_list
            .get_mut(prio)
            .expect("prio should be between [0, 39]");
        let len = rt_contexts.len();
        for _ in 0..len {
            let (rt_ref, rt_lock) = match rt_contexts.pop_front() {
                Some(lock) => match lock.upgrade() {
                    Some(l) => (lock, l),
                    None => {
                        skipped_contexts += 1;
                        continue;
                    }
                },
                None => break,
            };
            if Arc::ptr_eq(&rt_lock, idle_context) || Arc::ptr_eq(&rt_lock, prev_context_lock) {
                rt_contexts.push_back(rt_ref);
                continue;
            }
            let rt_guard = unsafe { rt_lock.write_arc() };
            if !rt_guard.status.is_runnable()
                || rt_guard.running
                || !rt_guard.sched_affinity.contains(cpu_id)
            {
                rt_contexts.push_back(rt_ref);
                continue;
            }
            if rt_guard.sched_policy == SchedPolicy::Fifo
                || rt_guard.sched_policy == SchedPolicy::RoundRobin
            {
                return Some(rt_guard);
            }
            rt_contexts.push_back(rt_ref);
        }
    }

    {
        let mut min_vruntime = u128::MAX;
        let mut best: Option<(usize, WeakContextRef)> = None;
        for (prio, queue) in contexts_list.iter().enumerate() {
            for ctx_ref in queue.iter() {
                if let Some(ctx_lock) = ctx_ref.upgrade() {
                    if Arc::ptr_eq(&ctx_lock, prev_context_lock)
                        || Arc::ptr_eq(&ctx_lock, idle_context)
                    {
                        continue;
                    }
                    if let Some(guard) = ctx_lock.try_read(token.token()) {
                        if guard.status.is_runnable()
                            && !guard.running
                            && guard.sched_affinity.contains(cpu_id)
                            && guard.sched_policy == SchedPolicy::Other
                        {
                            let mut vruntime = guard.vruntime;
                            if guard.last_cpu == Some(cpu_id) {
                                vruntime = vruntime.saturating_sub(vruntime / 8);
                            }
                            drop(guard);
                            if vruntime < min_vruntime {
                                min_vruntime = vruntime;
                                best = Some((prio, ctx_ref.clone()));
                            }
                        }
                    }
                }
            }
        }
        if let Some((best_prio, ctx_ref)) = best {
            contexts_list[best_prio].retain(|r| !WeakContextRef::eq(r, &ctx_ref));
            if let Some(ctx_lock) = ctx_ref.upgrade() {
                let guard = unsafe { ctx_lock.write_arc() };
                if guard.status.is_runnable()
                    && !guard.running
                    && guard.sched_affinity.contains(cpu_id)
                    && guard.sched_policy == SchedPolicy::Other
                {
                    return Some(guard);
                }

                drop(guard);
                contexts_list[best_prio].push_back(ctx_ref);
            }
        }
    }

    'priority: loop {
        *i = (*i + 1) % RUN_QUEUE_COUNT;
        total_iters += 1;

        if total_iters >= 5000 {
            break 'priority;
        }

        if skipped_contexts > total_contexts && total_contexts > 0 {
            break 'priority;
        }

        let contexts = contexts_list
            .get_mut(*i)
            .expect("i should be between [0, 39]!");

        if contexts.is_empty() {
            empty_queues += 1;
            if empty_queues >= RUN_QUEUE_COUNT {
                break 'priority;
            }
            continue;
        }

        empty_queues = 0;

        if balance[*i] < SCHED_PRIO_TO_WEIGHT[20] {
            balance[*i] += SCHED_PRIO_TO_WEIGHT[*i];
            continue;
        }

        let len = contexts.len();
        for _ in 0..len {
            let (next_context_ref, next_context_lock) = match contexts.pop_front() {
                Some(lock) => match lock.upgrade() {
                    Some(new_lock) => (lock, new_lock),
                    None => {
                        skipped_contexts += 1;
                        continue;
                    }
                },
                None => break,
            };

            if Arc::ptr_eq(&next_context_lock, prev_context_lock)
                || Arc::ptr_eq(&next_context_lock, idle_context)
            {
                contexts.push_back(next_context_ref);
                continue;
            }
            let mut next_context_guard = unsafe { next_context_lock.write_arc() };

            let sw = unsafe { update_runnable(&mut next_context_guard, cpu_id, switch_time) };
            if let UpdateResult::CanSwitch = sw {
                balance[*i] -= SCHED_PRIO_TO_WEIGHT[20];
                return Some(next_context_guard);
            }

            if matches!(sw, UpdateResult::Blocked) {
                idle_contexts(token.token()).push_back(next_context_ref);
            } else {
                contexts.push_back(next_context_ref);
            }
            skipped_contexts += 1;

            if skipped_contexts >= total_contexts {
                break 'priority;
            }
        }
    }

    None
}

unsafe fn update_stealable(context: &mut Context, switch_time: u128) -> UpdateResult {
    if context.running {
        return UpdateResult::Skip;
    }
    if context.status.is_soft_blocked()
        && let Some(wake) = context.wake
        && switch_time >= wake
    {
        context.wake = None;
        context.unblock_no_ipi();
    }
    if context.status.is_runnable() {
        UpdateResult::CanSwitch
    } else {
        UpdateResult::Blocked
    }
}

/// This is the scheduler function which currently utilises Deficit Weighted Round Robin Scheduler
fn select_next_context(
    token: &mut CleanLockToken,
    percpu: &PercpuBlock,
    cpu_id: LogicalCpuId,
    switch_time: u128,
    was_idle: bool,
    prev_context_guard: &mut ArcRwLockWriteGuard<L4, Context>,
) -> Result<Option<ArcContextLockWriteGuard>, SwitchResult> {
    let idle_context = percpu.switch_internals.idle_context();
    let prev_context_lock = crate::context::current();

    let local_next = {
        let mut sched_lock = SchedQueuesLock::new(&percpu.sched);
        let mut balance = percpu.sched.balance.get();
        let mut last_queue = percpu.sched.last_queue.get() % RUN_QUEUE_COUNT;
        let next = pick_next_from_queues(
            token,
            unsafe { sched_lock.queues_mut() },
            cpu_id,
            switch_time,
            &prev_context_lock,
            &idle_context,
            &mut balance,
            &mut last_queue,
        );
        percpu.sched.balance.set(balance);
        percpu.sched.last_queue.set(last_queue);
        next
    };

    if let Some(next_context_guard) = local_next {
        queue_previous_context(
            token,
            percpu,
            &prev_context_lock,
            prev_context_guard,
            &idle_context,
        );
        return Ok(Some(next_context_guard));
    }

    if let Some(next_context_guard) = steal_work(token, cpu_id, switch_time) {
        queue_previous_context(
            token,
            percpu,
            &prev_context_lock,
            prev_context_guard,
            &idle_context,
        );
        return Ok(Some(next_context_guard));
    }

    let global_next = {
        let contexts_data = run_contexts(token.token());
        let (mut contexts_data, mut contexts_token) = contexts_data.into_split();
        let mut balance = percpu.sched.balance.get();
        let mut last_queue = percpu.sched.last_queue.get() % RUN_QUEUE_COUNT;
        let next = pick_next_from_global_queues(
            &mut contexts_token,
            &mut contexts_data.set,
            cpu_id,
            switch_time,
            &prev_context_lock,
            &idle_context,
            &mut balance,
            &mut last_queue,
        );
        percpu.sched.balance.set(balance);
        percpu.sched.last_queue.set(last_queue);
        next
    };

    if let Some(next_context_guard) = global_next {
        queue_previous_context(
            token,
            percpu,
            &prev_context_lock,
            prev_context_guard,
            &idle_context,
        );
        return Ok(Some(next_context_guard));
    }

    queue_previous_context(
        token,
        percpu,
        &prev_context_lock,
        prev_context_guard,
        &idle_context,
    );

    if !was_idle && !Arc::ptr_eq(&prev_context_lock, &idle_context) {
        Ok(Some(unsafe { idle_context.write_arc() }))
    } else {
        Ok(None)
    }
}

/// Holds per-CPU state necessary for context switching.
///
/// This struct contains information such as the idle context, current context, and PIT tick counts,
/// as well as fields required for managing ptrace sessions and signals.
pub struct ContextSwitchPercpu {
    switch_result: Cell<Option<SwitchResultInner>>,
    switch_time: Cell<u128>,
    pit_ticks: Cell<usize>,

    current_ctxt: RefCell<Option<Arc<ContextLock>>>,

    /// The idle process.
    idle_ctxt: RefCell<Option<Arc<ContextLock>>>,
    pub(crate) being_sigkilled: Cell<bool>,
}

impl ContextSwitchPercpu {
    pub const fn default() -> Self {
        Self {
            switch_result: Cell::new(None),
            switch_time: Cell::new(0),
            pit_ticks: Cell::new(0),
            current_ctxt: RefCell::new(None),
            idle_ctxt: RefCell::new(None),
            being_sigkilled: Cell::new(false),
        }
    }

    /// Applies a function to the current context, allowing controlled access.
    ///
    /// # Parameters
    /// - `f`: A closure that receives a reference to the current context and returns a value.
    ///
    /// # Returns
    /// The result of applying `f` to the current context.
    pub fn with_context<T>(&self, f: impl FnOnce(&Arc<ContextLock>) -> T) -> T {
        f(self
            .current_ctxt
            .borrow()
            .as_ref()
            .expect("not inside of context"))
    }

    /// Applies a function to the current context, allowing controlled access.
    ///
    /// # Parameters
    /// - `f`: A closure that receives a reference to the current context and returns a value.
    ///
    /// # Returns
    /// The result of applying `f` to the current context if any.
    pub fn try_with_context<T>(&self, f: impl FnOnce(Option<&Arc<ContextLock>>) -> T) -> T {
        f(self.current_ctxt.borrow().as_ref())
    }

    /// Sets the current context to a new value.
    ///
    /// # Safety
    /// This function is unsafe as it modifies the context state directly.
    ///
    /// # Parameters
    /// - `new`: The new context to be set as the current context.
    pub unsafe fn set_current_context(&self, new: Arc<ContextLock>) {
        *self.current_ctxt.borrow_mut() = Some(new);
    }

    /// Sets the idle context to a new value.
    ///
    /// # Safety
    /// This function is unsafe as it modifies the idle context state directly.
    ///
    /// # Parameters
    /// - `new`: The new context to be set as the idle context.
    pub unsafe fn set_idle_context(&self, new: Arc<ContextLock>) {
        *self.idle_ctxt.borrow_mut() = Some(new);
    }

    /// Retrieves the current idle context.
    ///
    /// # Returns
    /// A reference to the idle context.
    pub fn idle_context(&self) -> Arc<ContextLock> {
        Arc::clone(
            self.idle_ctxt
                .borrow()
                .as_ref()
                .expect("no idle context present"),
        )
    }

    /// Retrieves the current idle context if it has been initialized.
    ///
    /// This is the fallible variant of [`idle_context`], intended for
    /// cross-CPU paths (`steal_work`, `migrate_one_context`) that may
    /// access a PercpuBlock whose `context::init()` has not run yet
    /// during AP bring-up.
    ///
    /// # Returns
    /// `Some(Arc<ContextLock>)` if the idle context is set, `None` otherwise.
    pub fn try_idle_context(&self) -> Option<Arc<ContextLock>> {
        self.idle_ctxt.borrow().as_ref().map(Arc::clone)
    }
}