feat: P0-P6 kernel scheduler + relibc threading comprehensive implementation

P0-P2: Barrier SMP, sigmask/pthread_kill races, robust mutexes, RT scheduling, POSIX sched API P3: PerCpuSched struct, per-CPU wiring, work stealing, load balancing, initial placement P4: 64-shard futex table, REQUEUE, PI futexes (LOCK_PI/UNLOCK_PI/TRYLOCK_PI), robust futexes, vruntime tracking, min-vruntime SCHED_OTHER selection P5: setpriority/getpriority, pthread_setaffinity_np, pthread_setname_np, pthread_setschedparam (Redox) P6: Cache-affine scheduling (last_cpu + vruntime bonus), NUMA topology kernel hints + numad userspace daemon Stability fixes: make_consistent stores 0 (dead TID fix), cond.rs error propagation, SPIN_COUNT adaptive spinning, Sys::open &str fix, PI futex CAS race, proc.rs lock ordering, barrier destroy Patches: 33 kernel + 58 relibc patches, all tracked in recipes Docs: KERNEL-SCHEDULER-MULTITHREAD-IMPROVEMENT-PLAN.md updated, SCHEDULER-REVIEW-FINAL.md created Architecture: NUMA topology parsing stays userspace (numad daemon), kernel stores lightweight NumaTopology hints
2026-04-30 18:21:48 +01:00
parent 55d00c3a24
commit 34360e1e4f
70 changed files with 15268 additions and 10 deletions
@@ -0,0 +1,985 @@
+diff --git a/src/context/switch.rs b/src/context/switch.rs
+index 86684c8..d054734 100644
+--- a/src/context/switch.rs
+++ b/src/context/switch.rs
+@@ -5,18 +5,18 @@
+ use crate::{
+     context::{
+         self, arch, idle_contexts, idle_contexts_try, run_contexts, ArcContextLockWriteGuard,
+-        Context, ContextLock, WeakContextRef,
+        Context, ContextLock, SchedPolicy, WeakContextRef, RUN_QUEUE_COUNT,
+     },
+-    cpu_set::LogicalCpuId,
+    cpu_set::{LogicalCpuId, LogicalCpuSet},
+     cpu_stats::{self, CpuState},
+-    percpu::PercpuBlock,
+-    sync::{ArcRwLockWriteGuard, CleanLockToken, L4},
+    percpu::{get_percpu_block, PerCpuSched, PercpuBlock},
+    sync::{ArcRwLockWriteGuard, CleanLockToken, LockToken, L1, L4},
+ };
+ use alloc::{sync::Arc, vec::Vec};
+ use core::{
+     cell::{Cell, RefCell},
+     hint, mem,
+-    sync::atomic::Ordering,
+    sync::atomic::{AtomicUsize, Ordering},
+ };
+ use syscall::PtraceFlags;
+ 
+@@ -33,35 +33,49 @@ const SCHED_PRIO_TO_WEIGHT: [usize; 40] = [
+     70, 56, 45, 36, 29, 23, 18, 15,
+ ];
+ 
+-/// Determines if a given context is eligible to be scheduled on a given CPU (in
+-/// principle, the current CPU).
+-///
+-/// # Safety
+-/// This function is unsafe because it modifies the `context`'s state directly without synchronization.
+-///
+-/// # Parameters
+-/// - `context`: The context (process/thread) to be checked.
+-/// - `cpu_id`: The logical ID of the CPU on which the context is being scheduled.
+-///
+-/// # Returns
+-/// - `UpdateResult::CanSwitch`: If the context can be switched to.
+-/// - `UpdateResult::Skip`: If the context should be skipped (e.g., it's running on another CPU).
+const LOAD_BALANCE_INTERVAL_NS: u128 = 100_000_000;
+
+static SCHED_STEAL_COUNT: AtomicUsize = AtomicUsize::new(0);
+
+struct SchedQueuesLock<'a> {
+    sched: &'a PerCpuSched,
+}
+
+impl<'a> SchedQueuesLock<'a> {
+    fn new(sched: &'a PerCpuSched) -> Self {
+        sched.take_lock();
+        Self { sched }
+    }
+
+    unsafe fn queues_mut(
+        &mut self,
+    ) -> &mut [alloc::collections::VecDeque<WeakContextRef>; RUN_QUEUE_COUNT] {
+        unsafe { self.sched.queues_mut() }
+    }
+}
+
+impl Drop for SchedQueuesLock<'_> {
+    fn drop(&mut self) {
+        self.sched.release_lock();
+    }
+}
+
+fn assign_context_to_cpu(context: &mut Context, cpu_id: LogicalCpuId) {
+    context.sched_affinity = LogicalCpuSet::empty();
+    context.sched_affinity.atomic_set(cpu_id);
+}
+
+ unsafe fn update_runnable(
+     context: &mut Context,
+     cpu_id: LogicalCpuId,
+     switch_time: u128,
+ ) -> UpdateResult {
+-    // Ignore contexts that are already running.
+     if context.running {
+         return UpdateResult::Skip;
+     }
+-
+-    // Ignore contexts assigned to other CPUs.
+     if !context.sched_affinity.contains(cpu_id) {
+         return UpdateResult::Skip;
+     }
+-
+-    // If context is soft-blocked and has a wake-up time, check if it should wake up.
+     if context.status.is_soft_blocked()
+         && let Some(wake) = context.wake
+         && switch_time >= wake
+@@ -69,8 +83,6 @@ unsafe fn update_runnable(
+         context.wake = None;
+         context.unblock_no_ipi();
+     }
+-
+-    // If the context is runnable, indicate it can be switched to.
+     if context.status.is_runnable() {
+         UpdateResult::CanSwitch
+     } else {
+@@ -90,12 +102,16 @@ struct SwitchResultInner {
+ ///
+ /// The function also calls the signal handler after switching contexts.
+ pub fn tick(token: &mut CleanLockToken) {
+-    let ticks_cell = &PercpuBlock::current().switch_internals.pit_ticks;
+    let percpu = PercpuBlock::current();
+    let ticks_cell = &percpu.switch_internals.pit_ticks;
+ 
+     let new_ticks = ticks_cell.get() + 1;
+     ticks_cell.set(new_ticks);
+ 
+-    // Trigger a context switch after every 3 ticks (approx. 6.75 ms).
+    let balance_time = crate::time::monotonic(token);
+    maybe_balance_queues(token, percpu, balance_time);
+
+    // Trigger a context switch after every 3 ticks.
+     if new_ticks >= 3 {
+         switch(token);
+         crate::context::signal::signal_handler(token);
+@@ -167,22 +183,12 @@ pub fn switch(token: &mut CleanLockToken) -> SwitchResult {
+     let mut prev_context_guard = unsafe { prev_context_lock.write_arc() };
+ 
+     if !prev_context_guard.is_preemptable() {
+-        // Unset global lock
+         arch::CONTEXT_SWITCH_LOCK.store(false, Ordering::SeqCst);
+-
+-        // Pretend to have finished switching, so CPU is not idled
+         return SwitchResult::Switched;
+     }
+ 
+     // Alarm (previously in update_runnable)
+-    let wakeups = wakeup_contexts(token, switch_time);
+-
+-    if wakeups.len() > 0 {
+-        let mut run_contexts = run_contexts(token.token());
+-        for (prio, context_lock) in wakeups {
+-            run_contexts.set[prio].push_back(context_lock);
+-        }
+-    }
+    wakeup_contexts(token, percpu, switch_time);
+ 
+     let cpu_id = crate::cpu_id();
+ 
+@@ -213,6 +219,7 @@ pub fn switch(token: &mut CleanLockToken) -> SwitchResult {
+ 
+             // Set the previous context as "not running"
+             prev_context.running = false;
+            prev_context.last_cpu = prev_context.cpu_id;
+ 
+             // Set the next context as "running"
+             next_context.running = true;
+@@ -222,6 +229,14 @@ pub fn switch(token: &mut CleanLockToken) -> SwitchResult {
+             // Update times
+             if !was_idle {
+                 prev_context.cpu_time += switch_time.saturating_sub(prev_context.switch_time);
+                if prev_context.sched_policy == SchedPolicy::Other {
+                    let actual_ns = switch_time.saturating_sub(prev_context.switch_time);
+                    let weight =
+                        SCHED_PRIO_TO_WEIGHT[prev_context.sched_static_prio.min(39)] as u128;
+                    let default_weight = SCHED_PRIO_TO_WEIGHT[20] as u128;
+                    let delta = actual_ns.saturating_mul(default_weight) / weight.max(1);
+                    prev_context.vruntime = prev_context.vruntime.saturating_add(delta);
+                }
+             }
+             next_context.switch_time = switch_time;
+             if next_context.userspace {
+@@ -302,13 +317,234 @@ pub fn switch(token: &mut CleanLockToken) -> SwitchResult {
+     }
+ }
+ 
+-fn wakeup_contexts(token: &mut CleanLockToken, switch_time: u128) -> Vec<(usize, WeakContextRef)> {
+fn queue_previous_context(
+    token: &mut CleanLockToken,
+    percpu: &PercpuBlock,
+    prev_context_lock: &Arc<ContextLock>,
+    prev_context_guard: &ArcRwLockWriteGuard<L4, Context>,
+    idle_context: &Arc<ContextLock>,
+) {
+    if Arc::ptr_eq(prev_context_lock, idle_context) {
+        return;
+    }
+
+    let prev_ctx = WeakContextRef(Arc::downgrade(prev_context_lock));
+    if prev_context_guard.status.is_runnable() {
+        let prio = prev_context_guard.prio;
+        let mut sched_lock = SchedQueuesLock::new(&percpu.sched);
+        unsafe {
+            sched_lock.queues_mut()[prio].push_back(prev_ctx);
+        }
+    } else {
+        idle_contexts(token.downgrade()).push_back(prev_ctx);
+    }
+}
+
+fn pop_movable_context(
+    token: &mut CleanLockToken,
+    queues: &mut [alloc::collections::VecDeque<WeakContextRef>; RUN_QUEUE_COUNT],
+    target_cpu: LogicalCpuId,
+    switch_time: u128,
+    idle_context: &Arc<ContextLock>,
+) -> Option<(usize, WeakContextRef)> {
+    for prio in 0..RUN_QUEUE_COUNT {
+        let len = queues[prio].len();
+        for _ in 0..len {
+            let Some(context_ref) = queues[prio].pop_front() else {
+                break;
+            };
+            let Some(context_lock) = context_ref.upgrade() else {
+                continue;
+            };
+            if Arc::ptr_eq(&context_lock, idle_context) {
+                queues[prio].push_back(context_ref);
+                continue;
+            }
+
+            let mut context_guard = unsafe { context_lock.write_arc() };
+            let sw = unsafe { update_stealable(&mut context_guard, switch_time) };
+            if let UpdateResult::CanSwitch = sw {
+                assign_context_to_cpu(&mut context_guard, target_cpu);
+                let moved_ref = WeakContextRef(Arc::downgrade(ArcContextLockWriteGuard::rwlock(
+                    &context_guard,
+                )));
+                drop(context_guard);
+                return Some((prio, moved_ref));
+            }
+
+            if matches!(sw, UpdateResult::Blocked) {
+                idle_contexts(token.downgrade()).push_back(context_ref);
+            } else {
+                queues[prio].push_back(context_ref);
+            }
+        }
+    }
+
+    None
+}
+
+fn steal_work(
+    token: &mut CleanLockToken,
+    cpu_id: LogicalCpuId,
+    switch_time: u128,
+) -> Option<ArcContextLockWriteGuard> {
+    let cpu_count = crate::cpu_count();
+    if cpu_count <= 1 {
+        return None;
+    }
+
+    for offset in 1..cpu_count {
+        let victim_id = LogicalCpuId::new((cpu_id.get() + offset) % cpu_count);
+        let Some(victim) = get_percpu_block(victim_id) else {
+            continue;
+        };
+
+        let victim_idle = victim.switch_internals.idle_context();
+        let mut victim_lock = SchedQueuesLock::new(&victim.sched);
+        let victim_queues = unsafe { victim_lock.queues_mut() };
+
+        for prio in 0..RUN_QUEUE_COUNT {
+            let len = victim_queues[prio].len();
+            for _ in 0..len {
+                let Some(context_ref) = victim_queues[prio].pop_front() else {
+                    break;
+                };
+                let Some(context_lock) = context_ref.upgrade() else {
+                    continue;
+                };
+                if Arc::ptr_eq(&context_lock, &victim_idle) {
+                    victim_queues[prio].push_back(context_ref);
+                    continue;
+                }
+
+                let mut context_guard = unsafe { context_lock.write_arc() };
+                let sw = unsafe { update_stealable(&mut context_guard, switch_time) };
+                if let UpdateResult::CanSwitch = sw {
+                    assign_context_to_cpu(&mut context_guard, cpu_id);
+                    SCHED_STEAL_COUNT.fetch_add(1, Ordering::Relaxed);
+                    return Some(context_guard);
+                }
+
+                if matches!(sw, UpdateResult::Blocked) {
+                    idle_contexts(token.downgrade()).push_back(context_ref);
+                } else {
+                    victim_queues[prio].push_back(context_ref);
+                }
+            }
+        }
+    }
+
+    None
+}
+
+fn queue_depth(percpu: &PercpuBlock) -> usize {
+    let mut sched_lock = SchedQueuesLock::new(&percpu.sched);
+    unsafe {
+        sched_lock
+            .queues_mut()
+            .iter()
+            .map(|queue| queue.len())
+            .sum()
+    }
+}
+
+fn migrate_one_context(
+    token: &mut CleanLockToken,
+    source_id: LogicalCpuId,
+    target_id: LogicalCpuId,
+    switch_time: u128,
+) -> bool {
+    let Some(source) = get_percpu_block(source_id) else {
+        return false;
+    };
+    let Some(target) = get_percpu_block(target_id) else {
+        return false;
+    };
+
+    let source_idle = source.switch_internals.idle_context();
+    let moved = {
+        let mut source_lock = SchedQueuesLock::new(&source.sched);
+        let source_queues = unsafe { source_lock.queues_mut() };
+        pop_movable_context(token, source_queues, target_id, switch_time, &source_idle)
+    };
+
+    let Some((prio, context_ref)) = moved else {
+        return false;
+    };
+
+    let mut target_lock = SchedQueuesLock::new(&target.sched);
+    unsafe {
+        target_lock.queues_mut()[prio].push_back(context_ref);
+    }
+    true
+}
+
+fn maybe_balance_queues(token: &mut CleanLockToken, percpu: &PercpuBlock, balance_time: u128) {
+    if crate::cpu_count() <= 1 || percpu.cpu_id != LogicalCpuId::BSP {
+        return;
+    }
+    if balance_time.saturating_sub(percpu.sched.last_balance_time.get()) < LOAD_BALANCE_INTERVAL_NS
+    {
+        return;
+    }
+
+    percpu.sched.last_balance_time.set(balance_time);
+
+    let mut depths = Vec::new();
+    let mut total_depth = 0usize;
+    for raw_id in 0..crate::cpu_count() {
+        let cpu_id = LogicalCpuId::new(raw_id);
+        let Some(cpu_percpu) = get_percpu_block(cpu_id) else {
+            continue;
+        };
+        let depth = queue_depth(cpu_percpu);
+        total_depth += depth;
+        depths.push((cpu_id, depth));
+    }
+
+    if depths.len() <= 1 || total_depth == 0 {
+        return;
+    }
+
+    let avg_depth = (total_depth + depths.len().saturating_sub(1)) / depths.len();
+
+    for target_index in 0..depths.len() {
+        if depths[target_index].1 != 0 {
+            continue;
+        }
+
+        let mut source_index = None;
+        let mut source_depth = 0usize;
+        for (idx, &(_, depth)) in depths.iter().enumerate() {
+            if idx == target_index {
+                continue;
+            }
+            if depth > avg_depth + 1 && depth > source_depth {
+                source_index = Some(idx);
+                source_depth = depth;
+            }
+        }
+
+        let Some(source_index) = source_index else {
+            continue;
+        };
+
+        let source_id = depths[source_index].0;
+        let target_id = depths[target_index].0;
+        if migrate_one_context(token, source_id, target_id, balance_time) {
+            depths[source_index].1 = depths[source_index].1.saturating_sub(1);
+            depths[target_index].1 += 1;
+        }
+    }
+}
+
+fn wakeup_contexts(token: &mut CleanLockToken, percpu: &PercpuBlock, switch_time: u128) {
+     // TODO: Optimise this somehow. Perhaps using a separate timer queue?
+     let mut wakeups = Vec::new();
+     let current_context = context::current();
+     let Some(idle_contexts) = idle_contexts_try(token.downgrade()) else {
+         // other cpus may spawning or killing contexts so let's skip wakeups to avoid contention
+-        return wakeups;
+        return;
+     };
+     let (mut idle_contexts, mut token) = idle_contexts.into_split();
+     let len = idle_contexts.len();
+@@ -327,15 +563,14 @@ fn wakeup_contexts(token: &mut CleanLockToken, switch_time: u128) -> Vec<(usize,
+             idle_contexts.push_back(context_ref);
+             continue;
+         };
+-        if guard.status.is_soft_blocked() {
+-            if let Some(wake) = guard.wake {
+-                if switch_time >= wake {
+-                    let prio = guard.prio;
+-                    drop(guard);
+-                    wakeups.push((prio, context_ref));
+-                    continue;
+-                }
+-            }
+        if guard.status.is_soft_blocked()
+            && let Some(wake) = guard.wake
+            && switch_time >= wake
+        {
+            let prio = guard.prio;
+            drop(guard);
+            wakeups.push((prio, context_ref));
+            continue;
+         }
+ 
+         if guard.status.is_runnable() && !guard.running {
+@@ -348,43 +583,127 @@ fn wakeup_contexts(token: &mut CleanLockToken, switch_time: u128) -> Vec<(usize,
+         drop(guard);
+         idle_contexts.push_back(context_ref);
+     }
+-    wakeups
+
+    if wakeups.is_empty() {
+        return;
+    }
+
+    let mut sched_lock = SchedQueuesLock::new(&percpu.sched);
+    let run_queues = unsafe { sched_lock.queues_mut() };
+    for (prio, context_ref) in wakeups {
+        if let Some(context_lock) = context_ref.upgrade() {
+            let mut context_guard = unsafe { context_lock.write_arc() };
+            assign_context_to_cpu(&mut context_guard, percpu.cpu_id);
+        }
+        run_queues[prio].push_back(context_ref);
+    }
+ }
+ 
+-/// This is the scheduler function which currently utilises Deficit Weighted Round Robin Scheduler
+-fn select_next_context(
+fn pick_next_from_queues(
+     token: &mut CleanLockToken,
+-    percpu: &PercpuBlock,
+    contexts_list: &mut [alloc::collections::VecDeque<WeakContextRef>; RUN_QUEUE_COUNT],
+     cpu_id: LogicalCpuId,
+     switch_time: u128,
+-    was_idle: bool,
+-    prev_context_guard: &mut ArcRwLockWriteGuard<L4, Context>,
+-) -> Result<Option<ArcContextLockWriteGuard>, SwitchResult> {
+-    let contexts_data = run_contexts(token.token());
+-    let (mut contexts_data, mut token) = contexts_data.into_split();
+-    let contexts_list = &mut contexts_data.set;
+-    let idle_context = percpu.switch_internals.idle_context();
+-    let mut balance = percpu.balance.get();
+-    let mut i = percpu.last_queue.get() % 40;
+-
+-    // Lock the previous context.
+-    let prev_context_lock = crate::context::current();
+-
+    prev_context_lock: &Arc<ContextLock>,
+    idle_context: &Arc<ContextLock>,
+    balance: &mut [usize; RUN_QUEUE_COUNT],
+    i: &mut usize,
+) -> Option<ArcContextLockWriteGuard> {
+     let mut empty_queues = 0;
+     let mut total_iters = 0;
+-    let mut next_context_guard_opt = None;
+-
+     let total_contexts: usize = contexts_list.iter().map(|q| q.len()).sum();
+     let mut skipped_contexts = 0;
+ 
+    for prio in 0..RUN_QUEUE_COUNT {
+        let rt_contexts = contexts_list
+            .get_mut(prio)
+            .expect("prio should be between [0, 39]");
+        let len = rt_contexts.len();
+        for _ in 0..len {
+            let (rt_ref, rt_lock) = match rt_contexts.pop_front() {
+                Some(lock) => match lock.upgrade() {
+                    Some(l) => (lock, l),
+                    None => {
+                        skipped_contexts += 1;
+                        continue;
+                    }
+                },
+                None => break,
+            };
+            if Arc::ptr_eq(&rt_lock, idle_context) || Arc::ptr_eq(&rt_lock, prev_context_lock) {
+                rt_contexts.push_back(rt_ref);
+                continue;
+            }
+            let rt_guard = unsafe { rt_lock.write_arc() };
+            if !rt_guard.status.is_runnable()
+                || rt_guard.running
+                || !rt_guard.sched_affinity.contains(cpu_id)
+            {
+                rt_contexts.push_back(rt_ref);
+                continue;
+            }
+            if rt_guard.sched_policy == SchedPolicy::Fifo
+                || rt_guard.sched_policy == SchedPolicy::RoundRobin
+            {
+                return Some(rt_guard);
+            }
+            rt_contexts.push_back(rt_ref);
+        }
+    }
+
+    {
+        let mut min_vruntime = u128::MAX;
+        let mut best: Option<(usize, WeakContextRef)> = None;
+        for (prio, queue) in contexts_list.iter().enumerate() {
+            for ctx_ref in queue.iter() {
+                if let Some(ctx_lock) = ctx_ref.upgrade() {
+                    if Arc::ptr_eq(&ctx_lock, prev_context_lock)
+                        || Arc::ptr_eq(&ctx_lock, idle_context)
+                    {
+                        continue;
+                    }
+                    if let Some(guard) = ctx_lock.try_read(token.token()) {
+                        if guard.status.is_runnable()
+                            && !guard.running
+                            && guard.sched_affinity.contains(cpu_id)
+                            && guard.sched_policy == SchedPolicy::Other
+                        {
+                            let mut vruntime = guard.vruntime;
+                            if guard.last_cpu == Some(cpu_id) {
+                                vruntime = vruntime.saturating_sub(vruntime / 8);
+                            }
+                            drop(guard);
+                            if vruntime < min_vruntime {
+                                min_vruntime = vruntime;
+                                best = Some((prio, ctx_ref.clone()));
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        if let Some((best_prio, ctx_ref)) = best {
+            contexts_list[best_prio].retain(|r| !WeakContextRef::eq(r, &ctx_ref));
+            if let Some(ctx_lock) = ctx_ref.upgrade() {
+                let guard = unsafe { ctx_lock.write_arc() };
+                if guard.status.is_runnable()
+                    && !guard.running
+                    && guard.sched_affinity.contains(cpu_id)
+                    && guard.sched_policy == SchedPolicy::Other
+                {
+                    return Some(guard);
+                }
+
+                drop(guard);
+                contexts_list[best_prio].push_back(ctx_ref);
+            }
+        }
+    }
+
+     'priority: loop {
+-        i = (i + 1) % 40;
+        *i = (*i + 1) % RUN_QUEUE_COUNT;
+         total_iters += 1;
+ 
+-        // The least prioritised queue takes <5000 iters to build up
+-        // balance = sched_prio_to_weight[20], if we have already spent
+-        // that many iters and not found any context, it is better to just
+-        // skip for now
+         if total_iters >= 5000 {
+             break 'priority;
+         }
+@@ -394,24 +713,21 @@ fn select_next_context(
+         }
+ 
+         let contexts = contexts_list
+-            .get_mut(i)
+            .get_mut(*i)
+             .expect("i should be between [0, 39]!");
+ 
+         if contexts.is_empty() {
+             empty_queues += 1;
+-            if empty_queues >= 40 {
+-                // If all queues are empty, just break out
+            if empty_queues >= RUN_QUEUE_COUNT {
+                 break 'priority;
+             }
+             continue;
+-        } else {
+-            empty_queues = 0;
+         }
+ 
+-        if balance[i] < SCHED_PRIO_TO_WEIGHT[20] {
+-            // This queue does not have enough balance to run,
+-            // increment the balance!
+-            balance[i] += SCHED_PRIO_TO_WEIGHT[i];
+        empty_queues = 0;
+
+        if balance[*i] < SCHED_PRIO_TO_WEIGHT[20] {
+            balance[*i] += SCHED_PRIO_TO_WEIGHT[*i];
+             continue;
+         }
+ 
+@@ -422,67 +738,331 @@ fn select_next_context(
+                     Some(new_lock) => (lock, new_lock),
+                     None => {
+                         skipped_contexts += 1;
+-                        continue; // Ghost Process, just continue
+                        continue;
+                     }
+                 },
+-                None => break, // Empty Queue
+                None => break,
+             };
+ 
+-            if Arc::ptr_eq(&next_context_lock, &prev_context_lock) {
+            if Arc::ptr_eq(&next_context_lock, prev_context_lock)
+                || Arc::ptr_eq(&next_context_lock, idle_context)
+            {
+                 contexts.push_back(next_context_ref);
+                 continue;
+             }
+-            if Arc::ptr_eq(&next_context_lock, &idle_context) {
+            let mut next_context_guard = unsafe { next_context_lock.write_arc() };
+
+            let sw = unsafe { update_runnable(&mut next_context_guard, cpu_id, switch_time) };
+            if let UpdateResult::CanSwitch = sw {
+                balance[*i] -= SCHED_PRIO_TO_WEIGHT[20];
+                return Some(next_context_guard);
+            }
+
+            if matches!(sw, UpdateResult::Blocked) {
+                idle_contexts(token.downgrade()).push_back(next_context_ref);
+            } else {
+                contexts.push_back(next_context_ref);
+            }
+            skipped_contexts += 1;
+
+            if skipped_contexts >= total_contexts {
+                break 'priority;
+            }
+        }
+    }
+
+    None
+}
+
+fn pick_next_from_global_queues(
+    token: &mut LockToken<L1>,
+    contexts_list: &mut [alloc::collections::VecDeque<WeakContextRef>; RUN_QUEUE_COUNT],
+    cpu_id: LogicalCpuId,
+    switch_time: u128,
+    prev_context_lock: &Arc<ContextLock>,
+    idle_context: &Arc<ContextLock>,
+    balance: &mut [usize; RUN_QUEUE_COUNT],
+    i: &mut usize,
+) -> Option<ArcContextLockWriteGuard> {
+    let mut empty_queues = 0;
+    let mut total_iters = 0;
+    let total_contexts: usize = contexts_list.iter().map(|q| q.len()).sum();
+    let mut skipped_contexts = 0;
+
+    for prio in 0..RUN_QUEUE_COUNT {
+        let rt_contexts = contexts_list
+            .get_mut(prio)
+            .expect("prio should be between [0, 39]");
+        let len = rt_contexts.len();
+        for _ in 0..len {
+            let (rt_ref, rt_lock) = match rt_contexts.pop_front() {
+                Some(lock) => match lock.upgrade() {
+                    Some(l) => (lock, l),
+                    None => {
+                        skipped_contexts += 1;
+                        continue;
+                    }
+                },
+                None => break,
+            };
+            if Arc::ptr_eq(&rt_lock, idle_context) || Arc::ptr_eq(&rt_lock, prev_context_lock) {
+                rt_contexts.push_back(rt_ref);
+                continue;
+            }
+            let rt_guard = unsafe { rt_lock.write_arc() };
+            if !rt_guard.status.is_runnable()
+                || rt_guard.running
+                || !rt_guard.sched_affinity.contains(cpu_id)
+            {
+                rt_contexts.push_back(rt_ref);
+                continue;
+            }
+            if rt_guard.sched_policy == SchedPolicy::Fifo
+                || rt_guard.sched_policy == SchedPolicy::RoundRobin
+            {
+                return Some(rt_guard);
+            }
+            rt_contexts.push_back(rt_ref);
+        }
+    }
+
+    {
+        let mut min_vruntime = u128::MAX;
+        let mut best: Option<(usize, WeakContextRef)> = None;
+        for (prio, queue) in contexts_list.iter().enumerate() {
+            for ctx_ref in queue.iter() {
+                if let Some(ctx_lock) = ctx_ref.upgrade() {
+                    if Arc::ptr_eq(&ctx_lock, prev_context_lock)
+                        || Arc::ptr_eq(&ctx_lock, idle_context)
+                    {
+                        continue;
+                    }
+                    if let Some(guard) = ctx_lock.try_read(token.token()) {
+                        if guard.status.is_runnable()
+                            && !guard.running
+                            && guard.sched_affinity.contains(cpu_id)
+                            && guard.sched_policy == SchedPolicy::Other
+                        {
+                            let mut vruntime = guard.vruntime;
+                            if guard.last_cpu == Some(cpu_id) {
+                                vruntime = vruntime.saturating_sub(vruntime / 8);
+                            }
+                            drop(guard);
+                            if vruntime < min_vruntime {
+                                min_vruntime = vruntime;
+                                best = Some((prio, ctx_ref.clone()));
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        if let Some((best_prio, ctx_ref)) = best {
+            contexts_list[best_prio].retain(|r| !WeakContextRef::eq(r, &ctx_ref));
+            if let Some(ctx_lock) = ctx_ref.upgrade() {
+                let guard = unsafe { ctx_lock.write_arc() };
+                if guard.status.is_runnable()
+                    && !guard.running
+                    && guard.sched_affinity.contains(cpu_id)
+                    && guard.sched_policy == SchedPolicy::Other
+                {
+                    return Some(guard);
+                }
+
+                drop(guard);
+                contexts_list[best_prio].push_back(ctx_ref);
+            }
+        }
+    }
+
+    'priority: loop {
+        *i = (*i + 1) % RUN_QUEUE_COUNT;
+        total_iters += 1;
+
+        if total_iters >= 5000 {
+            break 'priority;
+        }
+
+        if skipped_contexts > total_contexts && total_contexts > 0 {
+            break 'priority;
+        }
+
+        let contexts = contexts_list
+            .get_mut(*i)
+            .expect("i should be between [0, 39]!");
+
+        if contexts.is_empty() {
+            empty_queues += 1;
+            if empty_queues >= RUN_QUEUE_COUNT {
+                break 'priority;
+            }
+            continue;
+        }
+
+        empty_queues = 0;
+
+        if balance[*i] < SCHED_PRIO_TO_WEIGHT[20] {
+            balance[*i] += SCHED_PRIO_TO_WEIGHT[*i];
+            continue;
+        }
+
+        let len = contexts.len();
+        for _ in 0..len {
+            let (next_context_ref, next_context_lock) = match contexts.pop_front() {
+                Some(lock) => match lock.upgrade() {
+                    Some(new_lock) => (lock, new_lock),
+                    None => {
+                        skipped_contexts += 1;
+                        continue;
+                    }
+                },
+                None => break,
+            };
+
+            if Arc::ptr_eq(&next_context_lock, prev_context_lock)
+                || Arc::ptr_eq(&next_context_lock, idle_context)
+            {
+                 contexts.push_back(next_context_ref);
+                 continue;
+             }
+             let mut next_context_guard = unsafe { next_context_lock.write_arc() };
+ 
+-            // Is this context runnable on this CPU?
+             let sw = unsafe { update_runnable(&mut next_context_guard, cpu_id, switch_time) };
+             if let UpdateResult::CanSwitch = sw {
+-                next_context_guard_opt = Some(next_context_guard);
+-                balance[i] -= SCHED_PRIO_TO_WEIGHT[20];
+-                break 'priority;
+                balance[*i] -= SCHED_PRIO_TO_WEIGHT[20];
+                return Some(next_context_guard);
+            }
+
+            if matches!(sw, UpdateResult::Blocked) {
+                idle_contexts(token.token()).push_back(next_context_ref);
+             } else {
+-                if matches!(sw, UpdateResult::Blocked) {
+-                    idle_contexts(token.token()).push_back(next_context_ref);
+-                } else {
+-                    contexts.push_back(next_context_ref);
+-                };
+-                skipped_contexts += 1;
+                contexts.push_back(next_context_ref);
+            }
+            skipped_contexts += 1;
+ 
+-                if skipped_contexts >= total_contexts {
+-                    break 'priority;
+-                }
+            if skipped_contexts >= total_contexts {
+                break 'priority;
+             }
+         }
+     }
+-    percpu.balance.set(balance);
+-    percpu.last_queue.set(i);
+-
+-    if !Arc::ptr_eq(&prev_context_lock, &idle_context) {
+-        // Send the old process to the back of the line (if it is still runnable)
+-        let prev_ctx = WeakContextRef(Arc::downgrade(&prev_context_lock));
+-        if prev_context_guard.status.is_runnable() {
+-            let prio = prev_context_guard.prio;
+-            contexts_list[prio].push_back(prev_ctx);
+-        } else {
+-            idle_contexts(token.token()).push_back(prev_ctx);
+-        }
+
+    None
+}
+
+unsafe fn update_stealable(context: &mut Context, switch_time: u128) -> UpdateResult {
+    if context.running {
+        return UpdateResult::Skip;
+     }
+    if context.status.is_soft_blocked()
+        && let Some(wake) = context.wake
+        && switch_time >= wake
+    {
+        context.wake = None;
+        context.unblock_no_ipi();
+    }
+    if context.status.is_runnable() {
+        UpdateResult::CanSwitch
+    } else {
+        UpdateResult::Blocked
+    }
+}
+ 
+-    if let Some(next_context_guard) = next_context_guard_opt {
+-        // We found a new process!
+/// This is the scheduler function which currently utilises Deficit Weighted Round Robin Scheduler
+fn select_next_context(
+    token: &mut CleanLockToken,
+    percpu: &PercpuBlock,
+    cpu_id: LogicalCpuId,
+    switch_time: u128,
+    was_idle: bool,
+    prev_context_guard: &mut ArcRwLockWriteGuard<L4, Context>,
+) -> Result<Option<ArcContextLockWriteGuard>, SwitchResult> {
+    let idle_context = percpu.switch_internals.idle_context();
+    let prev_context_lock = crate::context::current();
+
+    let local_next = {
+        let mut sched_lock = SchedQueuesLock::new(&percpu.sched);
+        let mut balance = percpu.sched.balance.get();
+        let mut last_queue = percpu.sched.last_queue.get() % RUN_QUEUE_COUNT;
+        let next = pick_next_from_queues(
+            token,
+            unsafe { sched_lock.queues_mut() },
+            cpu_id,
+            switch_time,
+            &prev_context_lock,
+            &idle_context,
+            &mut balance,
+            &mut last_queue,
+        );
+        percpu.sched.balance.set(balance);
+        percpu.sched.last_queue.set(last_queue);
+        next
+    };
+
+    if let Some(next_context_guard) = local_next {
+        queue_previous_context(
+            token,
+            percpu,
+            &prev_context_lock,
+            prev_context_guard,
+            &idle_context,
+        );
+        return Ok(Some(next_context_guard));
+    }
+
+    if let Some(next_context_guard) = steal_work(token, cpu_id, switch_time) {
+        queue_previous_context(
+            token,
+            percpu,
+            &prev_context_lock,
+            prev_context_guard,
+            &idle_context,
+        );
+        return Ok(Some(next_context_guard));
+    }
+
+    let global_next = {
+        let contexts_data = run_contexts(token.token());
+        let (mut contexts_data, mut contexts_token) = contexts_data.into_split();
+        let mut balance = percpu.sched.balance.get();
+        let mut last_queue = percpu.sched.last_queue.get() % RUN_QUEUE_COUNT;
+        let next = pick_next_from_global_queues(
+            &mut contexts_token,
+            &mut contexts_data.set,
+            cpu_id,
+            switch_time,
+            &prev_context_lock,
+            &idle_context,
+            &mut balance,
+            &mut last_queue,
+        );
+        percpu.sched.balance.set(balance);
+        percpu.sched.last_queue.set(last_queue);
+        next
+    };
+
+    if let Some(next_context_guard) = global_next {
+        queue_previous_context(
+            token,
+            percpu,
+            &prev_context_lock,
+            prev_context_guard,
+            &idle_context,
+        );
+         return Ok(Some(next_context_guard));
+    }
+
+    queue_previous_context(
+        token,
+        percpu,
+        &prev_context_lock,
+        prev_context_guard,
+        &idle_context,
+    );
+
+    if !was_idle && !Arc::ptr_eq(&prev_context_lock, &idle_context) {
+        Ok(Some(unsafe { idle_context.write_arc() }))
+     } else {
+-        if !was_idle && !Arc::ptr_eq(&prev_context_lock, &idle_context) {
+-            // We switch into the idle context
+-            Ok(Some(unsafe { idle_context.write_arc() }))
+-        } else {
+-            // We found no other process to run.
+-            Ok(None)
+-        }
+        Ok(None)
+     }
+ }
+