feat: P0-P6 kernel scheduler + relibc threading comprehensive implementation

P0-P2: Barrier SMP, sigmask/pthread_kill races, robust mutexes, RT scheduling, POSIX sched API P3: PerCpuSched struct, per-CPU wiring, work stealing, load balancing, initial placement P4: 64-shard futex table, REQUEUE, PI futexes (LOCK_PI/UNLOCK_PI/TRYLOCK_PI), robust futexes, vruntime tracking, min-vruntime SCHED_OTHER selection P5: setpriority/getpriority, pthread_setaffinity_np, pthread_setname_np, pthread_setschedparam (Redox) P6: Cache-affine scheduling (last_cpu + vruntime bonus), NUMA topology kernel hints + numad userspace daemon Stability fixes: make_consistent stores 0 (dead TID fix), cond.rs error propagation, SPIN_COUNT adaptive spinning, Sys::open &str fix, PI futex CAS race, proc.rs lock ordering, barrier destroy Patches: 33 kernel + 58 relibc patches, all tracked in recipes Docs: KERNEL-SCHEDULER-MULTITHREAD-IMPROVEMENT-PLAN.md updated, SCHEDULER-REVIEW-FINAL.md created Architecture: NUMA topology parsing stays userspace (numad daemon), kernel stores lightweight NumaTopology hints
2026-04-30 18:21:48 +01:00
parent 7d5d364c01
commit 2e99d4073b
70 changed files with 15268 additions and 10 deletions
@@ -0,0 +1,214 @@
+diff --git a/src/context/switch.rs b/src/context/switch.rs
+index 86684c8..74dd5f1 100644
+--- a/src/context/switch.rs
+++ b/src/context/switch.rs
+@@ -5,7 +5,7 @@
+ use crate::{
+     context::{
+         self, arch, idle_contexts, idle_contexts_try, run_contexts, ArcContextLockWriteGuard,
+-        Context, ContextLock, WeakContextRef,
+        Context, ContextLock, SchedPolicy, WeakContextRef,
+     },
+     cpu_set::LogicalCpuId,
+     cpu_stats::{self, CpuState},
+@@ -33,35 +33,17 @@ const SCHED_PRIO_TO_WEIGHT: [usize; 40] = [
+     70, 56, 45, 36, 29, 23, 18, 15,
+ ];
+ 
+-/// Determines if a given context is eligible to be scheduled on a given CPU (in
+-/// principle, the current CPU).
+-///
+-/// # Safety
+-/// This function is unsafe because it modifies the `context`'s state directly without synchronization.
+-///
+-/// # Parameters
+-/// - `context`: The context (process/thread) to be checked.
+-/// - `cpu_id`: The logical ID of the CPU on which the context is being scheduled.
+-///
+-/// # Returns
+-/// - `UpdateResult::CanSwitch`: If the context can be switched to.
+-/// - `UpdateResult::Skip`: If the context should be skipped (e.g., it's running on another CPU).
+ unsafe fn update_runnable(
+     context: &mut Context,
+     cpu_id: LogicalCpuId,
+     switch_time: u128,
+ ) -> UpdateResult {
+-    // Ignore contexts that are already running.
+     if context.running {
+         return UpdateResult::Skip;
+     }
+-
+-    // Ignore contexts assigned to other CPUs.
+     if !context.sched_affinity.contains(cpu_id) {
+         return UpdateResult::Skip;
+     }
+-
+-    // If context is soft-blocked and has a wake-up time, check if it should wake up.
+     if context.status.is_soft_blocked()
+         && let Some(wake) = context.wake
+         && switch_time >= wake
+@@ -69,8 +51,6 @@ unsafe fn update_runnable(
+         context.wake = None;
+         context.unblock_no_ipi();
+     }
+-
+-    // If the context is runnable, indicate it can be switched to.
+     if context.status.is_runnable() {
+         UpdateResult::CanSwitch
+     } else {
+@@ -95,7 +75,7 @@ pub fn tick(token: &mut CleanLockToken) {
+     let new_ticks = ticks_cell.get() + 1;
+     ticks_cell.set(new_ticks);
+ 
+-    // Trigger a context switch after every 3 ticks (approx. 6.75 ms).
+    // Trigger a context switch after every 3 ticks.
+     if new_ticks >= 3 {
+         switch(token);
+         crate::context::signal::signal_handler(token);
+@@ -167,10 +147,7 @@ pub fn switch(token: &mut CleanLockToken) -> SwitchResult {
+     let mut prev_context_guard = unsafe { prev_context_lock.write_arc() };
+ 
+     if !prev_context_guard.is_preemptable() {
+-        // Unset global lock
+         arch::CONTEXT_SWITCH_LOCK.store(false, Ordering::SeqCst);
+-
+-        // Pretend to have finished switching, so CPU is not idled
+         return SwitchResult::Switched;
+     }
+ 
+@@ -222,6 +199,13 @@ pub fn switch(token: &mut CleanLockToken) -> SwitchResult {
+             // Update times
+             if !was_idle {
+                 prev_context.cpu_time += switch_time.saturating_sub(prev_context.switch_time);
+            if prev_context.sched_policy == SchedPolicy::Other {
+                let actual_ns = switch_time.saturating_sub(prev_context.switch_time);
+                let weight = SCHED_PRIO_TO_WEIGHT[prev_context.sched_static_prio.min(39)] as u128;
+                let default_weight = SCHED_PRIO_TO_WEIGHT[20] as u128;
+                let delta = actual_ns.saturating_mul(default_weight) / weight.max(1);
+                prev_context.vruntime = prev_context.vruntime.saturating_add(delta);
+            }
+             }
+             next_context.switch_time = switch_time;
+             if next_context.userspace {
+@@ -377,6 +361,121 @@ fn select_next_context(
+     let total_contexts: usize = contexts_list.iter().map(|q| q.len()).sum();
+     let mut skipped_contexts = 0;
+ 
+    // PASS 0: SCHED_FIFO and SCHED_RR — scan for RT contexts to schedule.
+    // When a runnable RT context is found, it takes priority over all SCHED_OTHER.
+    for prio in 0..40 {
+        let rt_contexts = contexts_list
+            .get_mut(prio)
+            .expect("prio should be between [0, 39]");
+        let len = rt_contexts.len();
+        for _ in 0..len {
+            let (rt_ref, rt_lock) = match rt_contexts.pop_front() {
+                Some(lock) => match lock.upgrade() {
+                    Some(l) => (lock, l),
+                    None => {
+                        skipped_contexts += 1;
+                        continue;
+                    }
+                },
+                None => break,
+            };
+            if Arc::ptr_eq(&rt_lock, &idle_context) {
+                rt_contexts.push_back(rt_ref);
+                continue;
+            }
+            // Current RT thread: if runnable with no higher-prio RT found yet,
+            // keep it running (no demotion to SCHED_OTHER)
+            if Arc::ptr_eq(&rt_lock, &prev_context_lock) {
+                let rt_guard = unsafe { rt_lock.write_arc() };
+                if rt_guard.status.is_runnable()
+                    && (rt_guard.sched_policy == SchedPolicy::Fifo
+                        || rt_guard.sched_policy == SchedPolicy::RoundRobin)
+                {
+                    percpu.balance.set(balance);
+                    percpu.last_queue.set(i);
+                    return Ok(Some(rt_guard));
+                }
+                rt_contexts.push_back(rt_ref);
+                continue;
+            }
+            let rt_guard = unsafe { rt_lock.write_arc() };
+            if !rt_guard.status.is_runnable() || rt_guard.running
+                || !rt_guard.sched_affinity.contains(cpu_id)
+            {
+                rt_contexts.push_back(rt_ref);
+                continue;
+            }
+            if rt_guard.sched_policy == SchedPolicy::Fifo
+                || rt_guard.sched_policy == SchedPolicy::RoundRobin
+            {
+                percpu.balance.set(balance);
+                percpu.last_queue.set(i);
+                if !Arc::ptr_eq(&prev_context_lock, &idle_context) {
+                    let prev_ctx = WeakContextRef(Arc::downgrade(&prev_context_lock));
+                    if prev_context_guard.status.is_runnable() {
+                        contexts_list[prev_context_guard.prio].push_back(prev_ctx);
+                    } else {
+                        idle_contexts(token.token()).push_back(prev_ctx);
+                    }
+                }
+                return Ok(Some(rt_guard));
+            }
+            rt_contexts.push_back(rt_ref);
+        }
+    }
+
+    // PASS 1: SCHED_OTHER — minimum-vruntime selection
+    {
+        let mut min_vruntime = u128::MAX;
+        let mut best: Option<(usize, WeakContextRef)> = None;
+        for (prio, queue) in contexts_list.iter().enumerate() {
+            for ctx_ref in queue.iter() {
+                if let Some(ctx_lock) = ctx_ref.upgrade() {
+                    if Arc::ptr_eq(&ctx_lock, &prev_context_lock) || Arc::ptr_eq(&ctx_lock, &idle_context) {
+                        continue;
+                    }
+                    if let Some(guard) = ctx_lock.try_read(token.token()) {
+                        if guard.status.is_runnable() && !guard.running
+                            && guard.sched_affinity.contains(cpu_id)
+                            && guard.sched_policy == SchedPolicy::Other
+                        {
+                            let v = guard.vruntime;
+                            drop(guard);
+                            if v < min_vruntime {
+                                min_vruntime = v;
+                                best = Some((prio, ctx_ref.clone()));
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        if let Some((best_prio, ctx_ref)) = best {
+            {
+                let queue = contexts_list.get_mut(best_prio).expect("valid prio");
+                queue.retain(|r| !WeakContextRef::eq(r, &ctx_ref));
+            }
+            if let Some(ctx_lock) = ctx_ref.upgrade() {
+                let guard = unsafe { ctx_lock.write_arc() };
+                if guard.status.is_runnable() {
+                    percpu.balance.set(balance);
+                    percpu.last_queue.set(i);
+                    if !Arc::ptr_eq(&prev_context_lock, &idle_context) {
+                        let prev_ctx = WeakContextRef(Arc::downgrade(&prev_context_lock));
+                        if prev_context_guard.status.is_runnable() {
+                            contexts_list[prev_context_guard.prio].push_back(prev_ctx);
+                        } else {
+                            idle_contexts(token.token()).push_back(prev_ctx);
+                        }
+                    }
+                    return Ok(Some(guard));
+                }
+            }
+        }
+    }
+
+    // PASS 2: fallback DWRR deficit tracking
+
+     'priority: loop {
+         i = (i + 1) % 40;
+         total_iters += 1;