feat: P0-P6 kernel scheduler + relibc threading comprehensive implementation

P0-P2: Barrier SMP, sigmask/pthread_kill races, robust mutexes, RT scheduling, POSIX sched API P3: PerCpuSched struct, per-CPU wiring, work stealing, load balancing, initial placement P4: 64-shard futex table, REQUEUE, PI futexes (LOCK_PI/UNLOCK_PI/TRYLOCK_PI), robust futexes, vruntime tracking, min-vruntime SCHED_OTHER selection P5: setpriority/getpriority, pthread_setaffinity_np, pthread_setname_np, pthread_setschedparam (Redox) P6: Cache-affine scheduling (last_cpu + vruntime bonus), NUMA topology kernel hints + numad userspace daemon Stability fixes: make_consistent stores 0 (dead TID fix), cond.rs error propagation, SPIN_COUNT adaptive spinning, Sys::open &str fix, PI futex CAS race, proc.rs lock ordering, barrier destroy Patches: 33 kernel + 58 relibc patches, all tracked in recipes Docs: KERNEL-SCHEDULER-MULTITHREAD-IMPROVEMENT-PLAN.md updated, SCHEDULER-REVIEW-FINAL.md created Architecture: NUMA topology parsing stays userspace (numad daemon), kernel stores lightweight NumaTopology hints
2026-04-30 18:21:48 +01:00
parent 55d00c3a24
commit 34360e1e4f
70 changed files with 15268 additions and 10 deletions
@@ -0,0 +1,146 @@
+diff --git a/src/percpu.rs b/src/percpu.rs
+--- a/src/percpu.rs
+++ b/src/percpu.rs
+@@ -29,12 +29,14 @@ pub struct PerCpuSched {
+     pub run_queues_lock: AtomicBool,
+     pub balance: Cell<[usize; RUN_QUEUE_COUNT]>,
+     pub last_queue: Cell<usize>,
+    pub last_balance_time: Cell<u128>,
+ }
+ 
+ impl PerCpuSched {
+     pub const fn new() -> Self {
+         const EMPTY: VecDeque<WeakContextRef> = VecDeque::new();
+         Self {
+             run_queues: SyncUnsafeCell::new([EMPTY; RUN_QUEUE_COUNT]),
+             run_queues_lock: AtomicBool::new(false),
+             balance: Cell::new([0; RUN_QUEUE_COUNT]),
+             last_queue: Cell::new(0),
+            last_balance_time: Cell::new(0),
+         }
+     }
+diff --git a/src/context/switch.rs b/src/context/switch.rs
+--- a/src/context/switch.rs
+++ b/src/context/switch.rs
+@@ -33,6 +33,8 @@ const SCHED_PRIO_TO_WEIGHT: [usize; 40] = [
+     70, 56, 45, 36, 29, 23, 18, 15,
+ ];
+ 
+const LOAD_BALANCE_INTERVAL_NS: u128 = 100_000_000;
+
+ static SCHED_STEAL_COUNT: AtomicUsize = AtomicUsize::new(0);
+@@ -101,6 +103,9 @@ pub fn tick(token: &mut CleanLockToken) {
+     let new_ticks = ticks_cell.get() + 1;
+     ticks_cell.set(new_ticks);
+ 
+    let balance_time = crate::time::monotonic(token);
+    maybe_balance_queues(token, percpu, balance_time);
+
+     // Trigger a context switch after every 3 ticks.
+     if new_ticks >= 3 {
+         switch(token);
+@@ -427,6 +432,92 @@ fn steal_work(
+ 
+     None
+ }
+
+fn queue_depth(percpu: &PercpuBlock) -> usize {
+    let mut sched_lock = SchedQueuesLock::new(&percpu.sched);
+    unsafe {
+        sched_lock
+            .queues_mut()
+            .iter()
+            .map(|queue| queue.len())
+            .sum()
+    }
+}
+
+fn migrate_one_context(
+    token: &mut CleanLockToken,
+    source_id: LogicalCpuId,
+    target_id: LogicalCpuId,
+    switch_time: u128,
+) -> bool {
+    let Some(source) = get_percpu_block(source_id) else {
+        return false;
+    };
+    let Some(target) = get_percpu_block(target_id) else {
+        return false;
+    };
+
+    let source_idle = source.switch_internals.idle_context();
+    let moved = {
+        let mut source_lock = SchedQueuesLock::new(&source.sched);
+        let source_queues = unsafe { source_lock.queues_mut() };
+        pop_movable_context(token, source_queues, target_id, switch_time, &source_idle)
+    };
+
+    let Some((prio, context_ref)) = moved else {
+        return false;
+    };
+
+    let mut target_lock = SchedQueuesLock::new(&target.sched);
+    unsafe {
+        target_lock.queues_mut()[prio].push_back(context_ref);
+    }
+    true
+}
+
+fn maybe_balance_queues(token: &mut CleanLockToken, percpu: &PercpuBlock, balance_time: u128) {
+    if crate::cpu_count() <= 1 || percpu.cpu_id != LogicalCpuId::BSP {
+        return;
+    }
+    if balance_time.saturating_sub(percpu.sched.last_balance_time.get()) < LOAD_BALANCE_INTERVAL_NS
+    {
+        return;
+    }
+
+    percpu.sched.last_balance_time.set(balance_time);
+
+    let mut depths = Vec::new();
+    let mut total_depth = 0usize;
+    for raw_id in 0..crate::cpu_count() {
+        let cpu_id = LogicalCpuId::new(raw_id);
+        let Some(cpu_percpu) = get_percpu_block(cpu_id) else {
+            continue;
+        };
+        let depth = queue_depth(cpu_percpu);
+        total_depth += depth;
+        depths.push((cpu_id, depth));
+    }
+
+    if depths.len() <= 1 || total_depth == 0 {
+        return;
+    }
+
+    let avg_depth = (total_depth + depths.len().saturating_sub(1)) / depths.len();
+
+    for target_index in 0..depths.len() {
+        if depths[target_index].1 != 0 {
+            continue;
+        }
+
+        let mut source_index = None;
+        let mut source_depth = 0usize;
+        for (idx, &(_, depth)) in depths.iter().enumerate() {
+            if idx == target_index {
+                continue;
+            }
+            if depth > avg_depth + 1 && depth > source_depth {
+                source_index = Some(idx);
+                source_depth = depth;
+            }
+        }
+
+        let Some(source_index) = source_index else {
+            continue;
+        };
+
+        let source_id = depths[source_index].0;
+        let target_id = depths[target_index].0;
+        if migrate_one_context(token, source_id, target_id, balance_time) {
+            depths[source_index].1 = depths[source_index].1.saturating_sub(1);
+            depths[target_index].1 += 1;
+        }
+    }
+}