Tier 2: per-CPU sched stats, NUMA-aware scheduling, init numa

- CpuStats: add context_switches and steals AtomicU64 counters, remove redundant per_cpu field from CpuStatsData - context/switch.rs: increment per-CPU switches at context switch, increment steals at work-steal; add NUMA vruntime bonus (1/8 for exact-CPU match, 1/16 for same-node) - context/mod.rs: least_loaded_cpu() now NUMA-aware, prefers same-node CPUs (accepts <=1 extra queued context vs cross-node best) - scheme/sys/sched.rs: new kernel handler exposing per-CPU scheduler stats (switches, steals, queue_depth) via /scheme/sys/sched - startup/mod.rs: call numa::init_default() during boot (was dead code)
2026-07-02 21:40:20 +03:00
parent e812356cf0
commit c6a5b7a1ad
6 changed files with 83 additions and 13 deletions
@@ -127,8 +127,11 @@ pub fn run_contexts(token: LockToken<'_, L0>) -> MutexGuard<'_, L1, RunContextDa

 fn least_loaded_cpu() -> LogicalCpuId {
    let current_cpu = crate::cpu_id();
+    let topo = crate::numa::topology();
    let mut best_cpu = current_cpu;
    let mut best_depth = usize::MAX;
+    let mut best_local_cpu = current_cpu;
+    let mut best_local_depth = usize::MAX;

    for raw_id in 0..crate::cpu_count() {
        let cpu_id = LogicalCpuId::new(raw_id);
@@ -144,9 +147,18 @@ fn least_loaded_cpu() -> LogicalCpuId {
            best_depth = depth;
            best_cpu = cpu_id;
        }
+
+        if topo.same_node(current_cpu, cpu_id) && depth < best_local_depth {
+            best_local_depth = depth;
+            best_local_cpu = cpu_id;
+        }
    }

-    best_cpu
+    if best_local_depth < usize::MAX && best_local_depth <= best_depth + 1 {
+        best_local_cpu
+    } else {
+        best_cpu
+    }
 }

 pub fn init(token: &mut CleanLockToken) {
@@ -162,6 +162,7 @@ pub fn switch(token: &mut CleanLockToken) -> SwitchResult {

    let percpu = PercpuBlock::current();
    cpu_stats::add_context_switch();
+    percpu.stats.context_switches.fetch_add(1, Ordering::Relaxed);

    //set PIT Interrupt counter to 0, giving each process same amount of PIT ticks
    percpu.switch_internals.pit_ticks.set(0);
@@ -424,6 +425,9 @@ fn steal_work(
                if let UpdateResult::CanSwitch = sw {
                    assign_context_to_cpu(&mut context_guard, cpu_id);
                    SCHED_STEAL_COUNT.fetch_add(1, Ordering::Relaxed);
+                    if let Some(thief) = get_percpu_block(cpu_id) {
+                        thief.stats.steals.fetch_add(1, Ordering::Relaxed);
+                    }
                    return Some(context_guard);
                }

@@ -695,6 +699,10 @@ fn pick_next_from_queues(
                            let mut vruntime = guard.vruntime;
                            if guard.last_cpu == Some(cpu_id) {
                                vruntime = vruntime.saturating_sub(vruntime / 8);
+                            } else if let Some(last_cpu) = guard.last_cpu {
+                                if crate::numa::topology().same_node(cpu_id, last_cpu) {
+                                    vruntime = vruntime.saturating_sub(vruntime / 16);
+                                }
                            }
                            drop(guard);
                            if vruntime < min_vruntime {
@@ -870,6 +878,10 @@ fn pick_next_from_global_queues(
                            let mut vruntime = guard.vruntime;
                            if guard.last_cpu == Some(cpu_id) {
                                vruntime = vruntime.saturating_sub(vruntime / 8);
+                            } else if let Some(last_cpu) = guard.last_cpu {
+                                if crate::numa::topology().same_node(cpu_id, last_cpu) {
+                                    vruntime = vruntime.saturating_sub(vruntime / 16);
+                                }
                            }
                            drop(guard);
                            if vruntime < min_vruntime {
@@ -28,18 +28,14 @@ pub enum CpuState {
 /// Statistics for the CPUs.
 #[derive(Debug, Default)]
 pub struct CpuStats {
-    /// Number of ticks spent on userspace contexts
    user: AtomicU64,
-    /// Number of ticks spent on Niced userspace contexts
    nice: AtomicU64,
-    /// Number of ticks spent on kernel contexts
    kernel: AtomicU64,
-    /// Number of ticks spent idle
    idle: AtomicU64,
-    /// Number of times the CPU handled an interrupt
    irq: AtomicU64,
-    /// Current state of the CPU
    state: AtomicU8,
+    pub context_switches: AtomicU64,
+    pub steals: AtomicU64,
 }

 impl CpuStats {
@@ -51,21 +47,20 @@ impl CpuStats {
            idle: AtomicU64::new(0),
            irq: AtomicU64::new(0),
            state: AtomicU8::new(0),
+            context_switches: AtomicU64::new(0),
+            steals: AtomicU64::new(0),
        }
    }
 }

 pub struct CpuStatsData {
-    /// Number of ticks spent on userspace contexts
    pub user: u64,
-    /// Number of ticks spent on Niced userspace contexts
    pub nice: u64,
-    /// Number of ticks spent on kernel contexts
    pub kernel: u64,
-    /// Number of ticks spent idle
    pub idle: u64,
-    /// Number of times the CPU handled an interrupt
    pub irq: u64,
+    pub context_switches: u64,
+    pub steals: u64,
 }

 impl CpuStats {
@@ -128,6 +123,8 @@ impl From<&CpuStats> for CpuStatsData {
            kernel: val.kernel.load(Ordering::Relaxed),
            idle: val.idle.load(Ordering::Relaxed),
            irq: val.irq.load(Ordering::Relaxed),
+            context_switches: val.context_switches.load(Ordering::Relaxed),
+            steals: val.steals.load(Ordering::Relaxed),
        }
    }
 }
@@ -34,6 +34,7 @@ mod iostat;
 mod irq;
 mod log;
 mod msr;
+mod sched;
 mod stat;
 mod syscall;
 mod uname;
@@ -115,6 +116,7 @@ const FILES: &[(&str, Kind)] = &[
    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
    ("spurious_irq", Rd(interrupt::irq::spurious_irq_resource)),
    ("stat", Rd(stat::resource)),
+    ("sched", Rd(sched::resource)),
    // Disabled because the debugger is inherently unsafe and probably will break the system.
    /*
    ("trigger_debugger", Rd(|token| unsafe {
@@ -0,0 +1,46 @@
+use core::fmt::Write as _;
+
+use crate::{
+    percpu::{get_all_stats, get_percpu_block},
+    sync::CleanLockToken,
+    syscall::error::Result,
+};
+use alloc::{string::String, vec::Vec};
+
+pub fn resource(_token: &mut CleanLockToken) -> Result<Vec<u8>> {
+    let stats = get_all_stats();
+    let mut out = String::new();
+    let mut total_switches: u64 = 0;
+    let mut total_steals: u64 = 0;
+
+    for (id, stat) in &stats {
+        let queue_depth = get_percpu_block(*id)
+            .map(|p| {
+                p.sched.take_lock();
+                let d = unsafe { p.sched.queues().iter().map(|q| q.len()).sum::<usize>() };
+                p.sched.release_lock();
+                d
+            })
+            .unwrap_or(0);
+
+        let _ = writeln!(
+            &mut out,
+            "cpu{} switches {} steals {} queue_depth {}",
+            id.get(),
+            stat.context_switches,
+            stat.steals,
+            queue_depth,
+        );
+
+        total_switches += stat.context_switches;
+        total_steals += stat.steals;
+    }
+
+    let _ = writeln!(
+        &mut out,
+        "total switches {} steals {}",
+        total_switches, total_steals,
+    );
+
+    Ok(out.into_bytes())
+}
@@ -159,7 +159,8 @@ pub(crate) fn kmain(bootstrap: Bootstrap) -> ! {
    //Initialize the first context, stored in kernel/src/context/mod.rs
    context::init(&mut token);

-    //Initialize global schemes, such as `acpi:`.
+    crate::numa::init_default();
+
    scheme::init_globals();

    debug!("BSP: {} CPUs", crate::cpu_count());