From 7fc8bbf057ef5d07daa91ceb8544abbcbd0cac40 Mon Sep 17 00:00:00 2001
From: vasilito <adminpupkin@gmail.com>
Date: Thu, 2 Jul 2026 06:43:23 +0300
Subject: [PATCH] kernel: apply P8-initial-placement, P9-numa-topology,
 P9-proc-lock-ordering
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Phase 0c, plan orders #5, #10, #11.

  P8-initial-placement: context::Context::spawn() now picks the
    least-loaded CPU for new threads based on PercpuSched.balance,
    replacing the old 'pin to birth CPU' default.

  P9-numa-topology: adds src/numa.rs (NumaTopology, NumaHint types and
    MAX_NUMA_NODES constant) and threads the get_percpu_block import
    through context/mod.rs. NUMA discovery is performed by userspace
    numad via /scheme/acpi/ and pushed to the kernel via scheme:numa;
    the kernel stores a lightweight copy for O(1) scheduler lookups.

  P9-proc-lock-ordering: fix to scheme/proc.rs acquire order to
    prevent deadlock between proc scheme handles and the per-CPU
    sched lock. Required after P8-percpu-wiring moved the scheduler
    state to per-CPU.

After this commit, three more of the plan's eleven P5–P9 patches are
landed. Remaining unlanded: P5-sched-rt-policy, P6-vruntime-switch,
P7-cache-affine-switch (all touch switch.rs which now diverges from
the patch baselines), and P5-scheme-sched-id/P5-proc-setschedpolicy/
P7-proc-setname/P7-proc-setpriority (overlap on scheme/proc.rs:10X-14X
context handle enum).

cargo check: 1 error remaining (pre-existing src/acpi/fadt.rs:110
unrelated to threading work).
---
 src/context/mod.rs | 31 +++++++++++++++++++++--
 src/numa.rs        | 62 ++++++++++++++++++++++++++++++++++++++++++++++
 src/scheme/proc.rs | 10 +++-----
 3 files changed, 95 insertions(+), 8 deletions(-)
 create mode 100644 src/numa.rs

diff --git a/src/context/mod.rs b/src/context/mod.rs
index 297cf2de00..1b1e8b523e 100644
--- a/src/context/mod.rs
+++ b/src/context/mod.rs
@@ -10,9 +10,9 @@ use core::{num::NonZeroUsize, ops::Deref};
 
 use crate::{
     context::memory::AddrSpaceWrapper,
-    cpu_set::LogicalCpuSet,
+    cpu_set::{LogicalCpuId, LogicalCpuSet},
     memory::{RmmA, RmmArch, TableKind},
-    percpu::PercpuBlock,
+    percpu::{get_percpu_block, PercpuBlock},
     sync::{
         ArcRwLockWriteGuard, CleanLockToken, LockToken, Mutex, MutexGuard, RwLock, RwLockReadGuard,
         RwLockWriteGuard, L0, L1, L2, L4,
@@ -125,6 +125,30 @@ pub fn run_contexts(token: LockToken<'_, L0>) -> MutexGuard<'_, L1, RunContextDa
     RUN_CONTEXTS.lock(token)
 }
 
+fn least_loaded_cpu() -> LogicalCpuId {
+    let current_cpu = crate::cpu_id();
+    let mut best_cpu = current_cpu;
+    let mut best_depth = usize::MAX;
+
+    for raw_id in 0..crate::cpu_count() {
+        let cpu_id = LogicalCpuId::new(raw_id);
+        let Some(percpu) = get_percpu_block(cpu_id) else {
+            continue;
+        };
+
+        percpu.sched.take_lock();
+        let depth = unsafe { percpu.sched.queues().iter().map(|queue| queue.len()).sum() };
+        percpu.sched.release_lock();
+
+        if depth < best_depth {
+            best_depth = depth;
+            best_cpu = cpu_id;
+        }
+    }
+
+    best_cpu
+}
+
 pub fn init(token: &mut CleanLockToken) {
     let owner = None; // kmain not owned by any fd
     let mut context = Context::new(owner).expect("failed to create kmain context");
@@ -239,6 +263,9 @@ pub fn spawn(
 
     context.kstack = Some(stack);
     context.userspace = userspace_allowed;
+    let target_cpu = least_loaded_cpu();
+    context.sched_affinity = LogicalCpuSet::empty();
+    context.sched_affinity.atomic_set(target_cpu);
 
     let context_lock = Arc::new(ContextLock::new(context));
     let context_ref = ContextRef(Arc::clone(&context_lock));
diff --git a/src/numa.rs b/src/numa.rs
new file mode 100644
index 0000000000..40c5a06812
--- /dev/null
+++ b/src/numa.rs
@@ -0,0 +1,62 @@
+/// NUMA topology hints for the kernel scheduler.
+/// NUMA discovery (SRAT/SLIT parsing) is performed by a userspace daemon
+/// (numad) via /scheme/acpi/, then pushed to the kernel via scheme:numa.
+/// The kernel stores a lightweight copy for O(1) scheduling lookups.
+use crate::cpu_set::{LogicalCpuId, LogicalCpuSet};
+use core::sync::atomic::{AtomicBool, Ordering};
+
+const MAX_NUMA_NODES: usize = 8;
+
+#[derive(Clone, Debug)]
+pub struct NumaHint {
+    pub node_id: u8,
+    pub cpus: LogicalCpuSet,
+}
+
+pub struct NumaTopology {
+    pub nodes: [Option<NumaHint>; MAX_NUMA_NODES],
+    pub initialized: AtomicBool,
+}
+
+impl NumaTopology {
+    pub const fn new() -> Self {
+        const NONE: Option<NumaHint> = None;
+        Self {
+            nodes: [NONE; MAX_NUMA_NODES],
+            initialized: AtomicBool::new(false),
+        }
+    }
+
+    pub fn node_for_cpu(&self, cpu: LogicalCpuId) -> Option<u8> {
+        for node in self.nodes.iter().flatten() {
+            if node.cpus.contains(cpu) {
+                return Some(node.node_id);
+            }
+        }
+        None
+    }
+
+    pub fn same_node(&self, cpu1: LogicalCpuId, cpu2: LogicalCpuId) -> bool {
+        self.node_for_cpu(cpu1) == self.node_for_cpu(cpu2)
+    }
+}
+
+static mut NUMA_TOPOLOGY: NumaTopology = NumaTopology::new();
+
+pub fn topology() -> &'static NumaTopology {
+    unsafe { &NUMA_TOPOLOGY }
+}
+
+pub fn init_default() {
+    let topo = topology();
+    if topo.initialized.swap(true, Ordering::AcqRel) {
+        return;
+    }
+    unsafe {
+        let topo_mut = &mut *core::ptr::addr_of_mut!(NUMA_TOPOLOGY);
+        topo_mut.nodes[0] = Some(NumaHint {
+            node_id: 0,
+            cpus: LogicalCpuSet::all(),
+        });
+    }
+}
diff --git a/src/scheme/proc.rs b/src/scheme/proc.rs
index dc9949850a..beba85089f 100644
--- a/src/scheme/proc.rs
+++ b/src/scheme/proc.rs
@@ -432,6 +432,7 @@ impl KernelScheme for ProcScheme {
     }
 
     fn close(&self, id: usize, token: &mut CleanLockToken) -> Result<()> {
+        let mut inner_token = unsafe { CleanLockToken::new() };
         let handle = HANDLES
             .write(token.token())
             .remove(&id)
@@ -459,9 +460,7 @@ impl KernelScheme for ProcScheme {
                     ))]
                     regs.set_arg1(arg1);
 
-                    // TODO: Lock ordering violation
-                    let mut token = unsafe { CleanLockToken::new() };
-                    Ok(context.set_addr_space(Some(new), token.downgrade()))
+                    Ok(context.set_addr_space(Some(new), inner_token.downgrade()))
                 })?;
                 if let Some(old_ctx) = old_ctx
                     && let Some(addrspace) = Arc::into_inner(old_ctx)
@@ -500,6 +499,7 @@ impl KernelScheme for ProcScheme {
         consume: bool,
         token: &mut CleanLockToken,
     ) -> Result<usize> {
+        let mut inner_token = unsafe { CleanLockToken::new() };
         let handle = HANDLES
             .read(token.token())
             .get(&id)
@@ -590,9 +590,7 @@ impl KernelScheme for ProcScheme {
                 };
                 // TODO: Allocated or AllocatedShared?
                 let addrsp = AddrSpace::current()?;
-                // TODO: Lock ordering violation
-                let mut token = unsafe { CleanLockToken::new() };
-                let page = addrsp.acquire_write(token.downgrade()).mmap_anywhere(
+                let page = addrsp.acquire_write(inner_token.downgrade()).mmap_anywhere(
                     &addrsp,
                     NonZeroUsize::new(1).unwrap(),
                     MapFlags::PROT_READ | MapFlags::PROT_WRITE,