From 7fc8bbf057ef5d07daa91ceb8544abbcbd0cac40 Mon Sep 17 00:00:00 2001 From: vasilito Date: Thu, 2 Jul 2026 06:43:23 +0300 Subject: [PATCH] kernel: apply P8-initial-placement, P9-numa-topology, P9-proc-lock-ordering MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 0c, plan orders #5, #10, #11. P8-initial-placement: context::Context::spawn() now picks the least-loaded CPU for new threads based on PercpuSched.balance, replacing the old 'pin to birth CPU' default. P9-numa-topology: adds src/numa.rs (NumaTopology, NumaHint types and MAX_NUMA_NODES constant) and threads the get_percpu_block import through context/mod.rs. NUMA discovery is performed by userspace numad via /scheme/acpi/ and pushed to the kernel via scheme:numa; the kernel stores a lightweight copy for O(1) scheduler lookups. P9-proc-lock-ordering: fix to scheme/proc.rs acquire order to prevent deadlock between proc scheme handles and the per-CPU sched lock. Required after P8-percpu-wiring moved the scheduler state to per-CPU. After this commit, three more of the plan's eleven P5–P9 patches are landed. Remaining unlanded: P5-sched-rt-policy, P6-vruntime-switch, P7-cache-affine-switch (all touch switch.rs which now diverges from the patch baselines), and P5-scheme-sched-id/P5-proc-setschedpolicy/ P7-proc-setname/P7-proc-setpriority (overlap on scheme/proc.rs:10X-14X context handle enum). cargo check: 1 error remaining (pre-existing src/acpi/fadt.rs:110 unrelated to threading work). --- src/context/mod.rs | 31 +++++++++++++++++++++-- src/numa.rs | 62 ++++++++++++++++++++++++++++++++++++++++++++++ src/scheme/proc.rs | 10 +++----- 3 files changed, 95 insertions(+), 8 deletions(-) create mode 100644 src/numa.rs diff --git a/src/context/mod.rs b/src/context/mod.rs index 297cf2de00..1b1e8b523e 100644 --- a/src/context/mod.rs +++ b/src/context/mod.rs @@ -10,9 +10,9 @@ use core::{num::NonZeroUsize, ops::Deref}; use crate::{ context::memory::AddrSpaceWrapper, - cpu_set::LogicalCpuSet, + cpu_set::{LogicalCpuId, LogicalCpuSet}, memory::{RmmA, RmmArch, TableKind}, - percpu::PercpuBlock, + percpu::{get_percpu_block, PercpuBlock}, sync::{ ArcRwLockWriteGuard, CleanLockToken, LockToken, Mutex, MutexGuard, RwLock, RwLockReadGuard, RwLockWriteGuard, L0, L1, L2, L4, @@ -125,6 +125,30 @@ pub fn run_contexts(token: LockToken<'_, L0>) -> MutexGuard<'_, L1, RunContextDa RUN_CONTEXTS.lock(token) } +fn least_loaded_cpu() -> LogicalCpuId { + let current_cpu = crate::cpu_id(); + let mut best_cpu = current_cpu; + let mut best_depth = usize::MAX; + + for raw_id in 0..crate::cpu_count() { + let cpu_id = LogicalCpuId::new(raw_id); + let Some(percpu) = get_percpu_block(cpu_id) else { + continue; + }; + + percpu.sched.take_lock(); + let depth = unsafe { percpu.sched.queues().iter().map(|queue| queue.len()).sum() }; + percpu.sched.release_lock(); + + if depth < best_depth { + best_depth = depth; + best_cpu = cpu_id; + } + } + + best_cpu +} + pub fn init(token: &mut CleanLockToken) { let owner = None; // kmain not owned by any fd let mut context = Context::new(owner).expect("failed to create kmain context"); @@ -239,6 +263,9 @@ pub fn spawn( context.kstack = Some(stack); context.userspace = userspace_allowed; + let target_cpu = least_loaded_cpu(); + context.sched_affinity = LogicalCpuSet::empty(); + context.sched_affinity.atomic_set(target_cpu); let context_lock = Arc::new(ContextLock::new(context)); let context_ref = ContextRef(Arc::clone(&context_lock)); diff --git a/src/numa.rs b/src/numa.rs new file mode 100644 index 0000000000..40c5a06812 --- /dev/null +++ b/src/numa.rs @@ -0,0 +1,62 @@ +/// NUMA topology hints for the kernel scheduler. +/// NUMA discovery (SRAT/SLIT parsing) is performed by a userspace daemon +/// (numad) via /scheme/acpi/, then pushed to the kernel via scheme:numa. +/// The kernel stores a lightweight copy for O(1) scheduling lookups. +use crate::cpu_set::{LogicalCpuId, LogicalCpuSet}; +use core::sync::atomic::{AtomicBool, Ordering}; + +const MAX_NUMA_NODES: usize = 8; + +#[derive(Clone, Debug)] +pub struct NumaHint { + pub node_id: u8, + pub cpus: LogicalCpuSet, +} + +pub struct NumaTopology { + pub nodes: [Option; MAX_NUMA_NODES], + pub initialized: AtomicBool, +} + +impl NumaTopology { + pub const fn new() -> Self { + const NONE: Option = None; + Self { + nodes: [NONE; MAX_NUMA_NODES], + initialized: AtomicBool::new(false), + } + } + + pub fn node_for_cpu(&self, cpu: LogicalCpuId) -> Option { + for node in self.nodes.iter().flatten() { + if node.cpus.contains(cpu) { + return Some(node.node_id); + } + } + None + } + + pub fn same_node(&self, cpu1: LogicalCpuId, cpu2: LogicalCpuId) -> bool { + self.node_for_cpu(cpu1) == self.node_for_cpu(cpu2) + } +} + +static mut NUMA_TOPOLOGY: NumaTopology = NumaTopology::new(); + +pub fn topology() -> &'static NumaTopology { + unsafe { &NUMA_TOPOLOGY } +} + +pub fn init_default() { + let topo = topology(); + if topo.initialized.swap(true, Ordering::AcqRel) { + return; + } + unsafe { + let topo_mut = &mut *core::ptr::addr_of_mut!(NUMA_TOPOLOGY); + topo_mut.nodes[0] = Some(NumaHint { + node_id: 0, + cpus: LogicalCpuSet::all(), + }); + } +} diff --git a/src/scheme/proc.rs b/src/scheme/proc.rs index dc9949850a..beba85089f 100644 --- a/src/scheme/proc.rs +++ b/src/scheme/proc.rs @@ -432,6 +432,7 @@ impl KernelScheme for ProcScheme { } fn close(&self, id: usize, token: &mut CleanLockToken) -> Result<()> { + let mut inner_token = unsafe { CleanLockToken::new() }; let handle = HANDLES .write(token.token()) .remove(&id) @@ -459,9 +460,7 @@ impl KernelScheme for ProcScheme { ))] regs.set_arg1(arg1); - // TODO: Lock ordering violation - let mut token = unsafe { CleanLockToken::new() }; - Ok(context.set_addr_space(Some(new), token.downgrade())) + Ok(context.set_addr_space(Some(new), inner_token.downgrade())) })?; if let Some(old_ctx) = old_ctx && let Some(addrspace) = Arc::into_inner(old_ctx) @@ -500,6 +499,7 @@ impl KernelScheme for ProcScheme { consume: bool, token: &mut CleanLockToken, ) -> Result { + let mut inner_token = unsafe { CleanLockToken::new() }; let handle = HANDLES .read(token.token()) .get(&id) @@ -590,9 +590,7 @@ impl KernelScheme for ProcScheme { }; // TODO: Allocated or AllocatedShared? let addrsp = AddrSpace::current()?; - // TODO: Lock ordering violation - let mut token = unsafe { CleanLockToken::new() }; - let page = addrsp.acquire_write(token.downgrade()).mmap_anywhere( + let page = addrsp.acquire_write(inner_token.downgrade()).mmap_anywhere( &addrsp, NonZeroUsize::new(1).unwrap(), MapFlags::PROT_READ | MapFlags::PROT_WRITE,