diff --git a/src/main.rs b/src/main.rs --- a/src/main.rs +++ b/src/main.rs @@ -70,6 +70,9 @@ mod log; /// Memory management mod memory; +/// NUMA topology +mod numa; + /// Panic mod panic; diff --git a/src/acpi/madt/arch/x86.rs b/src/acpi/madt/arch/x86.rs --- a/src/acpi/madt/arch/x86.rs +++ b/src/acpi/madt/arch/x86.rs @@ -18,6 +18,29 @@ use super::{Madt, MadtEntry}; +use alloc::vec::Vec; + +/// Maximum number of APIC→CPU mappings we track for NUMA topology. +const MAX_APIC_MAPPINGS: usize = 256; + +struct ApicMapping { + apic_id: u32, + cpu_id: LogicalCpuId, +} + +const UNINIT_MAPPING: ApicMapping = ApicMapping { apic_id: u32::MAX, cpu_id: LogicalCpuId::new(0) }; + +static mut APIC_MAPPINGS: [ApicMapping; MAX_APIC_MAPPINGS] = [UNINIT_MAPPING; MAX_APIC_MAPPINGS]; +static mut APIC_MAPPING_COUNT: usize = 0; + +unsafe fn record_apic_mapping(apic_id: u32, cpu_id: LogicalCpuId) { + let count = APIC_MAPPING_COUNT; + if count < MAX_APIC_MAPPINGS { + APIC_MAPPINGS[count] = ApicMapping { apic_id, cpu_id }; + APIC_MAPPING_COUNT = count + 1; + } +} + const AP_SPIN_LIMIT: u32 = 1_000_000; const TRAMPOLINE: usize = 0x8000; static TRAMPOLINE_DATA: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/trampoline")); @@ -61,6 +82,10 @@ pub(super) fn init(madt: Madt) { } if cfg!(not(feature = "multi_core")) { + unsafe { + record_apic_mapping(me.get(), LogicalCpuId::new(0)); + } + crate::numa::init_default(); return; } @@ -216,6 +241,17 @@ pub(super) fn init(madt: Madt) { crate::CPU_COUNT.fetch_add(1, Ordering::Relaxed); + // Record APIC→CPU mapping for NUMA topology. + unsafe { + record_apic_mapping(u32::from(ap_local_apic.id), cpu_id); + } + // Set NUMA node from SRAT data. + if let Some(percpu) = crate::percpu::get_for_cpu(cpu_id) { + if let Some(node) = crate::acpi::srat::numa_node_for_apic(u32::from(ap_local_apic.id)) { + percpu.numa_node.set(node); + } + } + RmmA::invalidate_all(); } } else if let MadtEntry::LocalX2Apic(ap_x2apic) = madt_entry { @@ -325,6 +361,18 @@ pub(super) fn init(madt: Madt) { } crate::CPU_COUNT.fetch_add(1, Ordering::Relaxed); + + // Record APIC→CPU mapping for NUMA topology. + unsafe { + record_apic_mapping(apic_id, cpu_id); + } + // Set NUMA node from SRAT data. + if let Some(percpu) = crate::percpu::get_for_cpu(cpu_id) { + if let Some(node) = crate::acpi::srat::numa_node_for_apic(apic_id) { + percpu.numa_node.set(node); + } + } + RmmA::invalidate_all(); } } else if let MadtEntry::LocalApicNmi(nmi) = madt_entry { @@ -342,6 +390,20 @@ pub(super) fn init(madt: Madt) { } } + // Initialize NUMA topology from APIC→CPU mappings and SRAT. + { + let mappings = unsafe { &APIC_MAPPINGS[..APIC_MAPPING_COUNT] }; + let mappings_ref: Vec<(u32, LogicalCpuId)> = mappings + .iter() + .map(|m| (m.apic_id, m.cpu_id)) + .collect(); + crate::numa::init_from_srat(&mappings_ref); + } + // Set BSP's NUMA node from SRAT. + if let Some(node) = crate::acpi::srat::numa_node_for_apic(me.get()) { + crate::percpu::PercpuBlock::current().numa_node.set(node); + } + // Unmap trampoline if let Some((_frame, _, flush)) = unsafe { KernelMapper::lock_rw() diff --git a/src/acpi/mod.rs b/src/acpi/mod.rs --- a/src/acpi/mod.rs +++ b/src/acpi/mod.rs @@ -20,6 +20,8 @@ mod rxsdt; pub mod sdt; #[cfg(target_arch = "aarch64")] mod spcr; +pub mod slit; +pub mod srat; mod xsdt; unsafe fn map_linearly(addr: PhysicalAddress, len: usize, mapper: &mut crate::memory::PageMapper) { @@ -163,7 +165,14 @@ pub unsafe fn init(already_supplied_rsdp: Option<*const u8>) { // TODO: Enumerate processors in userspace, and then provide an ACPI-independent interface // to initialize enumerated processors to userspace? + // Parse SRAT BEFORE MADT so NUMA node mapping is available + // when APs are started and PercpuBlocks are created. + srat::init(); + Madt::init(); + + // Parse SLIT after MADT for the NUMA distance matrix. + slit::init(); //TODO: support this on any arch // SPCR must be initialized after MADT for interrupt controllers #[cfg(target_arch = "aarch64")] diff --git a/src/acpi/slit.rs b/src/acpi/slit.rs --- /dev/null +++ b/src/acpi/slit.rs @@ -0,0 +1,45 @@ +//! SLIT (System Locality Information Table) parser. +//! +//! Parses the NUMA distance matrix for scheduler NUMA-aware work stealing. + +use super::sdt::Sdt; +use crate::acpi::find_sdt; + +const MAX_NODES: usize = 8; + +static mut SLIT_MATRIX: [[u8; MAX_NODES]; MAX_NODES] = [[10u8; MAX_NODES]; MAX_NODES]; +static mut SLIT_NUM_NODES: usize = 0; +static mut SLIT_AVAILABLE: bool = false; + +pub fn is_available() -> bool { unsafe { SLIT_AVAILABLE } } +pub fn num_nodes() -> usize { unsafe { SLIT_NUM_NODES } } + +pub fn distance(from: u8, to: u8) -> u8 { + if !unsafe { SLIT_AVAILABLE } { return 10; } + let (from, to) = (from as usize, to as usize); + if from >= MAX_NODES || to >= MAX_NODES { return 10; } + unsafe { SLIT_MATRIX[from][to] } +} + +pub fn same_socket(node1: u8, node2: u8) -> bool { distance(node1, node2) <= 20 } + +pub fn init() { + let sdt = match find_sdt("SLIT").as_slice() { + [] => return, + [x] => *x, + xs => { println!("SLIT: {} tables found, expected 1", xs.len()); return; } + }; + if &sdt.signature != b"SLIT" { return; } + let data_addr = sdt.data_address(); + let data_len = sdt.data_len(); + if data_len < 8 { return; } + let num_nodes = unsafe { *(data_addr as *const u64) } as usize; + if num_nodes == 0 || num_nodes > MAX_NODES { println!("SLIT: {num_nodes} nodes (max {MAX_NODES}), ignoring"); return; } + let matrix_start = 8; + let matrix_size = num_nodes * num_nodes; + if data_len < matrix_start + matrix_size { println!("SLIT: matrix truncated ({data_len} < {})", matrix_start + matrix_size); return; } + let matrix = unsafe { &mut SLIT_MATRIX }; + for i in 0..num_nodes { for j in 0..num_nodes { matrix[i][j] = unsafe { *((data_addr + matrix_start + i * num_nodes + j) as *const u8) }; } } + unsafe { SLIT_NUM_NODES = num_nodes; SLIT_AVAILABLE = true; } + debug!("SLIT: {} nodes, distance matrix loaded", num_nodes); +} diff --git a/src/acpi/srat.rs b/src/acpi/srat.rs --- /dev/null +++ b/src/acpi/srat.rs @@ -0,0 +1,102 @@ +//! SRAT (System Resource Affinity Table) parser. +//! +//! Parses CPU-to-NUMA-node and memory-to-NUMA-node affinity information. +//! Called before MADT init so that NUMA data is available during AP startup. + +use super::sdt::Sdt; +use crate::acpi::find_sdt; + +const MAX_CPU_ENTRIES: usize = 256; +const MAX_MEM_ENTRIES: usize = 64; + +#[derive(Clone, Copy)] +struct SratCpuEntry { apic_id: u32, node: u8, enabled: bool } + +#[derive(Clone, Copy)] +struct SratMemEntry { node: u8, base: u64, length: u64, enabled: bool } + +const CPU_NONE: SratCpuEntry = SratCpuEntry { apic_id: u32::MAX, node: 0, enabled: false }; +const MEM_NONE: SratMemEntry = SratMemEntry { node: 0, base: 0, length: 0, enabled: false }; + +static mut SRAT_CPU_ENTRIES: [SratCpuEntry; MAX_CPU_ENTRIES] = [CPU_NONE; MAX_CPU_ENTRIES]; +static mut SRAT_MEM_ENTRIES: [SratMemEntry; MAX_MEM_ENTRIES] = [MEM_NONE; MAX_MEM_ENTRIES]; +static mut SRAT_CPU_COUNT: usize = 0; +static mut SRAT_MEM_COUNT: usize = 0; +static mut SRAT_AVAILABLE: bool = false; + +pub fn is_available() -> bool { unsafe { SRAT_AVAILABLE } } + +pub fn numa_node_for_apic(apic_id: u32) -> Option { + if !unsafe { SRAT_AVAILABLE } { return None; } + let count = unsafe { SRAT_CPU_COUNT }; + let entries = unsafe { &SRAT_CPU_ENTRIES }; + for i in 0..count { + if entries[i].apic_id == apic_id && entries[i].enabled { return Some(entries[i].node); } + } + None +} + +pub fn numa_node_count() -> usize { + if !unsafe { SRAT_AVAILABLE } { return 1; } + let mut max_node: u8 = 0; + let count = unsafe { SRAT_CPU_COUNT }; + let entries = unsafe { &SRAT_CPU_ENTRIES }; + for i in 0..count { if entries[i].enabled && entries[i].node > max_node { max_node = entries[i].node; } } + (max_node as usize) + 1 +} + +#[repr(C, packed)] +struct SratLocalApic { _proximity_lo: u8, apic_id: u8, flags: u32, _local_sapic_eid: u8, _proximity_hi: [u8; 3], _clock_domain: u32 } + +#[repr(C, packed)] +struct SratMemoryAffinity { proximity_domain: u32, _reserved1: u16, base_address_lo: u32, base_address_hi: u32, length_lo: u32, length_hi: u32, _reserved2: u32, flags: u32, _reserved3: u64 } + +#[repr(C, packed)] +struct SratLocalX2Apic { _reserved: u16, proximity_domain: u32, x2apic_id: u32, flags: u32, _clock_domain: u32, _reserved2: u32 } + +pub fn init() { + let sdt = match find_sdt("SRAT").as_slice() { + [] => return, + [x] => *x, + xs => { println!("SRAT: {} tables found, expected 1", xs.len()); return; } + }; + if &sdt.signature != b"SRAT" { return; } + let data_addr = sdt.data_address(); + let data_len = sdt.data_len(); + if data_len < 12 { println!("SRAT: table too short ({data_len} bytes)"); return; } + let mut offset: usize = 12; + let cpu_entries = unsafe { &mut SRAT_CPU_ENTRIES }; + let mem_entries = unsafe { &mut SRAT_MEM_ENTRIES }; + let mut cpu_count: usize = 0; + let mut mem_count: usize = 0; + while offset + 2 <= data_len { + let entry_type = unsafe { *((data_addr + offset) as *const u8) }; + let entry_len = unsafe { *((data_addr + offset + 1) as *const u8) } as usize; + if entry_len < 2 || offset + entry_len > data_len { break; } + let entry_data = data_addr + offset + 2; + match entry_type { + 0x0 if entry_len >= size_of::() + 2 => { + let e = unsafe { &*(entry_data as *const SratLocalApic) }; + let enabled = (e.flags & 1) == 1; + let node = (e._proximity_lo as u32) | ((e._proximity_hi[0] as u32) << 8) | ((e._proximity_hi[1] as u32) << 16) | ((e._proximity_hi[2] as u32) << 24); + if cpu_count < MAX_CPU_ENTRIES { cpu_entries[cpu_count] = SratCpuEntry { apic_id: e.apic_id as u32, node: node as u8, enabled }; cpu_count += 1; } + } + 0x1 if entry_len >= size_of::() + 2 => { + let e = unsafe { &*(entry_data as *const SratMemoryAffinity) }; + let enabled = (e.flags & 1) == 1; + let base = (e.base_address_hi as u64) << 32 | e.base_address_lo as u64; + let length = (e.length_hi as u64) << 32 | e.length_lo as u64; + if mem_count < MAX_MEM_ENTRIES { mem_entries[mem_count] = SratMemEntry { node: e.proximity_domain as u8, base, length, enabled }; mem_count += 1; } + } + 0x2 if entry_len >= size_of::() + 2 => { + let e = unsafe { &*(entry_data as *const SratLocalX2Apic) }; + let enabled = (e.flags & 1) == 1; + if cpu_count < MAX_CPU_ENTRIES { cpu_entries[cpu_count] = SratCpuEntry { apic_id: e.x2apic_id, node: e.proximity_domain as u8, enabled }; cpu_count += 1; } + } + _ => {} + } + offset += entry_len; + } + unsafe { SRAT_CPU_COUNT = cpu_count; SRAT_MEM_COUNT = mem_count; SRAT_AVAILABLE = true; } + debug!("SRAT: {} CPU entries, {} memory entries", cpu_count, mem_count); +} diff --git a/src/numa.rs b/src/numa.rs --- a/src/numa.rs +++ b/src/numa.rs @@ -1,13 +1,15 @@ /// NUMA topology hints for the kernel scheduler. -/// NUMA discovery (SRAT/SLIT parsing) is performed by a userspace daemon -/// (numad) via /scheme/acpi/, then pushed to the kernel via scheme:numa. -/// The kernel stores a lightweight copy for O(1) scheduling lookups. +/// +/// NUMA discovery (SRAT/SLIT parsing) is performed during kernel ACPI init +/// (`acpi::init()`). The kernel stores a lightweight copy for O(1) scheduling +/// lookups. If no SRAT is found, `init_default()` creates a single-node topology. +use crate::acpi::srat; use crate::cpu_set::{LogicalCpuId, LogicalCpuSet}; use core::sync::atomic::{AtomicBool, Ordering}; const MAX_NUMA_NODES: usize = 8; -#[derive(Clone, Debug)] +#[derive(Debug)] pub struct NumaHint { pub node_id: u8, pub cpus: LogicalCpuSet, @@ -21,17 +23,12 @@ impl NumaTopology { pub const fn new() -> Self { const NONE: Option = None; - Self { - nodes: [NONE; MAX_NUMA_NODES], - initialized: AtomicBool::new(false), - } + Self { nodes: [NONE; MAX_NUMA_NODES], initialized: AtomicBool::new(false) } } pub fn node_for_cpu(&self, cpu: LogicalCpuId) -> Option { for node in self.nodes.iter().flatten() { - if node.cpus.contains(cpu) { - return Some(node.node_id); - } + if node.cpus.contains(cpu) { return Some(node.node_id); } } None } @@ -43,20 +40,42 @@ static mut NUMA_TOPOLOGY: NumaTopology = NumaTopology::new(); -pub fn topology() -> &'static NumaTopology { - unsafe { &NUMA_TOPOLOGY } +pub fn topology() -> &'static NumaTopology { unsafe { &NUMA_TOPOLOGY } } + +/// Initialize NUMA topology from SRAT data parsed during ACPI init. +pub fn init_from_srat(apic_ids: &[(u32, LogicalCpuId)]) { + let topo = topology(); + if topo.initialized.swap(true, Ordering::AcqRel) { return; } + if !srat::is_available() { init_default_inner(); return; } + unsafe { + let topo_mut = &mut *core::ptr::addr_of_mut!(NUMA_TOPOLOGY); + for &(apic_id, cpu_id) in apic_ids { + if let Some(node) = srat::numa_node_for_apic(apic_id) { + let idx = node as usize; + if idx < MAX_NUMA_NODES { + topo_mut.nodes[idx].get_or_insert_with(|| NumaHint { node_id: node, cpus: LogicalCpuSet::empty() }).cpus.atomic_set(cpu_id); + } + } + } + if topo_mut.nodes.iter().all(|n| n.is_none()) { + topo_mut.nodes[0] = Some(NumaHint { node_id: 0, cpus: LogicalCpuSet::all() }); + } + } + let node_count = topology().nodes.iter().filter(|n| n.is_some()).count(); + debug!("NUMA: {node_count} node(s) from SRAT"); } +/// Fallback: single-node topology. pub fn init_default() { let topo = topology(); - if topo.initialized.swap(true, Ordering::AcqRel) { - return; - } + if topo.initialized.swap(true, Ordering::AcqRel) { return; } + init_default_inner(); +} + +fn init_default_inner() { unsafe { let topo_mut = &mut *core::ptr::addr_of_mut!(NUMA_TOPOLOGY); - topo_mut.nodes[0] = Some(NumaHint { - node_id: 0, - cpus: LogicalCpuSet::all(), - }); + topo_mut.nodes[0] = Some(NumaHint { node_id: 0, cpus: LogicalCpuSet::all() }); } + debug!("NUMA: single-node topology (no SRAT)"); } diff --git a/src/percpu.rs b/src/percpu.rs --- a/src/percpu.rs +++ b/src/percpu.rs @@ -62,6 +62,10 @@ pub struct PercpuBlock { /// from the spin loop. Default 39 (lowest priority). pub current_prio: Cell, + /// NUMA proximity domain for this CPU. Set during ACPI init from SRAT. + /// `u8::MAX` means unknown (no SRAT or APIC ID not listed). + pub numa_node: Cell, + // TODO: Put mailbox queues here, e.g. for TLB shootdown? Just be sure to 128-byte align it // first to avoid cache invalidation. pub profiling: Option<&'static crate::profiling::RingBuffer>, @@ -354,6 +358,7 @@ impl PercpuBlock { tlb_flush_count: AtomicU32::new(0), pi_donated_prio: AtomicU32::new(u32::MAX), current_prio: Cell::new(39), + numa_node: Cell::new(u8::MAX), ptrace_flags: Cell::new(PtraceFlags::empty()), ptrace_session: RefCell::new(None), inside_syscall: Cell::new(false),