diff --git a/recipes/core/kernel/source/src/acpi/madt/arch/x86.rs.orig b/recipes/core/kernel/source/src/acpi/madt/arch/x86.rs.orig deleted file mode 100644 index 637f96ac0e..0000000000 --- a/recipes/core/kernel/source/src/acpi/madt/arch/x86.rs.orig +++ /dev/null @@ -1,162 +0,0 @@ -use core::{ - hint, - sync::atomic::{AtomicU8, Ordering}, -}; - -use crate::{ - arch::{ - device::local_apic::the_local_apic, - start::{kstart_ap, KernelArgsAp}, - }, - cpu_set::LogicalCpuId, - memory::{ - allocate_p2frame, Frame, KernelMapper, Page, PageFlags, PhysicalAddress, RmmA, RmmArch, - VirtualAddress, PAGE_SIZE, - }, - startup::AP_READY, -}; - -use super::{Madt, MadtEntry}; - -const TRAMPOLINE: usize = 0x8000; -static TRAMPOLINE_DATA: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/trampoline")); - -pub(super) fn init(madt: Madt) { - let local_apic = unsafe { the_local_apic() }; - let me = local_apic.id(); - - if local_apic.x2 { - debug!(" X2APIC {}", me.get()); - } else { - debug!(" XAPIC {}: {:>08X}", me.get(), local_apic.address); - } - - if cfg!(not(feature = "multi_core")) { - return; - } - - // Map trampoline - let trampoline_frame = Frame::containing(PhysicalAddress::new(TRAMPOLINE)); - let trampoline_page = Page::containing_address(VirtualAddress::new(TRAMPOLINE)); - let (result, page_table_physaddr) = unsafe { - //TODO: do not have writable and executable! - let mut mapper = KernelMapper::lock_rw(); - - let result = mapper - .map_phys( - trampoline_page.start_address(), - trampoline_frame.base(), - PageFlags::new().execute(true).write(true), - ) - .expect("failed to map trampoline"); - - (result, mapper.table().phys().data()) - }; - result.flush(); - - // Write trampoline, make sure TRAMPOLINE page is free for use - for (i, val) in TRAMPOLINE_DATA.iter().enumerate() { - unsafe { - (*((TRAMPOLINE as *mut u8).add(i) as *const AtomicU8)).store(*val, Ordering::SeqCst); - } - } - - unsafe { - let preliminary_cpu_count = madt.iter().filter(|e| matches!(e, MadtEntry::LocalApic(entry) if u32::from(entry.id) == me.get() || entry.flags & 1 == 1)).count(); - crate::profiling::allocate(preliminary_cpu_count as u32); - } - - for madt_entry in madt.iter() { - debug!(" {:x?}", madt_entry); - if let MadtEntry::LocalApic(ap_local_apic) = madt_entry { - if u32::from(ap_local_apic.id) == me.get() { - debug!(" This is my local APIC"); - } else if ap_local_apic.flags & 1 == 1 { - let cpu_id = LogicalCpuId::next(); - - // Allocate a stack - let stack_start = RmmA::phys_to_virt( - allocate_p2frame(4) - .expect("no more frames in acpi stack_start") - .base(), - ) - .data(); - let stack_end = stack_start + (PAGE_SIZE << 4); - - let pcr_ptr = crate::arch::gdt::allocate_and_init_pcr(cpu_id, stack_end); - - let idt_ptr = crate::arch::idt::allocate_and_init_idt(cpu_id); - - let args = KernelArgsAp { - stack_end: stack_end as *mut u8, - cpu_id, - pcr_ptr, - idt_ptr, - }; - - let ap_ready = (TRAMPOLINE + 8) as *mut u64; - let ap_args_ptr = unsafe { ap_ready.add(1) }; - let ap_page_table = unsafe { ap_ready.add(2) }; - let ap_code = unsafe { ap_ready.add(3) }; - - // Set the ap_ready to 0, volatile - unsafe { - ap_ready.write(0); - ap_args_ptr.write(&args as *const _ as u64); - ap_page_table.write(page_table_physaddr as u64); - #[expect(clippy::fn_to_numeric_cast)] - ap_code.write(kstart_ap as u64); - - // Ensure all trampoline writes are visible to the AP before - // it starts executing. asm!("") is only a compiler barrier; - // fence(SeqCst) is a full hardware memory barrier. - core::sync::atomic::fence(Ordering::SeqCst); - }; - AP_READY.store(false, Ordering::SeqCst); - - // Send INIT IPI - { - let mut icr = 0x4500; - if local_apic.x2 { - icr |= u64::from(ap_local_apic.id) << 32; - } else { - icr |= u64::from(ap_local_apic.id) << 56; - } - local_apic.set_icr(icr); - } - - // Send START IPI - { - let ap_segment = (TRAMPOLINE >> 12) & 0xFF; - let mut icr = 0x4600 | ap_segment as u64; - - if local_apic.x2 { - icr |= u64::from(ap_local_apic.id) << 32; - } else { - icr |= u64::from(ap_local_apic.id) << 56; - } - - local_apic.set_icr(icr); - } - - // Wait for trampoline ready - while unsafe { (*ap_ready.cast::()).load(Ordering::SeqCst) } == 0 { - hint::spin_loop(); - } - while !AP_READY.load(Ordering::SeqCst) { - hint::spin_loop(); - } - - RmmA::invalidate_all(); - } - } - } - - // Unmap trampoline - let (_frame, _, flush) = unsafe { - KernelMapper::lock_rw() - .unmap_phys(trampoline_page.start_address()) - .expect("failed to unmap trampoline page") - }; - flush.flush(); -} diff --git a/recipes/core/kernel/source/src/acpi/madt/arch/x86.rs.rej b/recipes/core/kernel/source/src/acpi/madt/arch/x86.rs.rej deleted file mode 100644 index 76763f99cb..0000000000 --- a/recipes/core/kernel/source/src/acpi/madt/arch/x86.rs.rej +++ /dev/null @@ -1,194 +0,0 @@ ---- src/acpi/madt/arch/x86.rs -+++ src/acpi/madt/arch/x86.rs -@@ -20,6 +22,7 @@ - - use super::{Madt, MadtEntry}; - -+use alloc::collections::BTreeSet; - use alloc::vec::Vec; - - /// Maximum number of APIC→CPU mappings we track for NUMA topology. -@@ -47,6 +50,67 @@ - const TRAMPOLINE: usize = 0x8000; - static TRAMPOLINE_DATA: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/trampoline")); - -+/// Estimate TSC frequency in MHz from CPUID. -+/// -+/// Tries CPUID leaf 0x16 (Processor Frequency Information) first, -+/// then CPUID leaf 0x15 (TSC/Core Crystal Clock Ratio). -+/// Returns None if frequency cannot be determined. -+fn tsc_freq_mhz_cpuid() -> Option { -+ let max_leaf = unsafe { core::arch::x86_64::__cpuid(0).eax as u32 }; -+ -+ // CPUID leaf 0x16: EAX = Core Base Frequency in MHz (Intel) -+ if max_leaf >= 0x16 { -+ let mhz = unsafe { core::arch::x86_64::__cpuid(0x16) }.eax as u64; -+ if mhz > 0 { -+ return Some(mhz); -+ } -+ } -+ -+ // CPUID leaf 0x15: EAX = denominator, EBX = numerator, ECX = crystal Hz -+ if max_leaf >= 0x15 { -+ let res = unsafe { core::arch::x86_64::__cpuid(0x15) }; -+ let denom = res.eax as u64; -+ let numer = res.ebx as u64; -+ let crystal_hz = res.ecx as u64; -+ if denom > 0 && numer > 0 && crystal_hz > 0 { -+ // TSC freq = crystal_hz * numer / denom -+ let tsc_hz = crystal_hz * numer / denom; -+ return Some(tsc_hz / 1_000_000); // Hz → MHz -+ } -+ } -+ -+ None -+} -+ -+/// Early-boot microsecond delay using the Time Stamp Counter. -+/// -+/// Uses CPUID-based TSC frequency estimation when available. -+/// Falls back to a conservative spin loop calibrated for the -+/// minimum expected CPU speed (1 GHz). -+/// -+/// # Safety -+/// Must only be called after the BSP TSC is running (always true -+/// after CPU reset on x86). -+fn early_udelay(us: u64) { -+ if let Some(mhz) = tsc_freq_mhz_cpuid() { -+ // TSC-based delay: precise on invariant TSC (all modern x86). -+ // MHz = cycles per µs. -+ let target = unsafe { rdtsc() } + us * mhz; -+ while unsafe { rdtsc() } < target { -+ hint::spin_loop(); -+ } -+ } else { -+ // Fallback: conservative spin loop. -+ // spin_loop() (PAUSE) is ~40 cycles on modern Intel, ~1 on AMD. -+ // At 1 GHz minimum: 1000 cycles/µs ÷ 40 cycles/iter = 25 iters/µs. -+ // Use 50 iters/µs for safety margin on slower/variable CPUs. -+ let iters = us.saturating_mul(50); -+ for _ in 0..iters { -+ hint::spin_loop(); -+ } -+ } -+} -+ - fn current_x2apic_processor_uid(madt: &Madt, apic_id: u32) -> Option { - madt.iter().find_map(|entry| match entry { - MadtEntry::LocalX2Apic(x2apic) if x2apic.x2apic_id == apic_id => Some(x2apic.processor_uid), -@@ -235,20 +329,53 @@ - local_apic.set_icr(icr); - } - -- // Send START IPI -+ // Intel SDM Vol 3A §8.4.4: wait 10ms after INIT deassert -+ // before sending first SIPI. Modern CPUs may need less, -+ // but 10ms is the safe specification-compliant value. -+ early_udelay(10_000); -+ -+ // Send START IPI #1 - { - let ap_segment = (TRAMPOLINE >> 12) & 0xFF; -- let mut icr = 0x4600 | ap_segment as u64; -- -+ // ICR: Delivery Mode=StartUp(110), Vector=ap_segment -+ // Note: bit 14 (Level) must be 0 for SIPI per Intel SDM. -+ let mut icr = 0x0600 | ap_segment as u64; - if local_apic.x2 { - icr |= u64::from(ap_local_apic.id) << 32; - } else { - icr |= u64::from(ap_local_apic.id) << 56; - } -+ local_apic.set_icr(icr); -+ } -+ -+ // Intel SDM: wait 200µs between SIPIs -+ early_udelay(200); - -+ // Send START IPI #2 (recommended for compatibility) -+ { -+ let ap_segment = (TRAMPOLINE >> 12) & 0xFF; -+ let mut icr = 0x0600 | ap_segment as u64; -+ if local_apic.x2 { -+ icr |= u64::from(ap_local_apic.id) << 32; -+ } else { -+ icr |= u64::from(ap_local_apic.id) << 56; -+ } - local_apic.set_icr(icr); - } - -+ // Wait briefly for SIPI to be accepted -+ early_udelay(200); -+ -+ // Check ESR for delivery errors after SIPI sequence. -+ // Bit 5 = Send Accept Error, Bit 6 = Send Illegal Vector. -+ let esr_val = unsafe { local_apic.esr() }; -+ if esr_val != 0 { -+ println!( -+ "KERNEL AP: CPU {} SIPI delivery error (ESR={:#x}), continuing", -+ ap_local_apic.id, esr_val -+ ); -+ } -+ - // Wait for trampoline ready with timeout - let mut trampoline_ready = false; - for _ in 0..AP_SPIN_LIMIT { -@@ -343,34 +470,50 @@ - } - AP_READY.store(false, Ordering::SeqCst); - -+ // Clear APIC Error Status Register before starting AP. -+ unsafe { local_apic.esr(); } -+ -+ // Send INIT IPI (Assert) - { - let mut icr = 0x4500u64; - icr |= u64::from(apic_id) << 32; - local_apic.set_icr(icr); - } - -- for _ in 0..100_000 { -- hint::spin_loop(); -- } -+ // Intel SDM Vol 3A §8.4.4: wait 10ms after INIT -+ early_udelay(10_000); - -+ // Send START IPI #1 - { - let ap_segment = (TRAMPOLINE >> 12) & 0xFF; -- let mut icr = 0x4600u64 | ap_segment as u64; -+ let mut icr = 0x0600u64 | ap_segment as u64; - icr |= u64::from(apic_id) << 32; - local_apic.set_icr(icr); - } - -- for _ in 0..2_000_000 { -- hint::spin_loop(); -- } -+ // Intel SDM: wait 200µs between SIPIs -+ early_udelay(200); - -+ // Send START IPI #2 (recommended for compatibility) - { - let ap_segment = (TRAMPOLINE >> 12) & 0xFF; -- let mut icr = 0x4600u64 | ap_segment as u64; -+ let mut icr = 0x0600u64 | ap_segment as u64; - icr |= u64::from(apic_id) << 32; - local_apic.set_icr(icr); - } - -+ // Wait briefly for SIPI acceptance -+ early_udelay(200); -+ -+ // Check ESR for delivery errors. -+ let esr_val = unsafe { local_apic.esr() }; -+ if esr_val != 0 { -+ println!( -+ "KERNEL AP: CPU {} SIPI delivery error (ESR={:#x}), continuing", -+ apic_id, esr_val -+ ); -+ } -+ - let mut trampoline_ready = false; - for _ in 0..AP_SPIN_LIMIT { - if unsafe { (*ap_ready.cast::()).load(Ordering::SeqCst) } != 0 { diff --git a/recipes/core/kernel/source/src/arch/x86_shared/idt.rs.rej b/recipes/core/kernel/source/src/arch/x86_shared/idt.rs.rej deleted file mode 100644 index fa130181dc..0000000000 --- a/recipes/core/kernel/source/src/arch/x86_shared/idt.rs.rej +++ /dev/null @@ -1,11 +0,0 @@ ---- src/arch/x86_shared/idt.rs -+++ src/arch/x86_shared/idt.rs -@@ -110,6 +110,8 @@ - } - - pub fn available_irqs_iter(cpu_id: LogicalCpuId) -> impl Iterator + 'static { -+ let count = (32..=254).filter(|&index| !is_reserved(cpu_id, index)).count(); -+ info!("available_irqs_iter: cpu_id={} count={}", cpu_id.get(), count); - (32..=254).filter(move |&index| !is_reserved(cpu_id, index)) - } - diff --git a/recipes/core/kernel/source/src/context/switch.rs.rej b/recipes/core/kernel/source/src/context/switch.rs.rej deleted file mode 100644 index 4dab10c9a5..0000000000 --- a/recipes/core/kernel/source/src/context/switch.rs.rej +++ /dev/null @@ -1,87 +0,0 @@ ---- src/context/switch.rs -+++ src/context/switch.rs -@@ -361,6 +361,7 @@ - } - - /// This is the scheduler function which currently utilises Deficit Weighted Round Robin Scheduler -+/// with NUMA-aware context selection preference. - fn select_next_context( - token: &mut CleanLockToken, - percpu: &PercpuBlock, -@@ -386,6 +387,10 @@ - let total_contexts: usize = contexts_list.iter().map(|q| q.len()).sum(); - let mut skipped_contexts = 0; - -+ // NUMA-aware selection: remember cross-node fallback candidate. -+ let my_numa_node = percpu.numa_node.get(); -+ let mut cross_node_fallback: Option<(usize, ArcContextLockWriteGuard)> = None; -+ - 'priority: loop { - i = (i + 1) % 40; - total_iters += 1; -@@ -450,11 +455,44 @@ - // Is this context runnable on this CPU? - let sw = unsafe { update_runnable(&mut next_context_guard, cpu_id, switch_time) }; - if let UpdateResult::CanSwitch = sw { -- // Cache the new context's priority for MCS lock priority donation. -- percpu.current_prio.set(next_context_guard.prio); -- next_context_guard_opt = Some(next_context_guard); -- balance[i] -= SCHED_PRIO_TO_WEIGHT[20]; -- break 'priority; -+ // NUMA-aware selection: check if this context's last CPU was on the same node. -+ let same_node = if my_numa_node != u8::MAX { -+ next_context_guard.cpu_id -+ .map(|cid| { -+ crate::percpu::get_for_cpu(cid) -+ .map(|p| p.numa_node.get() == my_numa_node) -+ .unwrap_or(false) -+ }) -+ .unwrap_or(true) // New context (no last CPU) — treat as same node -+ } else { -+ true // No NUMA info — treat all as same node -+ }; -+ -+ if same_node { -+ // Cache-warm: select immediately -+ percpu.current_prio.set(next_context_guard.prio); -+ next_context_guard_opt = Some(next_context_guard); -+ balance[i] -= SCHED_PRIO_TO_WEIGHT[20]; -+ break 'priority; -+ } else { -+ // Cross-node candidate: save as fallback, keep scanning for same-node -+ if cross_node_fallback.is_none() { -+ // Cache the priority and balance for later -+ cross_node_fallback = -+ Some((next_context_guard.prio, next_context_guard)); -+ balance[i] -= SCHED_PRIO_TO_WEIGHT[20]; -+ // Don't break — keep looking for a same-node context -+ continue; -+ } else { -+ // Already have a cross-node fallback; push this one back -+ contexts.push_back(next_context_ref); -+ skipped_contexts += 1; -+ if skipped_contexts >= total_contexts { -+ break 'priority; -+ } -+ continue; -+ } -+ } - } else { - if matches!(sw, UpdateResult::Blocked) { - idle_contexts(token.token()).push_back(next_context_ref); -@@ -469,6 +507,15 @@ - } - } - } -+ -+ // If we found a cross-node fallback but no same-node context, use it -+ if next_context_guard_opt.is_none() { -+ if let Some((prio, guard)) = cross_node_fallback { -+ percpu.current_prio.set(prio); -+ next_context_guard_opt = Some(guard); -+ } -+ } -+ - percpu.balance.set(balance); - percpu.last_queue.set(i); -