diff --git a/recipes/core/kernel/source/src/acpi/madt/arch/x86.rs.orig b/recipes/core/kernel/source/src/acpi/madt/arch/x86.rs.orig
deleted file mode 100644
index 637f96ac0e..0000000000
--- a/recipes/core/kernel/source/src/acpi/madt/arch/x86.rs.orig
+++ /dev/null
@@ -1,162 +0,0 @@
-use core::{
-    hint,
-    sync::atomic::{AtomicU8, Ordering},
-};
-
-use crate::{
-    arch::{
-        device::local_apic::the_local_apic,
-        start::{kstart_ap, KernelArgsAp},
-    },
-    cpu_set::LogicalCpuId,
-    memory::{
-        allocate_p2frame, Frame, KernelMapper, Page, PageFlags, PhysicalAddress, RmmA, RmmArch,
-        VirtualAddress, PAGE_SIZE,
-    },
-    startup::AP_READY,
-};
-
-use super::{Madt, MadtEntry};
-
-const TRAMPOLINE: usize = 0x8000;
-static TRAMPOLINE_DATA: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/trampoline"));
-
-pub(super) fn init(madt: Madt) {
-    let local_apic = unsafe { the_local_apic() };
-    let me = local_apic.id();
-
-    if local_apic.x2 {
-        debug!("    X2APIC {}", me.get());
-    } else {
-        debug!("    XAPIC {}: {:>08X}", me.get(), local_apic.address);
-    }
-
-    if cfg!(not(feature = "multi_core")) {
-        return;
-    }
-
-    // Map trampoline
-    let trampoline_frame = Frame::containing(PhysicalAddress::new(TRAMPOLINE));
-    let trampoline_page = Page::containing_address(VirtualAddress::new(TRAMPOLINE));
-    let (result, page_table_physaddr) = unsafe {
-        //TODO: do not have writable and executable!
-        let mut mapper = KernelMapper::lock_rw();
-
-        let result = mapper
-            .map_phys(
-                trampoline_page.start_address(),
-                trampoline_frame.base(),
-                PageFlags::new().execute(true).write(true),
-            )
-            .expect("failed to map trampoline");
-
-        (result, mapper.table().phys().data())
-    };
-    result.flush();
-
-    // Write trampoline, make sure TRAMPOLINE page is free for use
-    for (i, val) in TRAMPOLINE_DATA.iter().enumerate() {
-        unsafe {
-            (*((TRAMPOLINE as *mut u8).add(i) as *const AtomicU8)).store(*val, Ordering::SeqCst);
-        }
-    }
-
-    unsafe {
-        let preliminary_cpu_count = madt.iter().filter(|e| matches!(e, MadtEntry::LocalApic(entry) if u32::from(entry.id) == me.get() || entry.flags & 1 == 1)).count();
-        crate::profiling::allocate(preliminary_cpu_count as u32);
-    }
-
-    for madt_entry in madt.iter() {
-        debug!("      {:x?}", madt_entry);
-        if let MadtEntry::LocalApic(ap_local_apic) = madt_entry {
-            if u32::from(ap_local_apic.id) == me.get() {
-                debug!("        This is my local APIC");
-            } else if ap_local_apic.flags & 1 == 1 {
-                let cpu_id = LogicalCpuId::next();
-
-                // Allocate a stack
-                let stack_start = RmmA::phys_to_virt(
-                    allocate_p2frame(4)
-                        .expect("no more frames in acpi stack_start")
-                        .base(),
-                )
-                .data();
-                let stack_end = stack_start + (PAGE_SIZE << 4);
-
-                let pcr_ptr = crate::arch::gdt::allocate_and_init_pcr(cpu_id, stack_end);
-
-                let idt_ptr = crate::arch::idt::allocate_and_init_idt(cpu_id);
-
-                let args = KernelArgsAp {
-                    stack_end: stack_end as *mut u8,
-                    cpu_id,
-                    pcr_ptr,
-                    idt_ptr,
-                };
-
-                let ap_ready = (TRAMPOLINE + 8) as *mut u64;
-                let ap_args_ptr = unsafe { ap_ready.add(1) };
-                let ap_page_table = unsafe { ap_ready.add(2) };
-                let ap_code = unsafe { ap_ready.add(3) };
-
-                // Set the ap_ready to 0, volatile
-                unsafe {
-                    ap_ready.write(0);
-                    ap_args_ptr.write(&args as *const _ as u64);
-                    ap_page_table.write(page_table_physaddr as u64);
-                    #[expect(clippy::fn_to_numeric_cast)]
-                    ap_code.write(kstart_ap as u64);
-
-                    // Ensure all trampoline writes are visible to the AP before
-                    // it starts executing.  asm!("") is only a compiler barrier;
-                    // fence(SeqCst) is a full hardware memory barrier.
-                    core::sync::atomic::fence(Ordering::SeqCst);
-                };
-                AP_READY.store(false, Ordering::SeqCst);
-
-                // Send INIT IPI
-                {
-                    let mut icr = 0x4500;
-                    if local_apic.x2 {
-                        icr |= u64::from(ap_local_apic.id) << 32;
-                    } else {
-                        icr |= u64::from(ap_local_apic.id) << 56;
-                    }
-                    local_apic.set_icr(icr);
-                }
-
-                // Send START IPI
-                {
-                    let ap_segment = (TRAMPOLINE >> 12) & 0xFF;
-                    let mut icr = 0x4600 | ap_segment as u64;
-
-                    if local_apic.x2 {
-                        icr |= u64::from(ap_local_apic.id) << 32;
-                    } else {
-                        icr |= u64::from(ap_local_apic.id) << 56;
-                    }
-
-                    local_apic.set_icr(icr);
-                }
-
-                // Wait for trampoline ready
-                while unsafe { (*ap_ready.cast::<AtomicU8>()).load(Ordering::SeqCst) } == 0 {
-                    hint::spin_loop();
-                }
-                while !AP_READY.load(Ordering::SeqCst) {
-                    hint::spin_loop();
-                }
-
-                RmmA::invalidate_all();
-            }
-        }
-    }
-
-    // Unmap trampoline
-    let (_frame, _, flush) = unsafe {
-        KernelMapper::lock_rw()
-            .unmap_phys(trampoline_page.start_address())
-            .expect("failed to unmap trampoline page")
-    };
-    flush.flush();
-}
diff --git a/recipes/core/kernel/source/src/acpi/madt/arch/x86.rs.rej b/recipes/core/kernel/source/src/acpi/madt/arch/x86.rs.rej
deleted file mode 100644
index 76763f99cb..0000000000
--- a/recipes/core/kernel/source/src/acpi/madt/arch/x86.rs.rej
+++ /dev/null
@@ -1,194 +0,0 @@
---- src/acpi/madt/arch/x86.rs
-+++ src/acpi/madt/arch/x86.rs
-@@ -20,6 +22,7 @@
- 
- use super::{Madt, MadtEntry};
- 
-+use alloc::collections::BTreeSet;
- use alloc::vec::Vec;
- 
- /// Maximum number of APIC→CPU mappings we track for NUMA topology.
-@@ -47,6 +50,67 @@
- const TRAMPOLINE: usize = 0x8000;
- static TRAMPOLINE_DATA: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/trampoline"));
- 
-+/// Estimate TSC frequency in MHz from CPUID.
-+///
-+/// Tries CPUID leaf 0x16 (Processor Frequency Information) first,
-+/// then CPUID leaf 0x15 (TSC/Core Crystal Clock Ratio).
-+/// Returns None if frequency cannot be determined.
-+fn tsc_freq_mhz_cpuid() -> Option<u64> {
-+    let max_leaf = unsafe { core::arch::x86_64::__cpuid(0).eax as u32 };
-+
-+    // CPUID leaf 0x16: EAX = Core Base Frequency in MHz (Intel)
-+    if max_leaf >= 0x16 {
-+        let mhz = unsafe { core::arch::x86_64::__cpuid(0x16) }.eax as u64;
-+        if mhz > 0 {
-+            return Some(mhz);
-+        }
-+    }
-+
-+    // CPUID leaf 0x15: EAX = denominator, EBX = numerator, ECX = crystal Hz
-+    if max_leaf >= 0x15 {
-+        let res = unsafe { core::arch::x86_64::__cpuid(0x15) };
-+        let denom = res.eax as u64;
-+        let numer = res.ebx as u64;
-+        let crystal_hz = res.ecx as u64;
-+        if denom > 0 && numer > 0 && crystal_hz > 0 {
-+            // TSC freq = crystal_hz * numer / denom
-+            let tsc_hz = crystal_hz * numer / denom;
-+            return Some(tsc_hz / 1_000_000); // Hz → MHz
-+        }
-+    }
-+
-+    None
-+}
-+
-+/// Early-boot microsecond delay using the Time Stamp Counter.
-+///
-+/// Uses CPUID-based TSC frequency estimation when available.
-+/// Falls back to a conservative spin loop calibrated for the
-+/// minimum expected CPU speed (1 GHz).
-+///
-+/// # Safety
-+/// Must only be called after the BSP TSC is running (always true
-+/// after CPU reset on x86).
-+fn early_udelay(us: u64) {
-+    if let Some(mhz) = tsc_freq_mhz_cpuid() {
-+        // TSC-based delay: precise on invariant TSC (all modern x86).
-+        // MHz = cycles per µs.
-+        let target = unsafe { rdtsc() } + us * mhz;
-+        while unsafe { rdtsc() } < target {
-+            hint::spin_loop();
-+        }
-+    } else {
-+        // Fallback: conservative spin loop.
-+        // spin_loop() (PAUSE) is ~40 cycles on modern Intel, ~1 on AMD.
-+        // At 1 GHz minimum: 1000 cycles/µs ÷ 40 cycles/iter = 25 iters/µs.
-+        // Use 50 iters/µs for safety margin on slower/variable CPUs.
-+        let iters = us.saturating_mul(50);
-+        for _ in 0..iters {
-+            hint::spin_loop();
-+        }
-+    }
-+}
-+
- fn current_x2apic_processor_uid(madt: &Madt, apic_id: u32) -> Option<u32> {
-     madt.iter().find_map(|entry| match entry {
-         MadtEntry::LocalX2Apic(x2apic) if x2apic.x2apic_id == apic_id => Some(x2apic.processor_uid),
-@@ -235,20 +329,53 @@
-                     local_apic.set_icr(icr);
-                 }
- 
--                // Send START IPI
-+                // Intel SDM Vol 3A §8.4.4: wait 10ms after INIT deassert
-+                // before sending first SIPI. Modern CPUs may need less,
-+                // but 10ms is the safe specification-compliant value.
-+                early_udelay(10_000);
-+
-+                // Send START IPI #1
-                 {
-                     let ap_segment = (TRAMPOLINE >> 12) & 0xFF;
--                    let mut icr = 0x4600 | ap_segment as u64;
--
-+                    // ICR: Delivery Mode=StartUp(110), Vector=ap_segment
-+                    // Note: bit 14 (Level) must be 0 for SIPI per Intel SDM.
-+                    let mut icr = 0x0600 | ap_segment as u64;
-                     if local_apic.x2 {
-                         icr |= u64::from(ap_local_apic.id) << 32;
-                     } else {
-                         icr |= u64::from(ap_local_apic.id) << 56;
-                     }
-+                    local_apic.set_icr(icr);
-+                }
-+
-+                // Intel SDM: wait 200µs between SIPIs
-+                early_udelay(200);
- 
-+                // Send START IPI #2 (recommended for compatibility)
-+                {
-+                    let ap_segment = (TRAMPOLINE >> 12) & 0xFF;
-+                    let mut icr = 0x0600 | ap_segment as u64;
-+                    if local_apic.x2 {
-+                        icr |= u64::from(ap_local_apic.id) << 32;
-+                    } else {
-+                        icr |= u64::from(ap_local_apic.id) << 56;
-+                    }
-                     local_apic.set_icr(icr);
-                 }
- 
-+                // Wait briefly for SIPI to be accepted
-+                early_udelay(200);
-+
-+                // Check ESR for delivery errors after SIPI sequence.
-+                // Bit 5 = Send Accept Error, Bit 6 = Send Illegal Vector.
-+                let esr_val = unsafe { local_apic.esr() };
-+                if esr_val != 0 {
-+                    println!(
-+                        "KERNEL AP: CPU {} SIPI delivery error (ESR={:#x}), continuing",
-+                        ap_local_apic.id, esr_val
-+                    );
-+                }
-+
-                 // Wait for trampoline ready with timeout
-                 let mut trampoline_ready = false;
-                 for _ in 0..AP_SPIN_LIMIT {
-@@ -343,34 +470,50 @@
-                 }
-                 AP_READY.store(false, Ordering::SeqCst);
- 
-+                // Clear APIC Error Status Register before starting AP.
-+                unsafe { local_apic.esr(); }
-+
-+                // Send INIT IPI (Assert)
-                 {
-                     let mut icr = 0x4500u64;
-                     icr |= u64::from(apic_id) << 32;
-                     local_apic.set_icr(icr);
-                 }
- 
--                for _ in 0..100_000 {
--                    hint::spin_loop();
--                }
-+                // Intel SDM Vol 3A §8.4.4: wait 10ms after INIT
-+                early_udelay(10_000);
- 
-+                // Send START IPI #1
-                 {
-                     let ap_segment = (TRAMPOLINE >> 12) & 0xFF;
--                    let mut icr = 0x4600u64 | ap_segment as u64;
-+                    let mut icr = 0x0600u64 | ap_segment as u64;
-                     icr |= u64::from(apic_id) << 32;
-                     local_apic.set_icr(icr);
-                 }
- 
--                for _ in 0..2_000_000 {
--                    hint::spin_loop();
--                }
-+                // Intel SDM: wait 200µs between SIPIs
-+                early_udelay(200);
- 
-+                // Send START IPI #2 (recommended for compatibility)
-                 {
-                     let ap_segment = (TRAMPOLINE >> 12) & 0xFF;
--                    let mut icr = 0x4600u64 | ap_segment as u64;
-+                    let mut icr = 0x0600u64 | ap_segment as u64;
-                     icr |= u64::from(apic_id) << 32;
-                     local_apic.set_icr(icr);
-                 }
- 
-+                // Wait briefly for SIPI acceptance
-+                early_udelay(200);
-+
-+                // Check ESR for delivery errors.
-+                let esr_val = unsafe { local_apic.esr() };
-+                if esr_val != 0 {
-+                    println!(
-+                        "KERNEL AP: CPU {} SIPI delivery error (ESR={:#x}), continuing",
-+                        apic_id, esr_val
-+                    );
-+                }
-+
-                 let mut trampoline_ready = false;
-                 for _ in 0..AP_SPIN_LIMIT {
-                     if unsafe { (*ap_ready.cast::<AtomicU8>()).load(Ordering::SeqCst) } != 0 {
diff --git a/recipes/core/kernel/source/src/arch/x86_shared/idt.rs.rej b/recipes/core/kernel/source/src/arch/x86_shared/idt.rs.rej
deleted file mode 100644
index fa130181dc..0000000000
--- a/recipes/core/kernel/source/src/arch/x86_shared/idt.rs.rej
+++ /dev/null
@@ -1,11 +0,0 @@
---- src/arch/x86_shared/idt.rs
-+++ src/arch/x86_shared/idt.rs
-@@ -110,6 +110,8 @@
- }
- 
- pub fn available_irqs_iter(cpu_id: LogicalCpuId) -> impl Iterator<Item = u8> + 'static {
-+    let count = (32..=254).filter(|&index| !is_reserved(cpu_id, index)).count();
-+    info!("available_irqs_iter: cpu_id={} count={}", cpu_id.get(), count);
-     (32..=254).filter(move |&index| !is_reserved(cpu_id, index))
- }
- 
diff --git a/recipes/core/kernel/source/src/context/switch.rs.rej b/recipes/core/kernel/source/src/context/switch.rs.rej
deleted file mode 100644
index 4dab10c9a5..0000000000
--- a/recipes/core/kernel/source/src/context/switch.rs.rej
+++ /dev/null
@@ -1,87 +0,0 @@
---- src/context/switch.rs
-+++ src/context/switch.rs
-@@ -361,6 +361,7 @@
- }
- 
- /// This is the scheduler function which currently utilises Deficit Weighted Round Robin Scheduler
-+/// with NUMA-aware context selection preference.
- fn select_next_context(
-     token: &mut CleanLockToken,
-     percpu: &PercpuBlock,
-@@ -386,6 +387,10 @@
-     let total_contexts: usize = contexts_list.iter().map(|q| q.len()).sum();
-     let mut skipped_contexts = 0;
- 
-+    // NUMA-aware selection: remember cross-node fallback candidate.
-+    let my_numa_node = percpu.numa_node.get();
-+    let mut cross_node_fallback: Option<(usize, ArcContextLockWriteGuard)> = None;
-+
-     'priority: loop {
-         i = (i + 1) % 40;
-         total_iters += 1;
-@@ -450,11 +455,44 @@
-             // Is this context runnable on this CPU?
-             let sw = unsafe { update_runnable(&mut next_context_guard, cpu_id, switch_time) };
-             if let UpdateResult::CanSwitch = sw {
--                // Cache the new context's priority for MCS lock priority donation.
--                percpu.current_prio.set(next_context_guard.prio);
--                next_context_guard_opt = Some(next_context_guard);
--                balance[i] -= SCHED_PRIO_TO_WEIGHT[20];
--                break 'priority;
-+                // NUMA-aware selection: check if this context's last CPU was on the same node.
-+                let same_node = if my_numa_node != u8::MAX {
-+                    next_context_guard.cpu_id
-+                        .map(|cid| {
-+                            crate::percpu::get_for_cpu(cid)
-+                                .map(|p| p.numa_node.get() == my_numa_node)
-+                                .unwrap_or(false)
-+                        })
-+                        .unwrap_or(true) // New context (no last CPU) — treat as same node
-+                } else {
-+                    true // No NUMA info — treat all as same node
-+                };
-+
-+                if same_node {
-+                    // Cache-warm: select immediately
-+                    percpu.current_prio.set(next_context_guard.prio);
-+                    next_context_guard_opt = Some(next_context_guard);
-+                    balance[i] -= SCHED_PRIO_TO_WEIGHT[20];
-+                    break 'priority;
-+                } else {
-+                    // Cross-node candidate: save as fallback, keep scanning for same-node
-+                    if cross_node_fallback.is_none() {
-+                        // Cache the priority and balance for later
-+                        cross_node_fallback =
-+                            Some((next_context_guard.prio, next_context_guard));
-+                        balance[i] -= SCHED_PRIO_TO_WEIGHT[20];
-+                        // Don't break — keep looking for a same-node context
-+                        continue;
-+                    } else {
-+                        // Already have a cross-node fallback; push this one back
-+                        contexts.push_back(next_context_ref);
-+                        skipped_contexts += 1;
-+                        if skipped_contexts >= total_contexts {
-+                            break 'priority;
-+                        }
-+                        continue;
-+                    }
-+                }
-             } else {
-                 if matches!(sw, UpdateResult::Blocked) {
-                     idle_contexts(token.token()).push_back(next_context_ref);
-@@ -469,6 +507,15 @@
-             }
-         }
-     }
-+
-+    // If we found a cross-node fallback but no same-node context, use it
-+    if next_context_guard_opt.is_none() {
-+        if let Some((prio, guard)) = cross_node_fallback {
-+            percpu.current_prio.set(prio);
-+            next_context_guard_opt = Some(guard);
-+        }
-+    }
-+
-     percpu.balance.set(balance);
-     percpu.last_queue.set(i);
-