feat: raw framebuffer fallback for fbbootlogd when DRM unavailable

- Add RawFb struct: direct framebuffer rendering via physmap - Add RawTextScreen: simple text renderer using orbclient font - Fallback in FbbootlogScheme::new() when V2GraphicsHandle fails - Reads FRAMEBUFFER_ADDR/WIDTH/HEIGHT/STRIDE from bootloader env - Scroll via ptr::copy on pixel rows, clear bottom line - No DRM, no shadow buffer, no GPU required — like MS-DOS text mode - Add common dependency to fbbootlogd Cargo.toml
2026-05-17 14:56:50 +03:00
parent 20853c41f5
commit 2bfe4b427b
58 changed files with 1691 additions and 3602 deletions
@@ -12,7 +12,6 @@ cc = "1.0"
 toml = "0.8"

 [dependencies]
-acpi_ext = { package = "acpi", git = "https://gitlab.redox-os.org/redox-os/acpi.git", branch = "redox-6.x" }
 arrayvec = { version = "0.7.4", default-features = false }
 bitfield = "0.13.2"
 bitflags = "2"
@@ -1,4 +1,3 @@
-# Red Bear OS kernel patches applied via individual patch files
 .PHONY: all check

 SOURCE:=$(dir $(realpath $(lastword $(MAKEFILE_LIST))))
@@ -77,7 +77,6 @@ fn main() {
        }
        "x86_64" => {
            println!("cargo::rerun-if-changed=src/asm/x86_64/trampoline.asm");
-            println!("cargo::rerun-if-changed=src/asm/x86_64/s3_wakeup.asm");

            let status = Command::new("nasm")
                .arg("-f")
@@ -90,18 +89,6 @@ fn main() {
            if !status.success() {
                panic!("nasm failed with exit status {}", status);
            }
-
-            let status = Command::new("nasm")
-                .arg("-f")
-                .arg("bin")
-                .arg("-o")
-                .arg(format!("{}/s3_wakeup", out_dir))
-                .arg("src/asm/x86_64/s3_wakeup.asm")
-                .status()
-                .expect("failed to run nasm");
-            if !status.success() {
-                panic!("nasm failed with exit status {}", status);
-            }
        }
        "riscv64" => {
            println!("cargo::rustc-cfg=dtb");
@@ -0,0 +1,591 @@
+use core::{
+    hint,
+    sync::atomic::{AtomicU8, Ordering},
+};
+
+use x86::time::rdtsc;
+
+use crate::{
+    arch::{
+        device::local_apic::the_local_apic,
+        start::{kstart_ap, KernelArgsAp},
+    },
+    cpu_set::LogicalCpuId,
+    memory::{
+        allocate_p2frame, map_device_memory, Frame, KernelMapper, Page, PageFlags,
+        PhysicalAddress, RmmA, RmmArch, VirtualAddress, PAGE_SIZE,
+    },
+    startup::AP_READY,
+};
+
+use super::{Madt, MadtEntry};
+
+use alloc::collections::BTreeSet;
+use alloc::vec::Vec;
+
+/// Maximum number of APIC→CPU mappings we track for NUMA topology.
+const MAX_APIC_MAPPINGS: usize = 256;
+
+struct ApicMapping {
+    apic_id: u32,
+    cpu_id: LogicalCpuId,
+}
+
+const UNINIT_MAPPING: ApicMapping = ApicMapping { apic_id: u32::MAX, cpu_id: LogicalCpuId::new(0) };
+
+static mut APIC_MAPPINGS: [ApicMapping; MAX_APIC_MAPPINGS] = [UNINIT_MAPPING; MAX_APIC_MAPPINGS];
+static mut APIC_MAPPING_COUNT: usize = 0;
+
+unsafe fn record_apic_mapping(apic_id: u32, cpu_id: LogicalCpuId) {
+    let count = APIC_MAPPING_COUNT;
+    if count < MAX_APIC_MAPPINGS {
+        APIC_MAPPINGS[count] = ApicMapping { apic_id, cpu_id };
+        APIC_MAPPING_COUNT = count + 1;
+    }
+}
+
+const AP_SPIN_LIMIT: u32 = 1_000_000;
+const TRAMPOLINE: usize = 0x8000;
+static TRAMPOLINE_DATA: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/trampoline"));
+
+/// Estimate TSC frequency in MHz from CPUID.
+///
+/// Tries CPUID leaf 0x16 (Processor Frequency Information) first,
+/// then CPUID leaf 0x15 (TSC/Core Crystal Clock Ratio).
+/// Returns None if frequency cannot be determined.
+fn tsc_freq_mhz_cpuid() -> Option<u64> {
+    let max_leaf = unsafe { core::arch::x86_64::__cpuid(0).eax as u32 };
+
+    // CPUID leaf 0x16: EAX = Core Base Frequency in MHz (Intel)
+    if max_leaf >= 0x16 {
+        let mhz = unsafe { core::arch::x86_64::__cpuid(0x16) }.eax as u64;
+        if mhz > 0 {
+            return Some(mhz);
+        }
+    }
+
+    // CPUID leaf 0x15: EAX = denominator, EBX = numerator, ECX = crystal Hz
+    if max_leaf >= 0x15 {
+        let res = unsafe { core::arch::x86_64::__cpuid(0x15) };
+        let denom = res.eax as u64;
+        let numer = res.ebx as u64;
+        let crystal_hz = res.ecx as u64;
+        if denom > 0 && numer > 0 && crystal_hz > 0 {
+            // TSC freq = crystal_hz * numer / denom
+            let tsc_hz = crystal_hz * numer / denom;
+            return Some(tsc_hz / 1_000_000); // Hz → MHz
+        }
+    }
+
+    None
+}
+
+/// Early-boot microsecond delay using the Time Stamp Counter.
+///
+/// Uses CPUID-based TSC frequency estimation when available.
+/// Falls back to a conservative spin loop calibrated for the
+/// minimum expected CPU speed (1 GHz).
+///
+/// # Safety
+/// Must only be called after the BSP TSC is running (always true
+/// after CPU reset on x86).
+fn early_udelay(us: u64) {
+    if let Some(mhz) = tsc_freq_mhz_cpuid() {
+        // TSC-based delay: precise on invariant TSC (all modern x86).
+        // MHz = cycles per µs.
+        let target = unsafe { rdtsc() } + us * mhz;
+        while unsafe { rdtsc() } < target {
+            hint::spin_loop();
+        }
+    } else {
+        // Fallback: conservative spin loop.
+        // spin_loop() (PAUSE) is ~40 cycles on modern Intel, ~1 on AMD.
+        // At 1 GHz minimum: 1000 cycles/µs ÷ 40 cycles/iter = 25 iters/µs.
+        // Use 50 iters/µs for safety margin on slower/variable CPUs.
+        let iters = us.saturating_mul(50);
+        for _ in 0..iters {
+            hint::spin_loop();
+        }
+    }
+}
+
+fn current_x2apic_processor_uid(madt: &Madt, apic_id: u32) -> Option<u32> {
+    madt.iter().find_map(|entry| match entry {
+        MadtEntry::LocalX2Apic(x2apic) if x2apic.x2apic_id == apic_id => Some(x2apic.processor_uid),
+        _ => None,
+    })
+}
+
+fn apply_lapic_address_override(
+    local_apic: &mut crate::arch::device::local_apic::LocalApic,
+    address: u64,
+) {
+    if local_apic.x2 || address == 0 {
+        return;
+    }
+
+    let Ok(physaddr) = usize::try_from(address) else {
+        warn!(
+            "Ignoring LAPIC address override {:#x}: does not fit host usize",
+            address
+        );
+        return;
+    };
+
+    let mapped = unsafe { map_device_memory(PhysicalAddress::new(physaddr), 4096) }.data();
+    local_apic.address = mapped;
+    debug!("Applied LAPIC address override: {:#x}", address);
+}
+
+pub(super) fn init(madt: Madt) {
+    let local_apic = unsafe { the_local_apic() };
+    let me = local_apic.id();
+
+    if local_apic.x2 {
+        debug!("    X2APIC {}", me.get());
+    } else {
+        debug!("    XAPIC {}: {:>08X}", me.get(), local_apic.address);
+    }
+
+    if cfg!(not(feature = "multi_core")) {
+        unsafe {
+            record_apic_mapping(me.get(), LogicalCpuId::new(0));
+        }
+        crate::numa::init_default();
+        return;
+    }
+
+    // Map trampoline
+    let trampoline_frame = Frame::containing(PhysicalAddress::new(TRAMPOLINE));
+    let trampoline_page = Page::containing_address(VirtualAddress::new(TRAMPOLINE));
+    let (result, page_table_physaddr) = unsafe {
+        //TODO: do not have writable and executable!
+        let mut mapper = KernelMapper::lock_rw();
+
+        let result = match mapper.map_phys(
+            trampoline_page.start_address(),
+            trampoline_frame.base(),
+            PageFlags::new().execute(true).write(true),
+        ) {
+            Some(result) => result,
+            None => {
+                println!("KERNEL AP: failed to map trampoline page, AP bring-up disabled");
+                return;
+            }
+        };
+
+        (result, mapper.table().phys().data())
+    };
+    result.flush();
+
+    // Write trampoline, make sure TRAMPOLINE page is free for use
+    for (i, val) in TRAMPOLINE_DATA.iter().enumerate() {
+        unsafe {
+            (*((TRAMPOLINE as *mut u8).add(i) as *const AtomicU8)).store(*val, Ordering::SeqCst);
+        }
+    }
+
+    unsafe {
+        let preliminary_cpu_count = madt
+            .iter()
+            .filter(|entry| match entry {
+                MadtEntry::LocalApic(local) => u32::from(local.id) == me.get() || local.flags & 1 == 1,
+                MadtEntry::LocalX2Apic(local) => local.x2apic_id == me.get() || local.flags & 1 == 1,
+                _ => false,
+            })
+            .count();
+        crate::profiling::allocate(preliminary_cpu_count as u32);
+    }
+
+    // Firmware bug detection: check for duplicate APIC IDs in MADT.
+    // Some firmware (especially on early BIOS/UEFI) may list the same
+    // processor multiple times. Keep first occurrence, warn on duplicates.
+    let mut seen_apic_ids: BTreeSet<u32> = BTreeSet::new();
+    {
+        let _ = seen_apic_ids.insert(me.get()); // BSP
+        for entry in madt.iter() {
+            match entry {
+                MadtEntry::LocalApic(local) if local.flags & 1 == 1 => {
+                    let id = u32::from(local.id);
+                    if !seen_apic_ids.insert(id) {
+                        warn!("MADT: duplicate APIC ID {} in LocalApic entry, firmware bug", id);
+                    }
+                }
+                MadtEntry::LocalX2Apic(local) if local.flags & 1 == 1 => {
+                    let id = local.x2apic_id;
+                    if !seen_apic_ids.insert(id) {
+                        warn!("MADT: duplicate x2APIC ID {} in LocalX2Apic entry, firmware bug", id);
+                    }
+                }
+                _ => {}
+            }
+        }
+    }
+
+    for madt_entry in madt.iter() {
+        debug!("      {:x?}", madt_entry);
+        if let MadtEntry::LocalApic(ap_local_apic) = madt_entry {
+            if u32::from(ap_local_apic.id) == me.get() {
+                debug!("        This is my local APIC");
+            } else if ap_local_apic.flags & 1 == 1 {
+                // Allocate a stack
+                let alloc = match allocate_p2frame(4) {
+                    Some(frame) => frame,
+                    None => {
+                        println!("KERNEL AP: CPU {} no memory for stack, skipping", ap_local_apic.id);
+                        continue;
+                    }
+                };
+                let stack_start = RmmA::phys_to_virt(alloc.base()).data();
+                let stack_end = stack_start + (PAGE_SIZE << 4);
+
+                // Atomically allocate a CPU ID — fetch_add is SeqCst so that
+                // all later stores (PercpuBlock, NUMA node) are ordered after.
+                let cpu_id = LogicalCpuId::new(crate::CPU_COUNT.fetch_add(1, Ordering::SeqCst));
+                if cpu_id.get() >= crate::cpu_set::MAX_CPU_COUNT {
+                    println!(
+                        "KERNEL AP: CPU {} exceeds logical CPU limit, skipping",
+                        ap_local_apic.id
+                    );
+                    continue;
+                }
+
+                let pcr_ptr = crate::arch::gdt::allocate_and_init_pcr(cpu_id, stack_end);
+
+                let idt_ptr = crate::arch::idt::allocate_and_init_idt(cpu_id);
+
+                let args = KernelArgsAp {
+                    stack_end: stack_end as *mut u8,
+                    cpu_id,
+                    pcr_ptr,
+                    idt_ptr,
+                };
+
+                let ap_ready = (TRAMPOLINE + 8) as *mut u64;
+                let ap_args_ptr = unsafe { ap_ready.add(1) };
+                let ap_page_table = unsafe { ap_ready.add(2) };
+                let ap_code = unsafe { ap_ready.add(3) };
+
+                // Set the ap_ready to 0, volatile
+                unsafe {
+                    ap_ready.write(0);
+                    ap_args_ptr.write(&args as *const _ as u64);
+                    ap_page_table.write(page_table_physaddr as u64);
+                    #[expect(clippy::fn_to_numeric_cast)]
+                    ap_code.write(kstart_ap as u64);
+
+                    // Ensure all trampoline writes are visible to the AP before
+                    // it starts executing.  asm!("") is only a compiler barrier;
+                    // fence(SeqCst) is a full hardware memory barrier.
+                    core::sync::atomic::fence(Ordering::SeqCst);
+                };
+                AP_READY.store(false, Ordering::SeqCst);
+
+                // Clear APIC Error Status Register before starting AP.
+                // Intel SDM §8.4.4: ESR should be cleared before sending SIPI.
+                unsafe { local_apic.esr(); }
+
+                // Send INIT IPI (Assert)
+                {
+                    // ICR: Delivery Mode=INIT(101), Level=Assert, Trigger=Edge
+                    let mut icr = 0x4500u64;
+                    if local_apic.x2 {
+                        icr |= u64::from(ap_local_apic.id) << 32;
+                    } else {
+                        icr |= u64::from(ap_local_apic.id) << 56;
+                    }
+                    local_apic.set_icr(icr);
+                }
+
+                // Intel SDM Vol 3A §8.4.4: wait 10ms after INIT deassert
+                // before sending first SIPI. Modern CPUs may need less,
+                // but 10ms is the safe specification-compliant value.
+                early_udelay(10_000);
+
+                // Send START IPI #1
+                {
+                    let ap_segment = (TRAMPOLINE >> 12) & 0xFF;
+                    // ICR: Delivery Mode=StartUp(110), Vector=ap_segment
+                    // Note: bit 14 (Level) must be 0 for SIPI per Intel SDM.
+                    let mut icr = 0x0600 | ap_segment as u64;
+                    if local_apic.x2 {
+                        icr |= u64::from(ap_local_apic.id) << 32;
+                    } else {
+                        icr |= u64::from(ap_local_apic.id) << 56;
+                    }
+                    local_apic.set_icr(icr);
+                }
+
+                // Intel SDM: wait 200µs between SIPIs
+                early_udelay(200);
+
+                // Send START IPI #2 (recommended for compatibility)
+                {
+                    let ap_segment = (TRAMPOLINE >> 12) & 0xFF;
+                    let mut icr = 0x0600 | ap_segment as u64;
+                    if local_apic.x2 {
+                        icr |= u64::from(ap_local_apic.id) << 32;
+                    } else {
+                        icr |= u64::from(ap_local_apic.id) << 56;
+                    }
+                    local_apic.set_icr(icr);
+                }
+
+                // Wait briefly for SIPI to be accepted
+                early_udelay(200);
+
+                // Check ESR for delivery errors after SIPI sequence.
+                // Bit 5 = Send Accept Error, Bit 6 = Send Illegal Vector.
+                let esr_val = unsafe { local_apic.esr() };
+                if esr_val != 0 {
+                    println!(
+                        "KERNEL AP: CPU {} SIPI delivery error (ESR={:#x}), continuing",
+                        ap_local_apic.id, esr_val
+                    );
+                }
+
+                // Wait for trampoline ready with timeout
+                let mut trampoline_ready = false;
+                for _ in 0..AP_SPIN_LIMIT {
+                    if unsafe { (*ap_ready.cast::<AtomicU8>()).load(Ordering::SeqCst) } != 0 {
+                        trampoline_ready = true;
+                        break;
+                    }
+                    hint::spin_loop();
+                }
+                if !trampoline_ready {
+                    println!("KERNEL AP: CPU {} trampoline timeout, skipping", ap_local_apic.id);
+                    continue;
+                }
+
+                let mut kernel_ready = false;
+                for _ in 0..AP_SPIN_LIMIT {
+                    if AP_READY.load(Ordering::SeqCst) {
+                        kernel_ready = true;
+                        break;
+                    }
+                    hint::spin_loop();
+                }
+                if !kernel_ready {
+                    println!("KERNEL AP: CPU {} AP_READY timeout, skipping", ap_local_apic.id);
+                    continue;
+                }
+
+                // Record APIC→CPU mapping for NUMA topology.
+                unsafe {
+                    record_apic_mapping(u32::from(ap_local_apic.id), cpu_id);
+                }
+                // Set NUMA node from SRAT data.
+                if let Some(percpu) = crate::percpu::get_for_cpu(cpu_id) {
+                    if let Some(node) = crate::acpi::srat::numa_node_for_apic(u32::from(ap_local_apic.id)) {
+                        percpu.numa_node.set(node);
+                    }
+                }
+
+                RmmA::invalidate_all();
+            } else {
+                debug!("KERNEL AP: LAPIC CPU {} disabled in MADT, skipping", u32::from(ap_local_apic.id));
+            }
+        } else if let MadtEntry::LocalX2Apic(ap_x2apic) = madt_entry {
+            let apic_id = ap_x2apic.x2apic_id;
+            let flags = ap_x2apic.flags;
+
+            if apic_id == me.get() {
+                debug!("        This is my local x2APIC");
+            } else if flags & 1 == 1 {
+                let alloc = match allocate_p2frame(4) {
+                    Some(frame) => frame,
+                    None => {
+                        println!("KERNEL AP: CPU {} no memory for stack, skipping", apic_id);
+                        continue;
+                    }
+                };
+                let stack_start = RmmA::phys_to_virt(alloc.base()).data();
+                let stack_end = stack_start + (PAGE_SIZE << 4);
+
+                // Atomically allocate a CPU ID — fetch_add is SeqCst so that
+                // all later stores (PercpuBlock, NUMA node) are ordered after.
+                let cpu_id = LogicalCpuId::new(crate::CPU_COUNT.fetch_add(1, Ordering::SeqCst));
+                if cpu_id.get() >= crate::cpu_set::MAX_CPU_COUNT {
+                    println!(
+                        "KERNEL AP: CPU {} exceeds logical CPU limit, skipping",
+                        apic_id
+                    );
+                    continue;
+                }
+
+                let pcr_ptr = crate::arch::gdt::allocate_and_init_pcr(cpu_id, stack_end);
+                let idt_ptr = crate::arch::idt::allocate_and_init_idt(cpu_id);
+
+                let args = KernelArgsAp {
+                    stack_end: stack_end as *mut u8,
+                    cpu_id,
+                    pcr_ptr,
+                    idt_ptr,
+                };
+
+                let ap_ready = (TRAMPOLINE + 8) as *mut u64;
+                let ap_args_ptr = unsafe { ap_ready.add(1) };
+                let ap_page_table = unsafe { ap_ready.add(2) };
+                let ap_code = unsafe { ap_ready.add(3) };
+
+                unsafe {
+                    ap_ready.write(0);
+                    ap_args_ptr.write(&args as *const _ as u64);
+                    ap_page_table.write(page_table_physaddr as u64);
+                    #[expect(clippy::fn_to_numeric_cast)]
+                    ap_code.write(kstart_ap as u64);
+                    // Ensure all trampoline writes are visible to the AP.
+                    core::sync::atomic::fence(Ordering::SeqCst);
+                }
+                AP_READY.store(false, Ordering::SeqCst);
+
+                // Clear APIC Error Status Register before starting AP.
+                unsafe { local_apic.esr(); }
+
+                // Send INIT IPI (Assert)
+                {
+                    let mut icr = 0x4500u64;
+                    if local_apic.x2 {
+                        icr |= u64::from(apic_id) << 32;
+                    } else {
+                        icr |= u64::from(apic_id as u8) << 56;
+                    }
+                    local_apic.set_icr(icr);
+                }
+
+                // Intel SDM Vol 3A §8.4.4: wait 10ms after INIT
+                early_udelay(10_000);
+
+                // Send START IPI #1
+                {
+                    let ap_segment = (TRAMPOLINE >> 12) & 0xFF;
+                    let mut icr = 0x0600u64 | ap_segment as u64;
+                    if local_apic.x2 {
+                        icr |= u64::from(apic_id) << 32;
+                    } else {
+                        icr |= u64::from(apic_id as u8) << 56;
+                    }
+                    local_apic.set_icr(icr);
+                }
+
+                // Intel SDM: wait 200µs between SIPIs
+                early_udelay(200);
+
+                // Send START IPI #2 (recommended for compatibility)
+                {
+                    let ap_segment = (TRAMPOLINE >> 12) & 0xFF;
+                    let mut icr = 0x0600u64 | ap_segment as u64;
+                    if local_apic.x2 {
+                        icr |= u64::from(apic_id) << 32;
+                    } else {
+                        icr |= u64::from(apic_id as u8) << 56;
+                    }
+                    local_apic.set_icr(icr);
+                }
+
+                // Wait briefly for SIPI acceptance
+                early_udelay(200);
+
+                // Check ESR for delivery errors.
+                let esr_val = unsafe { local_apic.esr() };
+                if esr_val != 0 {
+                    println!(
+                        "KERNEL AP: CPU {} SIPI delivery error (ESR={:#x}), continuing",
+                        apic_id, esr_val
+                    );
+                }
+
+                let mut trampoline_ready = false;
+                for _ in 0..AP_SPIN_LIMIT {
+                    if unsafe { (*ap_ready.cast::<AtomicU8>()).load(Ordering::SeqCst) } != 0 {
+                        trampoline_ready = true;
+                        break;
+                    }
+                    hint::spin_loop();
+                }
+                if !trampoline_ready {
+                    println!("KERNEL AP: CPU {} trampoline timeout, skipping", apic_id);
+                    continue;
+                }
+
+                let mut kernel_ready = false;
+                for _ in 0..AP_SPIN_LIMIT {
+                    if AP_READY.load(Ordering::SeqCst) {
+                        kernel_ready = true;
+                        break;
+                    }
+                    hint::spin_loop();
+                }
+                if !kernel_ready {
+                    println!("KERNEL AP: CPU {} AP_READY timeout, skipping", apic_id);
+                    continue;
+                }
+
+                // Record APIC→CPU mapping for NUMA topology.
+                unsafe {
+                    record_apic_mapping(apic_id, cpu_id);
+                }
+                // Set NUMA node from SRAT data.
+                if let Some(percpu) = crate::percpu::get_for_cpu(cpu_id) {
+                    if let Some(node) = crate::acpi::srat::numa_node_for_apic(apic_id) {
+                        percpu.numa_node.set(node);
+                    }
+                }
+
+                RmmA::invalidate_all();
+            } else {
+                debug!("KERNEL AP: x2APIC CPU {} disabled in MADT (flags={:#x}), skipping", apic_id, flags);
+            }
+        } else if let MadtEntry::LocalApicNmi(nmi) = madt_entry {
+            let target_apic = nmi.processor;
+            if target_apic == 0xFF || target_apic == local_apic.id().get() as u8 {
+                unsafe { local_apic.set_lvt_nmi(nmi.nmi_pin, nmi.flags) };
+            }
+        } else if let MadtEntry::LocalX2ApicNmi(nmi) = madt_entry {
+            let current_uid = current_x2apic_processor_uid(&madt, me.get());
+            if nmi.processor_uid == u32::MAX || current_uid == Some(nmi.processor_uid) {
+                unsafe { local_apic.set_lvt_nmi(nmi.nmi_pin, nmi.flags) };
+            }
+        } else if let MadtEntry::LapicAddressOverride(override_entry) = madt_entry {
+            apply_lapic_address_override(local_apic, override_entry.local_apic_address);
+        }
+    }
+
+    // Initialize NUMA topology from APIC→CPU mappings and SRAT.
+    {
+        let mappings = unsafe { &APIC_MAPPINGS[..APIC_MAPPING_COUNT] };
+        let mappings_ref: Vec<(u32, LogicalCpuId)> = mappings
+            .iter()
+            .map(|m| (m.apic_id, m.cpu_id))
+            .collect();
+        crate::numa::init_from_srat(&mappings_ref);
+    }
+    // Set BSP's NUMA node from SRAT.
+    if let Some(node) = crate::acpi::srat::numa_node_for_apic(me.get()) {
+        crate::percpu::PercpuBlock::current().numa_node.set(node);
+    }
+
+    // Log final CPU count vs maximum
+    let cpu_count = crate::CPU_COUNT.load(Ordering::SeqCst);
+    info!(
+        "SMP: {} CPUs online (max {})",
+        cpu_count, crate::cpu_set::MAX_CPU_COUNT
+    );
+    if cpu_count > crate::cpu_set::MAX_CPU_COUNT * 80 / 100 {
+        warn!(
+            "SMP: CPU count approaching MAX_CPU_COUNT limit ({}/{})",
+            cpu_count, crate::cpu_set::MAX_CPU_COUNT
+        );
+    }
+
+    // Unmap trampoline
+    if let Some((_frame, _, flush)) = unsafe {
+        KernelMapper::lock_rw()
+            .unmap_phys(trampoline_page.start_address())
+    } {
+        flush.flush();
+    } else {
+        println!("KERNEL AP: failed to unmap trampoline page (non-fatal)");
+    }
+}
@@ -0,0 +1,160 @@
+use core::{
+    hint,
+    sync::atomic::{AtomicU8, Ordering},
+};
+
+use crate::{
+    arch::{
+        device::local_apic::the_local_apic,
+        start::{kstart_ap, KernelArgsAp},
+    },
+    cpu_set::LogicalCpuId,
+    memory::{
+        allocate_p2frame, Frame, KernelMapper, Page, PageFlags, PhysicalAddress, RmmA, RmmArch,
+        VirtualAddress, PAGE_SIZE,
+    },
+    startup::AP_READY,
+};
+
+use super::{Madt, MadtEntry};
+
+const TRAMPOLINE: usize = 0x8000;
+static TRAMPOLINE_DATA: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/trampoline"));
+
+pub(super) fn init(madt: Madt) {
+    let local_apic = unsafe { the_local_apic() };
+    let me = local_apic.id();
+
+    if local_apic.x2 {
+        debug!("    X2APIC {}", me.get());
+    } else {
+        debug!("    XAPIC {}: {:>08X}", me.get(), local_apic.address);
+    }
+
+    if cfg!(not(feature = "multi_core")) {
+        return;
+    }
+
+    // Map trampoline
+    let trampoline_frame = Frame::containing(PhysicalAddress::new(TRAMPOLINE));
+    let trampoline_page = Page::containing_address(VirtualAddress::new(TRAMPOLINE));
+    let (result, page_table_physaddr) = unsafe {
+        //TODO: do not have writable and executable!
+        let mut mapper = KernelMapper::lock_rw();
+
+        let result = mapper
+            .map_phys(
+                trampoline_page.start_address(),
+                trampoline_frame.base(),
+                PageFlags::new().execute(true).write(true),
+            )
+            .expect("failed to map trampoline");
+
+        (result, mapper.table().phys().data())
+    };
+    result.flush();
+
+    // Write trampoline, make sure TRAMPOLINE page is free for use
+    for (i, val) in TRAMPOLINE_DATA.iter().enumerate() {
+        unsafe {
+            (*((TRAMPOLINE as *mut u8).add(i) as *const AtomicU8)).store(*val, Ordering::SeqCst);
+        }
+    }
+
+    unsafe {
+        let preliminary_cpu_count = madt.iter().filter(|e| matches!(e, MadtEntry::LocalApic(entry) if u32::from(entry.id) == me.get() || entry.flags & 1 == 1)).count();
+        crate::profiling::allocate(preliminary_cpu_count as u32);
+    }
+
+    for madt_entry in madt.iter() {
+        debug!("      {:x?}", madt_entry);
+        if let MadtEntry::LocalApic(ap_local_apic) = madt_entry {
+            if u32::from(ap_local_apic.id) == me.get() {
+                debug!("        This is my local APIC");
+            } else if ap_local_apic.flags & 1 == 1 {
+                let cpu_id = LogicalCpuId::next();
+
+                // Allocate a stack
+                let stack_start = RmmA::phys_to_virt(
+                    allocate_p2frame(4)
+                        .expect("no more frames in acpi stack_start")
+                        .base(),
+                )
+                .data();
+                let stack_end = stack_start + (PAGE_SIZE << 4);
+
+                let pcr_ptr = crate::arch::gdt::allocate_and_init_pcr(cpu_id, stack_end);
+
+                let idt_ptr = crate::arch::idt::allocate_and_init_idt(cpu_id);
+
+                let args = KernelArgsAp {
+                    stack_end: stack_end as *mut u8,
+                    cpu_id,
+                    pcr_ptr,
+                    idt_ptr,
+                };
+
+                let ap_ready = (TRAMPOLINE + 8) as *mut u64;
+                let ap_args_ptr = unsafe { ap_ready.add(1) };
+                let ap_page_table = unsafe { ap_ready.add(2) };
+                let ap_code = unsafe { ap_ready.add(3) };
+
+                // Set the ap_ready to 0, volatile
+                unsafe {
+                    ap_ready.write(0);
+                    ap_args_ptr.write(&args as *const _ as u64);
+                    ap_page_table.write(page_table_physaddr as u64);
+                    #[expect(clippy::fn_to_numeric_cast)]
+                    ap_code.write(kstart_ap as u64);
+
+                    // TODO: Is this necessary (this fence)?
+                    core::arch::asm!("");
+                };
+                AP_READY.store(false, Ordering::SeqCst);
+
+                // Send INIT IPI
+                {
+                    let mut icr = 0x4500;
+                    if local_apic.x2 {
+                        icr |= u64::from(ap_local_apic.id) << 32;
+                    } else {
+                        icr |= u64::from(ap_local_apic.id) << 56;
+                    }
+                    local_apic.set_icr(icr);
+                }
+
+                // Send START IPI
+                {
+                    let ap_segment = (TRAMPOLINE >> 12) & 0xFF;
+                    let mut icr = 0x4600 | ap_segment as u64;
+
+                    if local_apic.x2 {
+                        icr |= u64::from(ap_local_apic.id) << 32;
+                    } else {
+                        icr |= u64::from(ap_local_apic.id) << 56;
+                    }
+
+                    local_apic.set_icr(icr);
+                }
+
+                // Wait for trampoline ready
+                while unsafe { (*ap_ready.cast::<AtomicU8>()).load(Ordering::SeqCst) } == 0 {
+                    hint::spin_loop();
+                }
+                while !AP_READY.load(Ordering::SeqCst) {
+                    hint::spin_loop();
+                }
+
+                RmmA::invalidate_all();
+            }
+        }
+    }
+
+    // Unmap trampoline
+    let (_frame, _, flush) = unsafe {
+        KernelMapper::lock_rw()
+            .unmap_phys(trampoline_page.start_address())
+            .expect("failed to unmap trampoline page")
+    };
+    flush.flush();
+}
@@ -0,0 +1,41 @@
+--- src/acpi/madt/arch/x86.rs
+++ src/acpi/madt/arch/x86.rs
+@@ -446,11 +446,7 @@
+                 // Send INIT IPI (Assert)
+                 {
+                     let mut icr = 0x4500u64;
+-                    if local_apic.x2 {
+-                        icr |= u64::from(apic_id) << 32;
+-                    } else {
+-                        icr |= u64::from(apic_id as u8) << 56;
+-                    }
+                    icr |= u64::from(apic_id) << 32;
+                     local_apic.set_icr(icr);
+                 }
+ 
+@@ -460,11 +456,7 @@
+                 {
+                     let ap_segment = (TRAMPOLINE >> 12) & 0xFF;
+                     let mut icr = 0x0600u64 | ap_segment as u64;
+-                    if local_apic.x2 {
+-                        icr |= u64::from(apic_id) << 32;
+-                    } else {
+-                        icr |= u64::from(apic_id as u8) << 56;
+-                    }
+                    icr |= u64::from(apic_id) << 32;
+                     local_apic.set_icr(icr);
+                 }
+ 
+@@ -476,11 +468,7 @@
+                 {
+                     let ap_segment = (TRAMPOLINE >> 12) & 0xFF;
+                     let mut icr = 0x0600u64 | ap_segment as u64;
+-                    if local_apic.x2 {
+-                        icr |= u64::from(apic_id) << 32;
+-                    } else {
+-                        icr |= u64::from(apic_id as u8) << 56;
+-                    }
+                    icr |= u64::from(apic_id) << 32;
+                     local_apic.set_icr(icr);
+                 }
+ 
@@ -34,12 +34,6 @@ impl Madt {
        let madt = Madt::new(find_one_sdt!("APIC"));

        if let Some(madt) = madt {
-            // Validate MADT checksum per ACPI 6.5 §5.2.2
-            if !madt.sdt.validate_checksum() {
-                error!("MADT checksum validation failed, skipping APIC initialization");
-                return;
-            }
-
            // safe because no APs have been started yet.
            unsafe { MADT.get().write(Some(madt)) };

@@ -152,48 +146,6 @@ pub struct MadtGicd {
    _reserved2: [u8; 3],
 }

-/// MADT Local x2APIC (entry type 0x9)
-#[derive(Clone, Copy, Debug)]
-#[repr(C, packed)]
-pub struct MadtLocalX2Apic {
-    _reserved: u16,
-    pub x2apic_id: u32,
-    pub flags: u32,
-    pub processor_uid: u32,
-}
-
-/// MADT Local APIC NMI (entry type 0x4)
-#[derive(Clone, Copy, Debug)]
-#[repr(C, packed)]
-pub struct MadtLocalApicNmi {
-    pub processor: u8,
-    pub flags: u16,
-    pub nmi_pin: u8,
-}
-
-/// MADT Local APIC address override (entry type 0x5)
-#[derive(Clone, Copy, Debug)]
-#[repr(C, packed)]
-pub struct MadtLapicAddressOverride {
-    _reserved: u16,
-    pub local_apic_address: u64,
-}
-
-/// MADT Local x2APIC NMI (entry type 0xA)
-#[derive(Clone, Copy, Debug)]
-#[repr(C, packed)]
-pub struct MadtLocalX2ApicNmi {
-    _reserved: u16,
-    pub processor_uid: u32,
-    pub flags: u16,
-    pub nmi_pin: u8,
-    _reserved2: u8,
-}
-
-const _: () = assert!(size_of::<MadtLocalApicNmi>() == 4);
-const _: () = assert!(size_of::<MadtLapicAddressOverride>() == 10);
-const _: () = assert!(size_of::<MadtLocalX2ApicNmi>() == 10);
-
 /// MADT Entries
 #[derive(Debug)]
 #[allow(dead_code)]
@@ -204,18 +156,10 @@ pub enum MadtEntry {
    InvalidIoApic(usize),
    IntSrcOverride(&'static MadtIntSrcOverride),
    InvalidIntSrcOverride(usize),
-    LocalApicNmi(&'static MadtLocalApicNmi),
-    InvalidLocalApicNmi(usize),
-    LapicAddressOverride(&'static MadtLapicAddressOverride),
-    InvalidLapicAddressOverride(usize),
    Gicc(&'static MadtGicc),
    InvalidGicc(usize),
    Gicd(&'static MadtGicd),
    InvalidGicd(usize),
-    LocalX2Apic(&'static MadtLocalX2Apic),
-    InvalidLocalX2Apic(usize),
-    LocalX2ApicNmi(&'static MadtLocalX2ApicNmi),
-    InvalidLocalX2ApicNmi(usize),
    Unknown(u8),
 }

@@ -232,10 +176,6 @@ impl Iterator for MadtIter {
            let entry_len =
                unsafe { *(self.sdt.data_address() as *const u8).add(self.i + 1) } as usize;

-            if entry_len < 2 {
-                return None;
-            }
-
            if self.i + entry_len <= self.sdt.data_len() {
                let item = match entry_type {
                    0x0 => {
@@ -266,46 +206,6 @@ impl Iterator for MadtIter {
                            MadtEntry::InvalidIntSrcOverride(entry_len)
                        }
                    }
-                    0x4 => {
-                        if entry_len == size_of::<MadtLocalApicNmi>() + 2 {
-                            MadtEntry::LocalApicNmi(unsafe {
-                                &*((self.sdt.data_address() + self.i + 2)
-                                    as *const MadtLocalApicNmi)
-                            })
-                        } else {
-                            MadtEntry::InvalidLocalApicNmi(entry_len)
-                        }
-                    }
-                    0x5 => {
-                        if entry_len == size_of::<MadtLapicAddressOverride>() + 2 {
-                            MadtEntry::LapicAddressOverride(unsafe {
-                                &*((self.sdt.data_address() + self.i + 2)
-                                    as *const MadtLapicAddressOverride)
-                            })
-                        } else {
-                            MadtEntry::InvalidLapicAddressOverride(entry_len)
-                        }
-                    }
-                    0x9 => {
-                        if entry_len == size_of::<MadtLocalX2Apic>() + 2 {
-                            MadtEntry::LocalX2Apic(unsafe {
-                                &*((self.sdt.data_address() + self.i + 2)
-                                    as *const MadtLocalX2Apic)
-                            })
-                        } else {
-                            MadtEntry::InvalidLocalX2Apic(entry_len)
-                        }
-                    }
-                    0xA => {
-                        if entry_len == size_of::<MadtLocalX2ApicNmi>() + 2 {
-                            MadtEntry::LocalX2ApicNmi(unsafe {
-                                &*((self.sdt.data_address() + self.i + 2)
-                                    as *const MadtLocalX2ApicNmi)
-                            })
-                        } else {
-                            MadtEntry::InvalidLocalX2ApicNmi(entry_len)
-                        }
-                    }
                    0xB => {
                        if entry_len >= size_of::<MadtGicc>() + 2 {
                            MadtEntry::Gicc(unsafe {
@@ -20,8 +20,6 @@ mod rxsdt;
 pub mod sdt;
 #[cfg(target_arch = "aarch64")]
 mod spcr;
-pub mod slit;
-pub mod srat;
 mod xsdt;

 unsafe fn map_linearly(addr: PhysicalAddress, len: usize, mapper: &mut crate::memory::PageMapper) {
@@ -84,14 +82,6 @@ impl Rxsdt for RxsdtEnum {

 pub static RXSDT_ENUM: Once<RxsdtEnum> = Once::new();

-#[derive(Clone, Copy, Debug)]
-pub struct AcpiRootInfo {
-    pub revision: u8,
-    pub root_sdt_address: PhysicalAddress,
-}
-
-pub static ACPI_ROOT_INFO: Once<AcpiRootInfo> = Once::new();
-
 /// Parse the ACPI tables to gather CPU, interrupt, and timer information
 pub unsafe fn init(already_supplied_rsdp: Option<*const u8>) {
    unsafe {
@@ -104,15 +94,6 @@ pub unsafe fn init(already_supplied_rsdp: Option<*const u8>) {
        let rsdp_opt = Rsdp::get_rsdp(already_supplied_rsdp);

        if let Some(rsdp) = rsdp_opt {
-            let root_info = ACPI_ROOT_INFO.call_once(|| AcpiRootInfo {
-                revision: rsdp.revision(),
-                root_sdt_address: rsdp.sdt_address(),
-            });
-
-            if root_info.root_sdt_address != rsdp.sdt_address() || root_info.revision != rsdp.revision() {
-                error!("ACPI_ROOT_INFO already initialized with a different RSDP root");
-            }
-
            debug!("SDT address: {:#x}", rsdp.sdt_address().data());
            let rxsdt = get_sdt(rsdp.sdt_address(), &mut KernelMapper::lock_rw());

@@ -165,14 +146,7 @@ pub unsafe fn init(already_supplied_rsdp: Option<*const u8>) {

            // TODO: Enumerate processors in userspace, and then provide an ACPI-independent interface
            // to initialize enumerated processors to userspace?
-            // Parse SRAT BEFORE MADT so NUMA node mapping is available
-            // when APs are started and PercpuBlocks are created.
-            srat::init();
-
            Madt::init();
-
-            // Parse SLIT after MADT for the NUMA distance matrix.
-            slit::init();
            //TODO: support this on any arch
            // SPCR must be initialized after MADT for interrupt controllers
            #[cfg(target_arch = "aarch64")]
@@ -17,33 +17,9 @@ pub struct Rsdp {

 impl Rsdp {
    pub unsafe fn get_rsdp(already_supplied_rsdp: Option<*const u8>) -> Option<Rsdp> {
-        already_supplied_rsdp.and_then(|rsdp_ptr| {
-            let rsdp = unsafe { *(rsdp_ptr as *const Rsdp) };
-
-            // Validate signature "RSD PTR "
-            if &rsdp.signature != b"RSD PTR " {
-                return None;
-            }
-
-            // ACPI 1.0 checksum: sum of first 20 bytes must be zero
-            let bytes_v1 = unsafe { core::slice::from_raw_parts(rsdp_ptr, 20) };
-            if bytes_v1.iter().fold(0u8, |sum, &b| sum.wrapping_add(b)) != 0 {
-                return None;
-            }
-
-            // ACPI 2.0+ extended checksum: sum of entire table (length bytes) must be zero
-            if rsdp.revision >= 2 {
-                let full_len = rsdp._length as usize;
-                if full_len < 36 || full_len > 256 {
-                    return None;
-                }
-                let bytes_full = unsafe { core::slice::from_raw_parts(rsdp_ptr, full_len) };
-                if bytes_full.iter().fold(0u8, |sum, &b| sum.wrapping_add(b)) != 0 {
-                    return None;
-                }
-            }
-
-            Some(rsdp)
+        already_supplied_rsdp.map(|rsdp_ptr| {
+            // TODO: Validate
+            unsafe { *(rsdp_ptr as *const Rsdp) }
        })
    }

@@ -55,8 +31,4 @@ impl Rsdp {
            self.rsdt_address as usize
        })
    }
-
-    pub fn revision(&self) -> u8 {
-        self.revision
-    }
 }
@@ -24,20 +24,4 @@ impl Sdt {
        let header_size = size_of::<Sdt>();
        total_size.saturating_sub(header_size)
    }
-
-    /// Validate the SDT checksum.
-    ///
-    /// Per ACPI 6.5 §5.2.2: the entire table (including the checksum field)
-    /// must sum to 0 when all bytes are added together as unsigned 8-bit values.
-    pub fn validate_checksum(&self) -> bool {
-        let ptr = self as *const _ as *const u8;
-        let len = self.length as usize;
-        if len < size_of::<Sdt>() {
-            return false;
-        }
-        let sum = unsafe { core::slice::from_raw_parts(ptr, len) }
-            .iter()
-            .fold(0u8, |acc, &b| acc.wrapping_add(b));
-        sum == 0
-    }
 }
@@ -1,45 +0,0 @@
-//! SLIT (System Locality Information Table) parser.
-//!
-//! Parses the NUMA distance matrix for scheduler NUMA-aware work stealing.
-
-use super::sdt::Sdt;
-use crate::acpi::find_sdt;
-
-const MAX_NODES: usize = 8;
-
-static mut SLIT_MATRIX: [[u8; MAX_NODES]; MAX_NODES] = [[10u8; MAX_NODES]; MAX_NODES];
-static mut SLIT_NUM_NODES: usize = 0;
-static mut SLIT_AVAILABLE: bool = false;
-
-pub fn is_available() -> bool { unsafe { SLIT_AVAILABLE } }
-pub fn num_nodes() -> usize { unsafe { SLIT_NUM_NODES } }
-
-pub fn distance(from: u8, to: u8) -> u8 {
-    if !unsafe { SLIT_AVAILABLE } { return 10; }
-    let (from, to) = (from as usize, to as usize);
-    if from >= MAX_NODES || to >= MAX_NODES { return 10; }
-    unsafe { SLIT_MATRIX[from][to] }
-}
-
-pub fn same_socket(node1: u8, node2: u8) -> bool { distance(node1, node2) <= 20 }
-
-pub fn init() {
-    let sdt = match find_sdt("SLIT").as_slice() {
-        [] => return,
-        [x] => *x,
-        xs => { println!("SLIT: {} tables found, expected 1", xs.len()); return; }
-    };
-    if &sdt.signature != b"SLIT" { return; }
-    let data_addr = sdt.data_address();
-    let data_len = sdt.data_len();
-    if data_len < 8 { return; }
-    let num_nodes = unsafe { *(data_addr as *const u64) } as usize;
-    if num_nodes == 0 || num_nodes > MAX_NODES { println!("SLIT: {num_nodes} nodes (max {MAX_NODES}), ignoring"); return; }
-    let matrix_start = 8;
-    let matrix_size = num_nodes * num_nodes;
-    if data_len < matrix_start + matrix_size { println!("SLIT: matrix truncated ({data_len} < {})", matrix_start + matrix_size); return; }
-    let matrix = unsafe { &mut SLIT_MATRIX };
-    for i in 0..num_nodes { for j in 0..num_nodes { matrix[i][j] = unsafe { *((data_addr + matrix_start + i * num_nodes + j) as *const u8) }; } }
-    unsafe { SLIT_NUM_NODES = num_nodes; SLIT_AVAILABLE = true; }
-    debug!("SLIT: {} nodes, distance matrix loaded", num_nodes);
-}
@@ -1,102 +0,0 @@
-//! SRAT (System Resource Affinity Table) parser.
-//!
-//! Parses CPU-to-NUMA-node and memory-to-NUMA-node affinity information.
-//! Called before MADT init so that NUMA data is available during AP startup.
-
-use super::sdt::Sdt;
-use crate::acpi::find_sdt;
-
-const MAX_CPU_ENTRIES: usize = 256;
-const MAX_MEM_ENTRIES: usize = 64;
-
-#[derive(Clone, Copy)]
-struct SratCpuEntry { apic_id: u32, node: u8, enabled: bool }
-
-#[derive(Clone, Copy)]
-struct SratMemEntry { node: u8, base: u64, length: u64, enabled: bool }
-
-const CPU_NONE: SratCpuEntry = SratCpuEntry { apic_id: u32::MAX, node: 0, enabled: false };
-const MEM_NONE: SratMemEntry = SratMemEntry { node: 0, base: 0, length: 0, enabled: false };
-
-static mut SRAT_CPU_ENTRIES: [SratCpuEntry; MAX_CPU_ENTRIES] = [CPU_NONE; MAX_CPU_ENTRIES];
-static mut SRAT_MEM_ENTRIES: [SratMemEntry; MAX_MEM_ENTRIES] = [MEM_NONE; MAX_MEM_ENTRIES];
-static mut SRAT_CPU_COUNT: usize = 0;
-static mut SRAT_MEM_COUNT: usize = 0;
-static mut SRAT_AVAILABLE: bool = false;
-
-pub fn is_available() -> bool { unsafe { SRAT_AVAILABLE } }
-
-pub fn numa_node_for_apic(apic_id: u32) -> Option<u8> {
-    if !unsafe { SRAT_AVAILABLE } { return None; }
-    let count = unsafe { SRAT_CPU_COUNT };
-    let entries = unsafe { &SRAT_CPU_ENTRIES };
-    for i in 0..count {
-        if entries[i].apic_id == apic_id && entries[i].enabled { return Some(entries[i].node); }
-    }
-    None
-}
-
-pub fn numa_node_count() -> usize {
-    if !unsafe { SRAT_AVAILABLE } { return 1; }
-    let mut max_node: u8 = 0;
-    let count = unsafe { SRAT_CPU_COUNT };
-    let entries = unsafe { &SRAT_CPU_ENTRIES };
-    for i in 0..count { if entries[i].enabled && entries[i].node > max_node { max_node = entries[i].node; } }
-    (max_node as usize) + 1
-}
-
-#[repr(C, packed)]
-struct SratLocalApic { _proximity_lo: u8, apic_id: u8, flags: u32, _local_sapic_eid: u8, _proximity_hi: [u8; 3], _clock_domain: u32 }
-
-#[repr(C, packed)]
-struct SratMemoryAffinity { proximity_domain: u32, _reserved1: u16, base_address_lo: u32, base_address_hi: u32, length_lo: u32, length_hi: u32, _reserved2: u32, flags: u32, _reserved3: u64 }
-
-#[repr(C, packed)]
-struct SratLocalX2Apic { _reserved: u16, proximity_domain: u32, x2apic_id: u32, flags: u32, _clock_domain: u32, _reserved2: u32 }
-
-pub fn init() {
-    let sdt = match find_sdt("SRAT").as_slice() {
-        [] => return,
-        [x] => *x,
-        xs => { println!("SRAT: {} tables found, expected 1", xs.len()); return; }
-    };
-    if &sdt.signature != b"SRAT" { return; }
-    let data_addr = sdt.data_address();
-    let data_len = sdt.data_len();
-    if data_len < 12 { println!("SRAT: table too short ({data_len} bytes)"); return; }
-    let mut offset: usize = 12;
-    let cpu_entries = unsafe { &mut SRAT_CPU_ENTRIES };
-    let mem_entries = unsafe { &mut SRAT_MEM_ENTRIES };
-    let mut cpu_count: usize = 0;
-    let mut mem_count: usize = 0;
-    while offset + 2 <= data_len {
-        let entry_type = unsafe { *((data_addr + offset) as *const u8) };
-        let entry_len = unsafe { *((data_addr + offset + 1) as *const u8) } as usize;
-        if entry_len < 2 || offset + entry_len > data_len { break; }
-        let entry_data = data_addr + offset + 2;
-        match entry_type {
-            0x0 if entry_len >= size_of::<SratLocalApic>() + 2 => {
-                let e = unsafe { &*(entry_data as *const SratLocalApic) };
-                let enabled = (e.flags & 1) == 1;
-                let node = (e._proximity_lo as u32) | ((e._proximity_hi[0] as u32) << 8) | ((e._proximity_hi[1] as u32) << 16) | ((e._proximity_hi[2] as u32) << 24);
-                if cpu_count < MAX_CPU_ENTRIES { cpu_entries[cpu_count] = SratCpuEntry { apic_id: e.apic_id as u32, node: node as u8, enabled }; cpu_count += 1; }
-            }
-            0x1 if entry_len >= size_of::<SratMemoryAffinity>() + 2 => {
-                let e = unsafe { &*(entry_data as *const SratMemoryAffinity) };
-                let enabled = (e.flags & 1) == 1;
-                let base = (e.base_address_hi as u64) << 32 | e.base_address_lo as u64;
-                let length = (e.length_hi as u64) << 32 | e.length_lo as u64;
-                if mem_count < MAX_MEM_ENTRIES { mem_entries[mem_count] = SratMemEntry { node: e.proximity_domain as u8, base, length, enabled }; mem_count += 1; }
-            }
-            0x2 if entry_len >= size_of::<SratLocalX2Apic>() + 2 => {
-                let e = unsafe { &*(entry_data as *const SratLocalX2Apic) };
-                let enabled = (e.flags & 1) == 1;
-                if cpu_count < MAX_CPU_ENTRIES { cpu_entries[cpu_count] = SratCpuEntry { apic_id: e.x2apic_id, node: e.proximity_domain as u8, enabled }; cpu_count += 1; }
-            }
-            _ => {}
-        }
-        offset += entry_len;
-    }
-    unsafe { SRAT_CPU_COUNT = cpu_count; SRAT_MEM_COUNT = mem_count; SRAT_AVAILABLE = true; }
-    debug!("SRAT: {} CPU entries, {} memory entries", cpu_count, mem_count);
-}
@@ -7,40 +7,26 @@ mod linked_list;
 /// Size of kernel heap
 const KERNEL_HEAP_SIZE: usize = ::rmm::MEGABYTE;

-#[cold]
-fn halt_kernel_heap_init(message: &str) -> ! {
-    print!("{message}");
-    println!("Kernel heap initialization cannot continue. Halting.");
-    loop {
-        core::hint::spin_loop();
-    }
-}
-
 unsafe fn map_heap(mapper: &mut KernelMapper<true>, offset: usize, size: usize) {
    let mut flush_all = PageFlushAll::new();

    let heap_start_page = Page::containing_address(VirtualAddress::new(offset));
    let heap_end_page = Page::containing_address(VirtualAddress::new(offset + size - 1));
    for page in Page::range_inclusive(heap_start_page, heap_end_page) {
-        let phys = match mapper.allocator_mut().allocate_one() {
-            Some(phys) => phys,
-            None => halt_kernel_heap_init(
-                "FATAL: failed to allocate physical frame for kernel heap\n",
-            ),
-        };
+        let phys = mapper
+            .allocator_mut()
+            .allocate_one()
+            .expect("failed to allocate kernel heap");
        let flush = unsafe {
-            match mapper.map_phys(
-                page.start_address(),
-                phys,
-                PageFlags::new()
-                    .write(true)
-                    .global(cfg!(not(feature = "pti"))),
-            ) {
-                Some(flush) => flush,
-                None => halt_kernel_heap_init(
-                    "FATAL: failed to map kernel heap virtual page\n",
-                ),
-            }
+            mapper
+                .map_phys(
+                    page.start_address(),
+                    phys,
+                    PageFlags::new()
+                        .write(true)
+                        .global(cfg!(not(feature = "pti"))),
+                )
+                .expect("failed to map kernel heap")
        };
        flush_all.consume(flush);
    }
@@ -91,7 +91,7 @@ unsafe extern "C" fn start(args_ptr: *const KernelArgs) -> ! {
                dtb::serial::init_early(dtb);
            }

-            info!("RedBear OS starting...");
+            info!("Redox OS starting...");
            args.print();

            // Initialize RMM
@@ -97,7 +97,7 @@ unsafe extern "C" fn start(args_ptr: *const KernelArgs) -> ! {
                init_early(dtb);
            }

-            info!("RedBear OS starting...");
+            info!("Redox OS starting...");
            args.print();

            if let Some(dtb) = &dtb {
@@ -14,10 +14,6 @@ pub struct IoApicRegs {
    pointer: *const u32,
 }
 impl IoApicRegs {
-    fn redirection_index_valid(&mut self, idx: u8) -> bool {
-        idx <= self.max_redirection_table_entries()
-    }
-
    fn ioregsel(&self) -> *const u32 {
        self.pointer
    }
@@ -48,28 +44,21 @@ impl IoApicRegs {
    pub fn read_ioapicver(&mut self) -> u32 {
        self.read_reg(0x01)
    }
-    pub fn read_ioredtbl(&mut self, idx: u8) -> Option<u64> {
-        if !self.redirection_index_valid(idx) {
-            warn!("IOAPIC read_ioredtbl index {} out of range", idx);
-            return None;
-        }
+    pub fn read_ioredtbl(&mut self, idx: u8) -> u64 {
+        assert!(idx < 24);
        let lo = self.read_reg(0x10 + idx * 2);
        let hi = self.read_reg(0x10 + idx * 2 + 1);

-        Some(u64::from(lo) | (u64::from(hi) << 32))
+        u64::from(lo) | (u64::from(hi) << 32)
    }
-    pub fn write_ioredtbl(&mut self, idx: u8, value: u64) -> bool {
-        if !self.redirection_index_valid(idx) {
-            warn!("IOAPIC write_ioredtbl index {} out of range", idx);
-            return false;
-        }
+    pub fn write_ioredtbl(&mut self, idx: u8, value: u64) {
+        assert!(idx < 24);

        let lo = value as u32;
        let hi = (value >> 32) as u32;

        self.write_reg(0x10 + idx * 2, lo);
        self.write_reg(0x10 + idx * 2 + 1, hi);
-        true
    }

    pub fn max_redirection_table_entries(&mut self) -> u8 {
@@ -103,37 +92,17 @@ impl IoApic {
    }
    /// Map an interrupt vector to a physical local APIC ID of a processor (thus physical mode).
    #[allow(dead_code)]
-    pub fn map(&self, idx: u8, info: MapInfo) -> bool {
-        let Some(raw) = info.as_raw() else {
-            return false;
-        };
-        self.regs.lock().write_ioredtbl(idx, raw)
+    pub fn map(&self, idx: u8, info: MapInfo) {
+        self.regs.lock().write_ioredtbl(idx, info.as_raw())
    }
    pub fn set_mask(&self, gsi: u32, mask: bool) {
        let idx = (gsi - self.gsi_start) as u8;
        let mut guard = self.regs.lock();

-        let Some(mut reg) = guard.read_ioredtbl(idx) else {
-            return;
-        };
+        let mut reg = guard.read_ioredtbl(idx);
        reg &= !(1 << 16);
        reg |= u64::from(mask) << 16;
-        let _ = guard.write_ioredtbl(idx, reg);
-    }
-    /// Change the destination APIC for a GSI by reprogramming the redirection table entry.
-    /// Preserves all other fields (vector, polarity, trigger mode, delivery mode, mask).
-    /// Returns true if the entry was successfully updated.
-    pub fn set_irq_affinity(&self, gsi: u32, dest: ApicId) -> bool {
-        let idx = (gsi - self.gsi_start) as u8;
-        let mut guard = self.regs.lock();
-        let Some(mut entry) = guard.read_ioredtbl(idx) else {
-            return false;
-        };
-        // Clear destination field (bits 63:56 for xAPIC physical mode)
-        // and set new destination APIC ID
-        entry &= !(0xFF_u64 << 56);
-        entry |= u64::from(dest.get()) << 56;
-        guard.write_ioredtbl(idx, entry)
+        guard.write_ioredtbl(idx, reg);
    }
 }

@@ -180,26 +149,19 @@ pub struct MapInfo {
 }

 impl MapInfo {
-    pub fn as_raw(&self) -> Option<u64> {
-        if !(0x20..=0xFE).contains(&self.vector) {
-            warn!(
-                "Refusing to map IOAPIC vector outside valid range: {:#x}",
-                self.vector
-            );
-            return None;
-        }
+    pub fn as_raw(&self) -> u64 {
+        assert!(self.vector >= 0x20);
+        assert!(self.vector <= 0xFE);

        // TODO: Check for reserved fields.

-        Some(
-            (u64::from(self.dest.get()) << 56)
+        (u64::from(self.dest.get()) << 56)
            | (u64::from(self.mask) << 16)
            | ((self.trigger_mode as u64) << 15)
            | ((self.polarity as u64) << 13)
            | ((self.dest_mode as u64) << 11)
            | ((self.delivery_mode as u64) << 8)
-            | u64::from(self.vector),
-        )
+            | u64::from(self.vector)
    }
 }

@@ -213,7 +175,7 @@ impl fmt::Debug for IoApic {

                let count = guard.max_redirection_table_entries();
                f.debug_list()
-                    .entries((0..=count).filter_map(|i| guard.read_ioredtbl(i)))
+                    .entries((0..count).map(|i| guard.read_ioredtbl(i)))
                    .finish()
            }
        }
@@ -275,14 +237,11 @@ pub unsafe fn handle_ioapic(madt_ioapic: &'static MadtIoApic) {
        let ioapic_registers = virt.data() as *const u32;
        let ioapic = IoApic::new(ioapic_registers, madt_ioapic.gsi_base);

-        let detected_id = ioapic.regs.lock().id();
-        if detected_id != madt_ioapic.id {
-            warn!(
-                "mismatched ACPI MADT I/O APIC ID: MADT={}, IOAPIC={}; continuing with detected hardware",
-                madt_ioapic.id,
-                detected_id
-            );
-        }
+        assert_eq!(
+            ioapic.regs.lock().id(),
+            madt_ioapic.id,
+            "mismatched ACPI MADT I/O APIC ID, and the ID reported by the I/O APIC"
+        );

        (*IOAPICS.get()).get_or_insert_with(Vec::new).push(ioapic);
    }
@@ -351,11 +310,11 @@ pub unsafe fn init() {
                }
            }
        }
-        for ioapic in ioapics() {
-            for idx in 0..=ioapic.count {
-                ioapic.set_mask(ioapic.gsi_start + u32::from(idx), true);
-            }
-        }
+        println!(
+            "I/O APICs: {:?}, overrides: {:?}",
+            ioapics(),
+            src_overrides()
+        );

        // map the legacy PC-compatible IRQs (0-15) to 32-47, just like we did with 8259 PIC (if it
        // wouldn't have been disabled due to this I/O APIC)
@@ -370,6 +329,7 @@ pub unsafe fn init() {
                            .iter()
                            .any(|over| over.bus_irq == legacy_irq)
                    {
+                        // there's an IRQ conflict, making this legacy IRQ inaccessible.
                        continue;
                    }
                    (
@@ -389,6 +349,7 @@ pub unsafe fn init() {
            let redir_tbl_index = (gsi - apic.gsi_start) as u8;

            let map_info = MapInfo {
+                // only send to the BSP
                dest: bsp_apic_id,
                dest_mode: DestinationMode::Physical,
                delivery_mode: DeliveryMode::Fixed,
@@ -405,32 +366,7 @@ pub unsafe fn init() {
                },
                vector: 32 + legacy_irq,
            };
-            if !apic.map(redir_tbl_index, map_info) {
-                warn!(
-                    "Unable to map legacy IRQ {} (GSI {}) through IOAPIC index {}",
-                    legacy_irq,
-                    gsi,
-                    redir_tbl_index
-                );
-            }
-
-            if legacy_irq == 0 && gsi != u32::from(legacy_irq) {
-                if let Some(apic0) = find_ioapic(u32::from(legacy_irq)) {
-                    let idx0 = (u32::from(legacy_irq) - apic0.gsi_start) as u8;
-                    let _ = apic0.map(
-                        idx0,
-                        MapInfo {
-                            dest: bsp_apic_id,
-                            dest_mode: DestinationMode::Physical,
-                            delivery_mode: DeliveryMode::Fixed,
-                            mask: false,
-                            polarity: ApicPolarity::ActiveHigh,
-                            trigger_mode: ApicTriggerMode::Edge,
-                            vector: 32,
-                        },
-                    );
-                }
-            }
+            apic.map(redir_tbl_index, map_info);
        }
        println!(
            "I/O APICs: {:?}, overrides: {:?}",
@@ -470,7 +406,7 @@ fn resolve(irq: u8) -> u32 {
 fn find_ioapic(gsi: u32) -> Option<&'static IoApic> {
    ioapics()
        .iter()
-        .find(|apic| gsi >= apic.gsi_start && gsi <= apic.gsi_start + u32::from(apic.count))
+        .find(|apic| gsi >= apic.gsi_start && gsi < apic.gsi_start + u32::from(apic.count))
 }

 pub unsafe fn mask(irq: u8) {
@@ -489,14 +425,3 @@ pub unsafe fn unmask(irq: u8) {
    };
    apic.set_mask(gsi, false);
 }
-
-/// Change the destination CPU for an IRQ by reprogramming the IOAPIC redirection entry.
-/// Resolves the legacy IRQ to its GSI, finds the owning IOAPIC, and updates the destination
-/// APIC ID in the redirection table while preserving all other fields.
-pub unsafe fn set_affinity(irq: u8, dest: ApicId) -> bool {
-    let gsi = resolve(irq);
-    match find_ioapic(gsi) {
-        Some(apic) => apic.set_irq_affinity(gsi, dest),
-        None => false,
-    }
-}
@@ -0,0 +1,312 @@
+use core::{
+    cell::SyncUnsafeCell,
+    ptr::{read_volatile, write_volatile},
+};
+use x86::msr::*;
+
+use crate::{
+    arch::{cpuid::cpuid, ipi::IpiKind},
+    memory::{map_device_memory, PhysicalAddress},
+    percpu::PercpuBlock,
+};
+
+#[derive(Clone, Copy, Debug)]
+pub struct ApicId(u32);
+
+impl ApicId {
+    pub fn new(inner: u32) -> Self {
+        Self(inner)
+    }
+
+    pub fn get(&self) -> u32 {
+        self.0
+    }
+}
+
+static LOCAL_APIC: SyncUnsafeCell<LocalApic> = SyncUnsafeCell::new(LocalApic {
+    address: 0,
+    x2: false,
+});
+pub unsafe fn the_local_apic() -> &'static mut LocalApic {
+    unsafe { &mut *LOCAL_APIC.get() }
+}
+
+pub unsafe fn init() {
+    unsafe {
+        the_local_apic().init();
+    }
+}
+
+pub unsafe fn init_ap() {
+    unsafe {
+        the_local_apic().init_ap();
+    }
+}
+
+/// Local APIC
+pub struct LocalApic {
+    pub address: usize,
+    pub x2: bool,
+}
+
+impl LocalApic {
+    unsafe fn init(&mut self) {
+        unsafe {
+            let physaddr = PhysicalAddress::new(rdmsr(IA32_APIC_BASE) as usize & 0xFFFF_0000);
+
+            self.x2 = cpuid()
+                .get_feature_info()
+                .is_some_and(|feature_info| feature_info.has_x2apic());
+
+            if !self.x2 {
+                info!("Detected xAPIC at {:#x}", physaddr.data());
+                self.address = map_device_memory(physaddr, 4096).data();
+            } else {
+                info!("Detected x2APIC");
+            }
+
+            self.init_ap();
+        }
+    }
+
+    unsafe fn init_ap(&mut self) {
+        unsafe {
+            if self.x2 {
+                wrmsr(IA32_APIC_BASE, rdmsr(IA32_APIC_BASE) | (1 << 10));
+                wrmsr(IA32_X2APIC_SIVR, 0x100);
+            } else {
+                self.write(0xF0, 0x100);
+            }
+            self.setup_error_int();
+            //self.setup_timer();
+
+            PercpuBlock::current()
+                .misc_arch_info
+                .apic_id_opt
+                .set(Some(self.id()));
+        }
+    }
+
+    unsafe fn read(&self, reg: u32) -> u32 {
+        debug_assert!(!self.x2);
+        unsafe { read_volatile((self.address + reg as usize) as *const u32) }
+    }
+
+    unsafe fn write(&mut self, reg: u32, value: u32) {
+        debug_assert!(!self.x2);
+        unsafe {
+            write_volatile((self.address + reg as usize) as *mut u32, value);
+        }
+    }
+
+    pub fn id(&self) -> ApicId {
+        ApicId::new(if self.x2 {
+            unsafe { rdmsr(IA32_X2APIC_APICID) as u32 }
+        } else {
+            unsafe { self.read(0x20) >> 24 }
+        })
+    }
+
+    pub fn version(&self) -> u32 {
+        if self.x2 {
+            unsafe { rdmsr(IA32_X2APIC_VERSION) as u32 }
+        } else {
+            unsafe { self.read(0x30) }
+        }
+    }
+
+    pub fn icr(&self) -> u64 {
+        if self.x2 {
+            unsafe { rdmsr(IA32_X2APIC_ICR) }
+        } else {
+            unsafe { ((self.read(0x310) as u64) << 32) | self.read(0x300) as u64 }
+        }
+    }
+
+    pub fn set_icr(&mut self, value: u64) {
+        if self.x2 {
+            unsafe {
+                const PENDING: u32 = 1 << 12;
+                while (rdmsr(IA32_X2APIC_ICR) as u32) & PENDING == PENDING {
+                    core::hint::spin_loop();
+                }
+                wrmsr(IA32_X2APIC_ICR, value);
+                while (rdmsr(IA32_X2APIC_ICR) as u32) & PENDING == PENDING {
+                    core::hint::spin_loop();
+                }
+            }
+        } else {
+            unsafe {
+                const PENDING: u32 = 1 << 12;
+                while self.read(0x300) & PENDING == PENDING {
+                    core::hint::spin_loop();
+                }
+                self.write(0x310, (value >> 32) as u32);
+                self.write(0x300, value as u32);
+                while self.read(0x300) & PENDING == PENDING {
+                    core::hint::spin_loop();
+                }
+            }
+        }
+    }
+
+    pub fn ipi(&mut self, apic_id: ApicId, kind: IpiKind) {
+        let shift = if self.x2 { 32 } else { 56 };
+        self.set_icr((u64::from(apic_id.get()) << shift) | 0x40 | kind as u64);
+    }
+    pub fn ipi_nmi(&mut self, apic_id: ApicId) {
+        let shift = if self.x2 { 32 } else { 56 };
+        self.set_icr((u64::from(apic_id.get()) << shift) | (1 << 14) | (0b100 << 8));
+    }
+
+    pub unsafe fn eoi(&mut self) {
+        unsafe {
+            if self.x2 {
+                wrmsr(IA32_X2APIC_EOI, 0);
+            } else {
+                self.write(0xB0, 0);
+            }
+        }
+    }
+    /// Reads the Error Status Register.
+    pub unsafe fn esr(&mut self) -> u32 {
+        unsafe {
+            if self.x2 {
+                // update the ESR to the current state of the local apic.
+                wrmsr(IA32_X2APIC_ESR, 0);
+                // read the updated value
+                rdmsr(IA32_X2APIC_ESR) as u32
+            } else {
+                self.write(0x280, 0);
+                self.read(0x280)
+            }
+        }
+    }
+    pub unsafe fn lvt_timer(&mut self) -> u32 {
+        unsafe {
+            if self.x2 {
+                rdmsr(IA32_X2APIC_LVT_TIMER) as u32
+            } else {
+                self.read(0x320)
+            }
+        }
+    }
+    pub unsafe fn set_lvt_timer(&mut self, value: u32) {
+        unsafe {
+            if self.x2 {
+                wrmsr(IA32_X2APIC_LVT_TIMER, u64::from(value));
+            } else {
+                self.write(0x320, value);
+            }
+        }
+    }
+    pub unsafe fn init_count(&mut self) -> u32 {
+        unsafe {
+            if self.x2 {
+                rdmsr(IA32_X2APIC_INIT_COUNT) as u32
+            } else {
+                self.read(0x380)
+            }
+        }
+    }
+    pub unsafe fn set_init_count(&mut self, initial_count: u32) {
+        unsafe {
+            if self.x2 {
+                wrmsr(IA32_X2APIC_INIT_COUNT, u64::from(initial_count));
+            } else {
+                self.write(0x380, initial_count);
+            }
+        }
+    }
+    pub unsafe fn cur_count(&mut self) -> u32 {
+        unsafe {
+            if self.x2 {
+                rdmsr(IA32_X2APIC_CUR_COUNT) as u32
+            } else {
+                self.read(0x390)
+            }
+        }
+    }
+    pub unsafe fn div_conf(&mut self) -> u32 {
+        unsafe {
+            if self.x2 {
+                rdmsr(IA32_X2APIC_DIV_CONF) as u32
+            } else {
+                self.read(0x3E0)
+            }
+        }
+    }
+    pub unsafe fn set_div_conf(&mut self, div_conf: u32) {
+        unsafe {
+            if self.x2 {
+                wrmsr(IA32_X2APIC_DIV_CONF, u64::from(div_conf));
+            } else {
+                self.write(0x3E0, div_conf);
+            }
+        }
+    }
+    pub unsafe fn lvt_error(&mut self) -> u32 {
+        unsafe {
+            if self.x2 {
+                rdmsr(IA32_X2APIC_LVT_ERROR) as u32
+            } else {
+                self.read(0x370)
+            }
+        }
+    }
+    pub unsafe fn set_lvt_error(&mut self, lvt_error: u32) {
+        unsafe {
+            if self.x2 {
+                wrmsr(IA32_X2APIC_LVT_ERROR, u64::from(lvt_error));
+            } else {
+                self.write(0x370, lvt_error);
+            }
+        }
+    }
+
+    pub unsafe fn set_lvt_nmi(&mut self, pin: u8, flags: u16) {
+        let polarity = match flags & 0b11 {
+            0b11 => 1 << 13,
+            _ => 0,
+        };
+        let trigger_mode = match (flags >> 2) & 0b11 {
+            0b11 => 1 << 15,
+            _ => 0,
+        };
+        let lvt_value = (0b100 << 8) | polarity | trigger_mode;
+
+        unsafe {
+            match pin {
+                0 => {
+                    if self.x2 {
+                        wrmsr(IA32_X2APIC_LVT_LINT0, u64::from(lvt_value));
+                    } else {
+                        self.write(0x350, lvt_value);
+                    }
+                }
+                1 => {
+                    if self.x2 {
+                        wrmsr(IA32_X2APIC_LVT_LINT1, u64::from(lvt_value));
+                    } else {
+                        self.write(0x360, lvt_value);
+                    }
+                }
+                _ => {}
+            }
+        }
+    }
+
+    unsafe fn setup_error_int(&mut self) {
+        unsafe {
+            let vector = 49u32;
+            self.set_lvt_error(vector);
+        }
+    }
+}
+
+#[repr(u8)]
+pub enum LvtTimerMode {
+    OneShot = 0b00,
+    Periodic = 0b01,
+    TscDeadline = 0b10,
+}
@@ -0,0 +1,14 @@
+--- src/arch/x86_shared/device/local_apic.rs
+++ src/arch/x86_shared/device/local_apic.rs
+@@ -61,9 +61,9 @@
+ 
+             if !self.x2 {
+-                info!("Detected xAPIC at {:#x}", physaddr.data());
+                debug!("Detected xAPIC at {:#x}", physaddr.data());
+                 self.address = map_device_memory(physaddr, 4096).data();
+             } else {
+-                info!("Detected x2APIC");
+                debug!("Detected x2APIC");
+             }
+ 
+ 
@@ -4,11 +4,9 @@ pub mod cpu;
 pub mod hpet;
 pub mod ioapic;
 pub mod local_apic;
-pub mod msi;
 pub mod pic;
 pub mod pit;
 pub mod serial;
-pub mod vector;
 #[cfg(feature = "system76_ec_debug")]
 pub mod system76_ec;

@@ -25,7 +23,8 @@ pub unsafe fn init() {
    }
 }
 pub unsafe fn init_after_acpi() {
-    unsafe { ioapic::init() };
+    // this will disable the IOAPIC if needed.
+    //ioapic::init(mapper);
 }

 unsafe fn init_hpet() -> bool {
@@ -1,183 +0,0 @@
-// MSI/MSI-X support for x86 — kernel-level message composition and validation
-// Cross-referenced from Linux 7.0: arch/x86/kernel/apic/msi.c (391 lines)
-
-use crate::arch::device::local_apic::ApicId;
-
-pub const MSI_ADDRESS_BASE: u64 = 0xFEE0_0000;
-pub const MSI_ADDRESS_MASK: u64 = 0xFEEF_F000;
-const MSI_DEST_MODE_LOGICAL: u64 = 1 << 2;
-const MSI_REDIRECTION_HINT: u64 = 1 << 3;
-
-#[derive(Debug, Clone, Copy)]
-pub struct MsiAddress {
-    pub raw: u64,
-}
-
-#[derive(Debug, Clone, Copy)]
-pub struct MsiData {
-    pub raw: u32,
-}
-
-#[derive(Debug, Clone)]
-pub struct MsiMessage {
-    pub address: MsiAddress,
-    pub data: MsiData,
-}
-
-impl MsiAddress {
-    pub fn new(dest_apic_id: u8, redirection_hint: bool, dest_mode_logical: bool) -> Self {
-        let mut addr = MSI_ADDRESS_BASE;
-        addr |= u64::from(dest_apic_id) << 12;
-        if redirection_hint {
-            addr |= MSI_REDIRECTION_HINT;
-        }
-        if dest_mode_logical {
-            addr |= MSI_DEST_MODE_LOGICAL;
-        }
-        Self { raw: addr }
-    }
-
-    pub fn validate(addr: u64) -> bool {
-        (addr & MSI_ADDRESS_MASK) == MSI_ADDRESS_BASE
-    }
-
-    pub fn dest_apic_id(&self) -> u8 {
-        ((self.raw >> 12) & 0xFF) as u8
-    }
-}
-
-impl MsiData {
-    pub fn new(vector: u8, delivery_mode: u8, trigger_mode: u8) -> Self {
-        let mut data = u32::from(vector);
-        data |= u32::from(delivery_mode) << 8;
-        data |= u32::from(trigger_mode) << 15;
-        Self { raw: data }
-    }
-
-    pub fn vector(&self) -> u8 {
-        (self.raw & 0xFF) as u8
-    }
-
-    pub fn delivery_mode(&self) -> u8 {
-        ((self.raw >> 8) & 0x7) as u8
-    }
-
-    pub fn trigger_mode(&self) -> u8 {
-        ((self.raw >> 15) & 0x1) as u8
-    }
-}
-
-impl MsiMessage {
-    pub fn compose(dest: ApicId, vector: u8, delivery_mode: u8, trigger_mode: u8) -> Self {
-        let address = MsiAddress::new(dest.get() as u8, false, false);
-        let data = MsiData::new(vector, delivery_mode, trigger_mode);
-        Self { address, data }
-    }
-
-    pub fn validate(&self) -> bool {
-        MsiAddress::validate(self.address.raw)
-            && self.data.vector() >= 32
-            && self.data.vector() < 255
-    }
-}
-
-pub fn is_valid_msi_address(addr: u64) -> bool {
-    MsiAddress::validate(addr)
-}
-
-pub fn is_valid_msi_vector(vector: u8) -> bool {
-    vector >= 32 && vector < 255
-}
-
-#[derive(Debug)]
-pub struct MsiCapability {
-    pub msg_ctl: u16,
-    pub msg_addr_lo: u32,
-    pub msg_addr_hi: u32,
-    pub msg_data: u16,
-    pub mask_bits: u32,
-    pub pending_bits: u32,
-    pub is_64bit: bool,
-    pub is_maskable: bool,
-    pub multiple_message_capable: u8,
-}
-
-impl MsiCapability {
-    pub fn parse(raw: &[u32; 6], msg_ctl: u16) -> Self {
-        Self {
-            msg_ctl,
-            msg_addr_lo: raw[1],
-            msg_addr_hi: if msg_ctl & (1 << 7) != 0 { raw[2] } else { 0 },
-            msg_data: if msg_ctl & (1 << 7) != 0 {
-                (raw[3] & 0xFFFF) as u16
-            } else {
-                (raw[2] & 0xFFFF) as u16
-            },
-            mask_bits: if msg_ctl & (1 << 8) != 0 {
-                if msg_ctl & (1 << 7) != 0 {
-                    raw[3] >> 16
-                } else {
-                    raw[3]
-                }
-            } else {
-                0
-            },
-            pending_bits: if msg_ctl & (1 << 8) != 0 { raw[4] } else { 0 },
-            is_64bit: msg_ctl & (1 << 7) != 0,
-            is_maskable: msg_ctl & (1 << 8) != 0,
-            multiple_message_capable: ((msg_ctl >> 1) & 0x7) as u8,
-        }
-    }
-}
-
-#[derive(Debug)]
-pub struct MsixCapability {
-    pub msg_ctl: u16,
-    pub table_offset: u32,
-    pub table_bar: u8,
-    pub pba_offset: u32,
-    pub pba_bar: u8,
-    pub table_size: u16,
-}
-
-impl MsixCapability {
-    pub fn parse(raw: &[u32; 3], msg_ctl: u16) -> Self {
-        Self {
-            msg_ctl,
-            table_offset: raw[1] & !0x7,
-            table_bar: (raw[1] & 0x7) as u8,
-            pba_offset: raw[2] & !0x7,
-            pba_bar: (raw[2] & 0x7) as u8,
-            table_size: ((msg_ctl >> 1) & 0x7FF) as u16 + 1,
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn test_compose_message() {
-        let msg = MsiMessage::compose(ApicId::new(3), 48, 0b101, 1);
-        assert!(msg.validate());
-        assert_eq!(msg.address.dest_apic_id(), 3);
-        assert_eq!(msg.data.vector(), 48);
-        assert_eq!(msg.data.delivery_mode(), 0b101);
-        assert_eq!(msg.data.trigger_mode(), 1);
-    }
-
-    #[test]
-    fn test_invalid_address() {
-        assert!(!is_valid_msi_address(0xDEAD_BEEF));
-        assert!(is_valid_msi_address(0xFEE0_0000));
-    }
-
-    #[test]
-    fn test_msi_parse() {
-        let raw = [0u32; 6];
-        let cap = MsiCapability::parse(&raw, 0);
-        assert!(!cap.is_64bit);
-        assert!(!cap.is_maskable);
-    }
-}
@@ -1,53 +0,0 @@
-use crate::cpu_set::LogicalCpuId;
-
-const VECTOR_COUNT: usize = 224;
-
-static VECTORS: [core::sync::atomic::AtomicU32; 7] = [
-    core::sync::atomic::AtomicU32::new(0),
-    core::sync::atomic::AtomicU32::new(0),
-    core::sync::atomic::AtomicU32::new(0),
-    core::sync::atomic::AtomicU32::new(0),
-    core::sync::atomic::AtomicU32::new(0),
-    core::sync::atomic::AtomicU32::new(0),
-    core::sync::atomic::AtomicU32::new(0),
-];
-
-pub fn allocate_vector(_cpu: LogicalCpuId) -> Option<u8> {
-    for (bank, slot) in VECTORS.iter().enumerate() {
-        let mut bits = slot.load(core::sync::atomic::Ordering::Acquire);
-        loop {
-            let free = bits.trailing_ones() as usize;
-            if free >= 32 {
-                break;
-            }
-            let bit = 1u32 << free;
-            match slot.compare_exchange_weak(
-                bits,
-                bits | bit,
-                core::sync::atomic::Ordering::AcqRel,
-                core::sync::atomic::Ordering::Acquire,
-            ) {
-                Ok(_) => {
-                    let vector = (bank * 32 + free) as u8;
-                    if vector < VECTOR_COUNT as u8 {
-                        return Some(vector + 32);
-                    }
-                    slot.fetch_and(!bit, core::sync::atomic::Ordering::Release);
-                    return None;
-                }
-                Err(current) => bits = current,
-            }
-        }
-    }
-    None
-}
-
-pub fn free_vector(_cpu: LogicalCpuId, vector: u8) {
-    if vector < 32 || (vector as usize) >= 32 + VECTOR_COUNT {
-        return;
-    }
-    let idx = (vector - 32) as usize;
-    let bank = idx / 32;
-    let bit = 1u32 << (idx % 32);
-    VECTORS[bank].fetch_and(!bit, core::sync::atomic::Ordering::Release);
-}
@@ -192,15 +192,6 @@ impl ProcessorControlRegion {
    }
 }

-#[cold]
-fn halt_pcr_init() -> ! {
-    println!("FATAL: failed to allocate physical memory for Processor Control Region");
-    println!("Processor startup cannot continue. Halting.");
-    loop {
-        core::hint::spin_loop();
-    }
-}
-
 pub unsafe fn pcr() -> *mut ProcessorControlRegion {
    unsafe {
        // Primitive benchmarking of RDFSBASE and RDGSBASE in userspace, appears to indicate that
@@ -384,10 +375,7 @@ pub fn allocate_and_init_pcr(
        .next_power_of_two()
        .trailing_zeros();

-    let pcr_frame = match crate::memory::allocate_p2frame(alloc_order) {
-        Some(frame) => frame,
-        None => halt_pcr_init(),
-    };
+    let pcr_frame = crate::memory::allocate_p2frame(alloc_order).expect("failed to allocate PCR");
    let pcr_ptr = RmmA::phys_to_virt(pcr_frame.base()).data() as *mut ProcessorControlRegion;
    unsafe { core::ptr::write(pcr_ptr, ProcessorControlRegion::new_partial_init(cpu_id)) };

@@ -78,15 +78,6 @@ static INIT_BSP_IDT: SyncUnsafeCell<Idt> = SyncUnsafeCell::new(Idt::new());
 pub(crate) static IDTS: RwLock<HashMap<LogicalCpuId, &'static mut Idt>> =
    RwLock::new(HashMap::with_hasher(DefaultHashBuilder::new()));

-#[cold]
-fn halt_idt_init() -> ! {
-    println!("FATAL: failed to allocate physical pages for backup interrupt stack");
-    println!("Interrupt setup cannot continue. Halting.");
-    loop {
-        core::hint::spin_loop();
-    }
-}
-
 #[inline]
 pub fn is_reserved(cpu_id: LogicalCpuId, index: u8) -> bool {
    if cpu_id == LogicalCpuId::BSP {
@@ -110,8 +101,6 @@ pub fn set_reserved(cpu_id: LogicalCpuId, index: u8, reserved: bool) {
 }

 pub fn available_irqs_iter(cpu_id: LogicalCpuId) -> impl Iterator<Item = u8> + 'static {
-    let count = (32..=254).filter(|&index| !is_reserved(cpu_id, index)).count();
-    info!("available_irqs_iter: cpu_id={} count={}", cpu_id.get(), count);
    (32..=254).filter(move |&index| !is_reserved(cpu_id, index))
 }

@@ -172,10 +161,8 @@ pub fn allocate_and_init_idt(cpu_id: LogicalCpuId) -> *mut Idt {
        .or_insert_with(|| Box::leak(Box::new(Idt::new())));

    use crate::memory::{RmmA, RmmArch};
-    let frames = match crate::memory::allocate_p2frame(4) {
-        Some(frames) => frames,
-        None => halt_idt_init(),
-    };
+    let frames = crate::memory::allocate_p2frame(4)
+        .expect("failed to allocate pages for backup interrupt stack");

    // Physical pages are mapped linearly. So is the linearly mapped virtual memory.
    let base_address = RmmA::phys_to_virt(frames.base());
@@ -1,5 +1,3 @@
-use core::sync::atomic::{AtomicBool, Ordering};
-
 use syscall::Exception;
 use x86::irq::PageFaultError;

@@ -12,22 +10,6 @@ use crate::{
    syscall::flag::*,
 };

-static NMI_IN_PROGRESS: AtomicBool = AtomicBool::new(false);
-
-unsafe fn nmi_raw_serial_write(bytes: &[u8]) {
-    use crate::syscall::io::{Io, Pio};
-
-    let mut com1 = Pio::<u8>::new(0x3F8);
-    let lsr = Pio::<u8>::new(0x3F8 + 5);
-
-    for &byte in bytes {
-        while lsr.read() & (1 << 5) == 0 {
-            core::hint::spin_loop();
-        }
-        com1.write(byte);
-    }
-}
-
 interrupt_stack!(divide_by_zero, |stack| {
    println!("Divide by zero");
    stack.trace();
@@ -73,35 +55,9 @@ interrupt_stack!(non_maskable, @paranoid, |stack| {

    #[cfg(not(all(target_arch = "x86_64", feature = "profiling")))]
    {
-        if NMI_IN_PROGRESS.swap(true, Ordering::SeqCst) {
-            return;
-        }
-
-        unsafe {
-            nmi_raw_serial_write(b"Non-maskable interrupt\n");
-            nmi_raw_serial_write(b"  RIP: ");
-
-            #[cfg(target_arch = "x86")]
-            let instruction_pointer = u64::from(stack.iret.eip);
-            #[cfg(target_arch = "x86_64")]
-            let instruction_pointer = stack.iret.rip;
-
-            let mut buf = [0u8; 19];
-            buf[0] = b'0';
-            buf[1] = b'x';
-            for i in 0..16 {
-                let nibble = ((instruction_pointer >> (60 - i * 4)) & 0xF) as u8;
-                buf[2 + i] = if nibble < 10 {
-                    b'0' + nibble
-                } else {
-                    b'a' + nibble - 10
-                };
-            }
-            buf[18] = b'\n';
-            nmi_raw_serial_write(&buf);
-        }
-
-        NMI_IN_PROGRESS.store(false, Ordering::SeqCst);
+        // TODO: This will likely deadlock
+        println!("Non-maskable interrupt");
+        stack.dump();
    }
 });

@@ -28,8 +28,6 @@ pub mod pti;
 /// Initialization and start function
 pub mod start;

-pub mod sleep;
-
 /// Stop function
 pub mod stop;

@@ -1,712 +0,0 @@
-use alloc::{sync::Arc, vec::Vec};
-use core::{
-    ptr::NonNull,
-    str::FromStr,
-    sync::atomic::{AtomicU32, Ordering},
-};
-
-use acpi_ext::{
-    aml::{namespace::AmlName, object::Object, Interpreter},
-    registers::FixedRegisters,
-    sdt::{facs::Facs, fadt::Fadt, SdtHeader},
-    AcpiTables, Handle, Handler, PhysicalMapping,
-};
-use spin::Mutex;
-use syscall::error::{Error, EINVAL, EIO};
-use x86::{segmentation::SegmentSelector, task, Ring};
-
-use crate::{
-    acpi::ACPI_ROOT_INFO,
-    arch::interrupt,
-    memory::{
-        round_down_pages, round_up_pages, KernelMapper, Page, PageFlags, PhysicalAddress, RmmA,
-        RmmArch, VirtualAddress, PAGE_SIZE,
-    },
-    syscall::io::{Io, Pio},
-};
-
-const ACPI_SLP_TYP_SHIFT: u16 = 10;
-const ACPI_SLP_TYP_MASK: u16 = 0x1C00;
-const ACPI_SLP_EN: u16 = 1 << 13;
-const WAKE_TRAMPOLINE_PHYS: usize = 0x8000;
-const SLEEP_RETURN_OK: usize = 0;
-
-#[cfg(target_arch = "x86_64")]
-static WAKE_TRAMPOLINE_DATA: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/s3_wakeup"));
-
-#[repr(C, packed)]
-#[derive(Clone, Copy, Debug, Default)]
-struct DescriptorTableRegister {
-    limit: u16,
-    base: u64,
-}
-
-#[repr(C, align(64))]
-#[derive(Clone, Copy, Debug)]
-struct FpuState {
-    bytes: [u8; 4096],
-}
-
-impl Default for FpuState {
-    fn default() -> Self {
-        Self { bytes: [0; 4096] }
-    }
-}
-
-#[derive(Clone, Copy, Debug, Eq, PartialEq)]
-pub enum SleepState {
-    S3,
-    S5,
-}
-
-#[derive(Clone, Copy, Debug, Eq, PartialEq)]
-pub enum SleepError {
-    UnsupportedArch,
-    MissingAcpi,
-    MissingFadt,
-    MissingFacs,
-    MissingSleepObject,
-    InvalidSleepObject,
-    UnsupportedPmControl,
-    UnsupportedAmlOperation,
-    SleepDidNotEnter,
-}
-
-impl SleepError {
-    fn code(self) -> usize {
-        match self {
-            Self::UnsupportedArch => EINVAL as usize,
-            Self::MissingAcpi
-            | Self::MissingFadt
-            | Self::MissingFacs
-            | Self::MissingSleepObject
-            | Self::UnsupportedAmlOperation => EIO as usize,
-            Self::InvalidSleepObject | Self::UnsupportedPmControl | Self::SleepDidNotEnter => {
-                EINVAL as usize
-            }
-        }
-    }
-
-    fn from_code(code: usize) -> Self {
-        match code as i32 {
-            x if x == EINVAL => Self::InvalidSleepObject,
-            _ => Self::MissingAcpi,
-        }
-    }
-}
-
-#[derive(Clone, Copy, Debug, Default)]
-struct SavedCpuContext {
-    entry_rsp: usize,
-    runtime_rsp: usize,
-    facs_address: usize,
-    cr0: usize,
-    cr2: usize,
-    cr3: usize,
-    cr4: usize,
-    rflags: usize,
-    gdtr: DescriptorTableRegister,
-    idtr: DescriptorTableRegister,
-    efer: u64,
-    fs_base: u64,
-    gs_base: u64,
-    kernel_gs_base: u64,
-    fpu: FpuState,
-}
-
-static SAVED_CONTEXT: Mutex<Option<SavedCpuContext>> = Mutex::new(None);
-static AML_MUTEX_IDS: AtomicU32 = AtomicU32::new(1);
-
-#[derive(Clone, Copy, Debug)]
-struct SleepTypeData {
-    a: u16,
-    b: u16,
-}
-
-#[derive(Clone, Copy)]
-struct KernelAcpiHandler;
-
-impl KernelAcpiHandler {
-    fn map_range(physical_address: usize, size: usize) -> (*mut u8, usize) {
-        let map_base = round_down_pages(physical_address);
-        let map_offset = physical_address - map_base;
-        let mapped_length = round_up_pages(size + map_offset);
-
-        // SAFETY: The ACPI interpreter only requests firmware-described physical regions.
-        unsafe {
-            let mut mapper = KernelMapper::lock_rw();
-            for page_index in 0..mapped_length / PAGE_SIZE {
-                let (_, flush) = mapper
-                    .map_linearly(
-                        PhysicalAddress::new(map_base + page_index * PAGE_SIZE),
-                        PageFlags::new(),
-                    )
-                    .expect("failed to linearly map ACPI physical region");
-                flush.flush();
-            }
-        }
-
-        let virtual_base = RmmA::phys_to_virt(PhysicalAddress::new(map_base)).data();
-        ((virtual_base + map_offset) as *mut u8, mapped_length)
-    }
-}
-
-impl Handler for KernelAcpiHandler {
-    unsafe fn map_physical_region<T>(&self, physical_address: usize, size: usize) -> PhysicalMapping<Self, T> {
-        let (virtual_start, mapped_length) = Self::map_range(physical_address, size);
-        PhysicalMapping {
-            physical_start: physical_address,
-            virtual_start: NonNull::new(virtual_start.cast::<T>())
-                .expect("expected mapped ACPI virtual address to be non-null"),
-            region_length: size,
-            mapped_length,
-            handler: *self,
-        }
-    }
-
-    fn unmap_physical_region<T>(_region: &PhysicalMapping<Self, T>) {}
-
-    fn read_u8(&self, address: usize) -> u8 {
-        // SAFETY: AML system-memory accesses are byte-addressable firmware regions.
-        unsafe { core::ptr::read_volatile(RmmA::phys_to_virt(PhysicalAddress::new(address)).data() as *const u8) }
-    }
-
-    fn read_u16(&self, address: usize) -> u16 {
-        // SAFETY: AML system-memory accesses are word-addressable firmware regions.
-        unsafe {
-            core::ptr::read_volatile(RmmA::phys_to_virt(PhysicalAddress::new(address)).data() as *const u16)
-        }
-    }
-
-    fn read_u32(&self, address: usize) -> u32 {
-        // SAFETY: AML system-memory accesses are dword-addressable firmware regions.
-        unsafe {
-            core::ptr::read_volatile(RmmA::phys_to_virt(PhysicalAddress::new(address)).data() as *const u32)
-        }
-    }
-
-    fn read_u64(&self, address: usize) -> u64 {
-        // SAFETY: AML system-memory accesses are qword-addressable firmware regions.
-        unsafe {
-            core::ptr::read_volatile(RmmA::phys_to_virt(PhysicalAddress::new(address)).data() as *const u64)
-        }
-    }
-
-    fn write_u8(&self, address: usize, value: u8) {
-        // SAFETY: AML system-memory accesses are byte-addressable firmware regions.
-        unsafe {
-            core::ptr::write_volatile(RmmA::phys_to_virt(PhysicalAddress::new(address)).data() as *mut u8, value)
-        }
-    }
-
-    fn write_u16(&self, address: usize, value: u16) {
-        // SAFETY: AML system-memory accesses are word-addressable firmware regions.
-        unsafe {
-            core::ptr::write_volatile(
-                RmmA::phys_to_virt(PhysicalAddress::new(address)).data() as *mut u16,
-                value,
-            )
-        }
-    }
-
-    fn write_u32(&self, address: usize, value: u32) {
-        // SAFETY: AML system-memory accesses are dword-addressable firmware regions.
-        unsafe {
-            core::ptr::write_volatile(
-                RmmA::phys_to_virt(PhysicalAddress::new(address)).data() as *mut u32,
-                value,
-            )
-        }
-    }
-
-    fn write_u64(&self, address: usize, value: u64) {
-        // SAFETY: AML system-memory accesses are qword-addressable firmware regions.
-        unsafe {
-            core::ptr::write_volatile(
-                RmmA::phys_to_virt(PhysicalAddress::new(address)).data() as *mut u64,
-                value,
-            )
-        }
-    }
-
-    fn read_io_u8(&self, port: u16) -> u8 {
-        Pio::<u8>::new(port).read()
-    }
-
-    fn read_io_u16(&self, port: u16) -> u16 {
-        Pio::<u16>::new(port).read()
-    }
-
-    fn read_io_u32(&self, port: u16) -> u32 {
-        Pio::<u32>::new(port).read()
-    }
-
-    fn write_io_u8(&self, port: u16, value: u8) {
-        Pio::<u8>::new(port).write(value)
-    }
-
-    fn write_io_u16(&self, port: u16, value: u16) {
-        Pio::<u16>::new(port).write(value)
-    }
-
-    fn write_io_u32(&self, port: u16, value: u32) {
-        Pio::<u32>::new(port).write(value)
-    }
-
-    fn read_pci_u8(&self, _address: acpi_ext::PciAddress, _offset: u16) -> u8 {
-        0
-    }
-
-    fn read_pci_u16(&self, _address: acpi_ext::PciAddress, _offset: u16) -> u16 {
-        0
-    }
-
-    fn read_pci_u32(&self, _address: acpi_ext::PciAddress, _offset: u16) -> u32 {
-        0
-    }
-
-    fn write_pci_u8(&self, _address: acpi_ext::PciAddress, _offset: u16, _value: u8) {}
-
-    fn write_pci_u16(&self, _address: acpi_ext::PciAddress, _offset: u16, _value: u16) {}
-
-    fn write_pci_u32(&self, _address: acpi_ext::PciAddress, _offset: u16, _value: u32) {}
-
-    fn nanos_since_boot(&self) -> u64 {
-        0
-    }
-
-    fn stall(&self, microseconds: u64) {
-        for _ in 0..(microseconds.saturating_mul(64)) {
-            core::hint::spin_loop();
-        }
-    }
-
-    fn sleep(&self, milliseconds: u64) {
-        for _ in 0..(milliseconds.saturating_mul(64_000)) {
-            core::hint::spin_loop();
-        }
-    }
-
-    fn create_mutex(&self) -> Handle {
-        Handle(AML_MUTEX_IDS.fetch_add(1, Ordering::Relaxed))
-    }
-
-    fn acquire(&self, _mutex: Handle, _timeout: u16) -> Result<(), acpi_ext::aml::AmlError> {
-        Ok(())
-    }
-
-    fn release(&self, _mutex: Handle) {}
-}
-
-fn sleep_state_name(state: SleepState) -> &'static str {
-    match state {
-        SleepState::S3 => "\\_S3",
-        SleepState::S5 => "\\_S5",
-    }
-}
-
-fn encode_sleep_type(value: u16) -> u16 {
-    if value <= 0x7 {
-        value << ACPI_SLP_TYP_SHIFT
-    } else {
-        value & ACPI_SLP_TYP_MASK
-    }
-}
-
-fn load_interpreter() -> Result<(
-    Arc<FixedRegisters<KernelAcpiHandler>>,
-    PhysicalMapping<KernelAcpiHandler, Facs>,
-    Interpreter<KernelAcpiHandler>,
-), SleepError> {
-    let root = *ACPI_ROOT_INFO.get().ok_or(SleepError::MissingAcpi)?;
-    let handler = KernelAcpiHandler;
-
-    // SAFETY: ACPI root info is captured from the firmware-provided, already validated root table.
-    let tables = unsafe {
-        AcpiTables::from_rsdt(handler, root.revision, root.root_sdt_address.data())
-            .map_err(|_| SleepError::MissingAcpi)?
-    };
-    let fadt = tables.find_table::<Fadt>().ok_or(SleepError::MissingFadt)?;
-    let registers = Arc::new(
-        FixedRegisters::new(&fadt, handler).map_err(|_| SleepError::UnsupportedPmControl)?,
-    );
-    let facs_address = fadt.facs_address().map_err(|_| SleepError::MissingFacs)?;
-
-    // SAFETY: The FADT-supplied FACS address is used exactly as described by the ACPI spec.
-    let facs = unsafe { handler.map_physical_region::<Facs>(facs_address, core::mem::size_of::<Facs>()) };
-    // SAFETY: The AML interpreter only needs an owned mapping of the same firmware FACS table.
-    let interpreter_facs = unsafe {
-        handler.map_physical_region::<Facs>(facs_address, core::mem::size_of::<Facs>())
-    };
-    let dsdt = tables.dsdt().map_err(|_| SleepError::MissingFadt)?;
-    let interpreter = Interpreter::new(handler, dsdt.revision, Arc::clone(&registers), Some(interpreter_facs));
-
-    // SAFETY: Each AML table mapping is owned by the interpreter during table loading.
-    unsafe {
-        let mapping = handler.map_physical_region::<SdtHeader>(dsdt.phys_address, dsdt.length as usize);
-        let stream = core::slice::from_raw_parts(
-            mapping.virtual_start.as_ptr().byte_add(core::mem::size_of::<SdtHeader>()) as *const u8,
-            dsdt.length as usize - core::mem::size_of::<SdtHeader>(),
-        );
-        interpreter
-            .load_table(stream)
-            .map_err(|_| SleepError::UnsupportedAmlOperation)?;
-
-        for ssdt in tables.ssdts() {
-            let mapping = handler.map_physical_region::<SdtHeader>(ssdt.phys_address, ssdt.length as usize);
-            let stream = core::slice::from_raw_parts(
-                mapping.virtual_start.as_ptr().byte_add(core::mem::size_of::<SdtHeader>()) as *const u8,
-                ssdt.length as usize - core::mem::size_of::<SdtHeader>(),
-            );
-            interpreter
-                .load_table(stream)
-                .map_err(|_| SleepError::UnsupportedAmlOperation)?;
-        }
-    }
-
-    Ok((registers, facs, interpreter))
-}
-
-fn sleep_type_data_from_interpreter(
-    interpreter: &Interpreter<KernelAcpiHandler>,
-    state: SleepState,
-) -> Result<SleepTypeData, SleepError> {
-    let name = AmlName::from_str(sleep_state_name(state)).map_err(|_| SleepError::MissingSleepObject)?;
-    let object = interpreter
-        .evaluate(name, Vec::new())
-        .map_err(|_| SleepError::MissingSleepObject)?;
-
-    let Object::Package(package) = &*object else {
-        return Err(SleepError::InvalidSleepObject);
-    };
-
-    let Some(typa_object) = package.first() else {
-        return Err(SleepError::InvalidSleepObject);
-    };
-    let Some(typb_object) = package.get(1) else {
-        return Err(SleepError::InvalidSleepObject);
-    };
-
-    let Object::Integer(typa) = &**typa_object else {
-        return Err(SleepError::InvalidSleepObject);
-    };
-    let Object::Integer(typb) = &**typb_object else {
-        return Err(SleepError::InvalidSleepObject);
-    };
-
-    Ok(SleepTypeData {
-        a: encode_sleep_type(*typa as u16),
-        b: encode_sleep_type(*typb as u16),
-    })
-}
-
-fn sleep_type_data(state: SleepState) -> Result<SleepTypeData, SleepError> {
-    let (_registers, _facs, interpreter) = load_interpreter()?;
-    sleep_type_data_from_interpreter(&interpreter, state)
-}
-
-fn install_wake_trampoline(stack_rsp: usize, cr3: usize) {
-    let trampoline_page = Page::containing_address(VirtualAddress::new(WAKE_TRAMPOLINE_PHYS));
-    let trampoline_frame = PhysicalAddress::new(WAKE_TRAMPOLINE_PHYS);
-
-    // SAFETY: The 0x8000 low-memory trampoline page is reserved by the kernel for bootstrap stubs.
-    let (result, _) = unsafe {
-        let mut mapper = KernelMapper::lock_rw();
-        let result = mapper
-            .map_phys(
-                trampoline_page.start_address(),
-                trampoline_frame,
-                PageFlags::new().execute(true).write(true),
-            )
-            .expect("failed to map S3 wake trampoline page");
-        (result, mapper.table().phys().data())
-    };
-    result.flush();
-
-    for (index, value) in WAKE_TRAMPOLINE_DATA.iter().enumerate() {
-        // SAFETY: The trampoline page is mapped writable at the same virtual address as the physical page.
-        unsafe {
-            core::ptr::write_volatile((WAKE_TRAMPOLINE_PHYS as *mut u8).add(index), *value);
-        }
-    }
-
-    // SAFETY: The wake trampoline layout reserves three qword fields immediately after the jump.
-    unsafe {
-        let stack_slot = (WAKE_TRAMPOLINE_PHYS + 8) as *mut u64;
-        let page_table_slot = stack_slot.add(1);
-        let code_slot = stack_slot.add(2);
-        stack_slot.write(stack_rsp as u64);
-        page_table_slot.write(cr3 as u64);
-        #[expect(clippy::fn_to_numeric_cast)]
-        code_slot.write(resume_from_s3_trampoline as usize as u64);
-    }
-
-    // SAFETY: The trampoline mapping is no longer needed once the physical page has been populated.
-    let (_frame, _, flush) = unsafe {
-        KernelMapper::lock_rw()
-            .unmap_phys(trampoline_page.start_address())
-            .expect("failed to unmap S3 wake trampoline page")
-    };
-    flush.flush();
-}
-
-fn save_descriptor_tables(context: &mut SavedCpuContext) {
-    // SAFETY: SGDT/SIDT only read the current CPU descriptor-table registers into the provided storage.
-    unsafe {
-        core::arch::asm!("sgdt [{}]", in(reg) &mut context.gdtr, options(nostack, preserves_flags));
-        core::arch::asm!("sidt [{}]", in(reg) &mut context.idtr, options(nostack, preserves_flags));
-    }
-}
-
-fn save_fpu_state(context: &mut SavedCpuContext) {
-    // SAFETY: The kernel owns the current CPU at suspend entry and the FXSAVE buffer is 64-byte aligned.
-    unsafe {
-        core::arch::asm!(
-            "fxsave64 [{}]",
-            in(reg) context.fpu.bytes.as_mut_ptr(),
-        );
-    }
-}
-
-fn restore_fpu_state(context: &SavedCpuContext) {
-    // SAFETY: The saved FXSAVE image belongs to the same CPU context and matches the restore instruction.
-    unsafe {
-        core::arch::asm!(
-            "fxrstor64 [{}]",
-            in(reg) context.fpu.bytes.as_ptr(),
-        );
-    }
-}
-
-fn save_cpu_context(entry_rsp: usize) -> SavedCpuContext {
-    let mut context = SavedCpuContext {
-        entry_rsp,
-        ..SavedCpuContext::default()
-    };
-
-    // SAFETY: Reading control registers and MSRs is required to reconstruct the CPU execution state on wake.
-    unsafe {
-        core::arch::asm!(
-            "mov {}, cr0",
-            out(reg) context.cr0,
-            options(nostack, preserves_flags)
-        );
-        core::arch::asm!(
-            "mov {}, cr2",
-            out(reg) context.cr2,
-            options(nostack, preserves_flags)
-        );
-        core::arch::asm!(
-            "mov {}, cr3",
-            out(reg) context.cr3,
-            options(nostack, preserves_flags)
-        );
-        core::arch::asm!(
-            "mov {}, cr4",
-            out(reg) context.cr4,
-            options(nostack, preserves_flags)
-        );
-        core::arch::asm!(
-            "pushfq",
-            "pop {}",
-            out(reg) context.rflags,
-            options(preserves_flags)
-        );
-        core::arch::asm!("mov {}, rsp", out(reg) context.runtime_rsp, options(nostack, preserves_flags));
-
-        context.efer = x86::msr::rdmsr(x86::msr::IA32_EFER);
-        context.fs_base = x86::msr::rdmsr(x86::msr::IA32_FS_BASE);
-        context.gs_base = x86::msr::rdmsr(x86::msr::IA32_GS_BASE);
-        context.kernel_gs_base = x86::msr::rdmsr(x86::msr::IA32_KERNEL_GSBASE);
-    }
-
-    save_descriptor_tables(&mut context);
-    save_fpu_state(&mut context);
-    context
-}
-
-fn set_firmware_waking_vector(facs: &mut PhysicalMapping<KernelAcpiHandler, Facs>, vector: usize) {
-    facs.firmware_waking_vector = vector as u32;
-    facs.x_firmware_waking_vector = vector as u64;
-}
-
-fn write_pm1_control_block(
-    registers: &FixedRegisters<KernelAcpiHandler>,
-    sleep_type: SleepTypeData,
-) -> Result<(), SleepError> {
-    let current_a = registers
-        .pm1_control_registers
-        .pm1a
-        .read()
-        .map_err(|_| SleepError::UnsupportedPmControl)? as u16;
-    let armed_a = (current_a & !(ACPI_SLP_TYP_MASK | ACPI_SLP_EN)) | sleep_type.a;
-
-    registers
-        .pm1_control_registers
-        .pm1a
-        .write(u64::from(armed_a))
-        .map_err(|_| SleepError::UnsupportedPmControl)?;
-
-    if let Some(pm1b) = &registers.pm1_control_registers.pm1b {
-        let current_b = pm1b.read().map_err(|_| SleepError::UnsupportedPmControl)? as u16;
-        let armed_b = (current_b & !(ACPI_SLP_TYP_MASK | ACPI_SLP_EN)) | sleep_type.b;
-        pm1b.write(u64::from(armed_b))
-            .map_err(|_| SleepError::UnsupportedPmControl)?;
-        pm1b.write(u64::from(armed_b | ACPI_SLP_EN))
-            .map_err(|_| SleepError::UnsupportedPmControl)?;
-    }
-
-    // SAFETY: WBINVD is required here to flush dirty cache lines before firmware powers down the CPU package.
-    unsafe {
-        core::arch::asm!("wbinvd", options(nostack, preserves_flags));
-    }
-
-    registers
-        .pm1_control_registers
-        .pm1a
-        .write(u64::from(armed_a | ACPI_SLP_EN))
-        .map_err(|_| SleepError::UnsupportedPmControl)?;
-
-    Ok(())
-}
-
-#[unsafe(naked)]
-unsafe extern "sysv64" fn enter_sleep_raw(state: usize) -> usize {
-    core::arch::naked_asm!(
-        "mov rsi, rsp",
-        "jmp {inner}",
-        inner = sym enter_sleep_raw_inner,
-    );
-}
-
-extern "C" fn enter_sleep_raw_inner(state: usize, entry_rsp: usize) -> usize {
-    let state = match state {
-        3 => SleepState::S3,
-        5 => SleepState::S5,
-        _ => return SleepError::InvalidSleepObject.code(),
-    };
-
-    let (registers, mut facs, interpreter) = match load_interpreter() {
-        Ok(tuple) => tuple,
-        Err(error) => return error.code(),
-    };
-    let sleep_type = match sleep_type_data_from_interpreter(&interpreter, state) {
-        Ok(data) => data,
-        Err(error) => return error.code(),
-    };
-
-    let mut context = save_cpu_context(entry_rsp);
-    context.facs_address = facs.physical_start;
-    install_wake_trampoline(context.runtime_rsp, context.cr3);
-    set_firmware_waking_vector(&mut facs, WAKE_TRAMPOLINE_PHYS);
-
-    {
-        let mut saved = SAVED_CONTEXT.lock();
-        *saved = Some(context);
-    }
-
-    // SAFETY: Suspend entry must not be interrupted while the wake vector and PM1 control block are being armed.
-    unsafe {
-        interrupt::disable();
-    }
-
-    if let Err(error) = write_pm1_control_block(registers.as_ref(), sleep_type) {
-        return error.code();
-    }
-
-    // SAFETY: The final CLI+HLT sequence is the architectural handoff point after asserting SLP_EN.
-    unsafe {
-        core::arch::asm!("cli; hlt", options(nostack));
-    }
-
-    SleepError::SleepDidNotEnter.code()
-}
-
-extern "C" fn resume_from_s3_trampoline() -> ! {
-    let mut saved = SAVED_CONTEXT.lock();
-    let context = saved.take().expect("S3 wake trampoline resumed without saved CPU context");
-    drop(saved);
-
-    // SAFETY: The saved FACS physical address was captured from the validated FADT during suspend entry.
-    if context.facs_address != 0 {
-        let mut facs = unsafe {
-            KernelAcpiHandler.map_physical_region::<Facs>(
-                context.facs_address,
-                core::mem::size_of::<Facs>(),
-            )
-        };
-        set_firmware_waking_vector(&mut facs, 0);
-    }
-
-    // SAFETY: The wake trampoline already switched to the saved kernel CR3 and long mode, so the remaining restores are architectural register state only.
-    unsafe {
-        x86::msr::wrmsr(x86::msr::IA32_EFER, context.efer);
-        core::arch::asm!("mov cr3, {}", in(reg) context.cr3, options(nostack));
-        core::arch::asm!("mov cr4, {}", in(reg) context.cr4, options(nostack));
-        core::arch::asm!("mov cr2, {}", in(reg) context.cr2, options(nostack));
-        core::arch::asm!("mov cr0, {}", in(reg) context.cr0, options(nostack));
-        core::arch::asm!("lgdt [{}]", in(reg) &context.gdtr, options(nostack));
-        core::arch::asm!("lidt [{}]", in(reg) &context.idtr, options(nostack));
-
-        task::load_tr(SegmentSelector::new(crate::arch::gdt::GDT_TSS as u16, Ring::Ring0));
-
-        x86::msr::wrmsr(x86::msr::IA32_FS_BASE, context.fs_base);
-        x86::msr::wrmsr(x86::msr::IA32_GS_BASE, context.gs_base);
-        x86::msr::wrmsr(x86::msr::IA32_KERNEL_GSBASE, context.kernel_gs_base);
-    }
-
-    restore_fpu_state(&context);
-
-    // SAFETY: Returning with the original entry stack and RFLAGS completes the suspend call as a successful function return.
-    unsafe {
-        core::arch::asm!(
-            "mov rsp, {entry_rsp}",
-            "push {rflags}",
-            "popfq",
-            "xor eax, eax",
-            "ret",
-            entry_rsp = in(reg) context.entry_rsp,
-            rflags = in(reg) context.rflags,
-            options(noreturn)
-        );
-    }
-}
-
-pub fn enter_sleep_state(state: SleepState) -> core::result::Result<(), SleepError> {
-    #[cfg(not(target_arch = "x86_64"))]
-    {
-        let _ = state;
-        return Err(SleepError::UnsupportedArch);
-    }
-
-    #[cfg(target_arch = "x86_64")]
-    {
-        let raw = unsafe {
-            enter_sleep_raw(match state {
-                SleepState::S3 => 3,
-                SleepState::S5 => 5,
-            })
-        };
-        if raw == SLEEP_RETURN_OK {
-            Ok(())
-        } else {
-            Err(SleepError::from_code(raw))
-        }
-    }
-}
-
-pub fn available_sleep_states() -> &'static [u8] {
-    if sleep_type_data(SleepState::S3).is_ok() {
-        b"S3\nS5\n"
-    } else {
-        b"S5\n"
-    }
-}
-
-pub fn trigger_sleep_request(request: &str) -> Result<(), Error> {
-    match request.trim() {
-        "S3" => enter_sleep_state(SleepState::S3).map_err(|_| Error::new(EIO)),
-        "S5" => enter_sleep_state(SleepState::S5).map_err(|_| Error::new(EIO)),
-        _ => Err(Error::new(EINVAL)),
-    }
-}
@@ -82,15 +82,6 @@ extern "C" fn kstart() {
 /// The entry to Rust, all things must be initialized
 unsafe extern "C" fn start(args_ptr: *const KernelArgs, stack_end: usize) -> ! {
    unsafe {
-        // EARLY CANARY: write 'R' to COM1 before any kernel init.
-        // This proves the serial hardware works and the kernel reached Rust entry.
-        // If this character appears but "RedBear OS starting..." does not,
-        // the hang is in args_ptr.read(), serial::init(), or graphical_debug::init().
-        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-        {
-            core::arch::asm!("out dx, al", in("dx") 0x3F8u16, in("al") b'R', options(nostack, preserves_flags));
-        }
-
        let bootstrap = {
            let args = args_ptr.read();

@@ -100,49 +91,27 @@ unsafe extern "C" fn start(args_ptr: *const KernelArgs, stack_end: usize) -> ! {
            // Set up graphical debug
            graphical_debug::init(args.env());

-            // SECOND CANARY: write 'S' to COM1 after serial init.
-            // If 'R' appears but 'S' does not, the hang is in serial::init() or graphical_debug::init().
-            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-            {
-                core::arch::asm!("out dx, al", in("dx") 0x3F8u16, in("al") b'S', options(nostack, preserves_flags));
-            }
-
-            info!("RedBear OS starting...");
+            info!("Redox OS starting...");
            args.print();

-            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-            { core::arch::asm!("out dx, al", in("dx") 0x3F8u16, in("al") b'1', options(nostack, preserves_flags)); }
-
            // Set up GDT
            gdt::init_bsp(stack_end);

-            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-            { core::arch::asm!("out dx, al", in("dx") 0x3F8u16, in("al") b'2', options(nostack, preserves_flags)); }
-
            // Set up IDT
            idt::init_bsp();

-            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-            { core::arch::asm!("out dx, al", in("dx") 0x3F8u16, in("al") b'3', options(nostack, preserves_flags)); }
-
            // Initialize RMM
            #[cfg(target_arch = "x86")]
            crate::startup::memory::init(&args, Some(0x100000), Some(0x40000000));
            #[cfg(target_arch = "x86_64")]
            crate::startup::memory::init(&args, Some(0x100000), None);

-            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-            { core::arch::asm!("out dx, al", in("dx") 0x3F8u16, in("al") b'4', options(nostack, preserves_flags)); }
-
            // Initialize paging
            paging::init();

            #[cfg(target_arch = "x86_64")]
            crate::arch::alternative::early_init(true);

-            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-            { core::arch::asm!("out dx, al", in("dx") 0x3F8u16, in("al") b'5', options(nostack, preserves_flags)); }
-
            // Set up syscall instruction
            interrupt::syscall::init();

@@ -152,9 +121,6 @@ unsafe extern "C" fn start(args_ptr: *const KernelArgs, stack_end: usize) -> ! {
            // Activate memory logging
            crate::log::init();

-            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-            { core::arch::asm!("out dx, al", in("dx") 0x3F8u16, in("al") b'6', options(nostack, preserves_flags)); }
-
            // Initialize miscellaneous processor features
            #[cfg(target_arch = "x86_64")]
            crate::arch::misc::init(LogicalCpuId::BSP);
@@ -162,9 +128,6 @@ unsafe extern "C" fn start(args_ptr: *const KernelArgs, stack_end: usize) -> ! {
            // Initialize devices
            device::init();

-            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-            { core::arch::asm!("out dx, al", in("dx") 0x3F8u16, in("al") b'7', options(nostack, preserves_flags)); }
-
            // Read ACPI tables, starts APs
            if cfg!(feature = "acpi") {
                crate::acpi::init(args.acpi_rsdp());
@@ -1,110 +0,0 @@
-; ACPI S3 wake trampoline
-; compiled with nasm by build.rs, copied to physical 0x8000 before S3 entry
-
-ORG 0x8000
-SECTION .text
-USE16
-
-trampoline:
-    jmp short startup_wake
-    times 8 - ($ - trampoline) nop
-    .stack: dq 0
-    .page_table: dq 0
-    .code: dq 0
-
-startup_wake:
-    cli
-
-    xor ax, ax
-    mov ds, ax
-    mov es, ax
-    mov ss, ax
-    mov sp, 0
-
-    mov edi, [trampoline.page_table]
-    mov cr3, edi
-
-    mov eax, cr0
-    and al, 11110011b
-    or al, 00100010b
-    mov cr0, eax
-
-    mov eax, cr4
-    or eax, 1 << 9 | 1 << 7 | 1 << 5 | 1 << 4
-    mov cr4, eax
-
-    fninit
-
-    lgdt [gdtr]
-
-    mov ecx, 0xC0000080
-    rdmsr
-    or eax, 1 << 11 | 1 << 8
-    wrmsr
-
-    mov ebx, cr0
-    or ebx, 1 << 31 | 1 << 16 | 1
-    mov cr0, ebx
-
-    jmp gdt.kernel_code:long_mode_wake
-
-USE64
-long_mode_wake:
-    mov rax, gdt.kernel_data
-    mov ds, rax
-    mov es, rax
-    mov fs, rax
-    mov gs, rax
-    mov ss, rax
-
-    mov rsp, [trampoline.stack]
-    mov rax, [trampoline.code]
-    jmp rax
-
-struc GDTEntry
-    .limitl resw 1
-    .basel resw 1
-    .basem resb 1
-    .attribute resb 1
-    .flags__limith resb 1
-    .baseh resb 1
-endstruc
-
-attrib:
-    .present              equ 1 << 7
-    .user                 equ 1 << 4
-    .code                 equ 1 << 3
-    .writable             equ 1 << 1
-
-flags:
-    .long_mode equ 1 << 5
-
-gdtr:
-    dw gdt.end + 1
-    dq gdt
-
-gdt:
-.null equ $ - gdt
-    dq 0
-
-.kernel_code equ $ - gdt
-istruc GDTEntry
-    at GDTEntry.limitl, dw 0
-    at GDTEntry.basel, dw 0
-    at GDTEntry.basem, db 0
-    at GDTEntry.attribute, db attrib.present | attrib.user | attrib.code
-    at GDTEntry.flags__limith, db flags.long_mode
-    at GDTEntry.baseh, db 0
-iend
-
-.kernel_data equ $ - gdt
-istruc GDTEntry
-    at GDTEntry.limitl, dw 0
-    at GDTEntry.basel, dw 0
-    at GDTEntry.basem, db 0
-    at GDTEntry.attribute, db attrib.present | attrib.user | attrib.writable
-    at GDTEntry.flags__limith, db 0
-    at GDTEntry.baseh, db 0
-iend
-
-.end equ $ - gdt
@@ -4,10 +4,16 @@ use crate::{
    percpu::PercpuBlock,
    syscall::FloatRegisters,
 };
-use core::{mem::offset_of, ptr};
+use core::{mem::offset_of, ptr, sync::atomic::AtomicBool};
 use spin::Once;
 use syscall::{EnvRegisters, Result};

+/// This must be used by the kernel to ensure that context switches are done atomically
+/// Compare and exchange this to true when beginning a context switch on any CPU
+/// The `Context::switch_to` function will set it back to false, allowing other CPU's to switch
+/// This must be done, as no locks can be held on the stack during switch
+pub static CONTEXT_SWITCH_LOCK: AtomicBool = AtomicBool::new(false);
+
 // 512 bytes for registers, extra bytes for fpcr and fpsr
 pub const KFX_ALIGN: usize = 16;

@@ -2,11 +2,13 @@ use crate::{
    arch::interrupt::InterruptStack, context::context::Kstack, memory::RmmA, percpu::PercpuBlock,
    syscall::FloatRegisters,
 };
-use core::mem::offset_of;
+use core::{mem::offset_of, sync::atomic::AtomicBool};
 use rmm::{Arch, VirtualAddress};
 use spin::Once;
 use syscall::{error::*, EnvRegisters};

+pub static CONTEXT_SWITCH_LOCK: AtomicBool = AtomicBool::new(false);
+
 pub const KFX_ALIGN: usize = 16;

 #[derive(Clone, Debug, Default)]
@@ -1,4 +1,4 @@
-use core::mem::offset_of;
+use core::{mem::offset_of, sync::atomic::AtomicBool};
 use rmm::{Arch, VirtualAddress};
 use spin::Once;
 use syscall::{error::*, EnvRegisters};
@@ -14,6 +14,12 @@ use crate::{
    syscall::FloatRegisters,
 };

+/// This must be used by the kernel to ensure that context switches are done atomically
+/// Compare and exchange this to true when beginning a context switch on any CPU
+/// The `Context::switch_to` function will set it back to false, allowing other CPU's to switch
+/// This must be done, as no locks can be held on the stack during switch
+pub static CONTEXT_SWITCH_LOCK: AtomicBool = AtomicBool::new(false);
+
 const ST_RESERVED: u128 = 0xFFFF_FFFF_FFFF_0000_0000_0000_0000_0000;

 pub const KFX_ALIGN: usize = 16;
@@ -1,5 +1,6 @@
 use core::{
    ptr::{addr_of, addr_of_mut},
+    sync::atomic::AtomicBool,
 };

 use crate::syscall::FloatRegisters;
@@ -11,6 +12,12 @@ use spin::Once;
 use syscall::{error::*, EnvRegisters};
 use x86::msr;

+/// This must be used by the kernel to ensure that context switches are done atomically
+/// Compare and exchange this to true when beginning a context switch on any CPU
+/// The `Context::switch_to` function will set it back to false, allowing other CPU's to switch
+/// This must be done, as no locks can be held on the stack during switch
+pub static CONTEXT_SWITCH_LOCK: AtomicBool = AtomicBool::new(false);
+
 const ST_RESERVED: u128 = 0xFFFF_FFFF_FFFF_0000_0000_0000_0000_0000;

 #[cfg(cpu_feature_never = "xsave")]
@@ -148,8 +148,6 @@ pub struct Context {
    pub euid: u32,
    pub egid: u32,
    pub pid: usize,
-    /// Supplementary group IDs for access control decisions.
-    pub groups: Vec<u32>,

    // See [`PreemptGuard`]
    //
@@ -206,7 +204,6 @@ impl Context {
            euid: 0,
            egid: 0,
            pid: 0,
-            groups: Vec::new(),

            #[cfg(feature = "syscall_debug")]
            syscall_debug_info: crate::syscall::debug::SyscallDebugInfo::default(),
@@ -482,7 +479,6 @@ impl Context {
            uid: self.euid,
            gid: self.egid,
            pid: self.pid,
-            groups: self.groups.clone(),
        }
    }
 }
@@ -4,7 +4,7 @@ use crate::{
    event,
    scheme::{self, SchemeId},
    sync::{CleanLockToken, RwLock, L6},
-    syscall::error::{Error, Result, ESTALE},
+    syscall::error::Result,
 };
 use alloc::sync::Arc;
 use syscall::{schemev2::NewFdFlags, RwFlags, O_APPEND, O_NONBLOCK};
@@ -18,7 +18,6 @@ pub struct FileDescription {
    pub offset: u64,
    /// The scheme that this file refers to
    pub scheme: SchemeId,
-    pub scheme_generation: Option<u64>,
    /// The number the scheme uses to refer to this file
    pub number: usize,
    /// The flags passed to open or fcntl(SETFL)
@@ -33,52 +32,6 @@ bitflags! {
    }
 }
 impl FileDescription {
-    pub fn with_generation(
-        scheme: SchemeId,
-        scheme_generation: Option<u64>,
-        number: usize,
-        offset: u64,
-        flags: u32,
-        internal_flags: InternalFlags,
-    ) -> Self {
-        Self {
-            offset,
-            scheme,
-            scheme_generation,
-            number,
-            flags,
-            internal_flags,
-        }
-    }
-
-    pub fn new(
-        scheme: SchemeId,
-        number: usize,
-        offset: u64,
-        flags: u32,
-        internal_flags: InternalFlags,
-        token: &mut CleanLockToken,
-    ) -> Self {
-        Self::with_generation(
-            scheme,
-            Some(scheme::current_scheme_generation(token.token(), scheme)),
-            number,
-            offset,
-            flags,
-            internal_flags,
-        )
-    }
-
-    pub fn get_scheme(&self, token: &mut CleanLockToken) -> Result<scheme::KernelSchemes> {
-        if let Some(expected_generation) = self.scheme_generation
-            && expected_generation != scheme::current_scheme_generation(token.token(), self.scheme)
-        {
-            return Err(Error::new(ESTALE));
-        }
-
-        scheme::get_scheme(token.token(), self.scheme)
-    }
-
    pub fn rw_flags(&self, rw: RwFlags) -> u32 {
        let mut ret = self.flags & !(O_NONBLOCK | O_APPEND) as u32;
        if rw.contains(RwFlags::APPEND) {
@@ -123,7 +76,7 @@ impl FileDescription {
    pub fn try_close(self, token: &mut CleanLockToken) -> Result<()> {
        event::unregister_file(self.scheme, self.number, token);

-        let scheme = self.get_scheme(token)?;
+        let scheme = scheme::get_scheme(token.token(), self.scheme)?;

        scheme.close(self.number, token)
    }
@@ -132,12 +85,12 @@ impl FileDescription {
 impl FileDescriptor {
    pub fn close(self, token: &mut CleanLockToken) -> Result<()> {
        {
-            let (desc, number, internal_flags) = {
+            let (scheme_id, number, internal_flags) = {
                let desc = self.description.read(token.token());
-                (*desc, desc.number, desc.internal_flags)
+                (desc.scheme, desc.number, desc.internal_flags)
            };
            if internal_flags.contains(InternalFlags::NOTIFY_ON_NEXT_DETACH) {
-                let scheme = desc.get_scheme(token)?;
+                let scheme = scheme::get_scheme(token.token(), scheme_id)?;
                scheme.detach(number, token)?;
            }
        }
@@ -64,13 +64,14 @@ impl UnmapResult {
            return Ok(());
        };

-        let (scheme, number) = {
-            let desc = *description.read(token.token());
-            (desc.get_scheme(token)?, desc.number)
+        let (scheme_id, number) = {
+            let desc = description.write(token.token());
+            (desc.scheme, desc.number)
        };

-        let funmap_result = scheme
-            .kfunmap(number, base_offset, self.size, self.flags, token);
+        let scheme_opt = scheme::get_scheme(token.token(), scheme_id);
+        let funmap_result = scheme_opt
+            .and_then(|scheme| scheme.kfunmap(number, base_offset, self.size, self.flags, token));

        if let Ok(fd) = Arc::try_unwrap(description) {
            fd.into_inner().try_close(token)?;
@@ -2686,13 +2687,20 @@ fn correct_inner<'l>(
            // XXX: This is cheating, but guaranteed we won't deadlock because we've dropped addr_space_guard
            let mut token = unsafe { CleanLockToken::new() };

-            let desc = *file_ref.description.read(token.token());
-            let scheme = desc.get_scheme(&mut token).map_err(|_| PfError::Segv)?;
-            let scheme_number = desc.number;
-            let user_inner = match scheme {
-                KernelSchemes::User(user) => user.inner,
-                _ => return Err(PfError::Segv),
+            let (scheme_id, scheme_number) = {
+                let desc = &file_ref.description.read(token.token());
+                (desc.scheme, desc.number)
            };
+            let user_inner = scheme::get_scheme(token.token(), scheme_id)
+                .ok()
+                .and_then(|s| {
+                    if let KernelSchemes::User(user) = s {
+                        Some(user.inner)
+                    } else {
+                        None
+                    }
+                })
+                .ok_or(PfError::Segv)?;

            let offset = file_ref.base_offset as u64 + (pages_from_grant_start * PAGE_SIZE) as u64;
            user_inner
@@ -14,8 +14,8 @@ use crate::{
    memory::{RmmA, RmmArch, TableKind},
    percpu::PercpuBlock,
    sync::{
-        ArcRwLockWriteGuard, CleanLockToken, LockToken, McsMutex, McsMutexGuard, Mutex,
-        MutexGuard, RwLock, RwLockReadGuard, RwLockWriteGuard, L0, L1, L2, L4,
+        ArcRwLockWriteGuard, CleanLockToken, LockToken, Mutex, MutexGuard, RwLock, RwLockReadGuard,
+        RwLockWriteGuard, L0, L1, L2, L4,
    },
    syscall::error::Result,
 };
@@ -74,12 +74,10 @@ pub use self::arch::empty_cr3;
 // the context file descriptors.
 static CONTEXTS: RwLock<L2, BTreeSet<ContextRef>> = RwLock::new(BTreeSet::new());

-// Actual context store for the scheduler — uses MCS fair spinlock to
-// eliminate cache-line bouncing under multi-CPU contention.
-static RUN_CONTEXTS: McsMutex<L1, RunContextData> = McsMutex::new(RunContextData::new());
+// Actual context store for the scheduler
+static RUN_CONTEXTS: Mutex<L1, RunContextData> = Mutex::new(RunContextData::new());

-// Context that has been pushed out from RUN_CONTEXTS after being idle.
-// Uses regular Mutex (lower contention; wakeup_contexts uses try_lock).
+// Context that has been pushed out from RUN_CONTEXTS after being idle
 static IDLE_CONTEXTS: Mutex<L2, VecDeque<WeakContextRef>> = Mutex::new(VecDeque::new());

 pub struct RunContextData {
@@ -115,7 +113,7 @@ pub fn idle_contexts_try(
    IDLE_CONTEXTS.try_lock(token)
 }

-pub fn run_contexts(token: LockToken<'_, L0>) -> McsMutexGuard<'_, L1, RunContextData> {
+pub fn run_contexts(token: LockToken<'_, L0>) -> MutexGuard<'_, L1, RunContextData> {
    RUN_CONTEXTS.lock(token)
 }

@@ -15,7 +15,7 @@ use crate::{
 use alloc::{sync::Arc, vec::Vec};
 use core::{
    cell::{Cell, RefCell},
-    mem,
+    hint, mem,
    sync::atomic::Ordering,
 };
 use syscall::PtraceFlags;
@@ -26,11 +26,6 @@ enum UpdateResult {
    Blocked,
 }

-/// Default number of PIT ticks before triggering a context switch.
-/// At ~2.25 ms per tick, 3 ticks ≈ 6.75 ms timeslice.
-/// Configurable per-CPU via `ContextSwitchPercpu::preempt_interval`.
-const DEFAULT_PREEMPT_INTERVAL: usize = 3;
-
 // A simple geometric series where value[i] ~= value[i - 1] * 1.25
 const SCHED_PRIO_TO_WEIGHT: [usize; 40] = [
    88761, 71755, 56483, 46273, 36291, 29154, 23254, 18705, 14949, 11916, 9548, 7620, 6100, 4904,
@@ -95,15 +90,13 @@ struct SwitchResultInner {
 ///
 /// The function also calls the signal handler after switching contexts.
 pub fn tick(token: &mut CleanLockToken) {
-    let percpu = PercpuBlock::current();
-    let ticks_cell = &percpu.switch_internals.pit_ticks;
+    let ticks_cell = &PercpuBlock::current().switch_internals.pit_ticks;

    let new_ticks = ticks_cell.get() + 1;
    ticks_cell.set(new_ticks);

-    // Trigger a context switch when the per-CPU preempt interval is reached.
-    let interval = percpu.switch_internals.preempt_interval.get();
-    if new_ticks >= interval {
+    // Trigger a context switch after every 3 ticks (approx. 6.75 ms).
+    if new_ticks >= 3 {
        switch(token);
        crate::context::signal::signal_handler(token);
    }
@@ -127,10 +120,7 @@ pub unsafe extern "C" fn switch_finish_hook() {
                crate::arch::stop::emergency_reset();
            }
        }
-        PercpuBlock::current()
-            .switch_internals
-            .in_context_switch
-            .set(false);
+        arch::CONTEXT_SWITCH_LOCK.store(false, Ordering::SeqCst);
        crate::percpu::switch_arch_hook();
    }
 }
@@ -160,15 +150,16 @@ pub fn switch(token: &mut CleanLockToken) -> SwitchResult {
    //set PIT Interrupt counter to 0, giving each process same amount of PIT ticks
    percpu.switch_internals.pit_ticks.set(0);

-    // Acquire the per-CPU context switch flag. Each CPU can only be in one context
-    // switch at a time. The per-context write locks provide cross-CPU safety; this
-    // flag catches re-entrant switches on the same CPU (a kernel bug).
-    debug_assert!(
-        !percpu.switch_internals.in_context_switch.get(),
-        "context switch re-entry on CPU {}",
-        percpu.cpu_id
-    );
-    percpu.switch_internals.in_context_switch.set(true);
+    // Acquire the global lock to ensure exclusive access during context switch and avoid
+    // issues that would be caused by the unsafe operations below
+    // TODO: Better memory orderings?
+    while arch::CONTEXT_SWITCH_LOCK
+        .compare_exchange_weak(false, true, Ordering::SeqCst, Ordering::Relaxed)
+        .is_err()
+    {
+        hint::spin_loop();
+        percpu.maybe_handle_tlb_shootdown();
+    }

    // Lock the previous context.
    let prev_context_lock = crate::context::current();
@@ -176,8 +167,8 @@ pub fn switch(token: &mut CleanLockToken) -> SwitchResult {
    let mut prev_context_guard = unsafe { prev_context_lock.write_arc() };

    if !prev_context_guard.is_preemptable() {
-        // Unset per-CPU context switch flag
-        percpu.switch_internals.in_context_switch.set(false);
+        // Unset global lock
+        arch::CONTEXT_SWITCH_LOCK.store(false, Ordering::SeqCst);

        // Pretend to have finished switching, so CPU is not idled
        return SwitchResult::Switched;
@@ -301,8 +292,8 @@ pub fn switch(token: &mut CleanLockToken) -> SwitchResult {
            SwitchResult::Switched
        }
        _ => {
-            // No target was found, unset per-CPU context switch flag and return
-            percpu.switch_internals.in_context_switch.set(false);
+            // No target was found, unset global lock and return
+            arch::CONTEXT_SWITCH_LOCK.store(false, Ordering::SeqCst);

            percpu.stats.set_state(cpu_stats::CpuState::Idle);

@@ -361,7 +352,6 @@ fn wakeup_contexts(token: &mut CleanLockToken, switch_time: u128) -> Vec<(usize,
 }

 /// This is the scheduler function which currently utilises Deficit Weighted Round Robin Scheduler
-/// with NUMA-aware context selection preference.
 fn select_next_context(
    token: &mut CleanLockToken,
    percpu: &PercpuBlock,
@@ -387,10 +377,6 @@ fn select_next_context(
    let total_contexts: usize = contexts_list.iter().map(|q| q.len()).sum();
    let mut skipped_contexts = 0;

-    // NUMA-aware selection: remember cross-node fallback candidate.
-    let my_numa_node = percpu.numa_node.get();
-    let mut cross_node_fallback: Option<(usize, ArcContextLockWriteGuard)> = None;
-
    'priority: loop {
        i = (i + 1) % 40;
        total_iters += 1;
@@ -455,44 +441,9 @@ fn select_next_context(
            // Is this context runnable on this CPU?
            let sw = unsafe { update_runnable(&mut next_context_guard, cpu_id, switch_time) };
            if let UpdateResult::CanSwitch = sw {
-                // NUMA-aware selection: check if this context's last CPU was on the same node.
-                let same_node = if my_numa_node != u8::MAX {
-                    next_context_guard.cpu_id
-                        .map(|cid| {
-                            crate::percpu::get_for_cpu(cid)
-                                .map(|p| p.numa_node.get() == my_numa_node)
-                                .unwrap_or(false)
-                        })
-                        .unwrap_or(true) // New context (no last CPU) — treat as same node
-                } else {
-                    true // No NUMA info — treat all as same node
-                };
-
-                if same_node {
-                    // Cache-warm: select immediately
-                    percpu.current_prio.set(next_context_guard.prio);
-                    next_context_guard_opt = Some(next_context_guard);
-                    balance[i] -= SCHED_PRIO_TO_WEIGHT[20];
-                    break 'priority;
-                } else {
-                    // Cross-node candidate: save as fallback, keep scanning for same-node
-                    if cross_node_fallback.is_none() {
-                        // Cache the priority and balance for later
-                        cross_node_fallback =
-                            Some((next_context_guard.prio, next_context_guard));
-                        balance[i] -= SCHED_PRIO_TO_WEIGHT[20];
-                        // Don't break — keep looking for a same-node context
-                        continue;
-                    } else {
-                        // Already have a cross-node fallback; push this one back
-                        contexts.push_back(next_context_ref);
-                        skipped_contexts += 1;
-                        if skipped_contexts >= total_contexts {
-                            break 'priority;
-                        }
-                        continue;
-                    }
-                }
+                next_context_guard_opt = Some(next_context_guard);
+                balance[i] -= SCHED_PRIO_TO_WEIGHT[20];
+                break 'priority;
            } else {
                if matches!(sw, UpdateResult::Blocked) {
                    idle_contexts(token.token()).push_back(next_context_ref);
@@ -507,15 +458,6 @@ fn select_next_context(
            }
        }
    }
-
-    // If we found a cross-node fallback but no same-node context, use it
-    if next_context_guard_opt.is_none() {
-        if let Some((prio, guard)) = cross_node_fallback {
-            percpu.current_prio.set(prio);
-            next_context_guard_opt = Some(guard);
-        }
-    }
-
    percpu.balance.set(balance);
    percpu.last_queue.set(i);

@@ -523,10 +465,7 @@ fn select_next_context(
        // Send the old process to the back of the line (if it is still runnable)
        let prev_ctx = WeakContextRef(Arc::downgrade(&prev_context_lock));
        if prev_context_guard.status.is_runnable() {
-            let raw_prio = prev_context_guard.prio;
-            let prio = percpu.effective_prio(raw_prio);
-            // Clear PI donation — previous context is being re-queued
-            percpu.pi_donated_prio.store(u32::MAX, Ordering::Relaxed);
+            let prio = prev_context_guard.prio;
            contexts_list[prio].push_back(prev_ctx);
        } else {
            idle_contexts(token.token()).push_back(prev_ctx);
@@ -538,8 +477,7 @@ fn select_next_context(
        return Ok(Some(next_context_guard));
    } else {
        if !was_idle && !Arc::ptr_eq(&prev_context_lock, &idle_context) {
-            // Switching to idle context — cache lowest priority
-            percpu.current_prio.set(39);
+            // We switch into the idle context
            Ok(Some(unsafe { idle_context.write_arc() }))
        } else {
            // We found no other process to run.
@@ -556,13 +494,6 @@ pub struct ContextSwitchPercpu {
    switch_result: Cell<Option<SwitchResultInner>>,
    switch_time: Cell<u128>,
    pit_ticks: Cell<usize>,
-    /// Per-CPU context switch flag. Set to true during a context switch on this CPU.
-    /// Replaced the global CONTEXT_SWITCH_LOCK to eliminate cross-CPU serialization.
-    in_context_switch: Cell<bool>,
-    /// Number of PIT ticks before triggering a context switch.
-    /// Default: 3 (≈6.75 ms). Lower values improve interactive responsiveness;
-    /// higher values improve throughput for batch/compute workloads.
-    preempt_interval: Cell<usize>,

    current_ctxt: RefCell<Option<Arc<ContextLock>>>,

@@ -577,8 +508,6 @@ impl ContextSwitchPercpu {
            switch_result: Cell::new(None),
            switch_time: Cell::new(0),
            pit_ticks: Cell::new(0),
-            in_context_switch: Cell::new(false),
-            preempt_interval: Cell::new(DEFAULT_PREEMPT_INTERVAL),
            current_ctxt: RefCell::new(None),
            idle_ctxt: RefCell::new(None),
            being_sigkilled: Cell::new(false),
@@ -42,18 +42,17 @@ impl core::fmt::Display for LogicalCpuId {
 }

 #[cfg(target_pointer_width = "64")]
-pub const MAX_CPU_COUNT: u32 = 256;
+pub const MAX_CPU_COUNT: u32 = 128;

 #[cfg(target_pointer_width = "32")]
 pub const MAX_CPU_COUNT: u32 = 32;

 const SET_WORDS: usize = (MAX_CPU_COUNT / usize::BITS) as usize;

-// TODO: Support more than 256 CPUs.
+// TODO: Support more than 128 CPUs.
 // The maximum number of CPUs on Linux is configurable, and the type for LogicalCpuSet and
 // LogicalCpuId may be optimized accordingly. In that case, box the mask if it's larger than some
-// base size (probably 256 bytes). AMD EPYC has 128C/256T, Threadripper PRO 96C/192T —
-// 256 covers current hardware.
+// base size (probably 256 bytes).
 #[derive(Debug)]
 pub struct LogicalCpuSet([AtomicUsize; SET_WORDS]);

@@ -1,5 +1,5 @@
 use alloc::sync::Arc;
-use core::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
+use core::sync::atomic::{AtomicUsize, Ordering};
 use hashbrown::{hash_map::DefaultHashBuilder, HashMap};
 use smallvec::SmallVec;
 use syscall::data::GlobalSchemes;
@@ -23,7 +23,6 @@ int_like!(EventQueueId, AtomicEventQueueId, usize, AtomicUsize);
 pub struct EventQueue {
    id: EventQueueId,
    queue: WaitQueue<Event>,
-    pub eventfd: Option<(AtomicU64, bool)>, // (counter, semaphore_mode)
 }

 impl EventQueue {
@@ -31,15 +30,6 @@ impl EventQueue {
        EventQueue {
            id,
            queue: WaitQueue::new(),
-            eventfd: None,
-        }
-    }
-
-    pub fn new_eventfd(id: EventQueueId, initval: u64, semaphore: bool) -> EventQueue {
-        EventQueue {
-            id,
-            queue: WaitQueue::new(),
-            eventfd: Some((AtomicU64::new(initval), semaphore)),
        }
    }

@@ -70,9 +70,6 @@ mod log;
 /// Memory management
 mod memory;

-/// NUMA topology
-mod numa;
-
 /// Panic
 mod panic;

@@ -1,81 +0,0 @@
-/// NUMA topology hints for the kernel scheduler.
-///
-/// NUMA discovery (SRAT/SLIT parsing) is performed during kernel ACPI init
-/// (`acpi::init()`). The kernel stores a lightweight copy for O(1) scheduling
-/// lookups. If no SRAT is found, `init_default()` creates a single-node topology.
-use crate::acpi::srat;
-use crate::cpu_set::{LogicalCpuId, LogicalCpuSet};
-use core::sync::atomic::{AtomicBool, Ordering};
-
-const MAX_NUMA_NODES: usize = 8;
-
-#[derive(Debug)]
-pub struct NumaHint {
-    pub node_id: u8,
-    pub cpus: LogicalCpuSet,
-}
-
-pub struct NumaTopology {
-    pub nodes: [Option<NumaHint>; MAX_NUMA_NODES],
-    pub initialized: AtomicBool,
-}
-
-impl NumaTopology {
-    pub const fn new() -> Self {
-        const NONE: Option<NumaHint> = None;
-        Self { nodes: [NONE; MAX_NUMA_NODES], initialized: AtomicBool::new(false) }
-    }
-
-    pub fn node_for_cpu(&self, cpu: LogicalCpuId) -> Option<u8> {
-        for node in self.nodes.iter().flatten() {
-            if node.cpus.contains(cpu) { return Some(node.node_id); }
-        }
-        None
-    }
-
-    pub fn same_node(&self, cpu1: LogicalCpuId, cpu2: LogicalCpuId) -> bool {
-        self.node_for_cpu(cpu1) == self.node_for_cpu(cpu2)
-    }
-}
-
-static mut NUMA_TOPOLOGY: NumaTopology = NumaTopology::new();
-
-pub fn topology() -> &'static NumaTopology { unsafe { &NUMA_TOPOLOGY } }
-
-/// Initialize NUMA topology from SRAT data parsed during ACPI init.
-pub fn init_from_srat(apic_ids: &[(u32, LogicalCpuId)]) {
-    let topo = topology();
-    if topo.initialized.swap(true, Ordering::AcqRel) { return; }
-    if !srat::is_available() { init_default_inner(); return; }
-    unsafe {
-        let topo_mut = &mut *core::ptr::addr_of_mut!(NUMA_TOPOLOGY);
-        for &(apic_id, cpu_id) in apic_ids {
-            if let Some(node) = srat::numa_node_for_apic(apic_id) {
-                let idx = node as usize;
-                if idx < MAX_NUMA_NODES {
-                    topo_mut.nodes[idx].get_or_insert_with(|| NumaHint { node_id: node, cpus: LogicalCpuSet::empty() }).cpus.atomic_set(cpu_id);
-                }
-            }
-        }
-        if topo_mut.nodes.iter().all(|n| n.is_none()) {
-            topo_mut.nodes[0] = Some(NumaHint { node_id: 0, cpus: LogicalCpuSet::all() });
-        }
-    }
-    let node_count = topology().nodes.iter().filter(|n| n.is_some()).count();
-    debug!("NUMA: {node_count} node(s) from SRAT");
-}
-
-/// Fallback: single-node topology.
-pub fn init_default() {
-    let topo = topology();
-    if topo.initialized.swap(true, Ordering::AcqRel) { return; }
-    init_default_inner();
-}
-
-fn init_default_inner() {
-    unsafe {
-        let topo_mut = &mut *core::ptr::addr_of_mut!(NUMA_TOPOLOGY);
-        topo_mut.nodes[0] = Some(NumaHint { node_id: 0, cpus: LogicalCpuSet::all() });
-    }
-    debug!("NUMA: single-node topology (no SRAT)");
-}
@@ -4,14 +4,9 @@ use alloc::{
 };
 use core::{
    cell::{Cell, RefCell},
-    hint,
-    sync::atomic::{AtomicBool, AtomicPtr, AtomicU32, AtomicU64, Ordering},
+    sync::atomic::{AtomicBool, AtomicPtr, Ordering},
 };

-/// Maximum number of pages to flush individually using INVLPG before falling
-/// back to a full TLB flush (CR3 reload).
-const TLB_RANGE_THRESHOLD: u32 = 32;
-
 use rmm::Arch;
 use syscall::PtraceFlags;

@@ -21,7 +16,7 @@ use crate::{
    cpu_set::{LogicalCpuId, MAX_CPU_COUNT},
    cpu_stats::{CpuStats, CpuStatsData},
    ptrace::Session,
-    sync::{mcs::McsNode, mcs::McsRawLock, CleanLockToken},
+    sync::CleanLockToken,
    syscall::debug::SyscallDebugInfo,
 };

@@ -39,38 +34,6 @@ pub struct PercpuBlock {
    pub balance: Cell<[usize; 40]>,
    pub last_queue: Cell<usize>,

-    /// Per-CPU MCS node for the scheduler run-queue lock (RUN_CONTEXTS).
-    pub mcs_sched_node: McsNode,
-
-    /// Counts how many times the scheduler MCS lock acquisition was contended.
-    pub mcs_contention_count: Cell<u64>,
-
-    /// TLB shootdown range: start virtual address (page-aligned).
-    /// Set to 0 for a full flush. Only valid when `wants_tlb_shootdown` is true.
-    pub tlb_flush_start: AtomicU64,
-    /// TLB shootdown range: number of pages to invalidate.
-    pub tlb_flush_count: AtomicU32,
-
-    /// Priority inheritance donation. When another CPU is blocked waiting on a
-    /// lock this CPU holds, the blocked CPU may donate its priority here.
-    /// `u32::MAX` means no donation; otherwise it's a priority level (0-39).
-    pub pi_donated_prio: AtomicU32,
-
-    /// Cached priority of the currently-running context on this CPU.
-    /// Set by the scheduler when selecting a new context. Read by the MCS
-    /// lock during priority donation — avoids acquiring the context RwLock
-    /// from the spin loop. Default 39 (lowest priority).
-    pub current_prio: Cell<usize>,
-
-    /// NUMA proximity domain for this CPU. Set during ACPI init from SRAT.
-    /// `u8::MAX` means unknown (no SRAT or APIC ID not listed).
-    pub numa_node: Cell<u8>,
-
-    /// Pointer to the MCS lock this CPU is currently spinning on (for transitive PI).
-    /// `null` when not waiting on any lock. Set in McsRawLock::acquire() before
-    /// entering the spin loop, cleared upon acquisition.
-    pub waiting_on_lock: AtomicPtr<McsRawLock>,
-
    // TODO: Put mailbox queues here, e.g. for TLB shootdown? Just be sure to 128-byte align it
    // first to avoid cache invalidation.
    pub profiling: Option<&'static crate::profiling::RingBuffer>,
@@ -94,15 +57,6 @@ pub unsafe fn init_tlb_shootdown(id: LogicalCpuId, block: *mut PercpuBlock) {
    ALL_PERCPU_BLOCKS[id.get() as usize].store(block, Ordering::Release)
 }

-/// Get a reference to another CPU's PercpuBlock by logical CPU ID.
-pub fn get_for_cpu(id: LogicalCpuId) -> Option<&'static PercpuBlock> {
-    unsafe {
-        ALL_PERCPU_BLOCKS[id.get() as usize]
-            .load(Ordering::Acquire)
-            .as_ref()
-    }
-}
-
 pub fn get_all_stats() -> Vec<(LogicalCpuId, CpuStatsData)> {
    let mut res = ALL_PERCPU_BLOCKS
        .iter()
@@ -147,148 +101,25 @@ pub fn shootdown_tlb_ipi(target: Option<LogicalCpuId>) {
                core::hint::spin_loop();
            }
        }
-        // Full flush — clear range info (Release ordering ensures the flag
-        // swap and these stores are visible to the handler before the IPI).
-        percpublock.tlb_flush_start.store(0, Ordering::Release);
-        percpublock.tlb_flush_count.store(0, Ordering::Release);

        crate::ipi::ipi_single(crate::ipi::IpiKind::Tlb, percpublock);
    } else {
-        // Broadcast TLB shootdown: set flag on all other CPUs, then send a single
-        // IPI with "all except self" destination shorthand instead of N individual IPIs.
-        let my_percpublock = PercpuBlock::current();
        for id in 0..crate::cpu_count() {
-            let target_id = LogicalCpuId::new(id);
-            if target_id == my_percpublock.cpu_id {
-                continue;
-            }
-            let Some(percpublock) = (unsafe {
-                ALL_PERCPU_BLOCKS[id as usize]
-                    .load(Ordering::Acquire)
-                    .as_ref()
-            }) else {
-                continue;
-            };
-            // Wait if this CPU still has a pending shootdown from a previous request
-            #[expect(clippy::bool_comparison)]
-            while percpublock
-                .wants_tlb_shootdown
-                .swap(true, Ordering::Release)
-                == true
-            {
-                while percpublock.wants_tlb_shootdown.load(Ordering::Relaxed) == true {
-                    my_percpublock.maybe_handle_tlb_shootdown();
-                    hint::spin_loop();
-                }
-            }
-            // Full flush — clear range info (Release ordering)
-            percpublock.tlb_flush_start.store(0, Ordering::Release);
-            percpublock.tlb_flush_count.store(0, Ordering::Release);
+            // TODO: Optimize: use global counter and percpu ack counters, send IPI using
+            // destination shorthand "all CPUs".
+            shootdown_tlb_ipi(Some(LogicalCpuId::new(id)));
        }
-        // Single broadcast IPI to all other CPUs using destination shorthand
-        crate::ipi::ipi(crate::ipi::IpiKind::Tlb, crate::ipi::IpiTarget::Other);
-    }
-}
-
-/// Range-based TLB shootdown IPI. Only invalidates the specified virtual address
-/// range using INVLPG per page for ranges up to TLB_RANGE_THRESHOLD pages.
-/// Falls back to full flush for larger ranges.
-pub fn shootdown_tlb_ipi_range(target: Option<LogicalCpuId>, start: usize, count: usize) {
-    if cfg!(not(feature = "multi_core")) {
-        return;
-    }
-
-    let start_aligned = start as u64 & !0xFFF;
-    let count_u32 = count as u32;
-    let use_range = count_u32 > 0 && count_u32 <= TLB_RANGE_THRESHOLD;
-
-    let set_range = |percpublock: &PercpuBlock| {
-        if use_range {
-            percpublock.tlb_flush_start.store(start_aligned, Ordering::Release);
-            percpublock.tlb_flush_count.store(count_u32, Ordering::Release);
-        } else {
-            percpublock.tlb_flush_start.store(0, Ordering::Release);
-            percpublock.tlb_flush_count.store(0, Ordering::Release);
-        }
-    };
-
-    if let Some(target) = target {
-        let my_percpublock = PercpuBlock::current();
-        assert_ne!(target, my_percpublock.cpu_id);
-
-        let Some(percpublock) = (unsafe {
-            ALL_PERCPU_BLOCKS[target.get() as usize]
-                .load(Ordering::Acquire)
-                .as_ref()
-        }) else {
-            return;
-        };
-        #[expect(clippy::bool_comparison)]
-        while percpublock.wants_tlb_shootdown.swap(true, Ordering::Release) == true {
-            while percpublock.wants_tlb_shootdown.load(Ordering::Relaxed) == true {
-                my_percpublock.maybe_handle_tlb_shootdown();
-                hint::spin_loop();
-            }
-        }
-        set_range(percpublock);
-        crate::ipi::ipi_single(crate::ipi::IpiKind::Tlb, percpublock);
-    } else {
-        let my_percpublock = PercpuBlock::current();
-        for id in 0..crate::cpu_count() {
-            let target_id = LogicalCpuId::new(id);
-            if target_id == my_percpublock.cpu_id {
-                continue;
-            }
-            let Some(percpublock) = (unsafe {
-                ALL_PERCPU_BLOCKS[id as usize]
-                    .load(Ordering::Acquire)
-                    .as_ref()
-            }) else {
-                continue;
-            };
-            #[expect(clippy::bool_comparison)]
-            while percpublock.wants_tlb_shootdown.swap(true, Ordering::Release) == true {
-                while percpublock.wants_tlb_shootdown.load(Ordering::Relaxed) == true {
-                    my_percpublock.maybe_handle_tlb_shootdown();
-                    hint::spin_loop();
-                }
-            }
-            set_range(percpublock);
-        }
-        crate::ipi::ipi(crate::ipi::IpiKind::Tlb, crate::ipi::IpiTarget::Other);
    }
 }
 impl PercpuBlock {
-    /// Return the effective scheduling priority, accounting for priority inheritance.
-    /// Lower number = higher priority (0-39 range).
-    pub fn effective_prio(&self, context_prio: usize) -> usize {
-        let donated = self.pi_donated_prio.load(Ordering::Relaxed);
-        if donated < context_prio as u32 {
-            donated as usize
-        } else {
-            context_prio
-        }
-    }
-
    pub fn maybe_handle_tlb_shootdown(&self) {
        #[expect(clippy::bool_comparison)]
        if self.wants_tlb_shootdown.swap(false, Ordering::Relaxed) == false {
            return;
        }

-        let start = self.tlb_flush_start.load(Ordering::Acquire);
-        let count = self.tlb_flush_count.load(Ordering::Acquire);
-
-        if start != 0 && count > 0 && count <= TLB_RANGE_THRESHOLD {
-            // Range-based flush using INVLPG per page — cheaper than full CR3 reload.
-            for i in 0..count {
-                let addr = start + (i as u64) * 4096;
-                crate::memory::RmmA::invalidate(rmm::VirtualAddress::new(addr as usize));
-            }
-        } else {
-            // Full TLB flush (CR3 reload) for large ranges or global shootdowns.
-            crate::memory::RmmA::invalidate_all();
-        }
+        // TODO: Finer-grained flush
+        crate::memory::RmmA::invalidate_all();

        if let Some(addrsp) = &*self.current_addrsp.borrow() {
            addrsp.tlb_ack.fetch_add(1, Ordering::Release);
@@ -358,14 +189,6 @@ impl PercpuBlock {
            wants_tlb_shootdown: AtomicBool::new(false),
            balance: Cell::new([0; 40]),
            last_queue: Cell::new(39),
-            mcs_sched_node: McsNode::new(),
-            mcs_contention_count: Cell::new(0),
-            tlb_flush_start: AtomicU64::new(0),
-            tlb_flush_count: AtomicU32::new(0),
-            pi_donated_prio: AtomicU32::new(u32::MAX),
-            current_prio: Cell::new(39),
-            numa_node: Cell::new(u8::MAX),
-            waiting_on_lock: AtomicPtr::new(core::ptr::null_mut()),
            ptrace_flags: Cell::new(PtraceFlags::empty()),
            ptrace_session: RefCell::new(None),
            inside_syscall: Cell::new(false),
@@ -10,7 +10,6 @@ use syscall::{

 use crate::{
    acpi::{RxsdtEnum, RXSDT_ENUM},
-    arch::sleep,
    context::file::InternalFlags,
    event,
    sync::{CleanLockToken, RwLock, WaitCondition, L1},
@@ -41,7 +40,6 @@ enum HandleKind {
    TopLevel,
    Rxsdt,
    ShutdownPipe,
-    SleepControl,
    SchemeRoot,
 }

@@ -148,11 +146,11 @@ impl KernelScheme for AcpiScheme {
        if flags & O_EXCL == O_EXCL || flags & O_SYMLINK == O_SYMLINK {
            return Err(Error::new(EINVAL));
        }
+        if flags & O_ACCMODE != O_RDONLY && flags & O_STAT != O_STAT {
+            return Err(Error::new(EROFS));
+        }
        let (handle_kind, int_flags) = match path {
            "" => {
-                if flags & O_ACCMODE != O_RDONLY && flags & O_STAT != O_STAT {
-                    return Err(Error::new(EROFS));
-                }
                if flags & O_DIRECTORY != O_DIRECTORY && flags & O_STAT != O_STAT {
                    return Err(Error::new(EISDIR));
                }
@@ -160,36 +158,17 @@ impl KernelScheme for AcpiScheme {
                (HandleKind::TopLevel, InternalFlags::POSITIONED)
            }
            "rxsdt" => {
-                if flags & O_ACCMODE != O_RDONLY && flags & O_STAT != O_STAT {
-                    return Err(Error::new(EROFS));
-                }
                if flags & O_DIRECTORY == O_DIRECTORY && flags & O_STAT != O_STAT {
                    return Err(Error::new(ENOTDIR));
                }
                (HandleKind::Rxsdt, InternalFlags::POSITIONED)
            }
            "kstop" => {
-                if flags & O_ACCMODE != O_RDONLY && flags & O_STAT != O_STAT {
-                    return Err(Error::new(EROFS));
-                }
                if flags & O_DIRECTORY == O_DIRECTORY && flags & O_STAT != O_STAT {
                    return Err(Error::new(ENOTDIR));
                }
                (HandleKind::ShutdownPipe, InternalFlags::empty())
            }
-            "sleep" => {
-                if flags & O_ACCMODE == O_RDONLY || flags & O_STAT == O_STAT {
-                    // allowed
-                } else if flags & O_ACCMODE != syscall::flag::O_WRONLY
-                    && flags & O_ACCMODE != syscall::flag::O_RDWR
-                {
-                    return Err(Error::new(EINVAL));
-                }
-                if flags & O_DIRECTORY == O_DIRECTORY && flags & O_STAT != O_STAT {
-                    return Err(Error::new(ENOTDIR));
-                }
-                (HandleKind::SleepControl, InternalFlags::POSITIONED)
-            }
            _ => return Err(Error::new(ENOENT)),
        };

@@ -212,7 +191,6 @@ impl KernelScheme for AcpiScheme {
        Ok(match handle.kind {
            HandleKind::Rxsdt => DATA.get().ok_or(Error::new(EBADFD))?.len() as u64,
            HandleKind::ShutdownPipe => 1,
-            HandleKind::SleepControl => sleep::available_sleep_states().len() as u64,
            HandleKind::TopLevel => 0,
            HandleKind::SchemeRoot => return Err(Error::new(EBADF))?,
        })
@@ -275,7 +253,6 @@ impl KernelScheme for AcpiScheme {

                return dst_buf.copy_exactly(&[0x42]).map(|()| 1);
            }
-            HandleKind::SleepControl => sleep::available_sleep_states(),
            HandleKind::Rxsdt => DATA.get().ok_or(Error::new(EBADFD))?,
            HandleKind::TopLevel => return Err(Error::new(EISDIR)),
            HandleKind::SchemeRoot => return Err(Error::new(EBADF)),
@@ -318,45 +295,11 @@ impl KernelScheme for AcpiScheme {
                kind: DirentKind::Socket,
                name: "kstop",
                inode: 0,
-                next_opaque_id: 2,
-            })?;
-        }
-        if opaque <= 2 {
-            buf.entry(DirEntry {
-                kind: DirentKind::Regular,
-                name: "sleep",
-                inode: 0,
                next_opaque_id: u64::MAX,
            })?;
        }
        Ok(buf.finalize())
    }
-    fn kwrite(
-        &self,
-        id: usize,
-        buf: crate::syscall::usercopy::UserSliceRo,
-        _flags: u32,
-        _stored_flags: u32,
-        token: &mut CleanLockToken,
-    ) -> Result<usize> {
-        let handle = *HANDLES.read(token.token()).get(id)?;
-
-        if handle.stat {
-            return Err(Error::new(EBADF));
-        }
-
-        match handle.kind {
-            HandleKind::SleepControl => {
-                let mut tmp = [0_u8; 16];
-                let len = buf.copy_common_bytes_to_slice(&mut tmp)?;
-                let request = core::str::from_utf8(&tmp[..len]).map_err(|_| Error::new(EINVAL))?;
-                sleep::trigger_sleep_request(request)?;
-                Ok(len)
-            }
-            HandleKind::SchemeRoot => Err(Error::new(EBADF)),
-            _ => Err(Error::new(EBADF)),
-        }
-    }
    fn kfpath(&self, _id: usize, buf: UserSliceWo, _token: &mut CleanLockToken) -> Result<usize> {
        //TODO: construct useful path?
        buf.copy_common_bytes_from_slice("/scheme/kernel.acpi/".as_bytes())
@@ -385,11 +328,6 @@ impl KernelScheme for AcpiScheme {
                st_size: 1,
                ..Default::default()
            },
-            HandleKind::SleepControl => Stat {
-                st_mode: MODE_FILE,
-                st_size: sleep::available_sleep_states().len().try_into().unwrap_or(u64::MAX),
-                ..Default::default()
-            },
            HandleKind::SchemeRoot => return Err(Error::new(EBADF)),
        })?;

@@ -22,10 +22,9 @@ struct Handle {

 static HANDLES: RwLock<L1, HandleMap<Handle>> = RwLock::new(HandleMap::new());

-/// Add to the input queue, translating CR to NL (ICRNL) for serial console compatibility.
+/// Add to the input queue
 pub fn debug_input(data: u8, token: &mut CleanLockToken) {
-    let translated = if data == b'\r' { b'\n' } else { data };
-    INPUT.send(translated, token);
+    INPUT.send(data, token);
 }

 // Notify readers of input updates
@@ -107,16 +106,12 @@ impl KernelScheme for DebugScheme {
    fn fevent(
        &self,
        id: usize,
-        flags: EventFlags,
+        _flags: EventFlags,
        token: &mut CleanLockToken,
    ) -> Result<EventFlags> {
        let _handle = *HANDLES.read(token.token()).get(id)?;

-        let mut ready = EventFlags::empty();
-        if flags.contains(EventFlags::EVENT_READ) {
-            ready |= EventFlags::EVENT_READ;
-        }
-        Ok(ready)
+        Ok(EventFlags::empty())
    }

    fn fsync(&self, id: usize, token: &mut CleanLockToken) -> Result<()> {
@@ -1,5 +1,4 @@
 use alloc::sync::Arc;
-use core::sync::atomic::Ordering;
 use syscall::{EventFlags, O_NONBLOCK};

 use crate::{
@@ -26,25 +25,12 @@ impl KernelScheme for EventScheme {
    fn kopenat(
        &self,
        id: usize,
-        user_buf: StrOrBytes,
+        _user_buf: StrOrBytes,
        _flags: usize,
        _fcntl_flags: u32,
        _ctx: CallerCtx,
        token: &mut CleanLockToken,
    ) -> Result<OpenResult> {
-        let path = match &user_buf {
-            StrOrBytes::Str(s) => s,
-            StrOrBytes::Bytes(b) => core::str::from_utf8(b).unwrap_or(""),
-        };
-        if path.starts_with("eventfd/") {
-            let rest = &path[8..]; // after "eventfd/"
-            let mut parts = rest.split('/');
-            let initval: u64 = parts.next().and_then(|s| s.parse().ok()).unwrap_or(0);
-            let sem: bool = parts.next().and_then(|s| s.parse().ok()).unwrap_or(false);
-            let id = next_queue_id();
-            queues_mut(token.token()).insert(id, Arc::new(EventQueue::new_eventfd(id, initval, sem)));
-            return Ok(OpenResult::SchemeLocal(id.get(), InternalFlags::empty()));
-        }
        if id != SCHEME_ROOT_ID {
            return Err(Error::new(EACCES));
        }
@@ -81,31 +67,6 @@ impl KernelScheme for EventScheme {
            handle.clone()
        };

-        if let Some((ref counter, semaphore)) = queue.eventfd {
-            let is_nonblock = flags & O_NONBLOCK as u32 != 0;
-            if semaphore {
-                let val = counter.load(Ordering::Acquire);
-                if val == 0 {
-                    if is_nonblock { return Err(Error::new(EAGAIN)); }
-                    // Blocking wait not implemented for eventfd in kernel
-                    return Err(Error::new(EAGAIN));
-                }
-                if counter.compare_exchange(val, val - 1, Ordering::AcqRel, Ordering::Relaxed).is_ok() {
-                    let one: u64 = 1;
-                    buf.copy_from_slice(unsafe { core::slice::from_raw_parts(&one as *const u64 as *const u8, 8) })?;
-                    return Ok(8);
-                }
-                return Err(Error::new(EAGAIN));
-            } else {
-                let val = counter.swap(0, Ordering::AcqRel);
-                if val == 0 && is_nonblock {
-                    return Err(Error::new(EAGAIN));
-                }
-                buf.copy_from_slice(unsafe { core::slice::from_raw_parts(&val as *const u64 as *const u8, 8) })?;
-                return Ok(8);
-            }
-        }
-
        queue.read(buf, flags & O_NONBLOCK as u32 == 0, token)
    }

@@ -124,19 +85,6 @@ impl KernelScheme for EventScheme {
            let handle = handles.get(&id).ok_or(Error::new(EBADF))?;
            handle.clone()
        };
-
-        if let Some((ref counter, _semaphore)) = queue.eventfd {
-            if buf.len() >= 8 {
-                let mut bytes = [0u8; 8];
-                buf.copy_to_slice(&mut bytes)?;
-                let val = u64::from_ne_bytes(bytes);
-                if val == u64::MAX { return Err(Error::new(EINVAL)); }
-                counter.fetch_add(val, Ordering::AcqRel);
-                return Ok(8);
-            }
-            return Err(Error::new(EINVAL));
-        }
-
        let mut events_written = 0;

        for chunk in buf.in_exact_chunks(size_of::<Event>()) {
@@ -18,9 +18,6 @@ use syscall::{
 use crate::context::file::InternalFlags;

 use super::{CallerCtx, HandleMap, OpenResult, SchemeExt, StrOrBytes};
-#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
-use crate::arch::device::{ioapic, local_apic::ApicId};
-
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 use crate::arch::interrupt::{available_irqs_iter, irq::acknowledge, is_reserved, set_reserved};
 #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))]
@@ -59,11 +56,8 @@ const INO_AVAIL: u64 = 0x8000_0000_0000_0000;
 const INO_BSP: u64 = 0x8001_0000_0000_0000;
 const INO_PHANDLE: u64 = 0x8003_0000_0000_0000;

-/// Add to the input queue, with iommu validation gate for MSI vectors
+/// Add to the input queue
 pub fn irq_trigger(irq: u8, token: &mut CleanLockToken) {
-    if irq >= 16 && !iommu_validate_msi_irq(irq) {
-        return;
-    }
    COUNTS.lock()[irq as usize] += 1;
    let fds: SmallVec<[usize; 8]> = {
        HANDLES
@@ -83,17 +77,16 @@ pub fn irq_trigger(irq: u8, token: &mut CleanLockToken) {
 #[allow(dead_code)]
 enum Handle {
    SchemeRoot,
-    Irq { ack: AtomicUsize, irq: u8, cpu_id: LogicalCpuId },
+    Irq { ack: AtomicUsize, irq: u8 },
    Avail(LogicalCpuId),
    TopLevel,
    Phandle(u8, Vec<u8>),
    Bsp,
-    IrqAffinity { irq: u8, mask: AtomicUsize },
 }
 impl Handle {
    fn as_irq_handle(&self) -> Option<(&AtomicUsize, u8)> {
        match self {
-            &Self::Irq { ref ack, irq, cpu_id: _ } => Some((ack, irq)),
+            &Self::Irq { ref ack, irq } => Some((ack, irq)),
            _ => None,
        }
    }
@@ -147,7 +140,6 @@ impl IrqScheme {
                    Handle::Irq {
                        ack: AtomicUsize::new(0),
                        irq: irq_number,
-                        cpu_id: LogicalCpuId::BSP,
                    },
                    InternalFlags::empty(),
                )
@@ -166,7 +158,6 @@ impl IrqScheme {
                    Handle::Irq {
                        ack: AtomicUsize::new(0),
                        irq: irq_number,
-                        cpu_id,
                    },
                    InternalFlags::empty(),
                )
@@ -208,7 +199,6 @@ impl IrqScheme {
                    Handle::Irq {
                        ack: AtomicUsize::new(0),
                        irq: irq_number as u8,
-                        cpu_id: LogicalCpuId::new(0),
                    },
                    InternalFlags::empty(),
                )
@@ -224,14 +214,6 @@ const fn vector_to_irq(vector: u8) -> u8 {
    vector - 32
 }

-const fn msi_vector_is_valid(vector: u8) -> bool {
-    vector >= 32 && vector < 0xEF
-}
-
-fn iommu_validate_msi_irq(_irq: u8) -> bool {
-    true
-}
-
 impl crate::scheme::KernelScheme for IrqScheme {
    fn scheme_root(&self, token: &mut CleanLockToken) -> Result<usize> {
        let id = HANDLES.write(token.token()).insert(Handle::SchemeRoot);
@@ -298,21 +280,7 @@ impl crate::scheme::KernelScheme for IrqScheme {
                    InternalFlags::POSITIONED,
                )
            } else if let Some(path_str) = path_str.strip_prefix('/') {
-                let (irq_str, affinity) = path_str
-                    .trim_end_matches('/')
-                    .rsplit_once('/')
-                    .map(|(a, b)| (a, Some(b)))
-                    .unwrap_or((path_str.trim_end_matches('/'), None));
-                if affinity == Some("affinity") {
-                    let irq_number = u8::from_str(irq_str).or(Err(Error::new(ENOENT)))?;
-                    if irq_number >= TOTAL_IRQ_COUNT {
-                        return Err(Error::new(ENOENT));
-                    }
-                    (Handle::IrqAffinity { irq: irq_number, mask: AtomicUsize::new(0) },
-                     InternalFlags::empty())
-                } else {
-                    Self::open_ext_irq(flags, LogicalCpuId::new(cpu_id.into()), path_str)?
-                }
+                Self::open_ext_irq(flags, LogicalCpuId::new(cpu_id.into()), path_str)?
            } else {
                return Err(Error::new(ENOENT));
            }
@@ -339,20 +307,12 @@ impl crate::scheme::KernelScheme for IrqScheme {
            }
            #[cfg(not(dtb))]
            panic!("")
-        } else if let Some(rest) = path_str.strip_suffix("/affinity") {
-            let irq_number = u8::from_str(rest).or(Err(Error::new(ENOENT)))?;
-            if irq_number >= TOTAL_IRQ_COUNT {
-                return Err(Error::new(ENOENT));
-            }
-            (Handle::IrqAffinity { irq: irq_number, mask: AtomicUsize::new(0) },
-             InternalFlags::empty())
        } else if let Ok(plain_irq_number) = u8::from_str(path_str) {
            if plain_irq_number < BASE_IRQ_COUNT {
                (
                    Handle::Irq {
                        ack: AtomicUsize::new(0),
                        irq: plain_irq_number,
-                        cpu_id: LogicalCpuId::BSP,
                    },
                    InternalFlags::empty(),
                )
@@ -408,7 +368,6 @@ impl crate::scheme::KernelScheme for IrqScheme {
                }
            }
            Handle::Avail(cpu_id) => {
-                let mut listed = 0;
                for vector in available_irqs_iter(cpu_id).skip(opaque) {
                    let irq = vector_to_irq(vector);
                    if cpu_id == LogicalCpuId::BSP && irq < BASE_IRQ_COUNT {
@@ -422,9 +381,7 @@ impl crate::scheme::KernelScheme for IrqScheme {
                        name: &intermediate,
                        next_opaque_id: u64::from(vector) + 1,
                    })?;
-                    listed += 1;
                }
-                info!("irq getdents Avail: cpu_id={} opaque={} listed={}", cpu_id.get(), opaque, listed);
            }
            _ => return Err(Error::new(ENOTDIR)),
        }
@@ -459,14 +416,11 @@ impl crate::scheme::KernelScheme for IrqScheme {
        let handle = handles_guard.get(id)?;

        if let &Handle::Irq {
-            irq: handle_irq,
-            cpu_id: handle_cpu_id,
-            ..
+            irq: handle_irq, ..
        } = handle
            && handle_irq > BASE_IRQ_COUNT
        {
-            info!("irq close: unreserving vector {} on cpu_id={}", irq_to_vector(handle_irq), handle_cpu_id.get());
-            set_reserved(handle_cpu_id, irq_to_vector(handle_irq), false);
+            set_reserved(LogicalCpuId::BSP, irq_to_vector(handle_irq), false);
        }
        Ok(())
    }
@@ -482,32 +436,9 @@ impl crate::scheme::KernelScheme for IrqScheme {
        let handle = handles_guard.get(file)?;

        match handle {
-            &Handle::IrqAffinity { irq: _handle_irq, ref mask } => {
-                if buffer.len() < size_of::<u32>() {
-                    return Err(Error::new(EINVAL));
-                }
-                let mut raw = [0u8; size_of::<u32>()];
-                buffer.copy_to_slice(&mut raw)?;
-                let cpu_id = u32::from_ne_bytes(raw);
-                let cpus = CPUS.get().ok_or(Error::new(EIO))?;
-                if !cpus.contains(&(cpu_id as u8)) {
-                    return Err(Error::new(EINVAL));
-                }
-                // Reprogram the IOAPIC redirection entry for x86 targets.
-                // Non-IOAPIC IRQs (e.g. MSI) will return false -> EIO.
-                #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
-                {
-                    if !unsafe { ioapic::set_affinity(_handle_irq, ApicId::new(cpu_id)) } {
-                        return Err(Error::new(EIO));
-                    }
-                }
-                mask.store(cpu_id as usize, Ordering::Release);
-                Ok(size_of::<u32>())
-            }
            &Handle::Irq {
                irq: handle_irq,
                ack: ref handle_ack,
-                cpu_id: _,
            } => {
                if buffer.len() < size_of::<usize>() {
                    return Err(Error::new(EINVAL));
@@ -544,15 +475,6 @@ impl crate::scheme::KernelScheme for IrqScheme {
                st_nlink: 1,
                ..Default::default()
            },
-            Handle::IrqAffinity { irq, .. } => Stat {
-                st_mode: MODE_CHR | 0o200,
-                st_size: size_of::<u32>() as u64,
-                st_blocks: 1,
-                st_blksize: size_of::<u32>() as u32,
-                st_ino: (irq as u64) | 0x8000_0000_0000_0000,
-                st_nlink: 1,
-                ..Default::default()
-            },
            Handle::Bsp => Stat {
                st_mode: MODE_CHR | 0o400,
                st_size: size_of::<usize>() as u64,
@@ -594,9 +516,8 @@ impl crate::scheme::KernelScheme for IrqScheme {

        let scheme_path = match handle {
            Handle::Irq { irq, .. } => format!("irq:{}", irq),
-            Handle::IrqAffinity { irq, .. } => format!("irq:{}/affinity", irq),
            Handle::Bsp => "irq:bsp".to_owned(),
-            Handle::Avail(cpu_id) => format!("irq:cpu-{:02x}", cpu_id.get()),
+            Handle::Avail(cpu_id) => format!("irq:cpu-{:2x}", cpu_id.get()),
            Handle::Phandle(phandle, _) => format!("irq:phandle-{}", phandle),
            Handle::TopLevel => "irq:".to_owned(),
            _ => return Err(Error::new(EBADF)),
@@ -622,7 +543,6 @@ impl crate::scheme::KernelScheme for IrqScheme {
            Handle::Irq {
                irq: handle_irq,
                ack: ref handle_ack,
-                cpu_id: _,
            } => {
                if buffer.len() < size_of::<usize>() {
                    return Err(Error::new(EINVAL));
@@ -642,7 +562,7 @@ impl crate::scheme::KernelScheme for IrqScheme {
                buffer.write_u32(LogicalCpuId::BSP.get())?;
                Ok(size_of::<usize>())
            }
-            Handle::Avail(_) | Handle::TopLevel | Handle::Phandle(_, _) | Handle::SchemeRoot | Handle::IrqAffinity { .. } => {
+            Handle::Avail(_) | Handle::TopLevel | Handle::Phandle(_, _) | Handle::SchemeRoot => {
                Err(Error::new(EISDIR))
            }
        }
@@ -14,7 +14,7 @@ use alloc::{
 };
 use core::{
    str,
-    sync::atomic::{AtomicU64, AtomicUsize, Ordering},
+    sync::atomic::{AtomicUsize, Ordering},
 };
 use hashbrown::hash_map::{self, DefaultHashBuilder, HashMap};
 use spin::Once;
@@ -169,7 +169,6 @@ enum Handle {

 /// Schemes list
 static HANDLES: Once<RwLock<L1, HashMap<SchemeId, Handle>>> = Once::new();
-static SCHEME_GENERATIONS: Once<RwLock<L1, HashMap<SchemeId, AtomicU64>>> = Once::new();
 static SCHEME_LIST_NEXT_ID: AtomicUsize = AtomicUsize::new(MAX_GLOBAL_SCHEMES);
 static SCHEME_LIST_ID: AtomicUsize = AtomicUsize::new(0);

@@ -205,10 +204,6 @@ fn init_schemes() -> RwLock<L1, HashMap<SchemeId, Handle>> {
    RwLock::new(handles)
 }

-fn init_scheme_generations() -> RwLock<L1, HashMap<SchemeId, AtomicU64>> {
-    RwLock::new(HashMap::new())
-}
-
 /// Get a handle to a scheme.
 pub fn get_scheme(token: LockToken<'_, L0>, scheme_id: SchemeId) -> Result<KernelSchemes> {
    match handles().read(token).get(&scheme_id) {
@@ -217,33 +212,10 @@ pub fn get_scheme(token: LockToken<'_, L0>, scheme_id: SchemeId) -> Result<Kerne
    }
 }

-pub fn current_scheme_generation(token: LockToken<'_, L0>, scheme_id: SchemeId) -> u64 {
-    scheme_generations()
-        .read(token)
-        .get(&scheme_id)
-        .map(|generation| generation.load(Ordering::Acquire))
-        .unwrap_or(0)
-}
-
 fn handles<'a>() -> &'a RwLock<L1, HashMap<SchemeId, Handle>> {
    HANDLES.call_once(init_schemes)
 }

-fn scheme_generations<'a>() -> &'a RwLock<L1, HashMap<SchemeId, AtomicU64>> {
-    SCHEME_GENERATIONS.call_once(init_scheme_generations)
-}
-
-fn increment_scheme_generation(scheme_id: SchemeId, token: &mut CleanLockToken) {
-    match scheme_generations().write(token.token()).entry(scheme_id) {
-        hash_map::Entry::Occupied(entry) => {
-            entry.get().fetch_add(1, Ordering::AcqRel);
-        }
-        hash_map::Entry::Vacant(entry) => {
-            entry.insert(AtomicU64::new(1));
-        }
-    }
-}
-
 /// Scheme list type
 pub struct SchemeList;

@@ -288,14 +260,9 @@ impl SchemeList {

    /// Remove a scheme
    fn remove(&self, id: usize, token: &mut CleanLockToken) {
-        let scheme_id = SchemeId(id);
-        let scheme = handles().write(token.token()).remove(&scheme_id);
+        let scheme = handles().write(token.token()).remove(&SchemeId(id));

        assert!(scheme.is_some());
-        if let Some(Handle::Scheme(KernelSchemes::User(user))) = scheme.as_ref() {
-            user.inner.fail_pending_calls(token);
-        }
-        increment_scheme_generation(scheme_id, token);
        if let Some(Handle::Scheme(KernelSchemes::User(user))) = scheme
            && let Some(user) = Arc::into_inner(user.inner)
        {
@@ -320,32 +287,32 @@ impl KernelScheme for SchemeList {
        token: &mut CleanLockToken,
    ) -> Result<OpenResult> {
        let scheme_id = SchemeId(scheme_id);
-        let maybe_inner = {
-            let handles = handles().read(token.token());
-            match handles.get(&scheme_id).ok_or(Error::new(EBADF))? {
-                Handle::Scheme(KernelSchemes::User(UserScheme { inner })) => Some(inner.clone()),
-                Handle::SchemeCreationCapability => None,
-                _ => return Err(Error::new(EBADF)),
+        match handles()
+            .read(token.token())
+            .get(&scheme_id)
+            .ok_or(Error::new(EBADF))?
+        {
+            Handle::Scheme(KernelSchemes::User(UserScheme { inner })) => {
+                let inner = inner.clone();
+                assert!(scheme_id == inner.scheme_id);
+                let scheme = scheme_id;
+                let params = unsafe { user_buf.read_exact::<NewFdParams>()? };
+
+                return Ok(OpenResult::External(Arc::new(RwLock::new(
+                    FileDescription {
+                        scheme,
+                        number: params.number,
+                        offset: params.offset,
+                        flags: params.flags as u32,
+                        internal_flags: InternalFlags::from_extra0(params.internal_flags)
+                            .ok_or(Error::new(EINVAL))?,
+                    },
+                ))));
            }
+            Handle::SchemeCreationCapability => (),
+            _ => return Err(Error::new(EBADF)),
        };

-        if let Some(inner) = maybe_inner {
-            assert!(scheme_id == inner.scheme_id);
-            let params = unsafe { user_buf.read_exact::<NewFdParams>()? };
-
-            return Ok(OpenResult::External(Arc::new(RwLock::new(
-                FileDescription::new(
-                    scheme_id,
-                    params.number,
-                    params.offset,
-                    params.flags as u32,
-                    InternalFlags::from_extra0(params.internal_flags)
-                        .ok_or(Error::new(EINVAL))?,
-                    token,
-                ),
-            ))));
-        }
-
        const EXPECTED: &[u8] = b"create-scheme";
        let mut buf = [0u8; EXPECTED.len()];

@@ -810,7 +777,6 @@ pub struct CallerCtx {
    pub pid: usize,
    pub uid: u32,
    pub gid: u32,
-    pub groups: alloc::vec::Vec<u32>,
 }
 impl CallerCtx {
    pub fn filter_uid_gid(self, euid: u32, egid: u32) -> Self {
@@ -819,7 +785,6 @@ impl CallerCtx {
                pid: self.pid,
                uid: euid,
                gid: egid,
-                groups: self.groups,
            }
        } else {
            self
@@ -1,10 +1,5 @@
-use alloc::{
-    collections::VecDeque,
-    string::{String, ToString},
-    sync::Arc,
-    vec::Vec,
-};
-use core::sync::atomic::{AtomicUsize, Ordering};
+use alloc::{collections::VecDeque, sync::Arc, vec::Vec};
+use core::sync::atomic::{AtomicBool, AtomicUsize, Ordering};

 use syscall::{data::GlobalSchemes, CallFlags};

@@ -19,228 +14,67 @@ use crate::{
    sync::{CleanLockToken, Mutex, RwLock, WaitCondition, L1},
    syscall::{
        data::Stat,
-        error::{
-            Error, Result, EAGAIN, EBADF, EEXIST, EINVAL, EINTR, ENOENT, ENOTDIR, EPIPE,
-        },
-        flag::{
-            EventFlags, EVENT_READ, EVENT_WRITE, MODE_FIFO, O_ACCMODE, O_DIRECTORY,
-            O_NONBLOCK, O_RDONLY, O_RDWR, O_STAT, O_WRONLY,
-        },
+        error::{Error, Result, EAGAIN, EBADF, EINTR, EINVAL, ENOENT, EPIPE},
+        flag::{EventFlags, EVENT_READ, EVENT_WRITE, MODE_FIFO, O_NONBLOCK},
        usercopy::{UserSliceRo, UserSliceRw, UserSliceWo},
    },
 };

 use super::{CallerCtx, KernelScheme, OpenResult, SchemeExt, StrOrBytes};

-static PIPE_NEXT_ID: AtomicUsize = AtomicUsize::new(1);
+// TODO: Preallocate a number of scheme IDs, since there can only be *one* root namespace, and
+// therefore only *one* pipe scheme.
+static PIPE_NEXT_ID: AtomicUsize = AtomicUsize::new(0);

-#[derive(Clone)]
 enum Handle {
-    Endpoint(EndpointHandle),
+    Pipe(Arc<Pipe>),
    SchemeRoot,
 }

-#[derive(Clone, Copy, Eq, PartialEq)]
-enum EndpointKind {
-    Read,
-    Write,
-    ReadWrite,
-}
-
-impl EndpointKind {
-    fn can_read(self) -> bool {
-        matches!(self, Self::Read | Self::ReadWrite)
-    }
-
-    fn can_write(self) -> bool {
-        matches!(self, Self::Write | Self::ReadWrite)
-    }
-}
-
-#[derive(Clone)]
-struct EndpointHandle {
-    pipe: Arc<Pipe>,
-    kind: EndpointKind,
-    named: Option<Arc<NamedPipe>>,
-}
-
-struct NamedPipe {
-    path: String,
-    mode: u16,
-    active: Mutex<L1, Option<Arc<Pipe>>>,
-}
-
-static HANDLES: RwLock<L1, HashMap<usize, Handle>> =
-    RwLock::new(HashMap::with_hasher(DefaultHashBuilder::new()));
-static NAMED_PIPES: RwLock<L1, HashMap<String, Arc<NamedPipe>>> =
+// TODO: SLOB?
+static PIPES: RwLock<L1, HashMap<usize, Handle>> =
    RwLock::new(HashMap::with_hasher(DefaultHashBuilder::new()));

 const MAX_QUEUE_SIZE: usize = 65536;

-fn next_id() -> usize {
-    PIPE_NEXT_ID.fetch_add(1, Ordering::Relaxed)
-}
+// In almost all places where Rust (and LLVM) uses pointers, they are limited to nonnegative isize,
+// so this is fine.
+const WRITE_NOT_READ_BIT: usize = 1;

-fn endpoint_kind_from_flags(flags: usize) -> Result<EndpointKind> {
-    match flags & O_ACCMODE {
-        O_RDONLY => Ok(EndpointKind::Read),
-        O_WRONLY => Ok(EndpointKind::Write),
-        O_RDWR => Ok(EndpointKind::ReadWrite),
-        _ => Err(Error::new(EINVAL)),
-    }
-}
-
-fn validate_named_fifo_open(flags: usize) -> Result<()> {
-    if flags & O_DIRECTORY == O_DIRECTORY && flags & O_STAT != O_STAT {
-        return Err(Error::new(ENOTDIR));
-    }
-
-    let _ = endpoint_kind_from_flags(flags)?;
-    Ok(())
-}
-
-fn trigger_matching(
-    pipe: &Arc<Pipe>,
-    require_read: bool,
-    require_write: bool,
-    flags: EventFlags,
-    token: &mut CleanLockToken,
-) {
-    let ids = {
-        let handles = HANDLES.read(token.token());
-        handles
-            .iter()
-            .filter_map(|(id, handle)| match handle {
-                Handle::Endpoint(endpoint)
-                    if Arc::ptr_eq(&endpoint.pipe, pipe)
-                        && (!require_read || endpoint.kind.can_read())
-                        && (!require_write || endpoint.kind.can_write()) =>
-                {
-                    Some(*id)
-                }
-                _ => None,
-            })
-            .collect::<Vec<_>>()
-    };
-
-    for id in ids {
-        event::trigger(GlobalSchemes::Pipe.scheme_id(), id, flags, token);
-    }
-}
-
-fn open_endpoint(
-    pipe: Arc<Pipe>,
-    kind: EndpointKind,
-    named: Option<Arc<NamedPipe>>,
-    token: &mut CleanLockToken,
-) -> usize {
-    if kind.can_read() {
-        pipe.reader_count.fetch_add(1, Ordering::SeqCst);
-    }
-    if kind.can_write() {
-        pipe.writer_count.fetch_add(1, Ordering::SeqCst);
-    }
-
-    let id = next_id();
-    HANDLES.write(token.token()).insert(
-        id,
-        Handle::Endpoint(EndpointHandle { pipe, kind, named }),
-    );
-    id
-}
-
-fn drop_wait_conditions_if_possible(pipe: Arc<Pipe>, token: &mut CleanLockToken) {
-    if let Some(pipe) = Arc::into_inner(pipe) {
-        {
-            pipe.read_condition.into_drop(token);
-        }
-        {
-            pipe.write_condition.into_drop(token);
-        }
-    }
+fn from_raw_id(id: usize) -> (bool, usize) {
+    (id & WRITE_NOT_READ_BIT != 0, id & !WRITE_NOT_READ_BIT)
 }

 pub fn pipe(token: &mut CleanLockToken) -> Result<(usize, usize)> {
-    let pipe = Arc::new(Pipe::new());
-    let read_id = open_endpoint(Arc::clone(&pipe), EndpointKind::Read, None, token);
-    let write_id = open_endpoint(pipe, EndpointKind::Write, None, token);
+    // Bit 0 is used for WRITE_NOT_READ_BIT
+    let id = PIPE_NEXT_ID.fetch_add(2, Ordering::Relaxed);

-    Ok((read_id, write_id))
-}
+    PIPES.write(token.token()).insert(
+        id,
+        Handle::Pipe(Arc::new(Pipe {
+            queue: Mutex::new(VecDeque::new()),
+            read_condition: WaitCondition::new(),
+            write_condition: WaitCondition::new(),
+            writer_is_alive: AtomicBool::new(true),
+            reader_is_alive: AtomicBool::new(true),
+            has_run_dup: AtomicBool::new(false),
+            fd_queue: Mutex::new(VecDeque::new()),
+        })),
+    );

-pub fn named_pipe_exists(path: &str, token: &mut CleanLockToken) -> bool {
-    NAMED_PIPES.read(token.token()).contains_key(path)
-}
-
-pub fn create_named_pipe(
-    path: &str,
-    display_path: &str,
-    mode: u16,
-    flags: usize,
-    token: &mut CleanLockToken,
-) -> Result<usize> {
-    validate_named_fifo_open(flags)?;
-
-    let named = {
-        let mut named_pipes = NAMED_PIPES.write(token.token());
-        if named_pipes.contains_key(path) {
-            return Err(Error::new(EEXIST));
-        }
-
-        let named = Arc::new(NamedPipe {
-            path: display_path.to_string(),
-            mode,
-            active: Mutex::new(None),
-        });
-        named_pipes.insert(path.to_string(), Arc::clone(&named));
-        named
-    };
-
-    let kind = endpoint_kind_from_flags(flags)?;
-    let pipe = Arc::new(Pipe::new());
-    *named.active.lock(token.token()) = Some(Arc::clone(&pipe));
-
-    Ok(open_endpoint(pipe, kind, Some(named), token))
-}
-
-pub fn open_named_pipe(path: &str, flags: usize, token: &mut CleanLockToken) -> Result<Option<usize>> {
-    validate_named_fifo_open(flags)?;
-
-    let named = match NAMED_PIPES.read(token.token()).get(path) {
-        Some(named) => Arc::clone(named),
-        None => return Ok(None),
-    };
-
-    let kind = endpoint_kind_from_flags(flags)?;
-    let pipe = {
-        let mut active = named.active.lock(token.token());
-        match active.as_ref() {
-            Some(pipe) => Arc::clone(pipe),
-            None => {
-                let pipe = Arc::new(Pipe::new());
-                *active = Some(Arc::clone(&pipe));
-                pipe
-            }
-        }
-    };
-
-    Ok(Some(open_endpoint(pipe, kind, Some(named), token)))
-}
-
-pub fn unlink_named_pipe(path: &str, token: &mut CleanLockToken) -> bool {
-    NAMED_PIPES.write(token.token()).remove(path).is_some()
+    Ok((id, id | WRITE_NOT_READ_BIT))
 }

 pub struct PipeScheme;

 impl PipeScheme {
-    fn get_endpoint(id: usize, token: &mut CleanLockToken) -> Result<EndpointHandle> {
-        HANDLES
+    fn get_pipe(key: usize, token: &mut CleanLockToken) -> Result<Arc<Pipe>> {
+        PIPES
            .read(token.token())
-            .get(&id)
+            .get(&key)
            .and_then(|handle| match handle {
-                Handle::Endpoint(endpoint) => Some(endpoint.clone()),
-                Handle::SchemeRoot => None,
+                Handle::Pipe(pipe) => Some(Arc::clone(pipe)),
+                _ => None,
            })
            .ok_or(Error::new(EBADF))
    }
@@ -248,33 +82,32 @@ impl PipeScheme {

 impl KernelScheme for PipeScheme {
    fn scheme_root(&self, token: &mut CleanLockToken) -> Result<usize> {
-        let id = next_id();
-        HANDLES.write(token.token()).insert(id, Handle::SchemeRoot);
+        let id = PIPE_NEXT_ID.fetch_add(2, Ordering::Relaxed);
+        PIPES.write(token.token()).insert(id, Handle::SchemeRoot);
        Ok(id)
    }
-
    fn fevent(
        &self,
        id: usize,
        flags: EventFlags,
        token: &mut CleanLockToken,
    ) -> Result<EventFlags> {
-        let endpoint = Self::get_endpoint(id, token)?;
+        let (is_writer_not_reader, key) = from_raw_id(id);
+        let pipe = Self::get_pipe(key, token)?;

        let mut ready = EventFlags::empty();

-        if endpoint.kind.can_write()
+        if is_writer_not_reader
            && flags.contains(EVENT_WRITE)
-            && (endpoint.pipe.queue.lock(token.token()).len() <= MAX_QUEUE_SIZE
-                || endpoint.pipe.reader_count.load(Ordering::Acquire) == 0)
+            && (pipe.queue.lock(token.token()).len() <= MAX_QUEUE_SIZE
+                || !pipe.reader_is_alive.load(Ordering::Acquire))
        {
            ready |= EventFlags::EVENT_WRITE;
        }
-
-        if endpoint.kind.can_read()
+        if !is_writer_not_reader
            && flags.contains(EVENT_READ)
-            && (!endpoint.pipe.queue.lock(token.token()).is_empty()
-                || endpoint.pipe.writer_count.load(Ordering::Acquire) == 0)
+            && (!pipe.queue.lock(token.token()).is_empty()
+                || !pipe.writer_is_alive.load(Ordering::Acquire))
        {
            ready |= EventFlags::EVENT_READ;
        }
@@ -283,48 +116,46 @@ impl KernelScheme for PipeScheme {
    }

    fn close(&self, id: usize, token: &mut CleanLockToken) -> Result<()> {
-        let handle = HANDLES
-            .write(token.token())
-            .remove(&id)
-            .ok_or(Error::new(EBADF))?;
+        let (is_write_not_read, key) = from_raw_id(id);

-        let Handle::Endpoint(endpoint) = handle else {
-            return Ok(());
+        let pipe = Self::get_pipe(key, token)?;
+        let scheme_id = GlobalSchemes::Pipe.scheme_id();
+
+        let can_remove = if is_write_not_read {
+            pipe.writer_is_alive.store(false, Ordering::SeqCst);
+            event::trigger(scheme_id, key, EVENT_READ, token);
+            pipe.read_condition.notify(token);
+
+            !pipe.reader_is_alive.load(Ordering::SeqCst)
+        } else {
+            pipe.reader_is_alive.store(false, Ordering::SeqCst);
+            event::trigger(scheme_id, key | WRITE_NOT_READ_BIT, EVENT_WRITE, token);
+            pipe.write_condition.notify(token);
+
+            !pipe.writer_is_alive.load(Ordering::SeqCst)
        };

-        let mut last_reader = false;
-        let mut last_writer = false;
-
-        if endpoint.kind.can_read() {
-            last_reader = endpoint.pipe.reader_count.fetch_sub(1, Ordering::SeqCst) == 1;
-        }
-        if endpoint.kind.can_write() {
-            last_writer = endpoint.pipe.writer_count.fetch_sub(1, Ordering::SeqCst) == 1;
-        }
-
-        if last_writer {
-            trigger_matching(&endpoint.pipe, true, false, EVENT_READ, token);
-            endpoint.pipe.read_condition.notify(token);
-        }
-        if last_reader {
-            trigger_matching(&endpoint.pipe, false, true, EVENT_WRITE, token);
-            endpoint.pipe.write_condition.notify(token);
-        }
-
-        let no_readers = endpoint.pipe.reader_count.load(Ordering::SeqCst) == 0;
-        let no_writers = endpoint.pipe.writer_count.load(Ordering::SeqCst) == 0;
-        if no_readers && no_writers {
-            if let Some(named) = endpoint.named {
-                let mut active = named.active.lock(token.token());
-                if active
-                    .as_ref()
-                    .is_some_and(|active_pipe| Arc::ptr_eq(active_pipe, &endpoint.pipe))
+        if can_remove {
+            let handle = PIPES.write(token.token()).remove(&key);
+            if let Some(Handle::Pipe(pipe)) = handle
+                && let Some(pipe) = Arc::into_inner(pipe)
+            {
                {
-                    *active = None;
+                    pipe.read_condition.into_drop(token);
+                }
+                {
+                    pipe.write_condition.into_drop(token);
                }
            }
+        }

-            drop_wait_conditions_if_possible(endpoint.pipe, token);
+        if let Some(pipe) = Arc::into_inner(pipe) {
+            {
+                pipe.read_condition.into_drop(token);
+            }
+            {
+                pipe.write_condition.into_drop(token);
+            }
        }

        Ok(())
@@ -337,9 +168,9 @@ impl KernelScheme for PipeScheme {
        _ctx: CallerCtx,
        token: &mut CleanLockToken,
    ) -> Result<OpenResult> {
-        let endpoint = Self::get_endpoint(old_id, token)?;
+        let (is_writer_not_reader, key) = from_raw_id(old_id);

-        if !endpoint.kind.can_read() {
+        if is_writer_not_reader {
            return Err(Error::new(EBADF));
        }

@@ -349,17 +180,17 @@ impl KernelScheme for PipeScheme {
            return Err(Error::new(EINVAL));
        }

+        let pipe = Self::get_pipe(key, token)?;
+
+        if pipe.has_run_dup.swap(true, Ordering::SeqCst) {
+            return Err(Error::new(EBADF));
+        }
+
        Ok(OpenResult::SchemeLocal(
-            open_endpoint(
-                Arc::clone(&endpoint.pipe),
-                EndpointKind::Write,
-                endpoint.named,
-                token,
-            ),
+            key | WRITE_NOT_READ_BIT,
            InternalFlags::empty(),
        ))
    }
-
    fn kopenat(
        &self,
        id: usize,
@@ -369,47 +200,40 @@ impl KernelScheme for PipeScheme {
        _ctx: CallerCtx,
        token: &mut CleanLockToken,
    ) -> Result<OpenResult> {
-        let is_scheme_root = {
-            let handles = HANDLES.read(token.token());
-            match handles.get(&id) {
-                Some(Handle::SchemeRoot) => true,
-                Some(Handle::Endpoint(_)) => false,
-                None => return Err(Error::new(EBADF)),
-            }
-        };
+        let (_, key) = from_raw_id(id);

-        if is_scheme_root {
-                let path = user_buf.as_str().or(Err(Error::new(EINVAL)))?;
-                if !path.trim_start_matches('/').is_empty() {
-                    return Err(Error::new(ENOENT));
+        {
+            let guard = PIPES.read(token.token());
+            if let Some(Handle::SchemeRoot) = guard.get(&key) {
+            } else if let Some(Handle::Pipe(pipe_arc)) = guard.get(&key) {
+                let pipe = Arc::clone(pipe_arc);
+                drop(guard);
+
+                if user_buf.as_bytes() == b"write" {
+                    return Err(Error::new(EINVAL));
+                }
+
+                if pipe.has_run_dup.swap(true, Ordering::SeqCst) {
+                    return Err(Error::new(EBADF));
                }

-                let pipe = Arc::new(Pipe::new());
                return Ok(OpenResult::SchemeLocal(
-                    open_endpoint(pipe, EndpointKind::Read, None, token),
+                    key | WRITE_NOT_READ_BIT,
                    InternalFlags::empty(),
                ));
+            } else {
+                return Err(Error::new(EBADF));
+            }
        }

-        let endpoint = Self::get_endpoint(id, token)?;
-        if !endpoint.kind.can_read() {
-            return Err(Error::new(EBADF));
+        let path = user_buf.as_str().or(Err(Error::new(EINVAL)))?;
+        if !path.trim_start_matches('/').is_empty() {
+            return Err(Error::new(ENOENT));
        }

-        let path = user_buf.as_bytes();
-        if !path.is_empty() && path != b"write" {
-            return Err(Error::new(EINVAL));
-        }
+        let (read_id, _) = pipe(token)?;

-        Ok(OpenResult::SchemeLocal(
-            open_endpoint(
-                Arc::clone(&endpoint.pipe),
-                EndpointKind::Write,
-                endpoint.named,
-                token,
-            ),
-            InternalFlags::empty(),
-        ))
+        Ok(OpenResult::SchemeLocal(read_id, InternalFlags::empty()))
    }

    fn kread(
@@ -420,15 +244,16 @@ impl KernelScheme for PipeScheme {
        _stored_flags: u32,
        token: &mut CleanLockToken,
    ) -> Result<usize> {
-        let endpoint = Self::get_endpoint(id, token)?;
+        let (is_write_not_read, key) = from_raw_id(id);

-        if !endpoint.kind.can_read() {
+        if is_write_not_read {
            return Err(Error::new(EBADF));
        }
+        let pipe = Self::get_pipe(key, token)?;

        loop {
-            let vec = endpoint.pipe.queue.lock(token.token());
-            let (mut vec, mut lock_token) = vec.into_split();
+            let vec = pipe.queue.lock(token.token());
+            let (mut vec, mut token) = vec.into_split();

            let (s1, s2) = vec.as_slices();
            let s1_count = core::cmp::min(user_buf.len(), s1.len());
@@ -448,34 +273,28 @@ impl KernelScheme for PipeScheme {
            let _ = vec.drain(..bytes_read);

            if bytes_read > 0 {
-                drop(vec);
-                drop(lock_token);
-                trigger_matching(&endpoint.pipe, false, true, EVENT_WRITE, token);
-                endpoint.pipe.write_condition.notify(token);
+                event::trigger_locked(
+                    GlobalSchemes::Pipe.scheme_id(),
+                    key | WRITE_NOT_READ_BIT,
+                    EVENT_WRITE,
+                    token.token(),
+                );
+                pipe.write_condition.notify_locked(token.token());

                return Ok(bytes_read);
-            }
-
-            if user_buf.is_empty() {
+            } else if user_buf.is_empty() {
                return Ok(0);
            }

-            if endpoint.pipe.writer_count.load(Ordering::SeqCst) == 0 {
+            if !pipe.writer_is_alive.load(Ordering::SeqCst) {
                return Ok(0);
-            }
-            if fcntl_flags & O_NONBLOCK as u32 != 0 {
+            } else if fcntl_flags & O_NONBLOCK as u32 != 0 {
                return Err(Error::new(EAGAIN));
-            }
-            if !endpoint
-                .pipe
-                .read_condition
-                .wait(vec, "PipeRead::read", &mut lock_token)
-            {
+            } else if !pipe.read_condition.wait(vec, "PipeRead::read", &mut token) {
                return Err(Error::new(EINTR));
            }
        }
    }
-
    fn kwrite(
        &self,
        id: usize,
@@ -484,17 +303,18 @@ impl KernelScheme for PipeScheme {
        _stored_flags: u32,
        token: &mut CleanLockToken,
    ) -> Result<usize> {
-        let endpoint = Self::get_endpoint(id, token)?;
+        let (is_write_not_read, key) = from_raw_id(id);

-        if !endpoint.kind.can_write() {
+        if !is_write_not_read {
            return Err(Error::new(EBADF));
        }
+        let pipe = Self::get_pipe(key, token)?;

        loop {
-            let vec = endpoint.pipe.queue.lock(token.token());
-            let (mut vec, mut lock_token) = vec.into_split();
+            let vec = pipe.queue.lock(token.token());
+            let (mut vec, mut token) = vec.into_split();

-            if endpoint.pipe.reader_count.load(Ordering::Relaxed) == 0 {
+            if !pipe.reader_is_alive.load(Ordering::Relaxed) {
                return Err(Error::new(EPIPE));
            }

@@ -509,6 +329,7 @@ impl KernelScheme for PipeScheme {

            let mut bytes_written = 0;

+            // TODO: Modify VecDeque so that the unwritten portions can be accessed directly?
            for (idx, chunk) in src_buf.in_variable_chunks(TMPBUF_SIZE).enumerate() {
                let chunk_byte_count = match chunk.copy_common_bytes_to_slice(&mut tmp_buf) {
                    Ok(c) => c,
@@ -520,52 +341,41 @@ impl KernelScheme for PipeScheme {
            }

            if bytes_written > 0 {
-                drop(vec);
-                drop(lock_token);
-                trigger_matching(&endpoint.pipe, true, false, EVENT_READ, token);
-                endpoint.pipe.read_condition.notify(token);
+                event::trigger_locked(
+                    GlobalSchemes::Pipe.scheme_id(),
+                    key,
+                    EVENT_READ,
+                    token.token(),
+                );
+                pipe.read_condition.notify_locked(token.token());

                return Ok(bytes_written);
-            }
-
-            if user_buf.is_empty() {
+            } else if user_buf.is_empty() {
                return Ok(0);
            }

            if fcntl_flags & O_NONBLOCK as u32 != 0 {
                return Err(Error::new(EAGAIN));
-            }
-            if !endpoint
-                .pipe
+            } else if !pipe
                .write_condition
-                .wait(vec, "PipeWrite::write", &mut lock_token)
+                .wait(vec, "PipeWrite::write", &mut token)
            {
                return Err(Error::new(EINTR));
            }
        }
    }
-
-    fn kfpath(&self, id: usize, buf: UserSliceWo, token: &mut CleanLockToken) -> Result<usize> {
-        let endpoint = Self::get_endpoint(id, token)?;
-        if let Some(named) = endpoint.named {
-            buf.copy_common_bytes_from_slice(named.path.as_bytes())
-        } else {
-            buf.copy_common_bytes_from_slice("/scheme/pipe/".as_bytes())
-        }
+    fn kfpath(&self, _id: usize, buf: UserSliceWo, _token: &mut CleanLockToken) -> Result<usize> {
+        //TODO: construct useful path?
+        buf.copy_common_bytes_from_slice("/scheme/pipe/".as_bytes())
    }
-
-    fn kfstat(&self, id: usize, buf: UserSliceWo, token: &mut CleanLockToken) -> Result<()> {
-        let endpoint = Self::get_endpoint(id, token)?;
-        let mode = endpoint.named.map_or(0o666, |named| named.mode);
-
+    fn kfstat(&self, _id: usize, buf: UserSliceWo, _token: &mut CleanLockToken) -> Result<()> {
        buf.copy_exactly(&Stat {
-            st_mode: MODE_FIFO | mode,
+            st_mode: MODE_FIFO | 0o666,
            ..Default::default()
        })?;

        Ok(())
    }
-
    fn kfdwrite(
        &self,
        id: usize,
@@ -575,17 +385,23 @@ impl KernelScheme for PipeScheme {
        _metadata: &[u64],
        token: &mut CleanLockToken,
    ) -> Result<usize> {
-        let endpoint = Self::get_endpoint(id, token)?;
+        let (is_write_not_read, key) = from_raw_id(id);

-        if !endpoint.kind.can_write() {
+        if !is_write_not_read {
            return Err(Error::new(EBADF));
        }
+        let pipe = match Self::get_pipe(key, token) {
+            Ok(p) => p,
+            Err(e) => {
+                return Err(e);
+            }
+        };

        loop {
-            let vec = endpoint.pipe.fd_queue.lock(token.token());
-            let (mut vec, mut lock_token) = vec.into_split();
+            let vec = pipe.fd_queue.lock(token.token());
+            let (mut vec, mut token) = vec.into_split();

-            if endpoint.pipe.reader_count.load(Ordering::Relaxed) == 0 {
+            if !pipe.reader_is_alive.load(Ordering::Relaxed) {
                return Err(Error::new(EPIPE));
            }
            if descs.is_empty() {
@@ -605,24 +421,25 @@ impl KernelScheme for PipeScheme {
            let fds_written = vec.len() - before_len;

            if fds_written > 0 {
-                drop(vec);
-                drop(lock_token);
-                trigger_matching(&endpoint.pipe, true, false, EVENT_READ, token);
-                endpoint.pipe.read_condition.notify(token);
+                event::trigger_locked(
+                    GlobalSchemes::Pipe.scheme_id(),
+                    key,
+                    EVENT_READ,
+                    token.token(),
+                );
+                pipe.read_condition.notify_locked(token.token());

                return Ok(fds_written);
            }

-            if !endpoint
-                .pipe
+            if !pipe
                .write_condition
-                .wait(vec, "PipeWrite::write", &mut lock_token)
+                .wait(vec, "PipeWrite::write", &mut token)
            {
                return Err(Error::new(EINTR));
            }
        }
    }
-
    fn kfdread(
        &self,
        id: usize,
@@ -631,19 +448,25 @@ impl KernelScheme for PipeScheme {
        _metadata: &[u64],
        token: &mut CleanLockToken,
    ) -> Result<usize> {
-        let endpoint = Self::get_endpoint(id, token)?;
+        let (is_write_not_read, key) = from_raw_id(id);

-        if !endpoint.kind.can_read() {
+        if is_write_not_read {
            return Err(Error::new(EBADF));
        }
+        let pipe = match Self::get_pipe(key, token) {
+            Ok(p) => p,
+            Err(e) => {
+                return Err(e);
+            }
+        };

        if payload.is_empty() {
            return Ok(0);
        }

        loop {
-            let vec = endpoint.pipe.fd_queue.lock(token.token());
-            let (mut vec, mut lock_token) = vec.into_split();
+            let vec = pipe.fd_queue.lock(token.token());
+            let (mut vec, mut token) = vec.into_split();

            let fds_available = vec.len();
            let max_fds_read = payload.len() / size_of::<usize>();
@@ -656,33 +479,31 @@ impl KernelScheme for PipeScheme {
                        fds_to_transfer,
                        payload,
                        flags.contains(CallFlags::FD_CLOEXEC),
-                        &mut lock_token,
+                        &mut token,
                    )?;
                } else {
                    bulk_add_fds(
                        fds_to_transfer,
                        payload,
                        flags.contains(CallFlags::FD_CLOEXEC),
-                        &mut lock_token,
+                        &mut token,
                    )?;
                }

-                drop(vec);
-                drop(lock_token);
-                trigger_matching(&endpoint.pipe, false, true, EVENT_WRITE, token);
-                endpoint.pipe.write_condition.notify(token);
+                event::trigger_locked(
+                    GlobalSchemes::Pipe.scheme_id(),
+                    key | WRITE_NOT_READ_BIT,
+                    EVENT_WRITE,
+                    token.token(),
+                );
+                pipe.write_condition.notify_locked(token.token());

                return Ok(fds_to_read);
            }

-            if endpoint.pipe.writer_count.load(Ordering::SeqCst) == 0 {
+            if !pipe.writer_is_alive.load(Ordering::SeqCst) {
                return Ok(0);
-            }
-            if !endpoint
-                .pipe
-                .read_condition
-                .wait(vec, "PipeRead::read", &mut lock_token)
-            {
+            } else if !pipe.read_condition.wait(vec, "PipeRead::read", &mut token) {
                return Err(Error::new(EINTR));
            }
        }
@@ -690,23 +511,11 @@ impl KernelScheme for PipeScheme {
 }

 pub struct Pipe {
-    read_condition: WaitCondition,
-    write_condition: WaitCondition,
+    read_condition: WaitCondition, // signals whether there are available bytes to read
+    write_condition: WaitCondition, // signals whether there is room for additional bytes
    queue: Mutex<L1, VecDeque<u8>>,
-    reader_count: AtomicUsize,
-    writer_count: AtomicUsize,
+    reader_is_alive: AtomicBool, // starts set, unset when reader closes
+    writer_is_alive: AtomicBool, // starts set, unset when writer closes
+    has_run_dup: AtomicBool,
    fd_queue: Mutex<L1, VecDeque<Arc<LockedFileDescription>>>,
 }
-
-impl Pipe {
-    fn new() -> Self {
-        Self {
-            read_condition: WaitCondition::new(),
-            write_condition: WaitCondition::new(),
-            queue: Mutex::new(VecDeque::new()),
-            reader_count: AtomicUsize::new(0),
-            writer_count: AtomicUsize::new(0),
-            fd_queue: Mutex::new(VecDeque::new()),
-        }
-    }
-}
@@ -105,7 +105,6 @@ enum ContextHandle {
    // Attr handles, to set ens/euid/egid/pid.
    Authority,
    Attr,
-    Groups,

    Status {
        privileged: bool,
@@ -262,7 +261,6 @@ impl ProcScheme {
                let handle = match actual_name {
                    "attrs" => ContextHandle::Attr,
                    "status" => ContextHandle::Status { privileged: true },
-                    "groups" => ContextHandle::Groups,
                    _ => return Err(Error::new(ENOENT)),
                };

@@ -308,11 +306,6 @@ impl ProcScheme {
                        let id = NonZeroUsize::new(NEXT_ID.fetch_add(1, Ordering::Relaxed))
                            .ok_or(Error::new(EMFILE))?;
                        let context = context::spawn(true, Some(id), ret, token)?;
-                        {
-                            let parent_groups =
-                                context::current().read(token.token()).groups.clone();
-                            context.write(token.token()).groups = parent_groups;
-                        }
                        HANDLES.write(token.token()).insert(
                            id.get(),
                            Handle {
@@ -432,7 +425,6 @@ impl KernelScheme for ProcScheme {
    }

    fn close(&self, id: usize, token: &mut CleanLockToken) -> Result<()> {
-        let mut inner_token = unsafe { CleanLockToken::new() };
        let handle = HANDLES
            .write(token.token())
            .remove(&id)
@@ -460,7 +452,9 @@ impl KernelScheme for ProcScheme {
                    ))]
                    regs.set_arg1(arg1);

-                    Ok(context.set_addr_space(Some(new), inner_token.downgrade()))
+                    // TODO: Lock ordering violation
+                    let mut token = unsafe { CleanLockToken::new() };
+                    Ok(context.set_addr_space(Some(new), token.downgrade()))
                })?;
                if let Some(old_ctx) = old_ctx
                    && let Some(addrspace) = Arc::into_inner(old_ctx)
@@ -499,7 +493,6 @@ impl KernelScheme for ProcScheme {
        consume: bool,
        token: &mut CleanLockToken,
    ) -> Result<usize> {
-        let mut inner_token = unsafe { CleanLockToken::new() };
        let handle = HANDLES
            .read(token.token())
            .get(&id)
@@ -590,7 +583,9 @@ impl KernelScheme for ProcScheme {
                };
                // TODO: Allocated or AllocatedShared?
                let addrsp = AddrSpace::current()?;
-                let page = addrsp.acquire_write(inner_token.downgrade()).mmap_anywhere(
+                // TODO: Lock ordering violation
+                let mut token = unsafe { CleanLockToken::new() };
+                let page = addrsp.acquire_write(token.downgrade()).mmap_anywhere(
                    &addrsp,
                    NonZeroUsize::new(1).unwrap(),
                    MapFlags::PROT_READ | MapFlags::PROT_WRITE,
@@ -854,17 +849,17 @@ impl KernelScheme for ProcScheme {
    }
 }
 fn extract_scheme_number(fd: usize, token: &mut CleanLockToken) -> Result<(KernelSchemes, usize)> {
-    let desc = {
+    let (scheme_id, number) = {
        let current_lock = context::current();
        let mut current = current_lock.read(token.token());
-        let (context, mut context_token) = current.token_split();
+        let (context, mut token) = current.token_split();
        let file_descriptor = context
-            .get_file(FileHandle::from(fd), &mut context_token)
+            .get_file(FileHandle::from(fd), &mut token)
            .ok_or(Error::new(EBADF))?;
-        *file_descriptor.description.read(context_token.token())
+        let desc = file_descriptor.description.read(token.token());
+        (desc.scheme, desc.number)
    };
-    let scheme = desc.get_scheme(token)?;
-    let number = desc.number;
+    let scheme = scheme::get_scheme(token.token(), scheme_id)?;

    Ok((scheme, number))
 }
@@ -1276,39 +1271,6 @@ impl ContextHandle {
                guard.prio = (info.prio as usize).min(39);
                Ok(size_of::<ProcSchemeAttrs>())
            }
-            Self::Groups => {
-                const NGROUPS_MAX: usize = 65536;
-                if buf.len() % size_of::<u32>() != 0 {
-                    return Err(Error::new(EINVAL));
-                }
-                let count = buf.len() / size_of::<u32>();
-                if count > NGROUPS_MAX {
-                    return Err(Error::new(EINVAL));
-                }
-                let mut groups = Vec::with_capacity(count);
-                for chunk in buf.in_exact_chunks(size_of::<u32>()).take(count) {
-                    groups.push(chunk.read_u32()?);
-                }
-                let proc_id = {
-                    let guard = context.read(token.token());
-                    guard.owner_proc_id
-                };
-                {
-                    let mut guard = context.write(token.token());
-                    guard.groups = groups.clone();
-                }
-                if let Some(pid) = proc_id {
-                    let mut contexts = context::contexts(token.downgrade());
-                    let (contexts, mut t) = contexts.token_split();
-                    for context_ref in contexts.iter() {
-                        let mut ctx = context_ref.write(t.token());
-                        if ctx.owner_proc_id == Some(pid) {
-                            ctx.groups = groups.clone();
-                        }
-                    }
-                }
-                Ok(count * size_of::<u32>())
-            }
            ContextHandle::OpenViaDup => {
                let mut args = buf.usizes();

@@ -1513,15 +1475,6 @@ impl ContextHandle {
                    debug_name,
                })
            }
-            Self::Groups => {
-                let c = &context.read(token.token());
-                let max = buf.len() / size_of::<u32>();
-                let count = c.groups.len().min(max);
-                for (chunk, gid) in buf.in_exact_chunks(size_of::<u32>()).zip(&c.groups).take(count) {
-                    chunk.copy_from_slice(&gid.to_ne_bytes())?;
-                }
-                Ok(count * size_of::<u32>())
-            }
            ContextHandle::Sighandler => {
                let data = match context.read(token.token()).sig {
                    Some(ref sig) => SetSighandlerData {
@@ -80,7 +80,6 @@ const ONE: NonZeroUsize = match NonZeroUsize::new(1) {
    Some(one) => one,
    None => unreachable!(),
 };
-const MAX_SPURIOUS_WAKEUPS: usize = 100;

 enum ParsedCqe {
    TriggerFevent {
@@ -210,8 +209,6 @@ impl UserInner {
        caller_responsible: &mut PageSpan,
        token: &mut CleanLockToken,
    ) -> Result<Response> {
-        let mut remaining_spurious_wakeups = MAX_SPURIOUS_WAKEUPS;
-
        {
            // Disable preemption to avoid context switches between setting the
            // process state and sending the scheme request. The process is made
@@ -264,10 +261,7 @@ impl UserInner {
                    };

                let states = self.states.lock(token.token());
-                let (mut states, mut state_token) = states.into_split();
-                let mut timed_out_descriptions = None;
-                let mut remove_state = false;
-                let mut timed_out = false;
+                let (mut states, mut token) = states.into_split();
                match states.get_mut(sqe.tag as usize) {
                    // invalid state
                    None => return Err(Error::new(EBADFD)),
@@ -280,35 +274,24 @@ impl UserInner {
                            fds,
                        } => {
                            let maybe_eintr =
-                                eintr_if_sigkill(&mut callee_responsible, &mut state_token.token());
-
-                            if maybe_eintr.is_ok() {
-                                remaining_spurious_wakeups =
-                                    remaining_spurious_wakeups.saturating_sub(1);
-                            }
-
-                            if maybe_eintr.is_ok() && remaining_spurious_wakeups == 0 {
-                                timed_out_descriptions = Some(Self::collect_descriptions_to_close(fds));
-                                remove_state = true;
-                            } else {
-                                *o = State::Waiting {
-                                    canceling: true,
-                                    callee_responsible,
-                                    context,
-                                    fds,
-                                };
-                            }
+                                eintr_if_sigkill(&mut callee_responsible, &mut token.token());
+                            *o = State::Waiting {
+                                canceling: true,
+                                callee_responsible,
+                                context,
+                                fds,
+                            };

                            maybe_eintr?;

-                            if remove_state {
-                                states.remove(sqe.tag as usize);
-                                timed_out = true;
-                            } else {
-                                context::current()
-                                    .write(state_token.token())
-                                    .block("UserInner::call (woken up after cancelation request)");
-                            }
+                            context::current()
+                                .write(token.token())
+                                .block("UserInner::call (woken up after cancelation request)");
+
+                            // We do not want to drop the lock before blocking
+                            // as if we get preempted in between we might miss a
+                            // wakeup.
+                            drop(states);
                        }
                        // spurious wakeup
                        State::Waiting {
@@ -317,76 +300,60 @@ impl UserInner {
                            context,
                            mut callee_responsible,
                        } => {
+                            let maybe_eintr = eintr_if_sigkill(&mut callee_responsible, &mut token);
                            let current_context = context::current();
-                            let maybe_eintr =
-                                eintr_if_sigkill(&mut callee_responsible, &mut state_token);

-                            if maybe_eintr.is_ok() {
-                                remaining_spurious_wakeups =
-                                    remaining_spurious_wakeups.saturating_sub(1);
-                            }
-
-                            if maybe_eintr.is_ok() && remaining_spurious_wakeups == 0 {
-                                timed_out_descriptions = Some(Self::collect_descriptions_to_close(fds));
-                                remove_state = true;
-                            } else {
-                                *o = State::Waiting {
-                                    // Currently we treat all spurious wakeups to have the same behavior
-                                    // as signals (i.e., we send a cancellation request). It is not something
-                                    // that should happen, but it certainly can happen, for example if a context
-                                    // is awoken through its thread handle without setting any sig bits, or if the
-                                    // caller clears its own sig bits. If it actually is a signal, then it is the
-                                    // intended behavior.
-                                    canceling: true,
-                                    fds,
-                                    context,
-                                    callee_responsible,
-                                };
-                            }
+                            *o = State::Waiting {
+                                // Currently we treat all spurious wakeups to have the same behavior
+                                // as signals (i.e., we send a cancellation request). It is not something
+                                // that should happen, but it certainly can happen, for example if a context
+                                // is awoken through its thread handle without setting any sig bits, or if the
+                                // caller clears its own sig bits. If it actually is a signal, then it is the
+                                // intended behavior.
+                                canceling: true,
+                                fds,
+                                context,
+                                callee_responsible,
+                            };

                            maybe_eintr?;

-                            if remove_state {
-                                states.remove(sqe.tag as usize);
-                                timed_out = true;
-                            } else {
-                                // We do not want to preempt between sending the
-                                // cancellation and blocking again where we might
-                                // miss a wakeup.
-                                let mut preempt =
-                                    PreemptGuardL1::new(&current_context, &mut state_token);
-                                let token = preempt.token();
+                            // We do not want to preempt between sending the
+                            // cancellation and blocking again where we might
+                            // miss a wakeup.
+                            let mut preempt = PreemptGuardL1::new(&current_context, &mut token);
+                            let token = preempt.token();

-                                self.todo.send_locked(
-                                    Sqe {
-                                        opcode: Opcode::Cancel as u8,
-                                        sqe_flags: SqeFlags::ONEWAY,
-                                        tag: sqe.tag,
-                                        ..Default::default()
-                                    },
-                                    token.token(),
-                                );
-                                event::trigger_locked(
-                                    self.root_id,
-                                    self.scheme_id.get(),
-                                    EVENT_READ,
-                                    token.token(),
-                                );
+                            self.todo.send_locked(
+                                Sqe {
+                                    opcode: Opcode::Cancel as u8,
+                                    sqe_flags: SqeFlags::ONEWAY,
+                                    tag: sqe.tag,
+                                    ..Default::default()
+                                },
+                                token.token(),
+                            );
+                            event::trigger_locked(
+                                self.root_id,
+                                self.scheme_id.get(),
+                                EVENT_READ,
+                                token.token(),
+                            );

-                                // 1. If cancellation was requested and arrived
-                                // before the scheme processed the request, an
-                                // acknowledgement will be sent back after the
-                                // cancellation is processed and we will be woken up
-                                // again. State will be State::Responded then.
-                                //
-                                // 2. If cancellation was requested but the scheme
-                                // already processed the request, we will receive
-                                // the actual response next and woken up again.
-                                // State will be State::Responded then.
-                                context::current()
-                                    .write(token.token())
-                                    .block("UserInner::call (spurious wakeup)");
-                            }
+                            // 1. If cancellation was requested and arrived
+                            // before the scheme processed the request, an
+                            // acknowledgement will be sent back after the
+                            // cancellation is processed and we will be woken up
+                            // again. State will be State::Responded then.
+                            //
+                            // 2. If cancellation was requested but the scheme
+                            // already processed the request, we will receive
+                            // the actual response next and woken up again.
+                            // State will be State::Responded then.
+                            context::current()
+                                .write(token.token())
+                                .block("UserInner::call (spurious wakeup)");
+                            drop(states);
                        }

                        // invalid state
@@ -401,70 +368,10 @@ impl UserInner {
                        }
                    },
                }
-
-                if let Some(descriptions) = timed_out_descriptions {
-                    drop(states);
-                    for desc in descriptions {
-                        let _ = desc.try_close(token);
-                    }
-                }
-
-                if timed_out {
-                    return Err(Error::new(ETIMEDOUT));
-                }
            }
        }
    }

-    fn collect_descriptions_to_close(
-        fds: Vec<Arc<LockedFileDescription>>,
-    ) -> Vec<FileDescription> {
-        fds.into_iter()
-            .filter_map(|fd| Arc::try_unwrap(fd).ok())
-            .map(RwLock::into_inner)
-            .collect()
-    }
-
-    pub fn fail_pending_calls(&self, token: &mut CleanLockToken) {
-        let descriptions_to_close = {
-            let mut states_lock = self.states.lock(token.token());
-            let (states, mut lock_token) = states_lock.token_split();
-            let mut descriptions_to_close = Vec::new();
-            let mut states_to_remove = Vec::new();
-
-            for (id, state) in states.iter_mut() {
-                match mem::replace(state, State::Placeholder) {
-                    State::Waiting { context, fds, .. } => {
-                        descriptions_to_close.extend(Self::collect_descriptions_to_close(fds));
-
-                        match context.upgrade() {
-                            Some(context) => {
-                                *state = State::Responded(Response::Regular(
-                                    Err(Error::new(ENODEV)),
-                                    0,
-                                    false,
-                                ));
-                                context.write(lock_token.token()).unblock();
-                            }
-                            None => states_to_remove.push(id),
-                        }
-                    }
-                    old_state => *state = old_state,
-                }
-            }
-
-            for id in states_to_remove {
-                states.remove(id);
-            }
-
-            descriptions_to_close
-        };
-
-        for desc in descriptions_to_close {
-            let _ = desc.try_close(token);
-        }
-    }
-
    /// Map a readable structure to the scheme's userspace and return the
    /// pointer
    #[must_use = "copying back to head/tail buffers can fail"]
@@ -1376,7 +1283,6 @@ impl UserInner {
    }

    pub fn into_drop(self, token: &mut CleanLockToken) {
-        self.fail_pending_calls(token);
        self.todo.condition.into_drop(token);
    }
 }
@@ -74,16 +74,14 @@ impl MemoryEntry {
 }

 struct MemoryMap {
-    entries: [MemoryEntry; 1024],
+    entries: [MemoryEntry; 512],
    size: usize,
 }

 impl MemoryMap {
    fn register(&mut self, base: usize, size: usize, kind: BootloaderMemoryKind) {
        if self.size >= self.entries.len() {
-            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-            unsafe { core::arch::asm!("out dx, al", in("dx") 0x3F8u16, in("al") b'!', options(nostack, preserves_flags)); }
-            panic!("Early memory map overflow at entry {} (max {})", self.size, self.entries.len());
+            panic!("Early memory map overflow!");
        }
        let start = if kind == BootloaderMemoryKind::Free {
            align_up(base)
@@ -136,7 +134,7 @@ static MEMORY_MAP: SyncUnsafeCell<MemoryMap> = SyncUnsafeCell::new(MemoryMap {
        start: 0,
        end: 0,
        kind: BootloaderMemoryKind::Null,
-    }; 1024],
+    }; 512],
    size: 0,
 });

@@ -325,16 +323,7 @@ unsafe fn map_memory<A: Arch>(areas: &[MemoryArea], mut bump_allocator: &mut Bum
            }
        }

-        let kernel_area = match (*MEMORY_MAP.get()).kernel() {
-            Some(area) => area,
-            None => {
-                println!("FATAL: kernel memory area not found in boot memory map");
-                println!("Cannot determine kernel base address. Halting.");
-                loop {
-                    core::hint::spin_loop();
-                }
-            }
-        };
+        let kernel_area = (*MEMORY_MAP.get()).kernel().unwrap();
        let kernel_base = kernel_area.start;
        let kernel_size = kernel_area.end.saturating_sub(kernel_area.start);
        // Map kernel at KERNEL_OFFSET
@@ -149,15 +149,6 @@ static BOOTSTRAP: spin::Once<Bootstrap> = spin::Once::new();
 pub(crate) static AP_READY: AtomicBool = AtomicBool::new(false);
 static BSP_READY: AtomicBool = AtomicBool::new(false);

-#[cold]
-fn halt_boot(message: &str) -> ! {
-    print!("{message}");
-    println!("Kernel boot cannot continue. Halting.");
-    loop {
-        hint::spin_loop();
-    }
-}
-
 /// This is the kernel entry point for the primary CPU. The arch crate is responsible for calling this
 pub(crate) fn kmain(bootstrap: Bootstrap) -> ! {
    let mut token = unsafe { CleanLockToken::new() };
@@ -189,7 +180,9 @@ pub(crate) fn kmain(bootstrap: Bootstrap) -> ! {
            context.euid = 0;
            context.egid = 0;
        }
-        Err(_err) => halt_boot("FATAL: failed to spawn first userspace process userspace_init\n"),
+        Err(err) => {
+            panic!("failed to spawn userspace_init: {:?}", err);
+        }
    }

    run_userspace(&mut token)
@@ -1,188 +0,0 @@
-//! MCS (Mellor-Crummey Scott) fair spinlock.
-//!
-//! Each waiter spins on its own local `locked` flag instead of a shared lock
-//! word, eliminating cache-line bouncing under contention. FIFO ordering
-//! guarantees fairness. O(1) cache-line transfers on unlock.
-//!
-//! Supports transitive priority inheritance: when CPU A waits on a lock held
-//! by CPU B, and CPU B waits on a lock held by CPU C, A's priority is
-//! propagated through the chain to C (up to MAX_PI_CHAIN_DEPTH hops).
-
-use core::sync::atomic::{AtomicBool, AtomicPtr, AtomicU32, Ordering};
-use core::{hint, ptr};
-
-use crate::percpu::PercpuBlock;
-
-/// Maximum depth for transitive priority inheritance chain following.
-/// Prevents infinite loops from theoretical lock cycles and bounds latency.
-/// Linux uses 20; 8 is conservative for a microkernel with fewer nesting levels.
-const MAX_PI_CHAIN_DEPTH: u32 = 8;
-
-/// A node in the MCS lock queue.
-pub struct McsNode {
-    pub next: AtomicPtr<McsNode>,
-    pub locked: AtomicBool,
-}
-
-impl McsNode {
-    pub const fn new() -> Self {
-        Self {
-            next: AtomicPtr::new(ptr::null_mut()),
-            locked: AtomicBool::new(false),
-        }
-    }
-}
-
-/// Raw MCS spinlock primitive.
-pub struct McsRawLock {
-    tail: AtomicPtr<McsNode>,
-    /// CPU ID of the current lock holder (for priority inheritance).
-    /// `u32::MAX` means no holder.
-    holder_cpu: AtomicU32,
-}
-
-impl McsRawLock {
-    pub const fn new() -> Self {
-        Self {
-            tail: AtomicPtr::new(ptr::null_mut()),
-            holder_cpu: AtomicU32::new(u32::MAX),
-        }
-    }
-
-    #[inline]
-    pub fn acquire(&self, node: &McsNode) -> bool {
-        node.next.store(ptr::null_mut(), Ordering::Relaxed);
-        node.locked.store(true, Ordering::Relaxed);
-        let prev = self.tail.swap((node as *const McsNode).cast_mut(), Ordering::AcqRel);
-        if prev.is_null() {
-            // Uncontended — record ourselves as holder
-            let cpu_id = PercpuBlock::current().cpu_id.get();
-            self.holder_cpu.store(cpu_id, Ordering::Release);
-            return false;
-        }
-        unsafe {
-            (*prev).next.store((node as *const McsNode).cast_mut(), Ordering::Release);
-        }
-        let percpu = PercpuBlock::current();
-        // Record which lock we're spinning on (for transitive PI chain following)
-        percpu.waiting_on_lock.store(
-            (self as *const McsRawLock).cast_mut(),
-            Ordering::Release,
-        );
-        let mut donated = false;
-        while node.locked.load(Ordering::Acquire) {
-            percpu.maybe_handle_tlb_shootdown();
-            // Donate priority to the lock holder (transitively) once per acquisition
-            if !donated {
-                self.maybe_donate_priority(percpu);
-                donated = true;
-            }
-            hint::spin_loop();
-        }
-        // Clear waiting_on_lock before proceeding — we now hold the lock
-        percpu.waiting_on_lock.store(ptr::null_mut(), Ordering::Release);
-        self.holder_cpu.store(percpu.cpu_id.get(), Ordering::Release);
-        true
-    }
-
-    #[inline]
-    pub fn release(&self, node: &McsNode) {
-        // Clear priority inheritance donation — we no longer hold the lock
-        PercpuBlock::current().pi_donated_prio.store(u32::MAX, Ordering::Release);
-        // Clear holder CPU
-        self.holder_cpu.store(u32::MAX, Ordering::Release);
-
-        let next = node.next.load(Ordering::Acquire);
-        if next.is_null() {
-            if self
-                .tail
-                .compare_exchange(
-                    (node as *const McsNode).cast_mut(),
-                    ptr::null_mut(),
-                    Ordering::AcqRel,
-                    Ordering::Acquire,
-                )
-                .is_ok()
-            {
-                return;
-            }
-            while node.next.load(Ordering::Acquire).is_null() {
-                hint::spin_loop();
-            }
-        }
-        unsafe {
-            (*node.next.load(Ordering::Acquire)).locked.store(false, Ordering::Release);
-        }
-    }
-
-    #[inline]
-    pub fn try_acquire(&self, node: &McsNode) -> bool {
-        node.next.store(ptr::null_mut(), Ordering::Relaxed);
-        node.locked.store(true, Ordering::Relaxed);
-        let ok = self
-            .tail
-            .compare_exchange(
-                ptr::null_mut(),
-                (node as *const McsNode).cast_mut(),
-                Ordering::AcqRel,
-                Ordering::Acquire,
-            )
-            .is_ok();
-        if ok {
-            let cpu_id = PercpuBlock::current().cpu_id.get();
-            self.holder_cpu.store(cpu_id, Ordering::Release);
-        }
-        ok
-    }
-
-    /// Donate current CPU's context priority to the lock holder's CPU,
-    /// following the PI chain transitively (A→B→C).
-    ///
-    /// Reads priority from PercpuBlock::current_prio (cached by the scheduler)
-    /// to avoid acquiring any lock in the MCS spin loop.
-    ///
-    /// Chain following: if the holder is itself waiting on another lock,
-    /// we propagate our priority to that lock's holder too, up to
-    /// MAX_PI_CHAIN_DEPTH hops.
-    fn maybe_donate_priority(&self, my_percpu: &PercpuBlock) {
-        let my_prio = my_percpu.current_prio.get() as u32;
-        let mut current_holder_cpu = self.holder_cpu.load(Ordering::Relaxed);
-
-        for _ in 0..MAX_PI_CHAIN_DEPTH {
-            if current_holder_cpu == u32::MAX {
-                return;
-            }
-            let holder_percpu = crate::percpu::get_for_cpu(
-                crate::cpu_set::LogicalCpuId::new(current_holder_cpu),
-            );
-            let Some(holder) = holder_percpu else {
-                return;
-            };
-
-            // Donate if our priority is higher (lower number) than current donation
-            let current_donated = holder.pi_donated_prio.load(Ordering::Relaxed);
-            if my_prio < current_donated {
-                holder.pi_donated_prio.store(my_prio, Ordering::Release);
-            }
-
-            // Follow the chain: is this holder also waiting on another lock?
-            let next_lock_ptr = holder.waiting_on_lock.load(Ordering::Relaxed);
-            if next_lock_ptr.is_null() {
-                return;
-            }
-            // SAFETY: The pointed-to McsRawLock is a long-lived struct field
-            // (e.g., part of the run queue). The holder is currently spinning
-            // in acquire(), so the pointer is valid. We only read holder_cpu
-            // (an atomic u32) — no mutable access needed.
-            let next_holder_cpu =
-                unsafe { (*next_lock_ptr).holder_cpu.load(Ordering::Relaxed) };
-
-            // Cycle detection: if the next holder is the same CPU we just visited, stop
-            if next_holder_cpu == current_holder_cpu {
-                return;
-            }
-            current_holder_cpu = next_holder_cpu;
-        }
-        // Chain depth exhausted — stop to bound latency
-    }
-}
@@ -1,6 +1,5 @@
 pub use self::{ordered::*, wait_condition::WaitCondition, wait_queue::WaitQueue};

-pub mod mcs;
 pub mod ordered;
 pub mod wait_condition;
 pub mod wait_queue;
@@ -52,9 +52,7 @@
 //! *g1 = 12;
 //! ```
 use alloc::sync::Arc;
-use core::cell::UnsafeCell;
 use core::marker::PhantomData;
-use core::ptr;

 use crate::percpu::PercpuBlock;

@@ -734,143 +732,3 @@ impl<L: Level, T> Drop for ArcRwLockWriteGuard<L, T> {
 /// This function can only be called if no lock is held by the calling thread/task
 #[inline]
 pub fn check_no_locks(_: LockToken<'_, L0>) {}
-
-// ---------------------------------------------------------------------------
-// MCS-based fair mutex (McsMutex)
-// ---------------------------------------------------------------------------
-
-/// A mutual exclusion lock using the MCS fair spinlock algorithm.
-///
-/// Unlike `Mutex<L, T>` which uses a simple spinlock (no fairness under
-/// contention), `McsMutex` uses Mellor-Crummey Scott queue-based spinning:
-///
-/// - Each waiter spins on its **own** local flag — no shared cache-line bouncing.
-/// - FIFO ordering prevents starvation.
-/// - O(1) cache-line transfers on unlock.
-///
-/// The MCS node is stored in [`crate::percpu::PercpuBlock::mcs_sched_node`], so
-/// this type is suitable for scheduler-internal locks where the holder is always
-/// the current CPU.
-pub struct McsMutex<L: Level, T> {
-    raw: crate::sync::mcs::McsRawLock,
-    data: UnsafeCell<T>,
-    _phantom: PhantomData<L>,
-}
-
-unsafe impl<L: Level, T: Send> Sync for McsMutex<L, T> {}
-unsafe impl<L: Level, T: Send> Send for McsMutex<L, T> {}
-
-impl<L: Level, T> McsMutex<L, T> {
-    pub const fn new(val: T) -> Self {
-        Self {
-            raw: crate::sync::mcs::McsRawLock::new(),
-            data: UnsafeCell::new(val),
-            _phantom: PhantomData,
-        }
-    }
-}
-
-impl<L: Level, T> McsMutex<L, T> {
-    pub fn lock<'a, LP: Lower<L> + 'a>(
-        &'a self,
-        lock_token: LockToken<'a, LP>,
-    ) -> McsMutexGuard<'a, L, T> {
-        let percpu = PercpuBlock::current();
-        let contended = self.raw.acquire(&percpu.mcs_sched_node);
-        if contended {
-            percpu
-                .mcs_contention_count
-                .set(percpu.mcs_contention_count.get() + 1);
-        }
-        McsMutexGuard {
-            lock: self,
-            lock_token: LockToken::downgraded(lock_token),
-        }
-    }
-
-    pub fn try_lock<'a, LP: Lower<L> + 'a>(
-        &'a self,
-        lock_token: LockToken<'a, LP>,
-    ) -> Option<McsMutexGuard<'a, L, T>> {
-        let percpu = PercpuBlock::current();
-        if self.raw.try_acquire(&percpu.mcs_sched_node) {
-            Some(McsMutexGuard {
-                lock: self,
-                lock_token: LockToken::downgraded(lock_token),
-            })
-        } else {
-            None
-        }
-    }
-}
-
-pub struct McsMutexGuard<'a, L: Level, T: 'a> {
-    lock: &'a McsMutex<L, T>,
-    lock_token: LockToken<'a, L>,
-}
-
-impl<'a, L: Level, T: 'a> McsMutexGuard<'a, L, T> {
-    pub fn token_split(&mut self) -> (&mut T, LockToken<'_, L>) {
-        unsafe { (&mut *self.lock.data.get(), self.lock_token.token()) }
-    }
-
-    pub fn into_split(self) -> (McsRawGuard<'a, L, T>, LockToken<'a, L>) {
-        let lock_ref = self.lock;
-        let token = unsafe { core::ptr::read(&self.lock_token) };
-        core::mem::forget(self);
-        (McsRawGuard { lock: lock_ref }, token)
-    }
-
-    pub fn from_split(raw: McsRawGuard<'a, L, T>, token: LockToken<'a, L>) -> Self {
-        let lock_ref = raw.lock;
-        core::mem::forget(raw);
-        Self {
-            lock: lock_ref,
-            lock_token: token,
-        }
-    }
-}
-
-impl<L: Level, T> core::ops::Deref for McsMutexGuard<'_, L, T> {
-    type Target = T;
-    fn deref(&self) -> &Self::Target {
-        unsafe { &*self.lock.data.get() }
-    }
-}
-
-impl<L: Level, T> core::ops::DerefMut for McsMutexGuard<'_, L, T> {
-    fn deref_mut(&mut self) -> &mut Self::Target {
-        unsafe { &mut *self.lock.data.get() }
-    }
-}
-
-impl<L: Level, T> Drop for McsMutexGuard<'_, L, T> {
-    fn drop(&mut self) {
-        let percpu = PercpuBlock::current();
-        self.lock.raw.release(&percpu.mcs_sched_node);
-    }
-}
-
-pub struct McsRawGuard<'a, L: Level, T: 'a> {
-    lock: &'a McsMutex<L, T>,
-}
-
-impl<L: Level, T> core::ops::Deref for McsRawGuard<'_, L, T> {
-    type Target = T;
-    fn deref(&self) -> &Self::Target {
-        unsafe { &*self.lock.data.get() }
-    }
-}
-
-impl<L: Level, T> core::ops::DerefMut for McsRawGuard<'_, L, T> {
-    fn deref_mut(&mut self) -> &mut Self::Target {
-        unsafe { &mut *self.lock.data.get() }
-    }
-}
-
-impl<L: Level, T> Drop for McsRawGuard<'_, L, T> {
-    fn drop(&mut self) {
-        let percpu = PercpuBlock::current();
-        self.lock.raw.release(&percpu.mcs_sched_node);
-    }
-}
@@ -2,7 +2,7 @@

 use core::num::NonZeroUsize;

-use alloc::{format, string::{String, ToString}, sync::Arc, vec::Vec};
+use alloc::{string::String, sync::Arc, vec::Vec};
 use redox_path::RedoxPath;

 use crate::{
@@ -12,9 +12,9 @@ use crate::{
        memory::{AddrSpace, GenericFlusher, Grant, PageSpan, TlbShootdownActions},
    },
    memory::{Page, VirtualAddress, PAGE_SIZE},
-    scheme::{self, pipe, FileHandle, KernelScheme, OpenResult, SchemeExt, StrOrBytes},
+    scheme::{self, FileHandle, KernelScheme, OpenResult, StrOrBytes},
    sync::{CleanLockToken, RwLock},
-    syscall::{data::{GlobalSchemes, Stat}, error::*, flag::*},
+    syscall::{data::Stat, error::*, flag::*},
 };

 use super::usercopy::{UserSlice, UserSliceRo, UserSliceRw, UserSliceWo};
@@ -45,7 +45,7 @@ pub fn file_op_generic_ext<T>(
        (file, desc)
    };

-    let scheme = desc.get_scheme(token)?;
+    let scheme = scheme::get_scheme(token.token(), desc.scheme)?;

    op(&*scheme, file.description, desc, token)
 }
@@ -62,32 +62,55 @@ pub fn copy_path_to_buf(raw_path: UserSliceRo, max_len: usize) -> Result<String>
 // TODO: Define elsewhere
 const PATH_MAX: usize = PAGE_SIZE;

-fn fifo_path_key(scheme_id: scheme::SchemeId, number: usize, path: &str) -> String {
-    if path.starts_with('/') {
-        path.to_string()
-    } else {
-        format!("@fifo:{}:{}:{}", scheme_id.get(), number, path)
-    }
-}
-
-fn install_open_result(
-    scheme_id: scheme::SchemeId,
+pub fn openat(
+    fh: FileHandle,
+    raw_path: UserSliceRo,
    flags: usize,
-    open_result: OpenResult,
+    fcntl_flags: u32,
+    euid: u32,
+    egid: u32,
    token: &mut CleanLockToken,
 ) -> Result<FileHandle> {
-    let new_description = match open_result {
-        OpenResult::SchemeLocal(number, internal_flags) => Arc::new(RwLock::new(
-            FileDescription::new(
-                scheme_id,
-                number,
-                0,
-                (flags & !O_CLOEXEC) as u32,
-                internal_flags,
-                token,
-            ),
-        )),
-        OpenResult::External(desc) => desc,
+    let path_buf = copy_path_to_buf(raw_path, PATH_MAX)?;
+
+    let (scheme_id, number) = {
+        let current_lock = context::current();
+        let mut current = current_lock.read(token.token());
+        let (context, mut token) = current.token_split();
+        let pipe = context.get_file(fh, &mut token).ok_or(Error::new(EBADF))?;
+        let desc = pipe.description.read(token.token());
+        (desc.scheme, desc.number)
+    };
+
+    let caller_ctx = context::current()
+        .read(token.token())
+        .caller_ctx()
+        .filter_uid_gid(euid, egid);
+
+    let new_description = {
+        let scheme = scheme::get_scheme(token.token(), scheme_id)?;
+
+        let res = scheme.kopenat(
+            number,
+            StrOrBytes::from_str(&path_buf),
+            flags,
+            fcntl_flags,
+            caller_ctx,
+            token,
+        );
+
+        match res? {
+            OpenResult::SchemeLocal(number, internal_flags) => {
+                Arc::new(RwLock::new(FileDescription {
+                    offset: 0,
+                    internal_flags,
+                    scheme: scheme_id,
+                    number,
+                    flags: (flags & !O_CLOEXEC) as u32,
+                }))
+            }
+            OpenResult::External(desc) => desc,
+        }
    };

    let current_lock = context::current();
@@ -103,102 +126,6 @@ fn install_open_result(
        )
        .ok_or(Error::new(EMFILE))
 }
-
-fn path_exists_in_scheme(
-    scheme: &dyn KernelScheme,
-    number: usize,
-    path: &str,
-    caller_ctx: scheme::CallerCtx,
-    token: &mut CleanLockToken,
-) -> Result<bool> {
-    match scheme.kopenat(number, StrOrBytes::from_str(path), O_STAT, 0, caller_ctx, token) {
-        Ok(OpenResult::SchemeLocal(number, _)) => {
-            let _ = scheme.close(number, token);
-            Ok(true)
-        }
-        Ok(OpenResult::External(_)) => Ok(true),
-        Err(err) if err.errno == ENOENT => Ok(false),
-        Err(err) => Err(err),
-    }
-}
-
-pub fn openat(
-    fh: FileHandle,
-    raw_path: UserSliceRo,
-    flags: usize,
-    fcntl_flags: u32,
-    euid: u32,
-    egid: u32,
-    token: &mut CleanLockToken,
-) -> Result<FileHandle> {
-    let path_buf = copy_path_to_buf(raw_path, PATH_MAX)?;
-
-    let desc = {
-        let current_lock = context::current();
-        let mut current = current_lock.read(token.token());
-        let (context, mut context_token) = current.token_split();
-        let pipe = context
-            .get_file(fh, &mut context_token)
-            .ok_or(Error::new(EBADF))?;
-        *pipe.description.read(context_token.token())
-    };
-    let scheme = desc.get_scheme(token)?;
-    let number = desc.number;
-    let scheme_id = desc.scheme;
-
-    let caller_ctx = context::current()
-        .read(token.token())
-        .caller_ctx()
-        .filter_uid_gid(euid, egid);
-
-    let fifo_mode_requested = flags & MODE_FIFO as usize == MODE_FIFO as usize;
-    let fifo_key = fifo_path_key(scheme_id, number, &path_buf);
-
-    if pipe::named_pipe_exists(&fifo_key, token) {
-        if flags & O_EXCL == O_EXCL && flags & O_CREAT == O_CREAT {
-            return Err(Error::new(EEXIST));
-        }
-        if fifo_mode_requested && flags & O_CREAT == O_CREAT {
-            return Err(Error::new(EEXIST));
-        }
-
-        let pipe_number = pipe::open_named_pipe(&fifo_key, flags, token)?
-            .ok_or(Error::new(ENOENT))?;
-        return install_open_result(
-            GlobalSchemes::Pipe.scheme_id(),
-            flags,
-            OpenResult::SchemeLocal(pipe_number, InternalFlags::empty()),
-            token,
-        );
-    }
-
-    if fifo_mode_requested && flags & O_CREAT == O_CREAT {
-        if path_exists_in_scheme(&*scheme, number, &path_buf, caller_ctx, token)? {
-            return Err(Error::new(EEXIST));
-        }
-
-        let mode = u16::try_from(flags & 0o7777).map_err(|_| Error::new(EINVAL))?;
-        let pipe_number = pipe::create_named_pipe(&fifo_key, &path_buf, mode, flags, token)?;
-
-        return install_open_result(
-            GlobalSchemes::Pipe.scheme_id(),
-            flags,
-            OpenResult::SchemeLocal(pipe_number, InternalFlags::empty()),
-            token,
-        );
-    }
-
-    let open_result = scheme.kopenat(
-        number,
-        StrOrBytes::from_str(&path_buf),
-        flags,
-        fcntl_flags,
-        caller_ctx,
-        token,
-    )?;
-
-    install_open_result(scheme_id, flags, open_result, token)
-}
 /// Unlinkat syscall
 pub fn unlinkat(
    fh: FileHandle,
@@ -210,27 +137,22 @@ pub fn unlinkat(
 ) -> Result<()> {
    let path_buf = copy_path_to_buf(raw_path, PATH_MAX)?;

-    let desc = {
+    let (number, scheme_id) = {
        let current_lock = context::current();
        let mut current = current_lock.read(token.token());
-        let (context, mut context_token) = current.token_split();
-        let pipe = context
-            .get_file(fh, &mut context_token)
-            .ok_or(Error::new(EBADF))?;
-        *pipe.description.read(context_token.token())
+        let (context, mut token) = current.token_split();
+        let pipe = context.get_file(fh, &mut token).ok_or(Error::new(EBADF))?;
+        let desc = pipe.description.read(token.token());
+        (desc.number, desc.scheme)
    };
-    let number = desc.number;
-    let scheme = desc.get_scheme(token)?;
+
+    let scheme = scheme::get_scheme(token.token(), scheme_id)?;

    let caller_ctx = context::current()
        .read(token.token())
        .caller_ctx()
        .filter_uid_gid(euid, egid);

-    if pipe::unlink_named_pipe(&fifo_path_key(desc.scheme, number, &path_buf), token) {
-        return Ok(());
-    }
-
    /*
    let mut path_buf = BorrowedHtBuf::head()?;
    let path = path_buf.use_for_string(raw_path)?;
@@ -277,18 +199,17 @@ fn duplicate_file(
        let description = { *file.description.read(token.token()) };

        let new_description = {
-            let scheme = description.get_scheme(token)?;
+            let scheme = scheme::get_scheme(token.token(), description.scheme)?;

            match scheme.kdup(description.number, user_buf, caller_ctx, token)? {
                OpenResult::SchemeLocal(number, internal_flags) => {
-                    Arc::new(RwLock::new(FileDescription::new(
-                        description.scheme,
-                        number,
-                        0,
-                        description.flags,
+                    Arc::new(RwLock::new(FileDescription {
+                        offset: 0,
                        internal_flags,
-                        token,
-                    )))
+                        scheme: description.scheme,
+                        number,
+                        flags: description.flags,
+                    }))
                }
                OpenResult::External(desc) => desc,
            }
@@ -375,10 +296,11 @@ fn call_normal(
    }
    .ok_or(Error::new(EBADF))?;

-    let (scheme, number) = {
-        let desc = *file.description.read(token.token());
-        (desc.get_scheme(token)?, desc.number)
+    let (scheme_id, number) = {
+        let desc = file.description.read(token.token());
+        (desc.scheme, desc.number)
    };
+    let scheme = scheme::get_scheme(token.token(), scheme_id)?;

    if flags.contains(CallFlags::STD_FS) {
        scheme.translate_std_fs_call(number, file.description, payload, flags, metadata, token)
@@ -419,28 +341,28 @@ fn fdwrite_inner(
 ) -> Result<usize> {
    // TODO: Ensure deadlocks can't happen
    let (scheme, number, descs_to_send) = {
-        let desc = {
+        let (scheme, number) = {
            let current_lock = context::current();
            let mut current = current_lock.read(token.token());
-            let (context, mut context_token) = current.token_split();
+            let (context, mut token) = current.token_split();
            let file_descriptor = context
-                .get_file(socket, &mut context_token)
+                .get_file(socket, &mut token)
                .ok_or(Error::new(EBADF))?;
-            *file_descriptor.description.read(context_token.token())
+            let desc = &file_descriptor.description.read(token.token());
+            (desc.scheme, desc.number)
        };
-        let scheme = desc.get_scheme(token)?;
-        let number = desc.number;
+        let scheme = scheme::get_scheme(token.token(), scheme)?;

        let current_lock = context::current();
        let mut current = current_lock.read(token.token());
-        let (context, mut context_token) = current.token_split();
+        let (context, mut token) = current.token_split();
        (
            scheme,
            number,
            if flags.contains(CallFlags::FD_CLONE) {
-                context.bulk_get_files(&target_fds, &mut context_token)
+                context.bulk_get_files(&target_fds, &mut token)
            } else {
-                context.bulk_remove_files(&target_fds, &mut context_token)
+                context.bulk_remove_files(&target_fds, &mut token)
            }?
            .into_iter()
            .map(|f| f.description)
@@ -473,22 +395,18 @@ fn call_fdread(
    metadata: &[u64],
    token: &mut CleanLockToken,
 ) -> Result<usize> {
-    let desc = {
-        let current_lock = context::current();
-        let mut current = current_lock.read(token.token());
-        let (context, mut context_token) = current.token_split();
-        let file_descriptor = context
-            .get_file(fd, &mut context_token)
-            .ok_or(Error::new(EBADF))?;
-        *file_descriptor.description.read(context_token.token())
-    };
    let (scheme, number) = {
-        let scheme = desc.get_scheme(token)?;
-        let number = desc.number;
-        (
-            scheme,
-            number,
-        )
+        let (scheme, number) = {
+            let current_lock = context::current();
+            let mut current = current_lock.read(token.token());
+            let (context, mut token) = current.token_split();
+            let file_descriptor = context.get_file(fd, &mut token).ok_or(Error::new(EBADF))?;
+            let desc = file_descriptor.description.read(token.token());
+            (desc.scheme, desc.number)
+        };
+        let scheme = scheme::get_scheme(token.token(), scheme)?;
+
+        (scheme, number)
    };

    scheme.kfdread(number, payload, flags, metadata, token)
@@ -522,9 +440,9 @@ pub fn fcntl(fd: FileHandle, cmd: usize, arg: usize, token: &mut CleanLockToken)
    }
    .ok_or(Error::new(EBADF))?;

-    let (number, flags, desc) = {
-        let desc = *file.description.read(token.token());
-        (desc.number, desc.flags, desc)
+    let (scheme_id, number, flags) = {
+        let desc = file.description.write(token.token());
+        (desc.scheme, desc.number, desc.flags)
    };

    if cmd == F_DUPFD || cmd == F_DUPFD_CLOEXEC {
@@ -542,7 +460,7 @@ pub fn fcntl(fd: FileHandle, cmd: usize, arg: usize, token: &mut CleanLockToken)

    // Communicate fcntl with scheme
    if cmd != F_GETFD && cmd != F_SETFD {
-        let scheme = desc.get_scheme(token)?;
+        let scheme = scheme::get_scheme(token.token(), scheme_id)?;

        scheme.fcntl(number, cmd, arg, token)?;
    };
@@ -600,11 +518,13 @@ pub fn flink(fd: FileHandle, raw_path: UserSliceRo, token: &mut CleanLockToken)
    let path = RedoxPath::from_absolute(&path_buf).ok_or(Error::new(EINVAL))?;
    let (_, reference) = path.as_parts().ok_or(Error::new(EINVAL))?;

-    let (number, scheme) = {
-        let desc = *file.description.read(token.token());
-        (desc.number, desc.get_scheme(token)?)
+    let (number, scheme_id) = {
+        let desc = file.description.read(token.token());
+        (desc.number, desc.scheme)
    };

+    let scheme = scheme::get_scheme(token.token(), scheme_id)?;
+
    // TODO: Check EXDEV.
    /*
    if scheme_id != description.scheme {
@@ -634,11 +554,13 @@ pub fn frename(fd: FileHandle, raw_path: UserSliceRo, token: &mut CleanLockToken
    let path = RedoxPath::from_absolute(&path_buf).ok_or(Error::new(EINVAL))?;
    let (_, reference) = path.as_parts().ok_or(Error::new(EINVAL))?;

-    let (number, scheme) = {
-        let desc = *file.description.read(token.token());
-        (desc.number, desc.get_scheme(token)?)
+    let (number, scheme_id) = {
+        let desc = file.description.read(token.token());
+        (desc.number, desc.scheme)
    };

+    let scheme = scheme::get_scheme(token.token(), scheme_id)?;
+
    // TODO: Check EXDEV.
    /*
    if scheme_id != description.scheme {
@@ -28,11 +28,6 @@ use crate::{
    sync::CleanLockToken,
 };

-/// Local syscall numbers not yet in the redox_syscall crate.
-/// These are allocated from the 987+ range to avoid collisions with crate numbers.
-pub const SYS_SCHED_SETAFFINITY: usize = 987;
-pub const SYS_SCHED_GETAFFINITY: usize = 988;
-
 /// Debug
 pub mod debug;

@@ -225,10 +220,6 @@ pub fn syscall(
                unlinkat(fd, UserSlice::ro(c, d)?, e, f as _, g as _, token).map(|()| 0)
            }
            SYS_YIELD => sched_yield(token).map(|()| 0),
-
-            // P17-3: CPU affinity syscalls. Numbers allocated locally (not yet in redox_syscall crate).
-            SYS_SCHED_SETAFFINITY => sched_setaffinity(b, UserSlice::ro(c, d)?, token),
-            SYS_SCHED_GETAFFINITY => sched_getaffinity(b, UserSlice::wo(c, d)?, token),
            SYS_NANOSLEEP => nanosleep(
                UserSlice::ro(b, size_of::<TimeSpec>())?,
                UserSlice::wo(c, size_of::<TimeSpec>())?.none_if_null(),
@@ -11,7 +11,6 @@ use crate::{
        memory::{AddrSpace, Grant, PageSpan},
        ContextRef,
    },
-    cpu_set::RawMask,
    event,
    sync::{CleanLockToken, RwLock},
    syscall::flag::{EventFlags, O_CREAT, O_RDWR},
@@ -272,95 +271,24 @@ unsafe fn bootstrap_mem(bootstrap: &crate::startup::Bootstrap) -> &'static [u8]
 }

 fn insert_fd(scheme: SchemeId, number: usize, cloexec: bool, token: &mut CleanLockToken) -> usize {
-    let description = Arc::new(RwLock::new(FileDescription::new(
-        scheme,
-        number,
-        0,
-        (O_CREAT | O_RDWR) as u32,
-        InternalFlags::empty(),
-        token,
-    )));
-
    let current_lock = context::current();
    let mut current = current_lock.read(token.token());
-    let (context, mut context_token) = current.token_split();
+    let (context, mut token) = current.token_split();
    context
        .add_file_min(
            FileDescriptor {
-                description,
+                description: Arc::new(RwLock::new(FileDescription {
+                    scheme,
+                    number,
+                    offset: 0,
+                    flags: (O_CREAT | O_RDWR) as u32,
+                    internal_flags: InternalFlags::empty(),
+                })),
                cloexec,
            },
            syscall::flag::UPPER_FDTBL_TAG + scheme.get(),
-            &mut context_token,
+            &mut token,
        )
        .expect("failed to insert fd to current context")
        .get()
 }
-
-/// Set CPU affinity mask for a process.
-///
-/// # Arguments (syscall ABI)
-/// - `pid`: Process ID (0 = current process; other PIDs not yet supported)
-/// - `mask_ptr`: Pointer to a `RawMask` (32 bytes on 64-bit, 256-bit bitmap)
-/// - `mask_len`: Length of mask in bytes (must equal `size_of::<RawMask>()`)
-pub fn sched_setaffinity(
-    pid: usize,
-    mask_ptr: super::usercopy::UserSliceRo,
-    token: &mut CleanLockToken,
-) -> Result<usize> {
-    // Validate mask size
-    if mask_ptr.len() != core::mem::size_of::<RawMask>() {
-        return Err(Error::new(super::error::EINVAL));
-    }
-
-    // pid == 0 means current process
-    let target = if pid == 0 {
-        context::current()
-    } else {
-        // TODO: Support PID-based lookup (requires context list iteration
-        // with lock token downgrades). For now, only pid=0 is supported.
-        return Err(Error::new(super::error::ESRCH));
-    };
-
-    // Read mask from userspace
-    let raw_mask: RawMask = unsafe { mask_ptr.read_exact() }?;
-
-    // Apply to context's affinity mask
-    let mut ctx = target.write(token.token());
-    ctx.sched_affinity.override_from(&raw_mask);
-
-    Ok(0)
-}
-
-/// Get CPU affinity mask for a process.
-///
-/// # Arguments (syscall ABI)
-/// - `pid`: Process ID (0 = current process; other PIDs not yet supported)
-/// - `mask_ptr`: Pointer to a `RawMask` buffer (32 bytes on 64-bit)
-/// - `mask_len`: Length of buffer in bytes (must equal `size_of::<RawMask>()`)
-///
-/// # Returns
-/// Number of bytes written to mask_ptr on success.
-pub fn sched_getaffinity(
-    pid: usize,
-    mask_ptr: super::usercopy::UserSliceWo,
-    token: &mut CleanLockToken,
-) -> Result<usize> {
-    // Validate mask size
-    if mask_ptr.len() != core::mem::size_of::<RawMask>() {
-        return Err(Error::new(super::error::EINVAL));
-    }
-
-    // pid == 0 means current process
-    let target = if pid == 0 {
-        context::current()
-    } else {
-        return Err(Error::new(super::error::ESRCH));
-    };
-
-    let ctx = target.read(token.token());
-    let raw_mask = ctx.sched_affinity.to_raw();
-    mask_ptr.copy_common_bytes_from_slice(crate::cpu_set::mask_as_bytes(&raw_mask))?;
-
-    Ok(core::mem::size_of::<RawMask>())
-}