chore: kernel source patches, local recipe updates, and build artifacts

Kernel source (ephemeral — changes durable in local/patches/kernel/): - P20 x2apic ICR mode fix, P21 x2apic SMP fix applied - ACPI MADT, RSDP, SDT improvements - Context switch, percpu, event, IRQ scheme updates - MSI/vector allocation, NUMA/SLIT/SRAT support Local recipe source updates: - redox-driver-acpi: bus/prt hardening - redox-drm: Intel display, KMS connector improvements - driver-manager: config/scheme hardening - thermald: main.rs fix - uutils-tar, ninja-build: source updates Other: - bootloader, installer, redoxfs, relibc, userutils source updates - recipe.toml.backup, libxcvt source directory
2026-05-18 14:20:54 +03:00
parent 29ff1ea8fc
commit 0cbad35638
72 changed files with 5058 additions and 599 deletions
@@ -0,0 +1,260 @@
+[source]
+git = "https://gitlab.redox-os.org/redox-os/base.git"
+rev = "463f76b9608a896e6f6c9f63457f57f6409873c7"
+patches = [
+    "P0-daemon-fix-init-notify-unwrap.patch",
+    "P0-workspace-add-bootstrap.patch",
+    "P0-init-continuous-scheduling.patch",
+    "P0-dhcpd-auto-iface.patch",
+    "P0-procmgr-sigchld-debug.patch",
+    "P0-pcid-mcfg-diagnostics.patch",
+    "P0-ihdgd-intel-gpu-ids.patch",
+    "P0-acpid-dmar-fix.patch",
+    # P1: acpid EC runtime and AML physmem hardening (narrow ACPI runtime patches)
+    "P1-acpid-ec-runtime.patch",
+    "P1-acpid-runtime-hardening.patch",
+    # Stale patches needing recreation: P1-pcid-uevent-surface, P2-boot-runtime-fixes,
+    # P2-hwd-misc, P2-pcid-cfg-access, P3-xhci-device-hardening, P6-cpufreqd-real-impl
+    "P2-i2c-gpio-ucsi-drivers.patch",
+    "P0-i2c-control-response-empty.patch",
+    "P2-ihdad-graceful-init.patch",
+    "P2-boot-logging.patch",
+    "P2-init-acpid-wiring.patch",
+    "P2-hwd-remove-acpid-spawn.patch",
+    "P2-initfs-pcid-service.patch",
+    "P2-misc-daemon-fixes.patch",
+    "P9-fix-so-pecred.patch",
+    "P3-inputd-keymap-bridge.patch",
+    # P3: ps2d consolidated — LED feedback, mouse resend, fastfail, Intellimouse2, controller init robustness, non-x86 fallback
+    "P7-ps2d-intellimouse2-leds-controller-init.patch",
+    "P3-usbhidd-hardening.patch",
+    "P3-init-colored-output.patch",
+    "P4-logd-persistent-logging.patch",
+    "P4-acpi-shutdown-hardening.patch",
+    "P4-acpi-s3-sleep.patch",
+    "P4-pcid-public-client-channel.patch",
+    "P4-pcid-config-scheme.patch",
+    "P4-pcid-spawner-pci-coordinate-env.patch",
+    "P4-initfs-usb-drm-services.patch",
+    "P4-initfs-release-virtio-gpu.patch",
+    "P4-initfs-network-services.patch",
+    "P4-initfs-getty-services.patch",
+    "P4-initfs-dbus-services.patch",
+    "P4-fbcond-scrollback.patch",
+    # P4: ucsid graceful ESTALE/ENOENT handling — don't crash when /scheme/acpi/symbols unavailable
+    "P4-ucsid-estale-graceful.patch",
+    # P4: Extend ESTALE/ENOENT graceful handling to all ACPI-reading daemons
+    "P4-acpi-estale-graceful.patch",
+    # P4: hwd graceful ESTALE/ENOENT handling in probe()
+    "P4-hwd-estale-graceful.patch",
+    # P5: i2c-hidd + intel-thc-hidd: boot-time ESTALE/ENOENT retry with exponential backoff
+    "P5-i2c-hidd-estale-retry.patch",
+    # P5: acpid /scheme/acpi/dmi SMBIOS endpoint for quirk matching
+    "P5-acpid-dmi-endpoint.patch",
+    "P4-thermal-daemon.patch",
+    "P4-thermald-workspace.patch",
+    "P6-driver-main-fixes.patch",
+    "P6-driver-new-modules.patch",
+    "P9-init-scheduler-completed.patch",
+    "P2-pcid-acpid-graceful-fd.patch",
+    # P5: Graceful DRM ioctl error handling in fbbootlogd/fbcond (avoid ENOTTY crash)
+    "P5-fbbootlogd-fbcond-graceful-drm.patch",
+    # P6: Fix rtcd EEXIST by avoiding O_CREAT on kernel scheme resource
+    "P6-rtcd-no-ocreat.patch",
+    # P6: Init hard requires dependency — blocks startup if dependency missing
+    "P6-init-requires-hard-dep.patch",
+    # P6: Fix pcid→acpid FD transfer — pass FD in metadata array, not payload
+    "P6-pcid-acpid-fd-transfer.patch",
+    # P7: Fix acpid pci_fd startup race — shared RwLock between scheme and AML handler
+    "P7-acpid-shared-pcifd.patch",
+    # P15: Init service timeout — prevent boot hanging on unresponsive daemons (30s default)
+    "P15-7-init-service-timeout.patch",
+    # P15: Dependency cycle detection in unit loader — log and skip circular requires_weak
+    "P15-8-init-cycle-detection.patch",
+    # P18: Init daemon restart policy — supervise Notify/Scheme services with exponential backoff
+    "P18-1-daemon-restart.patch",
+    # P18: ACPID robustness — RSDP BIOS-area fallback, graceful physmem error handling
+    "P18-5-acpid-robustness.patch",
+    # P18: MSI/MSI-X enablement — skip legacy IRQ for MSI-capable devices
+    "P18-3-msi-msix-enablement.patch",
+    # P18: Bounded IPC queues — backlog limits for chan, UDS stream, UDS dgram
+    "P18-8-bounded-ipcd-queues.patch",
+    # P18: MSI/MSI-X allocation resilience — handle EEXIST, fallback chain MSI-X→MSI→legacy
+    "P18-9-msi-allocation-resilience.patch",
+]
+
+[package]
+installs = [
+    "/lib/pcid.d/ac97d.toml",
+    "/lib/pcid.d/e1000d.toml",
+    "/lib/pcid.d/ihdad.toml",
+    "/lib/pcid.d/ihdgd.toml",
+    "/lib/pcid.d/ixgbed.toml",
+    "/lib/pcid.d/rtl8139d.toml",
+    "/lib/pcid.d/rtl8168d.toml",
+    "/lib/pcid.d/vboxd.toml",
+    "/lib/pcid.d/virtio-netd.toml",
+    "/lib/pcid.d/xhcid.toml",
+    "/usr/bin/audiod",
+    "/usr/bin/dhcpd",
+    "/usr/bin/dw-acpi-i2cd",
+    "/usr/bin/gpiod",
+    "/usr/bin/i2cd",
+    "/usr/bin/i2c-gpio-expanderd",
+    "/usr/bin/i2c-hidd",
+    "/usr/bin/inputd",
+    "/usr/bin/intel-gpiod",
+    "/usr/bin/ipcd",
+    "/usr/bin/netstack",
+    "/usr/bin/pcid",
+    "/usr/bin/pcid-spawner",
+    "/usr/bin/ptyd",
+    "/usr/bin/redoxerd",
+    "/usr/bin/smolnetd",
+    "/usr/bin/ucsid",
+    "/usr/lib/drivers/ac97d",
+    "/usr/lib/drivers/ahcid",
+    "/usr/lib/drivers/amd-mp2-i2cd",
+    "/usr/lib/drivers/e1000d",
+    "/usr/lib/drivers/ihdad",
+    "/usr/lib/drivers/ihdgd",
+    "/usr/lib/drivers/ided",
+    "/usr/lib/drivers/intel-lpss-i2cd",
+    "/usr/lib/drivers/intel-thc-hidd",
+    "/usr/lib/drivers/ixgbed",
+    "/usr/lib/drivers/ps2d",
+    "/usr/lib/drivers/rtl8139d",
+    "/usr/lib/drivers/rtl8168d",
+    "/usr/lib/drivers/sb16d",
+    "/usr/lib/drivers/thermald",
+    "/usr/lib/drivers/usbctl",
+    "/usr/lib/drivers/usbhidd",
+    "/usr/lib/drivers/usbhubd",
+    "/usr/lib/drivers/usbscsid",
+    "/usr/lib/drivers/vboxd",
+    "/usr/lib/drivers/virtio-gpud",
+    "/usr/lib/drivers/virtio-netd",
+    "/usr/lib/drivers/xhcid",
+    "/usr/lib/init.d/00_base.target",
+    "/usr/lib/init.d/00_ipcd.service",
+    "/usr/lib/init.d/00_pcid-spawner.service",
+    "/usr/lib/init.d/00_ptyd.service",
+    "/usr/lib/init.d/00_sudo.service",
+    "/usr/lib/init.d/00_tmp",
+    "/usr/lib/init.d/05_boot_essential.target",
+    "/usr/lib/init.d/10_dhcpd.service",
+    "/usr/lib/init.d/10_net.target",
+    "/usr/lib/init.d/10_smolnetd.service",
+    "/usr/lib/init.d/12_boot_late.target",
+    "/usr/lib/init.d/12_dbus.service",
+    "/usr/lib/init.d/13_seatd.service",
+    "/usr/lib/init.d/13_sessiond.service",
+    "/usr/lib/init.d/20_audiod.service",
+    "/usr/lib/init.d/29_activate_console.service",
+    "/usr/lib/init.d/30_console.service",
+    "/usr/lib/init.d/30_thermald.service",
+    "/usr/lib/init.d/31_debug_console.service",
+]
+
+[build]
+template = "custom"
+script = """
+mkdir -pv "${COOKBOOK_STAGE}/usr/bin"
+for package in audiod ipcd ptyd dhcpd; do
+    "${COOKBOOK_CARGO}" build \
+        --manifest-path "${COOKBOOK_SOURCE}/${package}/Cargo.toml" \
+        --target "${TARGET}" \
+        ${build_flags}
+    cp -v \
+        "target/${TARGET}/${build_type}/${package}" \
+        "${COOKBOOK_STAGE}/usr/bin/${package}"
+done
+
+"${COOKBOOK_CARGO}" build \
+    --manifest-path "${COOKBOOK_SOURCE}/netstack/Cargo.toml" \
+    --target "${TARGET}" \
+    ${build_flags}
+cp -v \
+    "target/${TARGET}/${build_type}/netstack" \
+    "${COOKBOOK_STAGE}/usr/bin/netstack"
+cp -v \
+    "target/${TARGET}/${build_type}/netstack" \
+    "${COOKBOOK_STAGE}/usr/bin/smolnetd"
+
+# Drivers that are built on all architectures, and NOT in drivers-initfs
+BINS=(
+    gpiod
+    i2c-gpio-expanderd
+    intel-gpiod
+    amd-mp2-i2cd
+    dw-acpi-i2cd
+    e1000d
+    ihdad
+    ihdgd
+    i2c-hidd
+    intel-thc-hidd
+    intel-lpss-i2cd
+    ixgbed
+    pcid
+    pcid-spawner
+    rtl8139d
+    rtl8168d
+    usbctl
+    usbhidd
+    thermald
+    usbhubd
+    ucsid
+    usbscsid
+    virtio-gpud
+    virtio-netd
+    xhcid
+    i2cd
+    inputd
+    redoxerd
+)
+
+# Add additional drivers to the list to build, that are not in drivers-initfs
+# depending on the target architecture
+case "${TARGET}" in
+    i586-unknown-redox | i686-unknown-redox | x86_64-unknown-redox)
+        BINS+=(ac97d ahcid ided nvmed ps2d sb16d vboxd)
+        ;;
+    *)
+        ;;
+esac
+
+#Build each driver in the list
+mkdir -pv "${COOKBOOK_STAGE}/usr/bin" "${COOKBOOK_STAGE}/usr/lib/drivers"
+export CARGO_PROFILE_RELEASE_OPT_LEVEL=s
+export CARGO_PROFILE_RELEASE_PANIC=abort
+# Only build drivers that actually have source Cargo.toml entries
+EXISTING_BINS=()
+for bin in "${BINS[@]}"
+do
+    if grep -Rqs "^name = \\\"${bin}\\\"$" "${COOKBOOK_SOURCE}"; then
+        EXISTING_BINS+=("${bin}")
+    fi
+done
+"${COOKBOOK_CARGO}" build ${build_flags} \
+    --manifest-path "${COOKBOOK_SOURCE}/Cargo.toml" \
+    --target "${TARGET}" \
+    $(for bin in "${EXISTING_BINS[@]}"; do echo "-p" "${bin}"; done)
+for bin in "${EXISTING_BINS[@]}"
+do
+    if [[ "${bin}" == "gpiod" || "${bin}" == "i2c-gpio-expanderd" || "${bin}" == "intel-gpiod" || "${bin}" == "i2cd" || "${bin}" == "dw-acpi-i2cd" || "${bin}" == "i2c-hidd" || "${bin}" == "inputd" || "${bin}" == "pcid" || "${bin}" == "pcid-spawner" || "${bin}" == "redoxerd" || "${bin}" == "ucsid" ]]; then
+        cp -v "target/${TARGET}/${build_type}/${bin}" "${COOKBOOK_STAGE}/usr/bin"
+    else
+        cp -v "target/${TARGET}/${build_type}/${bin}" "${COOKBOOK_STAGE}/usr/lib/drivers"
+    fi
+done
+
+mkdir -pv "${COOKBOOK_STAGE}/lib/pcid.d"
+find "${COOKBOOK_SOURCE}/drivers" -maxdepth 3 -type f -name 'config.toml' | while read conf
+do
+    driver="$(basename "$(dirname "$conf")")"
+    cp -v "$conf" "${COOKBOOK_STAGE}/lib/pcid.d/$driver.toml"
+done
+
+mkdir -pv "${COOKBOOK_STAGE}/usr/lib/init.d"
+cp -v "${COOKBOOK_SOURCE}/init.d"/* "${COOKBOOK_STAGE}/usr/lib/init.d/"
+"""
@@ -12,6 +12,7 @@ cc = "1.0"
 toml = "0.8"

 [dependencies]
+acpi_ext = { package = "acpi", git = "https://gitlab.redox-os.org/redox-os/acpi.git", branch = "redox-6.x" }
 arrayvec = { version = "0.7.4", default-features = false }
 bitfield = "0.13.2"
 bitflags = "2"
@@ -1,3 +1,4 @@
+# Red Bear OS kernel patches applied via individual patch files
 .PHONY: all check

 SOURCE:=$(dir $(realpath $(lastword $(MAKEFILE_LIST))))
@@ -77,6 +77,7 @@ fn main() {
        }
        "x86_64" => {
            println!("cargo::rerun-if-changed=src/asm/x86_64/trampoline.asm");
+            println!("cargo::rerun-if-changed=src/asm/x86_64/s3_wakeup.asm");

            let status = Command::new("nasm")
                .arg("-f")
@@ -89,6 +90,18 @@ fn main() {
            if !status.success() {
                panic!("nasm failed with exit status {}", status);
            }
+
+            let status = Command::new("nasm")
+                .arg("-f")
+                .arg("bin")
+                .arg("-o")
+                .arg(format!("{}/s3_wakeup", out_dir))
+                .arg("src/asm/x86_64/s3_wakeup.asm")
+                .status()
+                .expect("failed to run nasm");
+            if !status.success() {
+                panic!("nasm failed with exit status {}", status);
+            }
        }
        "riscv64" => {
            println!("cargo::rustc-cfg=dtb");
@@ -189,8 +189,18 @@ pub(super) fn init(madt: Madt) {
        let preliminary_cpu_count = madt
            .iter()
            .filter(|entry| match entry {
-                MadtEntry::LocalApic(local) => u32::from(local.id) == me.get() || local.flags & 1 == 1,
-                MadtEntry::LocalX2Apic(local) => local.x2apic_id == me.get() || local.flags & 1 == 1,
+                // When x2APIC is active, LocalApic entries use 8-bit IDs that don't
+                // match the BSP's 32-bit x2APIC ID. Use LocalX2Apic entries instead.
+                MadtEntry::LocalApic(local) if !local_apic.x2 => {
+                    u32::from(local.id) == me.get() || local.flags & 1 == 1
+                }
+                MadtEntry::LocalApic(_) => false,
+                // xAPIC mode: cannot use 32-bit x2APIC IDs via 8-bit ICR.
+                // Skip LocalX2Apic entries and use LocalApic exclusively.
+                MadtEntry::LocalX2Apic(local) if local_apic.x2 => {
+                    local.x2apic_id == me.get() || local.flags & 1 == 1
+                }
+                MadtEntry::LocalX2Apic(_) => false,
                _ => false,
            })
            .count();
@@ -205,18 +215,28 @@ pub(super) fn init(madt: Madt) {
        let _ = seen_apic_ids.insert(me.get()); // BSP
        for entry in madt.iter() {
            match entry {
-                MadtEntry::LocalApic(local) if local.flags & 1 == 1 => {
+                MadtEntry::LocalApic(local) if local.flags & 1 == 1 && !local_apic.x2 => {
                    let id = u32::from(local.id);
                    if !seen_apic_ids.insert(id) {
                        warn!("MADT: duplicate APIC ID {} in LocalApic entry, firmware bug", id);
                    }
                }
-                MadtEntry::LocalX2Apic(local) if local.flags & 1 == 1 => {
+                MadtEntry::LocalApic(local) if local.flags & 1 == 1 && local_apic.x2 => {
+                    // x2APIC mode: skip 8-bit LocalApic IDs; they conflict with
+                    // 32-bit x2APIC IDs. Dedup only among LocalX2Apic entries.
+                    debug!("MADT: ignoring 8-bit LocalApic ID {} in x2APIC mode", local.id);
+                }
+                MadtEntry::LocalX2Apic(local) if local.flags & 1 == 1 && local_apic.x2 => {
                    let id = local.x2apic_id;
                    if !seen_apic_ids.insert(id) {
                        warn!("MADT: duplicate x2APIC ID {} in LocalX2Apic entry, firmware bug", id);
                    }
                }
+                MadtEntry::LocalX2Apic(local) if local.flags & 1 == 1 && !local_apic.x2 => {
+                    // xAPIC mode: skip 32-bit x2APIC IDs; dedup only among LocalApic entries.
+                    let id = local.x2apic_id; // Copy from packed struct
+                    debug!("MADT: ignoring 32-bit x2APIC ID {} in xAPIC mode", id);
+                }
                _ => {}
            }
        }
@@ -225,7 +245,16 @@ pub(super) fn init(madt: Madt) {
    for madt_entry in madt.iter() {
        debug!("      {:x?}", madt_entry);
        if let MadtEntry::LocalApic(ap_local_apic) = madt_entry {
-            if u32::from(ap_local_apic.id) == me.get() {
+            // x2APIC mode: LocalApic entries have 8-bit IDs that don't match
+            // the BSP's 32-bit x2APIC ID. All entries would be treated as APs,
+            // and SIPI would target the wrong processors. Skip them and rely
+            // on LocalX2Apic entries exclusively.
+            if local_apic.x2 {
+                debug!(
+                    "        Skipping 8-bit LocalApic id={} (x2APIC active, using LocalX2Apic entries)",
+                    ap_local_apic.id
+                );
+            } else if u32::from(ap_local_apic.id) == me.get() {
                debug!("        This is my local APIC");
            } else if ap_local_apic.flags & 1 == 1 {
                // Allocate a stack
@@ -383,14 +412,19 @@ pub(super) fn init(madt: Madt) {
                }

                RmmA::invalidate_all();
-            } else {
-                debug!("KERNEL AP: LAPIC CPU {} disabled in MADT, skipping", u32::from(ap_local_apic.id));
            }
        } else if let MadtEntry::LocalX2Apic(ap_x2apic) = madt_entry {
            let apic_id = ap_x2apic.x2apic_id;
            let flags = ap_x2apic.flags;

-            if apic_id == me.get() {
+            // xAPIC mode: cannot target 32-bit x2APIC IDs via 8-bit ICR.
+            // Skip LocalX2Apic entries; use LocalApic entries exclusively.
+            if !local_apic.x2 {
+                debug!(
+                    "        Skipping 32-bit x2APIC id={} (xAPIC mode, using LocalApic entries)",
+                    apic_id
+                );
+            } else if apic_id == me.get() {
                debug!("        This is my local x2APIC");
            } else if flags & 1 == 1 {
                let alloc = match allocate_p2frame(4) {
@@ -446,11 +480,7 @@ pub(super) fn init(madt: Madt) {
                // Send INIT IPI (Assert)
                {
                    let mut icr = 0x4500u64;
-                    if local_apic.x2 {
-                        icr |= u64::from(apic_id) << 32;
-                    } else {
-                        icr |= u64::from(apic_id as u8) << 56;
-                    }
+                    icr |= u64::from(apic_id) << 32;
                    local_apic.set_icr(icr);
                }

@@ -461,11 +491,7 @@ pub(super) fn init(madt: Madt) {
                {
                    let ap_segment = (TRAMPOLINE >> 12) & 0xFF;
                    let mut icr = 0x0600u64 | ap_segment as u64;
-                    if local_apic.x2 {
-                        icr |= u64::from(apic_id) << 32;
-                    } else {
-                        icr |= u64::from(apic_id as u8) << 56;
-                    }
+                    icr |= u64::from(apic_id) << 32;
                    local_apic.set_icr(icr);
                }

@@ -476,11 +502,7 @@ pub(super) fn init(madt: Madt) {
                {
                    let ap_segment = (TRAMPOLINE >> 12) & 0xFF;
                    let mut icr = 0x0600u64 | ap_segment as u64;
-                    if local_apic.x2 {
-                        icr |= u64::from(apic_id) << 32;
-                    } else {
-                        icr |= u64::from(apic_id as u8) << 56;
-                    }
+                    icr |= u64::from(apic_id) << 32;
                    local_apic.set_icr(icr);
                }

@@ -534,8 +556,6 @@ pub(super) fn init(madt: Madt) {
                }

                RmmA::invalidate_all();
-            } else {
-                debug!("KERNEL AP: x2APIC CPU {} disabled in MADT (flags={:#x}), skipping", apic_id, flags);
            }
        } else if let MadtEntry::LocalApicNmi(nmi) = madt_entry {
            let target_apic = nmi.processor;
@@ -34,6 +34,12 @@ impl Madt {
        let madt = Madt::new(find_one_sdt!("APIC"));

        if let Some(madt) = madt {
+            // Validate MADT checksum per ACPI 6.5 §5.2.2
+            if !madt.sdt.validate_checksum() {
+                error!("MADT checksum validation failed, skipping APIC initialization");
+                return;
+            }
+
            // safe because no APs have been started yet.
            unsafe { MADT.get().write(Some(madt)) };

@@ -146,6 +152,48 @@ pub struct MadtGicd {
    _reserved2: [u8; 3],
 }

+/// MADT Local x2APIC (entry type 0x9)
+#[derive(Clone, Copy, Debug)]
+#[repr(C, packed)]
+pub struct MadtLocalX2Apic {
+    _reserved: u16,
+    pub x2apic_id: u32,
+    pub flags: u32,
+    pub processor_uid: u32,
+}
+
+/// MADT Local APIC NMI (entry type 0x4)
+#[derive(Clone, Copy, Debug)]
+#[repr(C, packed)]
+pub struct MadtLocalApicNmi {
+    pub processor: u8,
+    pub flags: u16,
+    pub nmi_pin: u8,
+}
+
+/// MADT Local APIC address override (entry type 0x5)
+#[derive(Clone, Copy, Debug)]
+#[repr(C, packed)]
+pub struct MadtLapicAddressOverride {
+    _reserved: u16,
+    pub local_apic_address: u64,
+}
+
+/// MADT Local x2APIC NMI (entry type 0xA)
+#[derive(Clone, Copy, Debug)]
+#[repr(C, packed)]
+pub struct MadtLocalX2ApicNmi {
+    _reserved: u16,
+    pub processor_uid: u32,
+    pub flags: u16,
+    pub nmi_pin: u8,
+    _reserved2: u8,
+}
+
+const _: () = assert!(size_of::<MadtLocalApicNmi>() == 4);
+const _: () = assert!(size_of::<MadtLapicAddressOverride>() == 10);
+const _: () = assert!(size_of::<MadtLocalX2ApicNmi>() == 10);
+
 /// MADT Entries
 #[derive(Debug)]
 #[allow(dead_code)]
@@ -156,10 +204,18 @@ pub enum MadtEntry {
    InvalidIoApic(usize),
    IntSrcOverride(&'static MadtIntSrcOverride),
    InvalidIntSrcOverride(usize),
+    LocalApicNmi(&'static MadtLocalApicNmi),
+    InvalidLocalApicNmi(usize),
+    LapicAddressOverride(&'static MadtLapicAddressOverride),
+    InvalidLapicAddressOverride(usize),
    Gicc(&'static MadtGicc),
    InvalidGicc(usize),
    Gicd(&'static MadtGicd),
    InvalidGicd(usize),
+    LocalX2Apic(&'static MadtLocalX2Apic),
+    InvalidLocalX2Apic(usize),
+    LocalX2ApicNmi(&'static MadtLocalX2ApicNmi),
+    InvalidLocalX2ApicNmi(usize),
    Unknown(u8),
 }

@@ -176,6 +232,10 @@ impl Iterator for MadtIter {
            let entry_len =
                unsafe { *(self.sdt.data_address() as *const u8).add(self.i + 1) } as usize;

+            if entry_len < 2 {
+                return None;
+            }
+
            if self.i + entry_len <= self.sdt.data_len() {
                let item = match entry_type {
                    0x0 => {
@@ -206,6 +266,46 @@ impl Iterator for MadtIter {
                            MadtEntry::InvalidIntSrcOverride(entry_len)
                        }
                    }
+                    0x4 => {
+                        if entry_len == size_of::<MadtLocalApicNmi>() + 2 {
+                            MadtEntry::LocalApicNmi(unsafe {
+                                &*((self.sdt.data_address() + self.i + 2)
+                                    as *const MadtLocalApicNmi)
+                            })
+                        } else {
+                            MadtEntry::InvalidLocalApicNmi(entry_len)
+                        }
+                    }
+                    0x5 => {
+                        if entry_len == size_of::<MadtLapicAddressOverride>() + 2 {
+                            MadtEntry::LapicAddressOverride(unsafe {
+                                &*((self.sdt.data_address() + self.i + 2)
+                                    as *const MadtLapicAddressOverride)
+                            })
+                        } else {
+                            MadtEntry::InvalidLapicAddressOverride(entry_len)
+                        }
+                    }
+                    0x9 => {
+                        if entry_len == size_of::<MadtLocalX2Apic>() + 2 {
+                            MadtEntry::LocalX2Apic(unsafe {
+                                &*((self.sdt.data_address() + self.i + 2)
+                                    as *const MadtLocalX2Apic)
+                            })
+                        } else {
+                            MadtEntry::InvalidLocalX2Apic(entry_len)
+                        }
+                    }
+                    0xA => {
+                        if entry_len == size_of::<MadtLocalX2ApicNmi>() + 2 {
+                            MadtEntry::LocalX2ApicNmi(unsafe {
+                                &*((self.sdt.data_address() + self.i + 2)
+                                    as *const MadtLocalX2ApicNmi)
+                            })
+                        } else {
+                            MadtEntry::InvalidLocalX2ApicNmi(entry_len)
+                        }
+                    }
                    0xB => {
                        if entry_len >= size_of::<MadtGicc>() + 2 {
                            MadtEntry::Gicc(unsafe {
@@ -20,6 +20,8 @@ mod rxsdt;
 pub mod sdt;
 #[cfg(target_arch = "aarch64")]
 mod spcr;
+pub mod slit;
+pub mod srat;
 mod xsdt;

 unsafe fn map_linearly(addr: PhysicalAddress, len: usize, mapper: &mut crate::memory::PageMapper) {
@@ -82,6 +84,14 @@ impl Rxsdt for RxsdtEnum {

 pub static RXSDT_ENUM: Once<RxsdtEnum> = Once::new();

+#[derive(Clone, Copy, Debug)]
+pub struct AcpiRootInfo {
+    pub revision: u8,
+    pub root_sdt_address: PhysicalAddress,
+}
+
+pub static ACPI_ROOT_INFO: Once<AcpiRootInfo> = Once::new();
+
 /// Parse the ACPI tables to gather CPU, interrupt, and timer information
 pub unsafe fn init(already_supplied_rsdp: Option<*const u8>) {
    unsafe {
@@ -94,6 +104,15 @@ pub unsafe fn init(already_supplied_rsdp: Option<*const u8>) {
        let rsdp_opt = Rsdp::get_rsdp(already_supplied_rsdp);

        if let Some(rsdp) = rsdp_opt {
+            let root_info = ACPI_ROOT_INFO.call_once(|| AcpiRootInfo {
+                revision: rsdp.revision(),
+                root_sdt_address: rsdp.sdt_address(),
+            });
+
+            if root_info.root_sdt_address != rsdp.sdt_address() || root_info.revision != rsdp.revision() {
+                error!("ACPI_ROOT_INFO already initialized with a different RSDP root");
+            }
+
            debug!("SDT address: {:#x}", rsdp.sdt_address().data());
            let rxsdt = get_sdt(rsdp.sdt_address(), &mut KernelMapper::lock_rw());

@@ -146,7 +165,14 @@ pub unsafe fn init(already_supplied_rsdp: Option<*const u8>) {

            // TODO: Enumerate processors in userspace, and then provide an ACPI-independent interface
            // to initialize enumerated processors to userspace?
+            // Parse SRAT BEFORE MADT so NUMA node mapping is available
+            // when APs are started and PercpuBlocks are created.
+            srat::init();
+
            Madt::init();
+
+            // Parse SLIT after MADT for the NUMA distance matrix.
+            slit::init();
            //TODO: support this on any arch
            // SPCR must be initialized after MADT for interrupt controllers
            #[cfg(target_arch = "aarch64")]
@@ -17,9 +17,33 @@ pub struct Rsdp {

 impl Rsdp {
    pub unsafe fn get_rsdp(already_supplied_rsdp: Option<*const u8>) -> Option<Rsdp> {
-        already_supplied_rsdp.map(|rsdp_ptr| {
-            // TODO: Validate
-            unsafe { *(rsdp_ptr as *const Rsdp) }
+        already_supplied_rsdp.and_then(|rsdp_ptr| {
+            let rsdp = unsafe { *(rsdp_ptr as *const Rsdp) };
+
+            // Validate signature "RSD PTR "
+            if &rsdp.signature != b"RSD PTR " {
+                return None;
+            }
+
+            // ACPI 1.0 checksum: sum of first 20 bytes must be zero
+            let bytes_v1 = unsafe { core::slice::from_raw_parts(rsdp_ptr, 20) };
+            if bytes_v1.iter().fold(0u8, |sum, &b| sum.wrapping_add(b)) != 0 {
+                return None;
+            }
+
+            // ACPI 2.0+ extended checksum: sum of entire table (length bytes) must be zero
+            if rsdp.revision >= 2 {
+                let full_len = rsdp._length as usize;
+                if full_len < 36 || full_len > 256 {
+                    return None;
+                }
+                let bytes_full = unsafe { core::slice::from_raw_parts(rsdp_ptr, full_len) };
+                if bytes_full.iter().fold(0u8, |sum, &b| sum.wrapping_add(b)) != 0 {
+                    return None;
+                }
+            }
+
+            Some(rsdp)
        })
    }

@@ -31,4 +55,8 @@ impl Rsdp {
            self.rsdt_address as usize
        })
    }
+
+    pub fn revision(&self) -> u8 {
+        self.revision
+    }
 }
@@ -24,4 +24,20 @@ impl Sdt {
        let header_size = size_of::<Sdt>();
        total_size.saturating_sub(header_size)
    }
+
+    /// Validate the SDT checksum.
+    ///
+    /// Per ACPI 6.5 §5.2.2: the entire table (including the checksum field)
+    /// must sum to 0 when all bytes are added together as unsigned 8-bit values.
+    pub fn validate_checksum(&self) -> bool {
+        let ptr = self as *const _ as *const u8;
+        let len = self.length as usize;
+        if len < size_of::<Sdt>() {
+            return false;
+        }
+        let sum = unsafe { core::slice::from_raw_parts(ptr, len) }
+            .iter()
+            .fold(0u8, |acc, &b| acc.wrapping_add(b));
+        sum == 0
+    }
 }
@@ -0,0 +1,45 @@
+//! SLIT (System Locality Information Table) parser.
+//!
+//! Parses the NUMA distance matrix for scheduler NUMA-aware work stealing.
+
+use super::sdt::Sdt;
+use crate::acpi::find_sdt;
+
+const MAX_NODES: usize = 8;
+
+static mut SLIT_MATRIX: [[u8; MAX_NODES]; MAX_NODES] = [[10u8; MAX_NODES]; MAX_NODES];
+static mut SLIT_NUM_NODES: usize = 0;
+static mut SLIT_AVAILABLE: bool = false;
+
+pub fn is_available() -> bool { unsafe { SLIT_AVAILABLE } }
+pub fn num_nodes() -> usize { unsafe { SLIT_NUM_NODES } }
+
+pub fn distance(from: u8, to: u8) -> u8 {
+    if !unsafe { SLIT_AVAILABLE } { return 10; }
+    let (from, to) = (from as usize, to as usize);
+    if from >= MAX_NODES || to >= MAX_NODES { return 10; }
+    unsafe { SLIT_MATRIX[from][to] }
+}
+
+pub fn same_socket(node1: u8, node2: u8) -> bool { distance(node1, node2) <= 20 }
+
+pub fn init() {
+    let sdt = match find_sdt("SLIT").as_slice() {
+        [] => return,
+        [x] => *x,
+        xs => { println!("SLIT: {} tables found, expected 1", xs.len()); return; }
+    };
+    if &sdt.signature != b"SLIT" { return; }
+    let data_addr = sdt.data_address();
+    let data_len = sdt.data_len();
+    if data_len < 8 { return; }
+    let num_nodes = unsafe { *(data_addr as *const u64) } as usize;
+    if num_nodes == 0 || num_nodes > MAX_NODES { println!("SLIT: {num_nodes} nodes (max {MAX_NODES}), ignoring"); return; }
+    let matrix_start = 8;
+    let matrix_size = num_nodes * num_nodes;
+    if data_len < matrix_start + matrix_size { println!("SLIT: matrix truncated ({data_len} < {})", matrix_start + matrix_size); return; }
+    let matrix = unsafe { &mut SLIT_MATRIX };
+    for i in 0..num_nodes { for j in 0..num_nodes { matrix[i][j] = unsafe { *((data_addr + matrix_start + i * num_nodes + j) as *const u8) }; } }
+    unsafe { SLIT_NUM_NODES = num_nodes; SLIT_AVAILABLE = true; }
+    debug!("SLIT: {} nodes, distance matrix loaded", num_nodes);
+}
@@ -0,0 +1,102 @@
+//! SRAT (System Resource Affinity Table) parser.
+//!
+//! Parses CPU-to-NUMA-node and memory-to-NUMA-node affinity information.
+//! Called before MADT init so that NUMA data is available during AP startup.
+
+use super::sdt::Sdt;
+use crate::acpi::find_sdt;
+
+const MAX_CPU_ENTRIES: usize = 256;
+const MAX_MEM_ENTRIES: usize = 64;
+
+#[derive(Clone, Copy)]
+struct SratCpuEntry { apic_id: u32, node: u8, enabled: bool }
+
+#[derive(Clone, Copy)]
+struct SratMemEntry { node: u8, base: u64, length: u64, enabled: bool }
+
+const CPU_NONE: SratCpuEntry = SratCpuEntry { apic_id: u32::MAX, node: 0, enabled: false };
+const MEM_NONE: SratMemEntry = SratMemEntry { node: 0, base: 0, length: 0, enabled: false };
+
+static mut SRAT_CPU_ENTRIES: [SratCpuEntry; MAX_CPU_ENTRIES] = [CPU_NONE; MAX_CPU_ENTRIES];
+static mut SRAT_MEM_ENTRIES: [SratMemEntry; MAX_MEM_ENTRIES] = [MEM_NONE; MAX_MEM_ENTRIES];
+static mut SRAT_CPU_COUNT: usize = 0;
+static mut SRAT_MEM_COUNT: usize = 0;
+static mut SRAT_AVAILABLE: bool = false;
+
+pub fn is_available() -> bool { unsafe { SRAT_AVAILABLE } }
+
+pub fn numa_node_for_apic(apic_id: u32) -> Option<u8> {
+    if !unsafe { SRAT_AVAILABLE } { return None; }
+    let count = unsafe { SRAT_CPU_COUNT };
+    let entries = unsafe { &SRAT_CPU_ENTRIES };
+    for i in 0..count {
+        if entries[i].apic_id == apic_id && entries[i].enabled { return Some(entries[i].node); }
+    }
+    None
+}
+
+pub fn numa_node_count() -> usize {
+    if !unsafe { SRAT_AVAILABLE } { return 1; }
+    let mut max_node: u8 = 0;
+    let count = unsafe { SRAT_CPU_COUNT };
+    let entries = unsafe { &SRAT_CPU_ENTRIES };
+    for i in 0..count { if entries[i].enabled && entries[i].node > max_node { max_node = entries[i].node; } }
+    (max_node as usize) + 1
+}
+
+#[repr(C, packed)]
+struct SratLocalApic { _proximity_lo: u8, apic_id: u8, flags: u32, _local_sapic_eid: u8, _proximity_hi: [u8; 3], _clock_domain: u32 }
+
+#[repr(C, packed)]
+struct SratMemoryAffinity { proximity_domain: u32, _reserved1: u16, base_address_lo: u32, base_address_hi: u32, length_lo: u32, length_hi: u32, _reserved2: u32, flags: u32, _reserved3: u64 }
+
+#[repr(C, packed)]
+struct SratLocalX2Apic { _reserved: u16, proximity_domain: u32, x2apic_id: u32, flags: u32, _clock_domain: u32, _reserved2: u32 }
+
+pub fn init() {
+    let sdt = match find_sdt("SRAT").as_slice() {
+        [] => return,
+        [x] => *x,
+        xs => { println!("SRAT: {} tables found, expected 1", xs.len()); return; }
+    };
+    if &sdt.signature != b"SRAT" { return; }
+    let data_addr = sdt.data_address();
+    let data_len = sdt.data_len();
+    if data_len < 12 { println!("SRAT: table too short ({data_len} bytes)"); return; }
+    let mut offset: usize = 12;
+    let cpu_entries = unsafe { &mut SRAT_CPU_ENTRIES };
+    let mem_entries = unsafe { &mut SRAT_MEM_ENTRIES };
+    let mut cpu_count: usize = 0;
+    let mut mem_count: usize = 0;
+    while offset + 2 <= data_len {
+        let entry_type = unsafe { *((data_addr + offset) as *const u8) };
+        let entry_len = unsafe { *((data_addr + offset + 1) as *const u8) } as usize;
+        if entry_len < 2 || offset + entry_len > data_len { break; }
+        let entry_data = data_addr + offset + 2;
+        match entry_type {
+            0x0 if entry_len >= size_of::<SratLocalApic>() + 2 => {
+                let e = unsafe { &*(entry_data as *const SratLocalApic) };
+                let enabled = (e.flags & 1) == 1;
+                let node = (e._proximity_lo as u32) | ((e._proximity_hi[0] as u32) << 8) | ((e._proximity_hi[1] as u32) << 16) | ((e._proximity_hi[2] as u32) << 24);
+                if cpu_count < MAX_CPU_ENTRIES { cpu_entries[cpu_count] = SratCpuEntry { apic_id: e.apic_id as u32, node: node as u8, enabled }; cpu_count += 1; }
+            }
+            0x1 if entry_len >= size_of::<SratMemoryAffinity>() + 2 => {
+                let e = unsafe { &*(entry_data as *const SratMemoryAffinity) };
+                let enabled = (e.flags & 1) == 1;
+                let base = (e.base_address_hi as u64) << 32 | e.base_address_lo as u64;
+                let length = (e.length_hi as u64) << 32 | e.length_lo as u64;
+                if mem_count < MAX_MEM_ENTRIES { mem_entries[mem_count] = SratMemEntry { node: e.proximity_domain as u8, base, length, enabled }; mem_count += 1; }
+            }
+            0x2 if entry_len >= size_of::<SratLocalX2Apic>() + 2 => {
+                let e = unsafe { &*(entry_data as *const SratLocalX2Apic) };
+                let enabled = (e.flags & 1) == 1;
+                if cpu_count < MAX_CPU_ENTRIES { cpu_entries[cpu_count] = SratCpuEntry { apic_id: e.x2apic_id, node: e.proximity_domain as u8, enabled }; cpu_count += 1; }
+            }
+            _ => {}
+        }
+        offset += entry_len;
+    }
+    unsafe { SRAT_CPU_COUNT = cpu_count; SRAT_MEM_COUNT = mem_count; SRAT_AVAILABLE = true; }
+    debug!("SRAT: {} CPU entries, {} memory entries", cpu_count, mem_count);
+}
@@ -7,26 +7,40 @@ mod linked_list;
 /// Size of kernel heap
 const KERNEL_HEAP_SIZE: usize = ::rmm::MEGABYTE;

+#[cold]
+fn halt_kernel_heap_init(message: &str) -> ! {
+    print!("{message}");
+    println!("Kernel heap initialization cannot continue. Halting.");
+    loop {
+        core::hint::spin_loop();
+    }
+}
+
 unsafe fn map_heap(mapper: &mut KernelMapper<true>, offset: usize, size: usize) {
    let mut flush_all = PageFlushAll::new();

    let heap_start_page = Page::containing_address(VirtualAddress::new(offset));
    let heap_end_page = Page::containing_address(VirtualAddress::new(offset + size - 1));
    for page in Page::range_inclusive(heap_start_page, heap_end_page) {
-        let phys = mapper
-            .allocator_mut()
-            .allocate_one()
-            .expect("failed to allocate kernel heap");
+        let phys = match mapper.allocator_mut().allocate_one() {
+            Some(phys) => phys,
+            None => halt_kernel_heap_init(
+                "FATAL: failed to allocate physical frame for kernel heap\n",
+            ),
+        };
        let flush = unsafe {
-            mapper
-                .map_phys(
-                    page.start_address(),
-                    phys,
-                    PageFlags::new()
-                        .write(true)
-                        .global(cfg!(not(feature = "pti"))),
-                )
-                .expect("failed to map kernel heap")
+            match mapper.map_phys(
+                page.start_address(),
+                phys,
+                PageFlags::new()
+                    .write(true)
+                    .global(cfg!(not(feature = "pti"))),
+            ) {
+                Some(flush) => flush,
+                None => halt_kernel_heap_init(
+                    "FATAL: failed to map kernel heap virtual page\n",
+                ),
+            }
        };
        flush_all.consume(flush);
    }
@@ -91,7 +91,7 @@ unsafe extern "C" fn start(args_ptr: *const KernelArgs) -> ! {
                dtb::serial::init_early(dtb);
            }

-            info!("Redox OS starting...");
+            info!("RedBear OS starting...");
            args.print();

            // Initialize RMM
@@ -97,7 +97,7 @@ unsafe extern "C" fn start(args_ptr: *const KernelArgs) -> ! {
                init_early(dtb);
            }

-            info!("Redox OS starting...");
+            info!("RedBear OS starting...");
            args.print();

            if let Some(dtb) = &dtb {
@@ -14,6 +14,10 @@ pub struct IoApicRegs {
    pointer: *const u32,
 }
 impl IoApicRegs {
+    fn redirection_index_valid(&mut self, idx: u8) -> bool {
+        idx <= self.max_redirection_table_entries()
+    }
+
    fn ioregsel(&self) -> *const u32 {
        self.pointer
    }
@@ -44,21 +48,28 @@ impl IoApicRegs {
    pub fn read_ioapicver(&mut self) -> u32 {
        self.read_reg(0x01)
    }
-    pub fn read_ioredtbl(&mut self, idx: u8) -> u64 {
-        assert!(idx < 24);
+    pub fn read_ioredtbl(&mut self, idx: u8) -> Option<u64> {
+        if !self.redirection_index_valid(idx) {
+            warn!("IOAPIC read_ioredtbl index {} out of range", idx);
+            return None;
+        }
        let lo = self.read_reg(0x10 + idx * 2);
        let hi = self.read_reg(0x10 + idx * 2 + 1);

-        u64::from(lo) | (u64::from(hi) << 32)
+        Some(u64::from(lo) | (u64::from(hi) << 32))
    }
-    pub fn write_ioredtbl(&mut self, idx: u8, value: u64) {
-        assert!(idx < 24);
+    pub fn write_ioredtbl(&mut self, idx: u8, value: u64) -> bool {
+        if !self.redirection_index_valid(idx) {
+            warn!("IOAPIC write_ioredtbl index {} out of range", idx);
+            return false;
+        }

        let lo = value as u32;
        let hi = (value >> 32) as u32;

        self.write_reg(0x10 + idx * 2, lo);
        self.write_reg(0x10 + idx * 2 + 1, hi);
+        true
    }

    pub fn max_redirection_table_entries(&mut self) -> u8 {
@@ -92,17 +103,37 @@ impl IoApic {
    }
    /// Map an interrupt vector to a physical local APIC ID of a processor (thus physical mode).
    #[allow(dead_code)]
-    pub fn map(&self, idx: u8, info: MapInfo) {
-        self.regs.lock().write_ioredtbl(idx, info.as_raw())
+    pub fn map(&self, idx: u8, info: MapInfo) -> bool {
+        let Some(raw) = info.as_raw() else {
+            return false;
+        };
+        self.regs.lock().write_ioredtbl(idx, raw)
    }
    pub fn set_mask(&self, gsi: u32, mask: bool) {
        let idx = (gsi - self.gsi_start) as u8;
        let mut guard = self.regs.lock();

-        let mut reg = guard.read_ioredtbl(idx);
+        let Some(mut reg) = guard.read_ioredtbl(idx) else {
+            return;
+        };
        reg &= !(1 << 16);
        reg |= u64::from(mask) << 16;
-        guard.write_ioredtbl(idx, reg);
+        let _ = guard.write_ioredtbl(idx, reg);
+    }
+    /// Change the destination APIC for a GSI by reprogramming the redirection table entry.
+    /// Preserves all other fields (vector, polarity, trigger mode, delivery mode, mask).
+    /// Returns true if the entry was successfully updated.
+    pub fn set_irq_affinity(&self, gsi: u32, dest: ApicId) -> bool {
+        let idx = (gsi - self.gsi_start) as u8;
+        let mut guard = self.regs.lock();
+        let Some(mut entry) = guard.read_ioredtbl(idx) else {
+            return false;
+        };
+        // Clear destination field (bits 63:56 for xAPIC physical mode)
+        // and set new destination APIC ID
+        entry &= !(0xFF_u64 << 56);
+        entry |= u64::from(dest.get()) << 56;
+        guard.write_ioredtbl(idx, entry)
    }
 }

@@ -149,19 +180,26 @@ pub struct MapInfo {
 }

 impl MapInfo {
-    pub fn as_raw(&self) -> u64 {
-        assert!(self.vector >= 0x20);
-        assert!(self.vector <= 0xFE);
+    pub fn as_raw(&self) -> Option<u64> {
+        if !(0x20..=0xFE).contains(&self.vector) {
+            warn!(
+                "Refusing to map IOAPIC vector outside valid range: {:#x}",
+                self.vector
+            );
+            return None;
+        }

        // TODO: Check for reserved fields.

-        (u64::from(self.dest.get()) << 56)
+        Some(
+            (u64::from(self.dest.get()) << 56)
            | (u64::from(self.mask) << 16)
            | ((self.trigger_mode as u64) << 15)
            | ((self.polarity as u64) << 13)
            | ((self.dest_mode as u64) << 11)
            | ((self.delivery_mode as u64) << 8)
-            | u64::from(self.vector)
+            | u64::from(self.vector),
+        )
    }
 }

@@ -175,7 +213,7 @@ impl fmt::Debug for IoApic {

                let count = guard.max_redirection_table_entries();
                f.debug_list()
-                    .entries((0..count).map(|i| guard.read_ioredtbl(i)))
+                    .entries((0..=count).filter_map(|i| guard.read_ioredtbl(i)))
                    .finish()
            }
        }
@@ -237,11 +275,14 @@ pub unsafe fn handle_ioapic(madt_ioapic: &'static MadtIoApic) {
        let ioapic_registers = virt.data() as *const u32;
        let ioapic = IoApic::new(ioapic_registers, madt_ioapic.gsi_base);

-        assert_eq!(
-            ioapic.regs.lock().id(),
-            madt_ioapic.id,
-            "mismatched ACPI MADT I/O APIC ID, and the ID reported by the I/O APIC"
-        );
+        let detected_id = ioapic.regs.lock().id();
+        if detected_id != madt_ioapic.id {
+            warn!(
+                "mismatched ACPI MADT I/O APIC ID: MADT={}, IOAPIC={}; continuing with detected hardware",
+                madt_ioapic.id,
+                detected_id
+            );
+        }

        (*IOAPICS.get()).get_or_insert_with(Vec::new).push(ioapic);
    }
@@ -310,11 +351,11 @@ pub unsafe fn init() {
                }
            }
        }
-        println!(
-            "I/O APICs: {:?}, overrides: {:?}",
-            ioapics(),
-            src_overrides()
-        );
+        for ioapic in ioapics() {
+            for idx in 0..=ioapic.count {
+                ioapic.set_mask(ioapic.gsi_start + u32::from(idx), true);
+            }
+        }

        // map the legacy PC-compatible IRQs (0-15) to 32-47, just like we did with 8259 PIC (if it
        // wouldn't have been disabled due to this I/O APIC)
@@ -329,7 +370,6 @@ pub unsafe fn init() {
                            .iter()
                            .any(|over| over.bus_irq == legacy_irq)
                    {
-                        // there's an IRQ conflict, making this legacy IRQ inaccessible.
                        continue;
                    }
                    (
@@ -349,7 +389,6 @@ pub unsafe fn init() {
            let redir_tbl_index = (gsi - apic.gsi_start) as u8;

            let map_info = MapInfo {
-                // only send to the BSP
                dest: bsp_apic_id,
                dest_mode: DestinationMode::Physical,
                delivery_mode: DeliveryMode::Fixed,
@@ -366,7 +405,32 @@ pub unsafe fn init() {
                },
                vector: 32 + legacy_irq,
            };
-            apic.map(redir_tbl_index, map_info);
+            if !apic.map(redir_tbl_index, map_info) {
+                warn!(
+                    "Unable to map legacy IRQ {} (GSI {}) through IOAPIC index {}",
+                    legacy_irq,
+                    gsi,
+                    redir_tbl_index
+                );
+            }
+
+            if legacy_irq == 0 && gsi != u32::from(legacy_irq) {
+                if let Some(apic0) = find_ioapic(u32::from(legacy_irq)) {
+                    let idx0 = (u32::from(legacy_irq) - apic0.gsi_start) as u8;
+                    let _ = apic0.map(
+                        idx0,
+                        MapInfo {
+                            dest: bsp_apic_id,
+                            dest_mode: DestinationMode::Physical,
+                            delivery_mode: DeliveryMode::Fixed,
+                            mask: false,
+                            polarity: ApicPolarity::ActiveHigh,
+                            trigger_mode: ApicTriggerMode::Edge,
+                            vector: 32,
+                        },
+                    );
+                }
+            }
        }
        println!(
            "I/O APICs: {:?}, overrides: {:?}",
@@ -406,7 +470,7 @@ fn resolve(irq: u8) -> u32 {
 fn find_ioapic(gsi: u32) -> Option<&'static IoApic> {
    ioapics()
        .iter()
-        .find(|apic| gsi >= apic.gsi_start && gsi < apic.gsi_start + u32::from(apic.count))
+        .find(|apic| gsi >= apic.gsi_start && gsi <= apic.gsi_start + u32::from(apic.count))
 }

 pub unsafe fn mask(irq: u8) {
@@ -425,3 +489,14 @@ pub unsafe fn unmask(irq: u8) {
    };
    apic.set_mask(gsi, false);
 }
+
+/// Change the destination CPU for an IRQ by reprogramming the IOAPIC redirection entry.
+/// Resolves the legacy IRQ to its GSI, finds the owning IOAPIC, and updates the destination
+/// APIC ID in the redirection table while preserving all other fields.
+pub unsafe fn set_affinity(irq: u8, dest: ApicId) -> bool {
+    let gsi = resolve(irq);
+    match find_ioapic(gsi) {
+        Some(apic) => apic.set_irq_affinity(gsi, dest),
+        None => false,
+    }
+}
@@ -4,9 +4,11 @@ pub mod cpu;
 pub mod hpet;
 pub mod ioapic;
 pub mod local_apic;
+pub mod msi;
 pub mod pic;
 pub mod pit;
 pub mod serial;
+pub mod vector;
 #[cfg(feature = "system76_ec_debug")]
 pub mod system76_ec;

@@ -23,8 +25,7 @@ pub unsafe fn init() {
    }
 }
 pub unsafe fn init_after_acpi() {
-    // this will disable the IOAPIC if needed.
-    //ioapic::init(mapper);
+    unsafe { ioapic::init() };
 }

 unsafe fn init_hpet() -> bool {
@@ -0,0 +1,183 @@
+// MSI/MSI-X support for x86 — kernel-level message composition and validation
+// Cross-referenced from Linux 7.0: arch/x86/kernel/apic/msi.c (391 lines)
+
+use crate::arch::device::local_apic::ApicId;
+
+pub const MSI_ADDRESS_BASE: u64 = 0xFEE0_0000;
+pub const MSI_ADDRESS_MASK: u64 = 0xFEEF_F000;
+const MSI_DEST_MODE_LOGICAL: u64 = 1 << 2;
+const MSI_REDIRECTION_HINT: u64 = 1 << 3;
+
+#[derive(Debug, Clone, Copy)]
+pub struct MsiAddress {
+    pub raw: u64,
+}
+
+#[derive(Debug, Clone, Copy)]
+pub struct MsiData {
+    pub raw: u32,
+}
+
+#[derive(Debug, Clone)]
+pub struct MsiMessage {
+    pub address: MsiAddress,
+    pub data: MsiData,
+}
+
+impl MsiAddress {
+    pub fn new(dest_apic_id: u8, redirection_hint: bool, dest_mode_logical: bool) -> Self {
+        let mut addr = MSI_ADDRESS_BASE;
+        addr |= u64::from(dest_apic_id) << 12;
+        if redirection_hint {
+            addr |= MSI_REDIRECTION_HINT;
+        }
+        if dest_mode_logical {
+            addr |= MSI_DEST_MODE_LOGICAL;
+        }
+        Self { raw: addr }
+    }
+
+    pub fn validate(addr: u64) -> bool {
+        (addr & MSI_ADDRESS_MASK) == MSI_ADDRESS_BASE
+    }
+
+    pub fn dest_apic_id(&self) -> u8 {
+        ((self.raw >> 12) & 0xFF) as u8
+    }
+}
+
+impl MsiData {
+    pub fn new(vector: u8, delivery_mode: u8, trigger_mode: u8) -> Self {
+        let mut data = u32::from(vector);
+        data |= u32::from(delivery_mode) << 8;
+        data |= u32::from(trigger_mode) << 15;
+        Self { raw: data }
+    }
+
+    pub fn vector(&self) -> u8 {
+        (self.raw & 0xFF) as u8
+    }
+
+    pub fn delivery_mode(&self) -> u8 {
+        ((self.raw >> 8) & 0x7) as u8
+    }
+
+    pub fn trigger_mode(&self) -> u8 {
+        ((self.raw >> 15) & 0x1) as u8
+    }
+}
+
+impl MsiMessage {
+    pub fn compose(dest: ApicId, vector: u8, delivery_mode: u8, trigger_mode: u8) -> Self {
+        let address = MsiAddress::new(dest.get() as u8, false, false);
+        let data = MsiData::new(vector, delivery_mode, trigger_mode);
+        Self { address, data }
+    }
+
+    pub fn validate(&self) -> bool {
+        MsiAddress::validate(self.address.raw)
+            && self.data.vector() >= 32
+            && self.data.vector() < 255
+    }
+}
+
+pub fn is_valid_msi_address(addr: u64) -> bool {
+    MsiAddress::validate(addr)
+}
+
+pub fn is_valid_msi_vector(vector: u8) -> bool {
+    vector >= 32 && vector < 255
+}
+
+#[derive(Debug)]
+pub struct MsiCapability {
+    pub msg_ctl: u16,
+    pub msg_addr_lo: u32,
+    pub msg_addr_hi: u32,
+    pub msg_data: u16,
+    pub mask_bits: u32,
+    pub pending_bits: u32,
+    pub is_64bit: bool,
+    pub is_maskable: bool,
+    pub multiple_message_capable: u8,
+}
+
+impl MsiCapability {
+    pub fn parse(raw: &[u32; 6], msg_ctl: u16) -> Self {
+        Self {
+            msg_ctl,
+            msg_addr_lo: raw[1],
+            msg_addr_hi: if msg_ctl & (1 << 7) != 0 { raw[2] } else { 0 },
+            msg_data: if msg_ctl & (1 << 7) != 0 {
+                (raw[3] & 0xFFFF) as u16
+            } else {
+                (raw[2] & 0xFFFF) as u16
+            },
+            mask_bits: if msg_ctl & (1 << 8) != 0 {
+                if msg_ctl & (1 << 7) != 0 {
+                    raw[3] >> 16
+                } else {
+                    raw[3]
+                }
+            } else {
+                0
+            },
+            pending_bits: if msg_ctl & (1 << 8) != 0 { raw[4] } else { 0 },
+            is_64bit: msg_ctl & (1 << 7) != 0,
+            is_maskable: msg_ctl & (1 << 8) != 0,
+            multiple_message_capable: ((msg_ctl >> 1) & 0x7) as u8,
+        }
+    }
+}
+
+#[derive(Debug)]
+pub struct MsixCapability {
+    pub msg_ctl: u16,
+    pub table_offset: u32,
+    pub table_bar: u8,
+    pub pba_offset: u32,
+    pub pba_bar: u8,
+    pub table_size: u16,
+}
+
+impl MsixCapability {
+    pub fn parse(raw: &[u32; 3], msg_ctl: u16) -> Self {
+        Self {
+            msg_ctl,
+            table_offset: raw[1] & !0x7,
+            table_bar: (raw[1] & 0x7) as u8,
+            pba_offset: raw[2] & !0x7,
+            pba_bar: (raw[2] & 0x7) as u8,
+            table_size: ((msg_ctl >> 1) & 0x7FF) as u16 + 1,
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_compose_message() {
+        let msg = MsiMessage::compose(ApicId::new(3), 48, 0b101, 1);
+        assert!(msg.validate());
+        assert_eq!(msg.address.dest_apic_id(), 3);
+        assert_eq!(msg.data.vector(), 48);
+        assert_eq!(msg.data.delivery_mode(), 0b101);
+        assert_eq!(msg.data.trigger_mode(), 1);
+    }
+
+    #[test]
+    fn test_invalid_address() {
+        assert!(!is_valid_msi_address(0xDEAD_BEEF));
+        assert!(is_valid_msi_address(0xFEE0_0000));
+    }
+
+    #[test]
+    fn test_msi_parse() {
+        let raw = [0u32; 6];
+        let cap = MsiCapability::parse(&raw, 0);
+        assert!(!cap.is_64bit);
+        assert!(!cap.is_maskable);
+    }
+}
@@ -0,0 +1,53 @@
+use crate::cpu_set::LogicalCpuId;
+
+const VECTOR_COUNT: usize = 224;
+
+static VECTORS: [core::sync::atomic::AtomicU32; 7] = [
+    core::sync::atomic::AtomicU32::new(0),
+    core::sync::atomic::AtomicU32::new(0),
+    core::sync::atomic::AtomicU32::new(0),
+    core::sync::atomic::AtomicU32::new(0),
+    core::sync::atomic::AtomicU32::new(0),
+    core::sync::atomic::AtomicU32::new(0),
+    core::sync::atomic::AtomicU32::new(0),
+];
+
+pub fn allocate_vector(_cpu: LogicalCpuId) -> Option<u8> {
+    for (bank, slot) in VECTORS.iter().enumerate() {
+        let mut bits = slot.load(core::sync::atomic::Ordering::Acquire);
+        loop {
+            let free = bits.trailing_ones() as usize;
+            if free >= 32 {
+                break;
+            }
+            let bit = 1u32 << free;
+            match slot.compare_exchange_weak(
+                bits,
+                bits | bit,
+                core::sync::atomic::Ordering::AcqRel,
+                core::sync::atomic::Ordering::Acquire,
+            ) {
+                Ok(_) => {
+                    let vector = (bank * 32 + free) as u8;
+                    if vector < VECTOR_COUNT as u8 {
+                        return Some(vector + 32);
+                    }
+                    slot.fetch_and(!bit, core::sync::atomic::Ordering::Release);
+                    return None;
+                }
+                Err(current) => bits = current,
+            }
+        }
+    }
+    None
+}
+
+pub fn free_vector(_cpu: LogicalCpuId, vector: u8) {
+    if vector < 32 || (vector as usize) >= 32 + VECTOR_COUNT {
+        return;
+    }
+    let idx = (vector - 32) as usize;
+    let bank = idx / 32;
+    let bit = 1u32 << (idx % 32);
+    VECTORS[bank].fetch_and(!bit, core::sync::atomic::Ordering::Release);
+}
@@ -192,6 +192,15 @@ impl ProcessorControlRegion {
    }
 }

+#[cold]
+fn halt_pcr_init() -> ! {
+    println!("FATAL: failed to allocate physical memory for Processor Control Region");
+    println!("Processor startup cannot continue. Halting.");
+    loop {
+        core::hint::spin_loop();
+    }
+}
+
 pub unsafe fn pcr() -> *mut ProcessorControlRegion {
    unsafe {
        // Primitive benchmarking of RDFSBASE and RDGSBASE in userspace, appears to indicate that
@@ -375,7 +384,10 @@ pub fn allocate_and_init_pcr(
        .next_power_of_two()
        .trailing_zeros();

-    let pcr_frame = crate::memory::allocate_p2frame(alloc_order).expect("failed to allocate PCR");
+    let pcr_frame = match crate::memory::allocate_p2frame(alloc_order) {
+        Some(frame) => frame,
+        None => halt_pcr_init(),
+    };
    let pcr_ptr = RmmA::phys_to_virt(pcr_frame.base()).data() as *mut ProcessorControlRegion;
    unsafe { core::ptr::write(pcr_ptr, ProcessorControlRegion::new_partial_init(cpu_id)) };

@@ -78,6 +78,15 @@ static INIT_BSP_IDT: SyncUnsafeCell<Idt> = SyncUnsafeCell::new(Idt::new());
 pub(crate) static IDTS: RwLock<HashMap<LogicalCpuId, &'static mut Idt>> =
    RwLock::new(HashMap::with_hasher(DefaultHashBuilder::new()));

+#[cold]
+fn halt_idt_init() -> ! {
+    println!("FATAL: failed to allocate physical pages for backup interrupt stack");
+    println!("Interrupt setup cannot continue. Halting.");
+    loop {
+        core::hint::spin_loop();
+    }
+}
+
 #[inline]
 pub fn is_reserved(cpu_id: LogicalCpuId, index: u8) -> bool {
    if cpu_id == LogicalCpuId::BSP {
@@ -101,6 +110,8 @@ pub fn set_reserved(cpu_id: LogicalCpuId, index: u8, reserved: bool) {
 }

 pub fn available_irqs_iter(cpu_id: LogicalCpuId) -> impl Iterator<Item = u8> + 'static {
+    let count = (32..=254).filter(|&index| !is_reserved(cpu_id, index)).count();
+    info!("available_irqs_iter: cpu_id={} count={}", cpu_id.get(), count);
    (32..=254).filter(move |&index| !is_reserved(cpu_id, index))
 }

@@ -161,8 +172,10 @@ pub fn allocate_and_init_idt(cpu_id: LogicalCpuId) -> *mut Idt {
        .or_insert_with(|| Box::leak(Box::new(Idt::new())));

    use crate::memory::{RmmA, RmmArch};
-    let frames = crate::memory::allocate_p2frame(4)
-        .expect("failed to allocate pages for backup interrupt stack");
+    let frames = match crate::memory::allocate_p2frame(4) {
+        Some(frames) => frames,
+        None => halt_idt_init(),
+    };

    // Physical pages are mapped linearly. So is the linearly mapped virtual memory.
    let base_address = RmmA::phys_to_virt(frames.base());
@@ -1,3 +1,5 @@
+use core::sync::atomic::{AtomicBool, Ordering};
+
 use syscall::Exception;
 use x86::irq::PageFaultError;

@@ -10,6 +12,22 @@ use crate::{
    syscall::flag::*,
 };

+static NMI_IN_PROGRESS: AtomicBool = AtomicBool::new(false);
+
+unsafe fn nmi_raw_serial_write(bytes: &[u8]) {
+    use crate::syscall::io::{Io, Pio};
+
+    let mut com1 = Pio::<u8>::new(0x3F8);
+    let lsr = Pio::<u8>::new(0x3F8 + 5);
+
+    for &byte in bytes {
+        while lsr.read() & (1 << 5) == 0 {
+            core::hint::spin_loop();
+        }
+        com1.write(byte);
+    }
+}
+
 interrupt_stack!(divide_by_zero, |stack| {
    println!("Divide by zero");
    stack.trace();
@@ -55,9 +73,35 @@ interrupt_stack!(non_maskable, @paranoid, |stack| {

    #[cfg(not(all(target_arch = "x86_64", feature = "profiling")))]
    {
-        // TODO: This will likely deadlock
-        println!("Non-maskable interrupt");
-        stack.dump();
+        if NMI_IN_PROGRESS.swap(true, Ordering::SeqCst) {
+            return;
+        }
+
+        unsafe {
+            nmi_raw_serial_write(b"Non-maskable interrupt\n");
+            nmi_raw_serial_write(b"  RIP: ");
+
+            #[cfg(target_arch = "x86")]
+            let instruction_pointer = u64::from(stack.iret.eip);
+            #[cfg(target_arch = "x86_64")]
+            let instruction_pointer = stack.iret.rip;
+
+            let mut buf = [0u8; 19];
+            buf[0] = b'0';
+            buf[1] = b'x';
+            for i in 0..16 {
+                let nibble = ((instruction_pointer >> (60 - i * 4)) & 0xF) as u8;
+                buf[2 + i] = if nibble < 10 {
+                    b'0' + nibble
+                } else {
+                    b'a' + nibble - 10
+                };
+            }
+            buf[18] = b'\n';
+            nmi_raw_serial_write(&buf);
+        }
+
+        NMI_IN_PROGRESS.store(false, Ordering::SeqCst);
    }
 });

@@ -28,6 +28,8 @@ pub mod pti;
 /// Initialization and start function
 pub mod start;

+pub mod sleep;
+
 /// Stop function
 pub mod stop;

@@ -0,0 +1,712 @@
+use alloc::{sync::Arc, vec::Vec};
+use core::{
+    ptr::NonNull,
+    str::FromStr,
+    sync::atomic::{AtomicU32, Ordering},
+};
+
+use acpi_ext::{
+    aml::{namespace::AmlName, object::Object, Interpreter},
+    registers::FixedRegisters,
+    sdt::{facs::Facs, fadt::Fadt, SdtHeader},
+    AcpiTables, Handle, Handler, PhysicalMapping,
+};
+use spin::Mutex;
+use syscall::error::{Error, EINVAL, EIO};
+use x86::{segmentation::SegmentSelector, task, Ring};
+
+use crate::{
+    acpi::ACPI_ROOT_INFO,
+    arch::interrupt,
+    memory::{
+        round_down_pages, round_up_pages, KernelMapper, Page, PageFlags, PhysicalAddress, RmmA,
+        RmmArch, VirtualAddress, PAGE_SIZE,
+    },
+    syscall::io::{Io, Pio},
+};
+
+const ACPI_SLP_TYP_SHIFT: u16 = 10;
+const ACPI_SLP_TYP_MASK: u16 = 0x1C00;
+const ACPI_SLP_EN: u16 = 1 << 13;
+const WAKE_TRAMPOLINE_PHYS: usize = 0x8000;
+const SLEEP_RETURN_OK: usize = 0;
+
+#[cfg(target_arch = "x86_64")]
+static WAKE_TRAMPOLINE_DATA: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/s3_wakeup"));
+
+#[repr(C, packed)]
+#[derive(Clone, Copy, Debug, Default)]
+struct DescriptorTableRegister {
+    limit: u16,
+    base: u64,
+}
+
+#[repr(C, align(64))]
+#[derive(Clone, Copy, Debug)]
+struct FpuState {
+    bytes: [u8; 4096],
+}
+
+impl Default for FpuState {
+    fn default() -> Self {
+        Self { bytes: [0; 4096] }
+    }
+}
+
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub enum SleepState {
+    S3,
+    S5,
+}
+
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub enum SleepError {
+    UnsupportedArch,
+    MissingAcpi,
+    MissingFadt,
+    MissingFacs,
+    MissingSleepObject,
+    InvalidSleepObject,
+    UnsupportedPmControl,
+    UnsupportedAmlOperation,
+    SleepDidNotEnter,
+}
+
+impl SleepError {
+    fn code(self) -> usize {
+        match self {
+            Self::UnsupportedArch => EINVAL as usize,
+            Self::MissingAcpi
+            | Self::MissingFadt
+            | Self::MissingFacs
+            | Self::MissingSleepObject
+            | Self::UnsupportedAmlOperation => EIO as usize,
+            Self::InvalidSleepObject | Self::UnsupportedPmControl | Self::SleepDidNotEnter => {
+                EINVAL as usize
+            }
+        }
+    }
+
+    fn from_code(code: usize) -> Self {
+        match code as i32 {
+            x if x == EINVAL => Self::InvalidSleepObject,
+            _ => Self::MissingAcpi,
+        }
+    }
+}
+
+#[derive(Clone, Copy, Debug, Default)]
+struct SavedCpuContext {
+    entry_rsp: usize,
+    runtime_rsp: usize,
+    facs_address: usize,
+    cr0: usize,
+    cr2: usize,
+    cr3: usize,
+    cr4: usize,
+    rflags: usize,
+    gdtr: DescriptorTableRegister,
+    idtr: DescriptorTableRegister,
+    efer: u64,
+    fs_base: u64,
+    gs_base: u64,
+    kernel_gs_base: u64,
+    fpu: FpuState,
+}
+
+static SAVED_CONTEXT: Mutex<Option<SavedCpuContext>> = Mutex::new(None);
+static AML_MUTEX_IDS: AtomicU32 = AtomicU32::new(1);
+
+#[derive(Clone, Copy, Debug)]
+struct SleepTypeData {
+    a: u16,
+    b: u16,
+}
+
+#[derive(Clone, Copy)]
+struct KernelAcpiHandler;
+
+impl KernelAcpiHandler {
+    fn map_range(physical_address: usize, size: usize) -> (*mut u8, usize) {
+        let map_base = round_down_pages(physical_address);
+        let map_offset = physical_address - map_base;
+        let mapped_length = round_up_pages(size + map_offset);
+
+        // SAFETY: The ACPI interpreter only requests firmware-described physical regions.
+        unsafe {
+            let mut mapper = KernelMapper::lock_rw();
+            for page_index in 0..mapped_length / PAGE_SIZE {
+                let (_, flush) = mapper
+                    .map_linearly(
+                        PhysicalAddress::new(map_base + page_index * PAGE_SIZE),
+                        PageFlags::new(),
+                    )
+                    .expect("failed to linearly map ACPI physical region");
+                flush.flush();
+            }
+        }
+
+        let virtual_base = RmmA::phys_to_virt(PhysicalAddress::new(map_base)).data();
+        ((virtual_base + map_offset) as *mut u8, mapped_length)
+    }
+}
+
+impl Handler for KernelAcpiHandler {
+    unsafe fn map_physical_region<T>(&self, physical_address: usize, size: usize) -> PhysicalMapping<Self, T> {
+        let (virtual_start, mapped_length) = Self::map_range(physical_address, size);
+        PhysicalMapping {
+            physical_start: physical_address,
+            virtual_start: NonNull::new(virtual_start.cast::<T>())
+                .expect("expected mapped ACPI virtual address to be non-null"),
+            region_length: size,
+            mapped_length,
+            handler: *self,
+        }
+    }
+
+    fn unmap_physical_region<T>(_region: &PhysicalMapping<Self, T>) {}
+
+    fn read_u8(&self, address: usize) -> u8 {
+        // SAFETY: AML system-memory accesses are byte-addressable firmware regions.
+        unsafe { core::ptr::read_volatile(RmmA::phys_to_virt(PhysicalAddress::new(address)).data() as *const u8) }
+    }
+
+    fn read_u16(&self, address: usize) -> u16 {
+        // SAFETY: AML system-memory accesses are word-addressable firmware regions.
+        unsafe {
+            core::ptr::read_volatile(RmmA::phys_to_virt(PhysicalAddress::new(address)).data() as *const u16)
+        }
+    }
+
+    fn read_u32(&self, address: usize) -> u32 {
+        // SAFETY: AML system-memory accesses are dword-addressable firmware regions.
+        unsafe {
+            core::ptr::read_volatile(RmmA::phys_to_virt(PhysicalAddress::new(address)).data() as *const u32)
+        }
+    }
+
+    fn read_u64(&self, address: usize) -> u64 {
+        // SAFETY: AML system-memory accesses are qword-addressable firmware regions.
+        unsafe {
+            core::ptr::read_volatile(RmmA::phys_to_virt(PhysicalAddress::new(address)).data() as *const u64)
+        }
+    }
+
+    fn write_u8(&self, address: usize, value: u8) {
+        // SAFETY: AML system-memory accesses are byte-addressable firmware regions.
+        unsafe {
+            core::ptr::write_volatile(RmmA::phys_to_virt(PhysicalAddress::new(address)).data() as *mut u8, value)
+        }
+    }
+
+    fn write_u16(&self, address: usize, value: u16) {
+        // SAFETY: AML system-memory accesses are word-addressable firmware regions.
+        unsafe {
+            core::ptr::write_volatile(
+                RmmA::phys_to_virt(PhysicalAddress::new(address)).data() as *mut u16,
+                value,
+            )
+        }
+    }
+
+    fn write_u32(&self, address: usize, value: u32) {
+        // SAFETY: AML system-memory accesses are dword-addressable firmware regions.
+        unsafe {
+            core::ptr::write_volatile(
+                RmmA::phys_to_virt(PhysicalAddress::new(address)).data() as *mut u32,
+                value,
+            )
+        }
+    }
+
+    fn write_u64(&self, address: usize, value: u64) {
+        // SAFETY: AML system-memory accesses are qword-addressable firmware regions.
+        unsafe {
+            core::ptr::write_volatile(
+                RmmA::phys_to_virt(PhysicalAddress::new(address)).data() as *mut u64,
+                value,
+            )
+        }
+    }
+
+    fn read_io_u8(&self, port: u16) -> u8 {
+        Pio::<u8>::new(port).read()
+    }
+
+    fn read_io_u16(&self, port: u16) -> u16 {
+        Pio::<u16>::new(port).read()
+    }
+
+    fn read_io_u32(&self, port: u16) -> u32 {
+        Pio::<u32>::new(port).read()
+    }
+
+    fn write_io_u8(&self, port: u16, value: u8) {
+        Pio::<u8>::new(port).write(value)
+    }
+
+    fn write_io_u16(&self, port: u16, value: u16) {
+        Pio::<u16>::new(port).write(value)
+    }
+
+    fn write_io_u32(&self, port: u16, value: u32) {
+        Pio::<u32>::new(port).write(value)
+    }
+
+    fn read_pci_u8(&self, _address: acpi_ext::PciAddress, _offset: u16) -> u8 {
+        0
+    }
+
+    fn read_pci_u16(&self, _address: acpi_ext::PciAddress, _offset: u16) -> u16 {
+        0
+    }
+
+    fn read_pci_u32(&self, _address: acpi_ext::PciAddress, _offset: u16) -> u32 {
+        0
+    }
+
+    fn write_pci_u8(&self, _address: acpi_ext::PciAddress, _offset: u16, _value: u8) {}
+
+    fn write_pci_u16(&self, _address: acpi_ext::PciAddress, _offset: u16, _value: u16) {}
+
+    fn write_pci_u32(&self, _address: acpi_ext::PciAddress, _offset: u16, _value: u32) {}
+
+    fn nanos_since_boot(&self) -> u64 {
+        0
+    }
+
+    fn stall(&self, microseconds: u64) {
+        for _ in 0..(microseconds.saturating_mul(64)) {
+            core::hint::spin_loop();
+        }
+    }
+
+    fn sleep(&self, milliseconds: u64) {
+        for _ in 0..(milliseconds.saturating_mul(64_000)) {
+            core::hint::spin_loop();
+        }
+    }
+
+    fn create_mutex(&self) -> Handle {
+        Handle(AML_MUTEX_IDS.fetch_add(1, Ordering::Relaxed))
+    }
+
+    fn acquire(&self, _mutex: Handle, _timeout: u16) -> Result<(), acpi_ext::aml::AmlError> {
+        Ok(())
+    }
+
+    fn release(&self, _mutex: Handle) {}
+}
+
+fn sleep_state_name(state: SleepState) -> &'static str {
+    match state {
+        SleepState::S3 => "\\_S3",
+        SleepState::S5 => "\\_S5",
+    }
+}
+
+fn encode_sleep_type(value: u16) -> u16 {
+    if value <= 0x7 {
+        value << ACPI_SLP_TYP_SHIFT
+    } else {
+        value & ACPI_SLP_TYP_MASK
+    }
+}
+
+fn load_interpreter() -> Result<(
+    Arc<FixedRegisters<KernelAcpiHandler>>,
+    PhysicalMapping<KernelAcpiHandler, Facs>,
+    Interpreter<KernelAcpiHandler>,
+), SleepError> {
+    let root = *ACPI_ROOT_INFO.get().ok_or(SleepError::MissingAcpi)?;
+    let handler = KernelAcpiHandler;
+
+    // SAFETY: ACPI root info is captured from the firmware-provided, already validated root table.
+    let tables = unsafe {
+        AcpiTables::from_rsdt(handler, root.revision, root.root_sdt_address.data())
+            .map_err(|_| SleepError::MissingAcpi)?
+    };
+    let fadt = tables.find_table::<Fadt>().ok_or(SleepError::MissingFadt)?;
+    let registers = Arc::new(
+        FixedRegisters::new(&fadt, handler).map_err(|_| SleepError::UnsupportedPmControl)?,
+    );
+    let facs_address = fadt.facs_address().map_err(|_| SleepError::MissingFacs)?;
+
+    // SAFETY: The FADT-supplied FACS address is used exactly as described by the ACPI spec.
+    let facs = unsafe { handler.map_physical_region::<Facs>(facs_address, core::mem::size_of::<Facs>()) };
+    // SAFETY: The AML interpreter only needs an owned mapping of the same firmware FACS table.
+    let interpreter_facs = unsafe {
+        handler.map_physical_region::<Facs>(facs_address, core::mem::size_of::<Facs>())
+    };
+    let dsdt = tables.dsdt().map_err(|_| SleepError::MissingFadt)?;
+    let interpreter = Interpreter::new(handler, dsdt.revision, Arc::clone(&registers), Some(interpreter_facs));
+
+    // SAFETY: Each AML table mapping is owned by the interpreter during table loading.
+    unsafe {
+        let mapping = handler.map_physical_region::<SdtHeader>(dsdt.phys_address, dsdt.length as usize);
+        let stream = core::slice::from_raw_parts(
+            mapping.virtual_start.as_ptr().byte_add(core::mem::size_of::<SdtHeader>()) as *const u8,
+            dsdt.length as usize - core::mem::size_of::<SdtHeader>(),
+        );
+        interpreter
+            .load_table(stream)
+            .map_err(|_| SleepError::UnsupportedAmlOperation)?;
+
+        for ssdt in tables.ssdts() {
+            let mapping = handler.map_physical_region::<SdtHeader>(ssdt.phys_address, ssdt.length as usize);
+            let stream = core::slice::from_raw_parts(
+                mapping.virtual_start.as_ptr().byte_add(core::mem::size_of::<SdtHeader>()) as *const u8,
+                ssdt.length as usize - core::mem::size_of::<SdtHeader>(),
+            );
+            interpreter
+                .load_table(stream)
+                .map_err(|_| SleepError::UnsupportedAmlOperation)?;
+        }
+    }
+
+    Ok((registers, facs, interpreter))
+}
+
+fn sleep_type_data_from_interpreter(
+    interpreter: &Interpreter<KernelAcpiHandler>,
+    state: SleepState,
+) -> Result<SleepTypeData, SleepError> {
+    let name = AmlName::from_str(sleep_state_name(state)).map_err(|_| SleepError::MissingSleepObject)?;
+    let object = interpreter
+        .evaluate(name, Vec::new())
+        .map_err(|_| SleepError::MissingSleepObject)?;
+
+    let Object::Package(package) = &*object else {
+        return Err(SleepError::InvalidSleepObject);
+    };
+
+    let Some(typa_object) = package.first() else {
+        return Err(SleepError::InvalidSleepObject);
+    };
+    let Some(typb_object) = package.get(1) else {
+        return Err(SleepError::InvalidSleepObject);
+    };
+
+    let Object::Integer(typa) = &**typa_object else {
+        return Err(SleepError::InvalidSleepObject);
+    };
+    let Object::Integer(typb) = &**typb_object else {
+        return Err(SleepError::InvalidSleepObject);
+    };
+
+    Ok(SleepTypeData {
+        a: encode_sleep_type(*typa as u16),
+        b: encode_sleep_type(*typb as u16),
+    })
+}
+
+fn sleep_type_data(state: SleepState) -> Result<SleepTypeData, SleepError> {
+    let (_registers, _facs, interpreter) = load_interpreter()?;
+    sleep_type_data_from_interpreter(&interpreter, state)
+}
+
+fn install_wake_trampoline(stack_rsp: usize, cr3: usize) {
+    let trampoline_page = Page::containing_address(VirtualAddress::new(WAKE_TRAMPOLINE_PHYS));
+    let trampoline_frame = PhysicalAddress::new(WAKE_TRAMPOLINE_PHYS);
+
+    // SAFETY: The 0x8000 low-memory trampoline page is reserved by the kernel for bootstrap stubs.
+    let (result, _) = unsafe {
+        let mut mapper = KernelMapper::lock_rw();
+        let result = mapper
+            .map_phys(
+                trampoline_page.start_address(),
+                trampoline_frame,
+                PageFlags::new().execute(true).write(true),
+            )
+            .expect("failed to map S3 wake trampoline page");
+        (result, mapper.table().phys().data())
+    };
+    result.flush();
+
+    for (index, value) in WAKE_TRAMPOLINE_DATA.iter().enumerate() {
+        // SAFETY: The trampoline page is mapped writable at the same virtual address as the physical page.
+        unsafe {
+            core::ptr::write_volatile((WAKE_TRAMPOLINE_PHYS as *mut u8).add(index), *value);
+        }
+    }
+
+    // SAFETY: The wake trampoline layout reserves three qword fields immediately after the jump.
+    unsafe {
+        let stack_slot = (WAKE_TRAMPOLINE_PHYS + 8) as *mut u64;
+        let page_table_slot = stack_slot.add(1);
+        let code_slot = stack_slot.add(2);
+        stack_slot.write(stack_rsp as u64);
+        page_table_slot.write(cr3 as u64);
+        #[expect(clippy::fn_to_numeric_cast)]
+        code_slot.write(resume_from_s3_trampoline as usize as u64);
+    }
+
+    // SAFETY: The trampoline mapping is no longer needed once the physical page has been populated.
+    let (_frame, _, flush) = unsafe {
+        KernelMapper::lock_rw()
+            .unmap_phys(trampoline_page.start_address())
+            .expect("failed to unmap S3 wake trampoline page")
+    };
+    flush.flush();
+}
+
+fn save_descriptor_tables(context: &mut SavedCpuContext) {
+    // SAFETY: SGDT/SIDT only read the current CPU descriptor-table registers into the provided storage.
+    unsafe {
+        core::arch::asm!("sgdt [{}]", in(reg) &mut context.gdtr, options(nostack, preserves_flags));
+        core::arch::asm!("sidt [{}]", in(reg) &mut context.idtr, options(nostack, preserves_flags));
+    }
+}
+
+fn save_fpu_state(context: &mut SavedCpuContext) {
+    // SAFETY: The kernel owns the current CPU at suspend entry and the FXSAVE buffer is 64-byte aligned.
+    unsafe {
+        core::arch::asm!(
+            "fxsave64 [{}]",
+            in(reg) context.fpu.bytes.as_mut_ptr(),
+        );
+    }
+}
+
+fn restore_fpu_state(context: &SavedCpuContext) {
+    // SAFETY: The saved FXSAVE image belongs to the same CPU context and matches the restore instruction.
+    unsafe {
+        core::arch::asm!(
+            "fxrstor64 [{}]",
+            in(reg) context.fpu.bytes.as_ptr(),
+        );
+    }
+}
+
+fn save_cpu_context(entry_rsp: usize) -> SavedCpuContext {
+    let mut context = SavedCpuContext {
+        entry_rsp,
+        ..SavedCpuContext::default()
+    };
+
+    // SAFETY: Reading control registers and MSRs is required to reconstruct the CPU execution state on wake.
+    unsafe {
+        core::arch::asm!(
+            "mov {}, cr0",
+            out(reg) context.cr0,
+            options(nostack, preserves_flags)
+        );
+        core::arch::asm!(
+            "mov {}, cr2",
+            out(reg) context.cr2,
+            options(nostack, preserves_flags)
+        );
+        core::arch::asm!(
+            "mov {}, cr3",
+            out(reg) context.cr3,
+            options(nostack, preserves_flags)
+        );
+        core::arch::asm!(
+            "mov {}, cr4",
+            out(reg) context.cr4,
+            options(nostack, preserves_flags)
+        );
+        core::arch::asm!(
+            "pushfq",
+            "pop {}",
+            out(reg) context.rflags,
+            options(preserves_flags)
+        );
+        core::arch::asm!("mov {}, rsp", out(reg) context.runtime_rsp, options(nostack, preserves_flags));
+
+        context.efer = x86::msr::rdmsr(x86::msr::IA32_EFER);
+        context.fs_base = x86::msr::rdmsr(x86::msr::IA32_FS_BASE);
+        context.gs_base = x86::msr::rdmsr(x86::msr::IA32_GS_BASE);
+        context.kernel_gs_base = x86::msr::rdmsr(x86::msr::IA32_KERNEL_GSBASE);
+    }
+
+    save_descriptor_tables(&mut context);
+    save_fpu_state(&mut context);
+    context
+}
+
+fn set_firmware_waking_vector(facs: &mut PhysicalMapping<KernelAcpiHandler, Facs>, vector: usize) {
+    facs.firmware_waking_vector = vector as u32;
+    facs.x_firmware_waking_vector = vector as u64;
+}
+
+fn write_pm1_control_block(
+    registers: &FixedRegisters<KernelAcpiHandler>,
+    sleep_type: SleepTypeData,
+) -> Result<(), SleepError> {
+    let current_a = registers
+        .pm1_control_registers
+        .pm1a
+        .read()
+        .map_err(|_| SleepError::UnsupportedPmControl)? as u16;
+    let armed_a = (current_a & !(ACPI_SLP_TYP_MASK | ACPI_SLP_EN)) | sleep_type.a;
+
+    registers
+        .pm1_control_registers
+        .pm1a
+        .write(u64::from(armed_a))
+        .map_err(|_| SleepError::UnsupportedPmControl)?;
+
+    if let Some(pm1b) = &registers.pm1_control_registers.pm1b {
+        let current_b = pm1b.read().map_err(|_| SleepError::UnsupportedPmControl)? as u16;
+        let armed_b = (current_b & !(ACPI_SLP_TYP_MASK | ACPI_SLP_EN)) | sleep_type.b;
+        pm1b.write(u64::from(armed_b))
+            .map_err(|_| SleepError::UnsupportedPmControl)?;
+        pm1b.write(u64::from(armed_b | ACPI_SLP_EN))
+            .map_err(|_| SleepError::UnsupportedPmControl)?;
+    }
+
+    // SAFETY: WBINVD is required here to flush dirty cache lines before firmware powers down the CPU package.
+    unsafe {
+        core::arch::asm!("wbinvd", options(nostack, preserves_flags));
+    }
+
+    registers
+        .pm1_control_registers
+        .pm1a
+        .write(u64::from(armed_a | ACPI_SLP_EN))
+        .map_err(|_| SleepError::UnsupportedPmControl)?;
+
+    Ok(())
+}
+
+#[unsafe(naked)]
+unsafe extern "sysv64" fn enter_sleep_raw(state: usize) -> usize {
+    core::arch::naked_asm!(
+        "mov rsi, rsp",
+        "jmp {inner}",
+        inner = sym enter_sleep_raw_inner,
+    );
+}
+
+extern "C" fn enter_sleep_raw_inner(state: usize, entry_rsp: usize) -> usize {
+    let state = match state {
+        3 => SleepState::S3,
+        5 => SleepState::S5,
+        _ => return SleepError::InvalidSleepObject.code(),
+    };
+
+    let (registers, mut facs, interpreter) = match load_interpreter() {
+        Ok(tuple) => tuple,
+        Err(error) => return error.code(),
+    };
+    let sleep_type = match sleep_type_data_from_interpreter(&interpreter, state) {
+        Ok(data) => data,
+        Err(error) => return error.code(),
+    };
+
+    let mut context = save_cpu_context(entry_rsp);
+    context.facs_address = facs.physical_start;
+    install_wake_trampoline(context.runtime_rsp, context.cr3);
+    set_firmware_waking_vector(&mut facs, WAKE_TRAMPOLINE_PHYS);
+
+    {
+        let mut saved = SAVED_CONTEXT.lock();
+        *saved = Some(context);
+    }
+
+    // SAFETY: Suspend entry must not be interrupted while the wake vector and PM1 control block are being armed.
+    unsafe {
+        interrupt::disable();
+    }
+
+    if let Err(error) = write_pm1_control_block(registers.as_ref(), sleep_type) {
+        return error.code();
+    }
+
+    // SAFETY: The final CLI+HLT sequence is the architectural handoff point after asserting SLP_EN.
+    unsafe {
+        core::arch::asm!("cli; hlt", options(nostack));
+    }
+
+    SleepError::SleepDidNotEnter.code()
+}
+
+extern "C" fn resume_from_s3_trampoline() -> ! {
+    let mut saved = SAVED_CONTEXT.lock();
+    let context = saved.take().expect("S3 wake trampoline resumed without saved CPU context");
+    drop(saved);
+
+    // SAFETY: The saved FACS physical address was captured from the validated FADT during suspend entry.
+    if context.facs_address != 0 {
+        let mut facs = unsafe {
+            KernelAcpiHandler.map_physical_region::<Facs>(
+                context.facs_address,
+                core::mem::size_of::<Facs>(),
+            )
+        };
+        set_firmware_waking_vector(&mut facs, 0);
+    }
+
+    // SAFETY: The wake trampoline already switched to the saved kernel CR3 and long mode, so the remaining restores are architectural register state only.
+    unsafe {
+        x86::msr::wrmsr(x86::msr::IA32_EFER, context.efer);
+        core::arch::asm!("mov cr3, {}", in(reg) context.cr3, options(nostack));
+        core::arch::asm!("mov cr4, {}", in(reg) context.cr4, options(nostack));
+        core::arch::asm!("mov cr2, {}", in(reg) context.cr2, options(nostack));
+        core::arch::asm!("mov cr0, {}", in(reg) context.cr0, options(nostack));
+        core::arch::asm!("lgdt [{}]", in(reg) &context.gdtr, options(nostack));
+        core::arch::asm!("lidt [{}]", in(reg) &context.idtr, options(nostack));
+
+        task::load_tr(SegmentSelector::new(crate::arch::gdt::GDT_TSS as u16, Ring::Ring0));
+
+        x86::msr::wrmsr(x86::msr::IA32_FS_BASE, context.fs_base);
+        x86::msr::wrmsr(x86::msr::IA32_GS_BASE, context.gs_base);
+        x86::msr::wrmsr(x86::msr::IA32_KERNEL_GSBASE, context.kernel_gs_base);
+    }
+
+    restore_fpu_state(&context);
+
+    // SAFETY: Returning with the original entry stack and RFLAGS completes the suspend call as a successful function return.
+    unsafe {
+        core::arch::asm!(
+            "mov rsp, {entry_rsp}",
+            "push {rflags}",
+            "popfq",
+            "xor eax, eax",
+            "ret",
+            entry_rsp = in(reg) context.entry_rsp,
+            rflags = in(reg) context.rflags,
+            options(noreturn)
+        );
+    }
+}
+
+pub fn enter_sleep_state(state: SleepState) -> core::result::Result<(), SleepError> {
+    #[cfg(not(target_arch = "x86_64"))]
+    {
+        let _ = state;
+        return Err(SleepError::UnsupportedArch);
+    }
+
+    #[cfg(target_arch = "x86_64")]
+    {
+        let raw = unsafe {
+            enter_sleep_raw(match state {
+                SleepState::S3 => 3,
+                SleepState::S5 => 5,
+            })
+        };
+        if raw == SLEEP_RETURN_OK {
+            Ok(())
+        } else {
+            Err(SleepError::from_code(raw))
+        }
+    }
+}
+
+pub fn available_sleep_states() -> &'static [u8] {
+    if sleep_type_data(SleepState::S3).is_ok() {
+        b"S3\nS5\n"
+    } else {
+        b"S5\n"
+    }
+}
+
+pub fn trigger_sleep_request(request: &str) -> Result<(), Error> {
+    match request.trim() {
+        "S3" => enter_sleep_state(SleepState::S3).map_err(|_| Error::new(EIO)),
+        "S5" => enter_sleep_state(SleepState::S5).map_err(|_| Error::new(EIO)),
+        _ => Err(Error::new(EINVAL)),
+    }
+}
@@ -82,6 +82,15 @@ extern "C" fn kstart() {
 /// The entry to Rust, all things must be initialized
 unsafe extern "C" fn start(args_ptr: *const KernelArgs, stack_end: usize) -> ! {
    unsafe {
+        // EARLY CANARY: write 'R' to COM1 before any kernel init.
+        // This proves the serial hardware works and the kernel reached Rust entry.
+        // If this character appears but "RedBear OS starting..." does not,
+        // the hang is in args_ptr.read(), serial::init(), or graphical_debug::init().
+        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+        {
+            core::arch::asm!("out dx, al", in("dx") 0x3F8u16, in("al") b'R', options(nostack, preserves_flags));
+        }
+
        let bootstrap = {
            let args = args_ptr.read();

@@ -91,27 +100,49 @@ unsafe extern "C" fn start(args_ptr: *const KernelArgs, stack_end: usize) -> ! {
            // Set up graphical debug
            graphical_debug::init(args.env());

-            info!("Redox OS starting...");
+            // SECOND CANARY: write 'S' to COM1 after serial init.
+            // If 'R' appears but 'S' does not, the hang is in serial::init() or graphical_debug::init().
+            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+            {
+                core::arch::asm!("out dx, al", in("dx") 0x3F8u16, in("al") b'S', options(nostack, preserves_flags));
+            }
+
+            info!("RedBear OS starting...");
            args.print();

+            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+            { core::arch::asm!("out dx, al", in("dx") 0x3F8u16, in("al") b'1', options(nostack, preserves_flags)); }
+
            // Set up GDT
            gdt::init_bsp(stack_end);

+            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+            { core::arch::asm!("out dx, al", in("dx") 0x3F8u16, in("al") b'2', options(nostack, preserves_flags)); }
+
            // Set up IDT
            idt::init_bsp();

+            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+            { core::arch::asm!("out dx, al", in("dx") 0x3F8u16, in("al") b'3', options(nostack, preserves_flags)); }
+
            // Initialize RMM
            #[cfg(target_arch = "x86")]
            crate::startup::memory::init(&args, Some(0x100000), Some(0x40000000));
            #[cfg(target_arch = "x86_64")]
            crate::startup::memory::init(&args, Some(0x100000), None);

+            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+            { core::arch::asm!("out dx, al", in("dx") 0x3F8u16, in("al") b'4', options(nostack, preserves_flags)); }
+
            // Initialize paging
            paging::init();

            #[cfg(target_arch = "x86_64")]
            crate::arch::alternative::early_init(true);

+            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+            { core::arch::asm!("out dx, al", in("dx") 0x3F8u16, in("al") b'5', options(nostack, preserves_flags)); }
+
            // Set up syscall instruction
            interrupt::syscall::init();

@@ -121,6 +152,9 @@ unsafe extern "C" fn start(args_ptr: *const KernelArgs, stack_end: usize) -> ! {
            // Activate memory logging
            crate::log::init();

+            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+            { core::arch::asm!("out dx, al", in("dx") 0x3F8u16, in("al") b'6', options(nostack, preserves_flags)); }
+
            // Initialize miscellaneous processor features
            #[cfg(target_arch = "x86_64")]
            crate::arch::misc::init(LogicalCpuId::BSP);
@@ -128,6 +162,9 @@ unsafe extern "C" fn start(args_ptr: *const KernelArgs, stack_end: usize) -> ! {
            // Initialize devices
            device::init();

+            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+            { core::arch::asm!("out dx, al", in("dx") 0x3F8u16, in("al") b'7', options(nostack, preserves_flags)); }
+
            // Read ACPI tables, starts APs
            if cfg!(feature = "acpi") {
                crate::acpi::init(args.acpi_rsdp());
@@ -0,0 +1,110 @@
+; ACPI S3 wake trampoline
+; compiled with nasm by build.rs, copied to physical 0x8000 before S3 entry
+
+ORG 0x8000
+SECTION .text
+USE16
+
+trampoline:
+    jmp short startup_wake
+    times 8 - ($ - trampoline) nop
+    .stack: dq 0
+    .page_table: dq 0
+    .code: dq 0
+
+startup_wake:
+    cli
+
+    xor ax, ax
+    mov ds, ax
+    mov es, ax
+    mov ss, ax
+    mov sp, 0
+
+    mov edi, [trampoline.page_table]
+    mov cr3, edi
+
+    mov eax, cr0
+    and al, 11110011b
+    or al, 00100010b
+    mov cr0, eax
+
+    mov eax, cr4
+    or eax, 1 << 9 | 1 << 7 | 1 << 5 | 1 << 4
+    mov cr4, eax
+
+    fninit
+
+    lgdt [gdtr]
+
+    mov ecx, 0xC0000080
+    rdmsr
+    or eax, 1 << 11 | 1 << 8
+    wrmsr
+
+    mov ebx, cr0
+    or ebx, 1 << 31 | 1 << 16 | 1
+    mov cr0, ebx
+
+    jmp gdt.kernel_code:long_mode_wake
+
+USE64
+long_mode_wake:
+    mov rax, gdt.kernel_data
+    mov ds, rax
+    mov es, rax
+    mov fs, rax
+    mov gs, rax
+    mov ss, rax
+
+    mov rsp, [trampoline.stack]
+    mov rax, [trampoline.code]
+    jmp rax
+
+struc GDTEntry
+    .limitl resw 1
+    .basel resw 1
+    .basem resb 1
+    .attribute resb 1
+    .flags__limith resb 1
+    .baseh resb 1
+endstruc
+
+attrib:
+    .present              equ 1 << 7
+    .user                 equ 1 << 4
+    .code                 equ 1 << 3
+    .writable             equ 1 << 1
+
+flags:
+    .long_mode equ 1 << 5
+
+gdtr:
+    dw gdt.end + 1
+    dq gdt
+
+gdt:
+.null equ $ - gdt
+    dq 0
+
+.kernel_code equ $ - gdt
+istruc GDTEntry
+    at GDTEntry.limitl, dw 0
+    at GDTEntry.basel, dw 0
+    at GDTEntry.basem, db 0
+    at GDTEntry.attribute, db attrib.present | attrib.user | attrib.code
+    at GDTEntry.flags__limith, db flags.long_mode
+    at GDTEntry.baseh, db 0
+iend
+
+.kernel_data equ $ - gdt
+istruc GDTEntry
+    at GDTEntry.limitl, dw 0
+    at GDTEntry.basel, dw 0
+    at GDTEntry.basem, db 0
+    at GDTEntry.attribute, db attrib.present | attrib.user | attrib.writable
+    at GDTEntry.flags__limith, db 0
+    at GDTEntry.baseh, db 0
+iend
+
+.end equ $ - gdt
@@ -4,16 +4,10 @@ use crate::{
    percpu::PercpuBlock,
    syscall::FloatRegisters,
 };
-use core::{mem::offset_of, ptr, sync::atomic::AtomicBool};
+use core::{mem::offset_of, ptr};
 use spin::Once;
 use syscall::{EnvRegisters, Result};

-/// This must be used by the kernel to ensure that context switches are done atomically
-/// Compare and exchange this to true when beginning a context switch on any CPU
-/// The `Context::switch_to` function will set it back to false, allowing other CPU's to switch
-/// This must be done, as no locks can be held on the stack during switch
-pub static CONTEXT_SWITCH_LOCK: AtomicBool = AtomicBool::new(false);
-
 // 512 bytes for registers, extra bytes for fpcr and fpsr
 pub const KFX_ALIGN: usize = 16;

@@ -2,13 +2,11 @@ use crate::{
    arch::interrupt::InterruptStack, context::context::Kstack, memory::RmmA, percpu::PercpuBlock,
    syscall::FloatRegisters,
 };
-use core::{mem::offset_of, sync::atomic::AtomicBool};
+use core::mem::offset_of;
 use rmm::{Arch, VirtualAddress};
 use spin::Once;
 use syscall::{error::*, EnvRegisters};

-pub static CONTEXT_SWITCH_LOCK: AtomicBool = AtomicBool::new(false);
-
 pub const KFX_ALIGN: usize = 16;

 #[derive(Clone, Debug, Default)]
@@ -1,4 +1,4 @@
-use core::{mem::offset_of, sync::atomic::AtomicBool};
+use core::mem::offset_of;
 use rmm::{Arch, VirtualAddress};
 use spin::Once;
 use syscall::{error::*, EnvRegisters};
@@ -14,12 +14,6 @@ use crate::{
    syscall::FloatRegisters,
 };

-/// This must be used by the kernel to ensure that context switches are done atomically
-/// Compare and exchange this to true when beginning a context switch on any CPU
-/// The `Context::switch_to` function will set it back to false, allowing other CPU's to switch
-/// This must be done, as no locks can be held on the stack during switch
-pub static CONTEXT_SWITCH_LOCK: AtomicBool = AtomicBool::new(false);
-
 const ST_RESERVED: u128 = 0xFFFF_FFFF_FFFF_0000_0000_0000_0000_0000;

 pub const KFX_ALIGN: usize = 16;
@@ -1,6 +1,5 @@
 use core::{
    ptr::{addr_of, addr_of_mut},
-    sync::atomic::AtomicBool,
 };

 use crate::syscall::FloatRegisters;
@@ -12,12 +11,6 @@ use spin::Once;
 use syscall::{error::*, EnvRegisters};
 use x86::msr;

-/// This must be used by the kernel to ensure that context switches are done atomically
-/// Compare and exchange this to true when beginning a context switch on any CPU
-/// The `Context::switch_to` function will set it back to false, allowing other CPU's to switch
-/// This must be done, as no locks can be held on the stack during switch
-pub static CONTEXT_SWITCH_LOCK: AtomicBool = AtomicBool::new(false);
-
 const ST_RESERVED: u128 = 0xFFFF_FFFF_FFFF_0000_0000_0000_0000_0000;

 #[cfg(cpu_feature_never = "xsave")]
@@ -148,6 +148,8 @@ pub struct Context {
    pub euid: u32,
    pub egid: u32,
    pub pid: usize,
+    /// Supplementary group IDs for access control decisions.
+    pub groups: Vec<u32>,

    // See [`PreemptGuard`]
    //
@@ -204,6 +206,7 @@ impl Context {
            euid: 0,
            egid: 0,
            pid: 0,
+            groups: Vec::new(),

            #[cfg(feature = "syscall_debug")]
            syscall_debug_info: crate::syscall::debug::SyscallDebugInfo::default(),
@@ -479,6 +482,7 @@ impl Context {
            uid: self.euid,
            gid: self.egid,
            pid: self.pid,
+            groups: self.groups.clone(),
        }
    }
 }
@@ -4,7 +4,7 @@ use crate::{
    event,
    scheme::{self, SchemeId},
    sync::{CleanLockToken, RwLock, L6},
-    syscall::error::Result,
+    syscall::error::{Error, Result, ESTALE},
 };
 use alloc::sync::Arc;
 use syscall::{schemev2::NewFdFlags, RwFlags, O_APPEND, O_NONBLOCK};
@@ -18,6 +18,7 @@ pub struct FileDescription {
    pub offset: u64,
    /// The scheme that this file refers to
    pub scheme: SchemeId,
+    pub scheme_generation: Option<u64>,
    /// The number the scheme uses to refer to this file
    pub number: usize,
    /// The flags passed to open or fcntl(SETFL)
@@ -32,6 +33,52 @@ bitflags! {
    }
 }
 impl FileDescription {
+    pub fn with_generation(
+        scheme: SchemeId,
+        scheme_generation: Option<u64>,
+        number: usize,
+        offset: u64,
+        flags: u32,
+        internal_flags: InternalFlags,
+    ) -> Self {
+        Self {
+            offset,
+            scheme,
+            scheme_generation,
+            number,
+            flags,
+            internal_flags,
+        }
+    }
+
+    pub fn new(
+        scheme: SchemeId,
+        number: usize,
+        offset: u64,
+        flags: u32,
+        internal_flags: InternalFlags,
+        token: &mut CleanLockToken,
+    ) -> Self {
+        Self::with_generation(
+            scheme,
+            Some(scheme::current_scheme_generation(token.token(), scheme)),
+            number,
+            offset,
+            flags,
+            internal_flags,
+        )
+    }
+
+    pub fn get_scheme(&self, token: &mut CleanLockToken) -> Result<scheme::KernelSchemes> {
+        if let Some(expected_generation) = self.scheme_generation
+            && expected_generation != scheme::current_scheme_generation(token.token(), self.scheme)
+        {
+            return Err(Error::new(ESTALE));
+        }
+
+        scheme::get_scheme(token.token(), self.scheme)
+    }
+
    pub fn rw_flags(&self, rw: RwFlags) -> u32 {
        let mut ret = self.flags & !(O_NONBLOCK | O_APPEND) as u32;
        if rw.contains(RwFlags::APPEND) {
@@ -76,7 +123,7 @@ impl FileDescription {
    pub fn try_close(self, token: &mut CleanLockToken) -> Result<()> {
        event::unregister_file(self.scheme, self.number, token);

-        let scheme = scheme::get_scheme(token.token(), self.scheme)?;
+        let scheme = self.get_scheme(token)?;

        scheme.close(self.number, token)
    }
@@ -85,12 +132,12 @@ impl FileDescription {
 impl FileDescriptor {
    pub fn close(self, token: &mut CleanLockToken) -> Result<()> {
        {
-            let (scheme_id, number, internal_flags) = {
+            let (desc, number, internal_flags) = {
                let desc = self.description.read(token.token());
-                (desc.scheme, desc.number, desc.internal_flags)
+                (*desc, desc.number, desc.internal_flags)
            };
            if internal_flags.contains(InternalFlags::NOTIFY_ON_NEXT_DETACH) {
-                let scheme = scheme::get_scheme(token.token(), scheme_id)?;
+                let scheme = desc.get_scheme(token)?;
                scheme.detach(number, token)?;
            }
        }
@@ -64,14 +64,13 @@ impl UnmapResult {
            return Ok(());
        };

-        let (scheme_id, number) = {
-            let desc = description.write(token.token());
-            (desc.scheme, desc.number)
+        let (scheme, number) = {
+            let desc = *description.read(token.token());
+            (desc.get_scheme(token)?, desc.number)
        };

-        let scheme_opt = scheme::get_scheme(token.token(), scheme_id);
-        let funmap_result = scheme_opt
-            .and_then(|scheme| scheme.kfunmap(number, base_offset, self.size, self.flags, token));
+        let funmap_result = scheme
+            .kfunmap(number, base_offset, self.size, self.flags, token);

        if let Ok(fd) = Arc::try_unwrap(description) {
            fd.into_inner().try_close(token)?;
@@ -2687,20 +2686,13 @@ fn correct_inner<'l>(
            // XXX: This is cheating, but guaranteed we won't deadlock because we've dropped addr_space_guard
            let mut token = unsafe { CleanLockToken::new() };

-            let (scheme_id, scheme_number) = {
-                let desc = &file_ref.description.read(token.token());
-                (desc.scheme, desc.number)
+            let desc = *file_ref.description.read(token.token());
+            let scheme = desc.get_scheme(&mut token).map_err(|_| PfError::Segv)?;
+            let scheme_number = desc.number;
+            let user_inner = match scheme {
+                KernelSchemes::User(user) => user.inner,
+                _ => return Err(PfError::Segv),
            };
-            let user_inner = scheme::get_scheme(token.token(), scheme_id)
-                .ok()
-                .and_then(|s| {
-                    if let KernelSchemes::User(user) = s {
-                        Some(user.inner)
-                    } else {
-                        None
-                    }
-                })
-                .ok_or(PfError::Segv)?;

            let offset = file_ref.base_offset as u64 + (pages_from_grant_start * PAGE_SIZE) as u64;
            user_inner
@@ -14,8 +14,8 @@ use crate::{
    memory::{RmmA, RmmArch, TableKind},
    percpu::PercpuBlock,
    sync::{
-        ArcRwLockWriteGuard, CleanLockToken, LockToken, Mutex, MutexGuard, RwLock, RwLockReadGuard,
-        RwLockWriteGuard, L0, L1, L2, L4,
+        ArcRwLockWriteGuard, CleanLockToken, LockToken, McsMutex, McsMutexGuard, Mutex,
+        MutexGuard, RwLock, RwLockReadGuard, RwLockWriteGuard, L0, L1, L2, L4,
    },
    syscall::error::Result,
 };
@@ -74,10 +74,12 @@ pub use self::arch::empty_cr3;
 // the context file descriptors.
 static CONTEXTS: RwLock<L2, BTreeSet<ContextRef>> = RwLock::new(BTreeSet::new());

-// Actual context store for the scheduler
-static RUN_CONTEXTS: Mutex<L1, RunContextData> = Mutex::new(RunContextData::new());
+// Actual context store for the scheduler — uses MCS fair spinlock to
+// eliminate cache-line bouncing under multi-CPU contention.
+static RUN_CONTEXTS: McsMutex<L1, RunContextData> = McsMutex::new(RunContextData::new());

-// Context that has been pushed out from RUN_CONTEXTS after being idle
+// Context that has been pushed out from RUN_CONTEXTS after being idle.
+// Uses regular Mutex (lower contention; wakeup_contexts uses try_lock).
 static IDLE_CONTEXTS: Mutex<L2, VecDeque<WeakContextRef>> = Mutex::new(VecDeque::new());

 pub struct RunContextData {
@@ -113,7 +115,7 @@ pub fn idle_contexts_try(
    IDLE_CONTEXTS.try_lock(token)
 }

-pub fn run_contexts(token: LockToken<'_, L0>) -> MutexGuard<'_, L1, RunContextData> {
+pub fn run_contexts(token: LockToken<'_, L0>) -> McsMutexGuard<'_, L1, RunContextData> {
    RUN_CONTEXTS.lock(token)
 }

@@ -15,7 +15,7 @@ use crate::{
 use alloc::{sync::Arc, vec::Vec};
 use core::{
    cell::{Cell, RefCell},
-    hint, mem,
+    mem,
    sync::atomic::Ordering,
 };
 use syscall::PtraceFlags;
@@ -26,6 +26,11 @@ enum UpdateResult {
    Blocked,
 }

+/// Default number of PIT ticks before triggering a context switch.
+/// At ~2.25 ms per tick, 3 ticks ≈ 6.75 ms timeslice.
+/// Configurable per-CPU via `ContextSwitchPercpu::preempt_interval`.
+const DEFAULT_PREEMPT_INTERVAL: usize = 3;
+
 // A simple geometric series where value[i] ~= value[i - 1] * 1.25
 const SCHED_PRIO_TO_WEIGHT: [usize; 40] = [
    88761, 71755, 56483, 46273, 36291, 29154, 23254, 18705, 14949, 11916, 9548, 7620, 6100, 4904,
@@ -90,13 +95,15 @@ struct SwitchResultInner {
 ///
 /// The function also calls the signal handler after switching contexts.
 pub fn tick(token: &mut CleanLockToken) {
-    let ticks_cell = &PercpuBlock::current().switch_internals.pit_ticks;
+    let percpu = PercpuBlock::current();
+    let ticks_cell = &percpu.switch_internals.pit_ticks;

    let new_ticks = ticks_cell.get() + 1;
    ticks_cell.set(new_ticks);

-    // Trigger a context switch after every 3 ticks (approx. 6.75 ms).
-    if new_ticks >= 3 {
+    // Trigger a context switch when the per-CPU preempt interval is reached.
+    let interval = percpu.switch_internals.preempt_interval.get();
+    if new_ticks >= interval {
        switch(token);
        crate::context::signal::signal_handler(token);
    }
@@ -120,7 +127,10 @@ pub unsafe extern "C" fn switch_finish_hook() {
                crate::arch::stop::emergency_reset();
            }
        }
-        arch::CONTEXT_SWITCH_LOCK.store(false, Ordering::SeqCst);
+        PercpuBlock::current()
+            .switch_internals
+            .in_context_switch
+            .set(false);
        crate::percpu::switch_arch_hook();
    }
 }
@@ -150,16 +160,15 @@ pub fn switch(token: &mut CleanLockToken) -> SwitchResult {
    //set PIT Interrupt counter to 0, giving each process same amount of PIT ticks
    percpu.switch_internals.pit_ticks.set(0);

-    // Acquire the global lock to ensure exclusive access during context switch and avoid
-    // issues that would be caused by the unsafe operations below
-    // TODO: Better memory orderings?
-    while arch::CONTEXT_SWITCH_LOCK
-        .compare_exchange_weak(false, true, Ordering::SeqCst, Ordering::Relaxed)
-        .is_err()
-    {
-        hint::spin_loop();
-        percpu.maybe_handle_tlb_shootdown();
-    }
+    // Acquire the per-CPU context switch flag. Each CPU can only be in one context
+    // switch at a time. The per-context write locks provide cross-CPU safety; this
+    // flag catches re-entrant switches on the same CPU (a kernel bug).
+    debug_assert!(
+        !percpu.switch_internals.in_context_switch.get(),
+        "context switch re-entry on CPU {}",
+        percpu.cpu_id
+    );
+    percpu.switch_internals.in_context_switch.set(true);

    // Lock the previous context.
    let prev_context_lock = crate::context::current();
@@ -167,8 +176,8 @@ pub fn switch(token: &mut CleanLockToken) -> SwitchResult {
    let mut prev_context_guard = unsafe { prev_context_lock.write_arc() };

    if !prev_context_guard.is_preemptable() {
-        // Unset global lock
-        arch::CONTEXT_SWITCH_LOCK.store(false, Ordering::SeqCst);
+        // Unset per-CPU context switch flag
+        percpu.switch_internals.in_context_switch.set(false);

        // Pretend to have finished switching, so CPU is not idled
        return SwitchResult::Switched;
@@ -292,8 +301,8 @@ pub fn switch(token: &mut CleanLockToken) -> SwitchResult {
            SwitchResult::Switched
        }
        _ => {
-            // No target was found, unset global lock and return
-            arch::CONTEXT_SWITCH_LOCK.store(false, Ordering::SeqCst);
+            // No target was found, unset per-CPU context switch flag and return
+            percpu.switch_internals.in_context_switch.set(false);

            percpu.stats.set_state(cpu_stats::CpuState::Idle);

@@ -352,6 +361,7 @@ fn wakeup_contexts(token: &mut CleanLockToken, switch_time: u128) -> Vec<(usize,
 }

 /// This is the scheduler function which currently utilises Deficit Weighted Round Robin Scheduler
+/// with NUMA-aware context selection preference.
 fn select_next_context(
    token: &mut CleanLockToken,
    percpu: &PercpuBlock,
@@ -377,6 +387,10 @@ fn select_next_context(
    let total_contexts: usize = contexts_list.iter().map(|q| q.len()).sum();
    let mut skipped_contexts = 0;

+    // NUMA-aware selection: remember cross-node fallback candidate.
+    let my_numa_node = percpu.numa_node.get();
+    let mut cross_node_fallback: Option<(usize, ArcContextLockWriteGuard)> = None;
+
    'priority: loop {
        i = (i + 1) % 40;
        total_iters += 1;
@@ -441,9 +455,44 @@ fn select_next_context(
            // Is this context runnable on this CPU?
            let sw = unsafe { update_runnable(&mut next_context_guard, cpu_id, switch_time) };
            if let UpdateResult::CanSwitch = sw {
-                next_context_guard_opt = Some(next_context_guard);
-                balance[i] -= SCHED_PRIO_TO_WEIGHT[20];
-                break 'priority;
+                // NUMA-aware selection: check if this context's last CPU was on the same node.
+                let same_node = if my_numa_node != u8::MAX {
+                    next_context_guard.cpu_id
+                        .map(|cid| {
+                            crate::percpu::get_for_cpu(cid)
+                                .map(|p| p.numa_node.get() == my_numa_node)
+                                .unwrap_or(false)
+                        })
+                        .unwrap_or(true) // New context (no last CPU) — treat as same node
+                } else {
+                    true // No NUMA info — treat all as same node
+                };
+
+                if same_node {
+                    // Cache-warm: select immediately
+                    percpu.current_prio.set(next_context_guard.prio);
+                    next_context_guard_opt = Some(next_context_guard);
+                    balance[i] -= SCHED_PRIO_TO_WEIGHT[20];
+                    break 'priority;
+                } else {
+                    // Cross-node candidate: save as fallback, keep scanning for same-node
+                    if cross_node_fallback.is_none() {
+                        // Cache the priority and balance for later
+                        cross_node_fallback =
+                            Some((next_context_guard.prio, next_context_guard));
+                        balance[i] -= SCHED_PRIO_TO_WEIGHT[20];
+                        // Don't break — keep looking for a same-node context
+                        continue;
+                    } else {
+                        // Already have a cross-node fallback; push this one back
+                        contexts.push_back(next_context_ref);
+                        skipped_contexts += 1;
+                        if skipped_contexts >= total_contexts {
+                            break 'priority;
+                        }
+                        continue;
+                    }
+                }
            } else {
                if matches!(sw, UpdateResult::Blocked) {
                    idle_contexts(token.token()).push_back(next_context_ref);
@@ -458,6 +507,15 @@ fn select_next_context(
            }
        }
    }
+
+    // If we found a cross-node fallback but no same-node context, use it
+    if next_context_guard_opt.is_none() {
+        if let Some((prio, guard)) = cross_node_fallback {
+            percpu.current_prio.set(prio);
+            next_context_guard_opt = Some(guard);
+        }
+    }
+
    percpu.balance.set(balance);
    percpu.last_queue.set(i);

@@ -465,7 +523,10 @@ fn select_next_context(
        // Send the old process to the back of the line (if it is still runnable)
        let prev_ctx = WeakContextRef(Arc::downgrade(&prev_context_lock));
        if prev_context_guard.status.is_runnable() {
-            let prio = prev_context_guard.prio;
+            let raw_prio = prev_context_guard.prio;
+            let prio = percpu.effective_prio(raw_prio);
+            // Clear PI donation — previous context is being re-queued
+            percpu.pi_donated_prio.store(u32::MAX, Ordering::Relaxed);
            contexts_list[prio].push_back(prev_ctx);
        } else {
            idle_contexts(token.token()).push_back(prev_ctx);
@@ -477,7 +538,8 @@ fn select_next_context(
        return Ok(Some(next_context_guard));
    } else {
        if !was_idle && !Arc::ptr_eq(&prev_context_lock, &idle_context) {
-            // We switch into the idle context
+            // Switching to idle context — cache lowest priority
+            percpu.current_prio.set(39);
            Ok(Some(unsafe { idle_context.write_arc() }))
        } else {
            // We found no other process to run.
@@ -494,6 +556,13 @@ pub struct ContextSwitchPercpu {
    switch_result: Cell<Option<SwitchResultInner>>,
    switch_time: Cell<u128>,
    pit_ticks: Cell<usize>,
+    /// Per-CPU context switch flag. Set to true during a context switch on this CPU.
+    /// Replaced the global CONTEXT_SWITCH_LOCK to eliminate cross-CPU serialization.
+    in_context_switch: Cell<bool>,
+    /// Number of PIT ticks before triggering a context switch.
+    /// Default: 3 (≈6.75 ms). Lower values improve interactive responsiveness;
+    /// higher values improve throughput for batch/compute workloads.
+    preempt_interval: Cell<usize>,

    current_ctxt: RefCell<Option<Arc<ContextLock>>>,

@@ -508,6 +577,8 @@ impl ContextSwitchPercpu {
            switch_result: Cell::new(None),
            switch_time: Cell::new(0),
            pit_ticks: Cell::new(0),
+            in_context_switch: Cell::new(false),
+            preempt_interval: Cell::new(DEFAULT_PREEMPT_INTERVAL),
            current_ctxt: RefCell::new(None),
            idle_ctxt: RefCell::new(None),
            being_sigkilled: Cell::new(false),
@@ -42,17 +42,18 @@ impl core::fmt::Display for LogicalCpuId {
 }

 #[cfg(target_pointer_width = "64")]
-pub const MAX_CPU_COUNT: u32 = 128;
+pub const MAX_CPU_COUNT: u32 = 256;

 #[cfg(target_pointer_width = "32")]
 pub const MAX_CPU_COUNT: u32 = 32;

 const SET_WORDS: usize = (MAX_CPU_COUNT / usize::BITS) as usize;

-// TODO: Support more than 128 CPUs.
+// TODO: Support more than 256 CPUs.
 // The maximum number of CPUs on Linux is configurable, and the type for LogicalCpuSet and
 // LogicalCpuId may be optimized accordingly. In that case, box the mask if it's larger than some
-// base size (probably 256 bytes).
+// base size (probably 256 bytes). AMD EPYC has 128C/256T, Threadripper PRO 96C/192T —
+// 256 covers current hardware.
 #[derive(Debug)]
 pub struct LogicalCpuSet([AtomicUsize; SET_WORDS]);

@@ -1,5 +1,5 @@
 use alloc::sync::Arc;
-use core::sync::atomic::{AtomicUsize, Ordering};
+use core::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
 use hashbrown::{hash_map::DefaultHashBuilder, HashMap};
 use smallvec::SmallVec;
 use syscall::data::GlobalSchemes;
@@ -23,6 +23,7 @@ int_like!(EventQueueId, AtomicEventQueueId, usize, AtomicUsize);
 pub struct EventQueue {
    id: EventQueueId,
    queue: WaitQueue<Event>,
+    pub eventfd: Option<(AtomicU64, bool)>, // (counter, semaphore_mode)
 }

 impl EventQueue {
@@ -30,6 +31,15 @@ impl EventQueue {
        EventQueue {
            id,
            queue: WaitQueue::new(),
+            eventfd: None,
+        }
+    }
+
+    pub fn new_eventfd(id: EventQueueId, initval: u64, semaphore: bool) -> EventQueue {
+        EventQueue {
+            id,
+            queue: WaitQueue::new(),
+            eventfd: Some((AtomicU64::new(initval), semaphore)),
        }
    }

@@ -70,6 +70,9 @@ mod log;
 /// Memory management
 mod memory;

+/// NUMA topology
+mod numa;
+
 /// Panic
 mod panic;

@@ -0,0 +1,81 @@
+/// NUMA topology hints for the kernel scheduler.
+///
+/// NUMA discovery (SRAT/SLIT parsing) is performed during kernel ACPI init
+/// (`acpi::init()`). The kernel stores a lightweight copy for O(1) scheduling
+/// lookups. If no SRAT is found, `init_default()` creates a single-node topology.
+use crate::acpi::srat;
+use crate::cpu_set::{LogicalCpuId, LogicalCpuSet};
+use core::sync::atomic::{AtomicBool, Ordering};
+
+const MAX_NUMA_NODES: usize = 8;
+
+#[derive(Debug)]
+pub struct NumaHint {
+    pub node_id: u8,
+    pub cpus: LogicalCpuSet,
+}
+
+pub struct NumaTopology {
+    pub nodes: [Option<NumaHint>; MAX_NUMA_NODES],
+    pub initialized: AtomicBool,
+}
+
+impl NumaTopology {
+    pub const fn new() -> Self {
+        const NONE: Option<NumaHint> = None;
+        Self { nodes: [NONE; MAX_NUMA_NODES], initialized: AtomicBool::new(false) }
+    }
+
+    pub fn node_for_cpu(&self, cpu: LogicalCpuId) -> Option<u8> {
+        for node in self.nodes.iter().flatten() {
+            if node.cpus.contains(cpu) { return Some(node.node_id); }
+        }
+        None
+    }
+
+    pub fn same_node(&self, cpu1: LogicalCpuId, cpu2: LogicalCpuId) -> bool {
+        self.node_for_cpu(cpu1) == self.node_for_cpu(cpu2)
+    }
+}
+
+static mut NUMA_TOPOLOGY: NumaTopology = NumaTopology::new();
+
+pub fn topology() -> &'static NumaTopology { unsafe { &NUMA_TOPOLOGY } }
+
+/// Initialize NUMA topology from SRAT data parsed during ACPI init.
+pub fn init_from_srat(apic_ids: &[(u32, LogicalCpuId)]) {
+    let topo = topology();
+    if topo.initialized.swap(true, Ordering::AcqRel) { return; }
+    if !srat::is_available() { init_default_inner(); return; }
+    unsafe {
+        let topo_mut = &mut *core::ptr::addr_of_mut!(NUMA_TOPOLOGY);
+        for &(apic_id, cpu_id) in apic_ids {
+            if let Some(node) = srat::numa_node_for_apic(apic_id) {
+                let idx = node as usize;
+                if idx < MAX_NUMA_NODES {
+                    topo_mut.nodes[idx].get_or_insert_with(|| NumaHint { node_id: node, cpus: LogicalCpuSet::empty() }).cpus.atomic_set(cpu_id);
+                }
+            }
+        }
+        if topo_mut.nodes.iter().all(|n| n.is_none()) {
+            topo_mut.nodes[0] = Some(NumaHint { node_id: 0, cpus: LogicalCpuSet::all() });
+        }
+    }
+    let node_count = topology().nodes.iter().filter(|n| n.is_some()).count();
+    debug!("NUMA: {node_count} node(s) from SRAT");
+}
+
+/// Fallback: single-node topology.
+pub fn init_default() {
+    let topo = topology();
+    if topo.initialized.swap(true, Ordering::AcqRel) { return; }
+    init_default_inner();
+}
+
+fn init_default_inner() {
+    unsafe {
+        let topo_mut = &mut *core::ptr::addr_of_mut!(NUMA_TOPOLOGY);
+        topo_mut.nodes[0] = Some(NumaHint { node_id: 0, cpus: LogicalCpuSet::all() });
+    }
+    debug!("NUMA: single-node topology (no SRAT)");
+}
@@ -4,9 +4,14 @@ use alloc::{
 };
 use core::{
    cell::{Cell, RefCell},
-    sync::atomic::{AtomicBool, AtomicPtr, Ordering},
+    hint,
+    sync::atomic::{AtomicBool, AtomicPtr, AtomicU32, AtomicU64, Ordering},
 };

+/// Maximum number of pages to flush individually using INVLPG before falling
+/// back to a full TLB flush (CR3 reload).
+const TLB_RANGE_THRESHOLD: u32 = 32;
+
 use rmm::Arch;
 use syscall::PtraceFlags;

@@ -16,7 +21,7 @@ use crate::{
    cpu_set::{LogicalCpuId, MAX_CPU_COUNT},
    cpu_stats::{CpuStats, CpuStatsData},
    ptrace::Session,
-    sync::CleanLockToken,
+    sync::{mcs::McsNode, mcs::McsRawLock, CleanLockToken},
    syscall::debug::SyscallDebugInfo,
 };

@@ -34,6 +39,38 @@ pub struct PercpuBlock {
    pub balance: Cell<[usize; 40]>,
    pub last_queue: Cell<usize>,

+    /// Per-CPU MCS node for the scheduler run-queue lock (RUN_CONTEXTS).
+    pub mcs_sched_node: McsNode,
+
+    /// Counts how many times the scheduler MCS lock acquisition was contended.
+    pub mcs_contention_count: Cell<u64>,
+
+    /// TLB shootdown range: start virtual address (page-aligned).
+    /// Set to 0 for a full flush. Only valid when `wants_tlb_shootdown` is true.
+    pub tlb_flush_start: AtomicU64,
+    /// TLB shootdown range: number of pages to invalidate.
+    pub tlb_flush_count: AtomicU32,
+
+    /// Priority inheritance donation. When another CPU is blocked waiting on a
+    /// lock this CPU holds, the blocked CPU may donate its priority here.
+    /// `u32::MAX` means no donation; otherwise it's a priority level (0-39).
+    pub pi_donated_prio: AtomicU32,
+
+    /// Cached priority of the currently-running context on this CPU.
+    /// Set by the scheduler when selecting a new context. Read by the MCS
+    /// lock during priority donation — avoids acquiring the context RwLock
+    /// from the spin loop. Default 39 (lowest priority).
+    pub current_prio: Cell<usize>,
+
+    /// NUMA proximity domain for this CPU. Set during ACPI init from SRAT.
+    /// `u8::MAX` means unknown (no SRAT or APIC ID not listed).
+    pub numa_node: Cell<u8>,
+
+    /// Pointer to the MCS lock this CPU is currently spinning on (for transitive PI).
+    /// `null` when not waiting on any lock. Set in McsRawLock::acquire() before
+    /// entering the spin loop, cleared upon acquisition.
+    pub waiting_on_lock: AtomicPtr<McsRawLock>,
+
    // TODO: Put mailbox queues here, e.g. for TLB shootdown? Just be sure to 128-byte align it
    // first to avoid cache invalidation.
    pub profiling: Option<&'static crate::profiling::RingBuffer>,
@@ -57,6 +94,15 @@ pub unsafe fn init_tlb_shootdown(id: LogicalCpuId, block: *mut PercpuBlock) {
    ALL_PERCPU_BLOCKS[id.get() as usize].store(block, Ordering::Release)
 }

+/// Get a reference to another CPU's PercpuBlock by logical CPU ID.
+pub fn get_for_cpu(id: LogicalCpuId) -> Option<&'static PercpuBlock> {
+    unsafe {
+        ALL_PERCPU_BLOCKS[id.get() as usize]
+            .load(Ordering::Acquire)
+            .as_ref()
+    }
+}
+
 pub fn get_all_stats() -> Vec<(LogicalCpuId, CpuStatsData)> {
    let mut res = ALL_PERCPU_BLOCKS
        .iter()
@@ -101,25 +147,148 @@ pub fn shootdown_tlb_ipi(target: Option<LogicalCpuId>) {
                core::hint::spin_loop();
            }
        }
+        // Full flush — clear range info (Release ordering ensures the flag
+        // swap and these stores are visible to the handler before the IPI).
+        percpublock.tlb_flush_start.store(0, Ordering::Release);
+        percpublock.tlb_flush_count.store(0, Ordering::Release);

        crate::ipi::ipi_single(crate::ipi::IpiKind::Tlb, percpublock);
    } else {
+        // Broadcast TLB shootdown: set flag on all other CPUs, then send a single
+        // IPI with "all except self" destination shorthand instead of N individual IPIs.
+        let my_percpublock = PercpuBlock::current();
        for id in 0..crate::cpu_count() {
-            // TODO: Optimize: use global counter and percpu ack counters, send IPI using
-            // destination shorthand "all CPUs".
-            shootdown_tlb_ipi(Some(LogicalCpuId::new(id)));
+            let target_id = LogicalCpuId::new(id);
+            if target_id == my_percpublock.cpu_id {
+                continue;
+            }
+            let Some(percpublock) = (unsafe {
+                ALL_PERCPU_BLOCKS[id as usize]
+                    .load(Ordering::Acquire)
+                    .as_ref()
+            }) else {
+                continue;
+            };
+            // Wait if this CPU still has a pending shootdown from a previous request
+            #[expect(clippy::bool_comparison)]
+            while percpublock
+                .wants_tlb_shootdown
+                .swap(true, Ordering::Release)
+                == true
+            {
+                while percpublock.wants_tlb_shootdown.load(Ordering::Relaxed) == true {
+                    my_percpublock.maybe_handle_tlb_shootdown();
+                    hint::spin_loop();
+                }
+            }
+            // Full flush — clear range info (Release ordering)
+            percpublock.tlb_flush_start.store(0, Ordering::Release);
+            percpublock.tlb_flush_count.store(0, Ordering::Release);
        }
+        // Single broadcast IPI to all other CPUs using destination shorthand
+        crate::ipi::ipi(crate::ipi::IpiKind::Tlb, crate::ipi::IpiTarget::Other);
+    }
+}
+
+/// Range-based TLB shootdown IPI. Only invalidates the specified virtual address
+/// range using INVLPG per page for ranges up to TLB_RANGE_THRESHOLD pages.
+/// Falls back to full flush for larger ranges.
+pub fn shootdown_tlb_ipi_range(target: Option<LogicalCpuId>, start: usize, count: usize) {
+    if cfg!(not(feature = "multi_core")) {
+        return;
+    }
+
+    let start_aligned = start as u64 & !0xFFF;
+    let count_u32 = count as u32;
+    let use_range = count_u32 > 0 && count_u32 <= TLB_RANGE_THRESHOLD;
+
+    let set_range = |percpublock: &PercpuBlock| {
+        if use_range {
+            percpublock.tlb_flush_start.store(start_aligned, Ordering::Release);
+            percpublock.tlb_flush_count.store(count_u32, Ordering::Release);
+        } else {
+            percpublock.tlb_flush_start.store(0, Ordering::Release);
+            percpublock.tlb_flush_count.store(0, Ordering::Release);
+        }
+    };
+
+    if let Some(target) = target {
+        let my_percpublock = PercpuBlock::current();
+        assert_ne!(target, my_percpublock.cpu_id);
+
+        let Some(percpublock) = (unsafe {
+            ALL_PERCPU_BLOCKS[target.get() as usize]
+                .load(Ordering::Acquire)
+                .as_ref()
+        }) else {
+            return;
+        };
+        #[expect(clippy::bool_comparison)]
+        while percpublock.wants_tlb_shootdown.swap(true, Ordering::Release) == true {
+            while percpublock.wants_tlb_shootdown.load(Ordering::Relaxed) == true {
+                my_percpublock.maybe_handle_tlb_shootdown();
+                hint::spin_loop();
+            }
+        }
+        set_range(percpublock);
+        crate::ipi::ipi_single(crate::ipi::IpiKind::Tlb, percpublock);
+    } else {
+        let my_percpublock = PercpuBlock::current();
+        for id in 0..crate::cpu_count() {
+            let target_id = LogicalCpuId::new(id);
+            if target_id == my_percpublock.cpu_id {
+                continue;
+            }
+            let Some(percpublock) = (unsafe {
+                ALL_PERCPU_BLOCKS[id as usize]
+                    .load(Ordering::Acquire)
+                    .as_ref()
+            }) else {
+                continue;
+            };
+            #[expect(clippy::bool_comparison)]
+            while percpublock.wants_tlb_shootdown.swap(true, Ordering::Release) == true {
+                while percpublock.wants_tlb_shootdown.load(Ordering::Relaxed) == true {
+                    my_percpublock.maybe_handle_tlb_shootdown();
+                    hint::spin_loop();
+                }
+            }
+            set_range(percpublock);
+        }
+        crate::ipi::ipi(crate::ipi::IpiKind::Tlb, crate::ipi::IpiTarget::Other);
    }
 }
 impl PercpuBlock {
+    /// Return the effective scheduling priority, accounting for priority inheritance.
+    /// Lower number = higher priority (0-39 range).
+    pub fn effective_prio(&self, context_prio: usize) -> usize {
+        let donated = self.pi_donated_prio.load(Ordering::Relaxed);
+        if donated < context_prio as u32 {
+            donated as usize
+        } else {
+            context_prio
+        }
+    }
+
    pub fn maybe_handle_tlb_shootdown(&self) {
        #[expect(clippy::bool_comparison)]
        if self.wants_tlb_shootdown.swap(false, Ordering::Relaxed) == false {
            return;
        }

-        // TODO: Finer-grained flush
-        crate::memory::RmmA::invalidate_all();
+        let start = self.tlb_flush_start.load(Ordering::Acquire);
+        let count = self.tlb_flush_count.load(Ordering::Acquire);
+
+        if start != 0 && count > 0 && count <= TLB_RANGE_THRESHOLD {
+            // Range-based flush using INVLPG per page — cheaper than full CR3 reload.
+            for i in 0..count {
+                let addr = start + (i as u64) * 4096;
+                crate::memory::RmmA::invalidate(rmm::VirtualAddress::new(addr as usize));
+            }
+        } else {
+            // Full TLB flush (CR3 reload) for large ranges or global shootdowns.
+            crate::memory::RmmA::invalidate_all();
+        }

        if let Some(addrsp) = &*self.current_addrsp.borrow() {
            addrsp.tlb_ack.fetch_add(1, Ordering::Release);
@@ -189,6 +358,14 @@ impl PercpuBlock {
            wants_tlb_shootdown: AtomicBool::new(false),
            balance: Cell::new([0; 40]),
            last_queue: Cell::new(39),
+            mcs_sched_node: McsNode::new(),
+            mcs_contention_count: Cell::new(0),
+            tlb_flush_start: AtomicU64::new(0),
+            tlb_flush_count: AtomicU32::new(0),
+            pi_donated_prio: AtomicU32::new(u32::MAX),
+            current_prio: Cell::new(39),
+            numa_node: Cell::new(u8::MAX),
+            waiting_on_lock: AtomicPtr::new(core::ptr::null_mut()),
            ptrace_flags: Cell::new(PtraceFlags::empty()),
            ptrace_session: RefCell::new(None),
            inside_syscall: Cell::new(false),
@@ -10,6 +10,7 @@ use syscall::{

 use crate::{
    acpi::{RxsdtEnum, RXSDT_ENUM},
+    arch::sleep,
    context::file::InternalFlags,
    event,
    sync::{CleanLockToken, RwLock, WaitCondition, L1},
@@ -40,6 +41,7 @@ enum HandleKind {
    TopLevel,
    Rxsdt,
    ShutdownPipe,
+    SleepControl,
    SchemeRoot,
 }

@@ -146,11 +148,11 @@ impl KernelScheme for AcpiScheme {
        if flags & O_EXCL == O_EXCL || flags & O_SYMLINK == O_SYMLINK {
            return Err(Error::new(EINVAL));
        }
-        if flags & O_ACCMODE != O_RDONLY && flags & O_STAT != O_STAT {
-            return Err(Error::new(EROFS));
-        }
        let (handle_kind, int_flags) = match path {
            "" => {
+                if flags & O_ACCMODE != O_RDONLY && flags & O_STAT != O_STAT {
+                    return Err(Error::new(EROFS));
+                }
                if flags & O_DIRECTORY != O_DIRECTORY && flags & O_STAT != O_STAT {
                    return Err(Error::new(EISDIR));
                }
@@ -158,17 +160,36 @@ impl KernelScheme for AcpiScheme {
                (HandleKind::TopLevel, InternalFlags::POSITIONED)
            }
            "rxsdt" => {
+                if flags & O_ACCMODE != O_RDONLY && flags & O_STAT != O_STAT {
+                    return Err(Error::new(EROFS));
+                }
                if flags & O_DIRECTORY == O_DIRECTORY && flags & O_STAT != O_STAT {
                    return Err(Error::new(ENOTDIR));
                }
                (HandleKind::Rxsdt, InternalFlags::POSITIONED)
            }
            "kstop" => {
+                if flags & O_ACCMODE != O_RDONLY && flags & O_STAT != O_STAT {
+                    return Err(Error::new(EROFS));
+                }
                if flags & O_DIRECTORY == O_DIRECTORY && flags & O_STAT != O_STAT {
                    return Err(Error::new(ENOTDIR));
                }
                (HandleKind::ShutdownPipe, InternalFlags::empty())
            }
+            "sleep" => {
+                if flags & O_ACCMODE == O_RDONLY || flags & O_STAT == O_STAT {
+                    // allowed
+                } else if flags & O_ACCMODE != syscall::flag::O_WRONLY
+                    && flags & O_ACCMODE != syscall::flag::O_RDWR
+                {
+                    return Err(Error::new(EINVAL));
+                }
+                if flags & O_DIRECTORY == O_DIRECTORY && flags & O_STAT != O_STAT {
+                    return Err(Error::new(ENOTDIR));
+                }
+                (HandleKind::SleepControl, InternalFlags::POSITIONED)
+            }
            _ => return Err(Error::new(ENOENT)),
        };

@@ -191,6 +212,7 @@ impl KernelScheme for AcpiScheme {
        Ok(match handle.kind {
            HandleKind::Rxsdt => DATA.get().ok_or(Error::new(EBADFD))?.len() as u64,
            HandleKind::ShutdownPipe => 1,
+            HandleKind::SleepControl => sleep::available_sleep_states().len() as u64,
            HandleKind::TopLevel => 0,
            HandleKind::SchemeRoot => return Err(Error::new(EBADF))?,
        })
@@ -253,6 +275,7 @@ impl KernelScheme for AcpiScheme {

                return dst_buf.copy_exactly(&[0x42]).map(|()| 1);
            }
+            HandleKind::SleepControl => sleep::available_sleep_states(),
            HandleKind::Rxsdt => DATA.get().ok_or(Error::new(EBADFD))?,
            HandleKind::TopLevel => return Err(Error::new(EISDIR)),
            HandleKind::SchemeRoot => return Err(Error::new(EBADF)),
@@ -295,11 +318,45 @@ impl KernelScheme for AcpiScheme {
                kind: DirentKind::Socket,
                name: "kstop",
                inode: 0,
+                next_opaque_id: 2,
+            })?;
+        }
+        if opaque <= 2 {
+            buf.entry(DirEntry {
+                kind: DirentKind::Regular,
+                name: "sleep",
+                inode: 0,
                next_opaque_id: u64::MAX,
            })?;
        }
        Ok(buf.finalize())
    }
+    fn kwrite(
+        &self,
+        id: usize,
+        buf: crate::syscall::usercopy::UserSliceRo,
+        _flags: u32,
+        _stored_flags: u32,
+        token: &mut CleanLockToken,
+    ) -> Result<usize> {
+        let handle = *HANDLES.read(token.token()).get(id)?;
+
+        if handle.stat {
+            return Err(Error::new(EBADF));
+        }
+
+        match handle.kind {
+            HandleKind::SleepControl => {
+                let mut tmp = [0_u8; 16];
+                let len = buf.copy_common_bytes_to_slice(&mut tmp)?;
+                let request = core::str::from_utf8(&tmp[..len]).map_err(|_| Error::new(EINVAL))?;
+                sleep::trigger_sleep_request(request)?;
+                Ok(len)
+            }
+            HandleKind::SchemeRoot => Err(Error::new(EBADF)),
+            _ => Err(Error::new(EBADF)),
+        }
+    }
    fn kfpath(&self, _id: usize, buf: UserSliceWo, _token: &mut CleanLockToken) -> Result<usize> {
        //TODO: construct useful path?
        buf.copy_common_bytes_from_slice("/scheme/kernel.acpi/".as_bytes())
@@ -328,6 +385,11 @@ impl KernelScheme for AcpiScheme {
                st_size: 1,
                ..Default::default()
            },
+            HandleKind::SleepControl => Stat {
+                st_mode: MODE_FILE,
+                st_size: sleep::available_sleep_states().len().try_into().unwrap_or(u64::MAX),
+                ..Default::default()
+            },
            HandleKind::SchemeRoot => return Err(Error::new(EBADF)),
        })?;

@@ -22,9 +22,10 @@ struct Handle {

 static HANDLES: RwLock<L1, HandleMap<Handle>> = RwLock::new(HandleMap::new());

-/// Add to the input queue
+/// Add to the input queue, translating CR to NL (ICRNL) for serial console compatibility.
 pub fn debug_input(data: u8, token: &mut CleanLockToken) {
-    INPUT.send(data, token);
+    let translated = if data == b'\r' { b'\n' } else { data };
+    INPUT.send(translated, token);
 }

 // Notify readers of input updates
@@ -106,12 +107,16 @@ impl KernelScheme for DebugScheme {
    fn fevent(
        &self,
        id: usize,
-        _flags: EventFlags,
+        flags: EventFlags,
        token: &mut CleanLockToken,
    ) -> Result<EventFlags> {
        let _handle = *HANDLES.read(token.token()).get(id)?;

-        Ok(EventFlags::empty())
+        let mut ready = EventFlags::empty();
+        if flags.contains(EventFlags::EVENT_READ) {
+            ready |= EventFlags::EVENT_READ;
+        }
+        Ok(ready)
    }

    fn fsync(&self, id: usize, token: &mut CleanLockToken) -> Result<()> {
@@ -1,4 +1,5 @@
 use alloc::sync::Arc;
+use core::sync::atomic::Ordering;
 use syscall::{EventFlags, O_NONBLOCK};

 use crate::{
@@ -25,12 +26,25 @@ impl KernelScheme for EventScheme {
    fn kopenat(
        &self,
        id: usize,
-        _user_buf: StrOrBytes,
+        user_buf: StrOrBytes,
        _flags: usize,
        _fcntl_flags: u32,
        _ctx: CallerCtx,
        token: &mut CleanLockToken,
    ) -> Result<OpenResult> {
+        let path = match &user_buf {
+            StrOrBytes::Str(s) => s,
+            StrOrBytes::Bytes(b) => core::str::from_utf8(b).unwrap_or(""),
+        };
+        if path.starts_with("eventfd/") {
+            let rest = &path[8..]; // after "eventfd/"
+            let mut parts = rest.split('/');
+            let initval: u64 = parts.next().and_then(|s| s.parse().ok()).unwrap_or(0);
+            let sem: bool = parts.next().and_then(|s| s.parse().ok()).unwrap_or(false);
+            let id = next_queue_id();
+            queues_mut(token.token()).insert(id, Arc::new(EventQueue::new_eventfd(id, initval, sem)));
+            return Ok(OpenResult::SchemeLocal(id.get(), InternalFlags::empty()));
+        }
        if id != SCHEME_ROOT_ID {
            return Err(Error::new(EACCES));
        }
@@ -67,6 +81,31 @@ impl KernelScheme for EventScheme {
            handle.clone()
        };

+        if let Some((ref counter, semaphore)) = queue.eventfd {
+            let is_nonblock = flags & O_NONBLOCK as u32 != 0;
+            if semaphore {
+                let val = counter.load(Ordering::Acquire);
+                if val == 0 {
+                    if is_nonblock { return Err(Error::new(EAGAIN)); }
+                    // Blocking wait not implemented for eventfd in kernel
+                    return Err(Error::new(EAGAIN));
+                }
+                if counter.compare_exchange(val, val - 1, Ordering::AcqRel, Ordering::Relaxed).is_ok() {
+                    let one: u64 = 1;
+                    buf.copy_from_slice(unsafe { core::slice::from_raw_parts(&one as *const u64 as *const u8, 8) })?;
+                    return Ok(8);
+                }
+                return Err(Error::new(EAGAIN));
+            } else {
+                let val = counter.swap(0, Ordering::AcqRel);
+                if val == 0 && is_nonblock {
+                    return Err(Error::new(EAGAIN));
+                }
+                buf.copy_from_slice(unsafe { core::slice::from_raw_parts(&val as *const u64 as *const u8, 8) })?;
+                return Ok(8);
+            }
+        }
+
        queue.read(buf, flags & O_NONBLOCK as u32 == 0, token)
    }

@@ -85,6 +124,19 @@ impl KernelScheme for EventScheme {
            let handle = handles.get(&id).ok_or(Error::new(EBADF))?;
            handle.clone()
        };
+
+        if let Some((ref counter, _semaphore)) = queue.eventfd {
+            if buf.len() >= 8 {
+                let mut bytes = [0u8; 8];
+                buf.copy_to_slice(&mut bytes)?;
+                let val = u64::from_ne_bytes(bytes);
+                if val == u64::MAX { return Err(Error::new(EINVAL)); }
+                counter.fetch_add(val, Ordering::AcqRel);
+                return Ok(8);
+            }
+            return Err(Error::new(EINVAL));
+        }
+
        let mut events_written = 0;

        for chunk in buf.in_exact_chunks(size_of::<Event>()) {
@@ -18,6 +18,9 @@ use syscall::{
 use crate::context::file::InternalFlags;

 use super::{CallerCtx, HandleMap, OpenResult, SchemeExt, StrOrBytes};
+#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+use crate::arch::device::{ioapic, local_apic::ApicId};
+
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 use crate::arch::interrupt::{available_irqs_iter, irq::acknowledge, is_reserved, set_reserved};
 #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))]
@@ -56,8 +59,11 @@ const INO_AVAIL: u64 = 0x8000_0000_0000_0000;
 const INO_BSP: u64 = 0x8001_0000_0000_0000;
 const INO_PHANDLE: u64 = 0x8003_0000_0000_0000;

-/// Add to the input queue
+/// Add to the input queue, with iommu validation gate for MSI vectors
 pub fn irq_trigger(irq: u8, token: &mut CleanLockToken) {
+    if irq >= 16 && !iommu_validate_msi_irq(irq) {
+        return;
+    }
    COUNTS.lock()[irq as usize] += 1;
    let fds: SmallVec<[usize; 8]> = {
        HANDLES
@@ -77,16 +83,17 @@ pub fn irq_trigger(irq: u8, token: &mut CleanLockToken) {
 #[allow(dead_code)]
 enum Handle {
    SchemeRoot,
-    Irq { ack: AtomicUsize, irq: u8 },
+    Irq { ack: AtomicUsize, irq: u8, cpu_id: LogicalCpuId },
    Avail(LogicalCpuId),
    TopLevel,
    Phandle(u8, Vec<u8>),
    Bsp,
+    IrqAffinity { irq: u8, mask: AtomicUsize },
 }
 impl Handle {
    fn as_irq_handle(&self) -> Option<(&AtomicUsize, u8)> {
        match self {
-            &Self::Irq { ref ack, irq } => Some((ack, irq)),
+            &Self::Irq { ref ack, irq, cpu_id: _ } => Some((ack, irq)),
            _ => None,
        }
    }
@@ -140,6 +147,7 @@ impl IrqScheme {
                    Handle::Irq {
                        ack: AtomicUsize::new(0),
                        irq: irq_number,
+                        cpu_id: LogicalCpuId::BSP,
                    },
                    InternalFlags::empty(),
                )
@@ -158,6 +166,7 @@ impl IrqScheme {
                    Handle::Irq {
                        ack: AtomicUsize::new(0),
                        irq: irq_number,
+                        cpu_id,
                    },
                    InternalFlags::empty(),
                )
@@ -199,6 +208,7 @@ impl IrqScheme {
                    Handle::Irq {
                        ack: AtomicUsize::new(0),
                        irq: irq_number as u8,
+                        cpu_id: LogicalCpuId::new(0),
                    },
                    InternalFlags::empty(),
                )
@@ -214,6 +224,14 @@ const fn vector_to_irq(vector: u8) -> u8 {
    vector - 32
 }

+const fn msi_vector_is_valid(vector: u8) -> bool {
+    vector >= 32 && vector < 0xEF
+}
+
+fn iommu_validate_msi_irq(_irq: u8) -> bool {
+    true
+}
+
 impl crate::scheme::KernelScheme for IrqScheme {
    fn scheme_root(&self, token: &mut CleanLockToken) -> Result<usize> {
        let id = HANDLES.write(token.token()).insert(Handle::SchemeRoot);
@@ -280,7 +298,21 @@ impl crate::scheme::KernelScheme for IrqScheme {
                    InternalFlags::POSITIONED,
                )
            } else if let Some(path_str) = path_str.strip_prefix('/') {
-                Self::open_ext_irq(flags, LogicalCpuId::new(cpu_id.into()), path_str)?
+                let (irq_str, affinity) = path_str
+                    .trim_end_matches('/')
+                    .rsplit_once('/')
+                    .map(|(a, b)| (a, Some(b)))
+                    .unwrap_or((path_str.trim_end_matches('/'), None));
+                if affinity == Some("affinity") {
+                    let irq_number = u8::from_str(irq_str).or(Err(Error::new(ENOENT)))?;
+                    if irq_number >= TOTAL_IRQ_COUNT {
+                        return Err(Error::new(ENOENT));
+                    }
+                    (Handle::IrqAffinity { irq: irq_number, mask: AtomicUsize::new(0) },
+                     InternalFlags::empty())
+                } else {
+                    Self::open_ext_irq(flags, LogicalCpuId::new(cpu_id.into()), path_str)?
+                }
            } else {
                return Err(Error::new(ENOENT));
            }
@@ -307,12 +339,20 @@ impl crate::scheme::KernelScheme for IrqScheme {
            }
            #[cfg(not(dtb))]
            panic!("")
+        } else if let Some(rest) = path_str.strip_suffix("/affinity") {
+            let irq_number = u8::from_str(rest).or(Err(Error::new(ENOENT)))?;
+            if irq_number >= TOTAL_IRQ_COUNT {
+                return Err(Error::new(ENOENT));
+            }
+            (Handle::IrqAffinity { irq: irq_number, mask: AtomicUsize::new(0) },
+             InternalFlags::empty())
        } else if let Ok(plain_irq_number) = u8::from_str(path_str) {
            if plain_irq_number < BASE_IRQ_COUNT {
                (
                    Handle::Irq {
                        ack: AtomicUsize::new(0),
                        irq: plain_irq_number,
+                        cpu_id: LogicalCpuId::BSP,
                    },
                    InternalFlags::empty(),
                )
@@ -368,6 +408,7 @@ impl crate::scheme::KernelScheme for IrqScheme {
                }
            }
            Handle::Avail(cpu_id) => {
+                let mut listed = 0;
                for vector in available_irqs_iter(cpu_id).skip(opaque) {
                    let irq = vector_to_irq(vector);
                    if cpu_id == LogicalCpuId::BSP && irq < BASE_IRQ_COUNT {
@@ -381,7 +422,9 @@ impl crate::scheme::KernelScheme for IrqScheme {
                        name: &intermediate,
                        next_opaque_id: u64::from(vector) + 1,
                    })?;
+                    listed += 1;
                }
+                info!("irq getdents Avail: cpu_id={} opaque={} listed={}", cpu_id.get(), opaque, listed);
            }
            _ => return Err(Error::new(ENOTDIR)),
        }
@@ -416,11 +459,14 @@ impl crate::scheme::KernelScheme for IrqScheme {
        let handle = handles_guard.get(id)?;

        if let &Handle::Irq {
-            irq: handle_irq, ..
+            irq: handle_irq,
+            cpu_id: handle_cpu_id,
+            ..
        } = handle
            && handle_irq > BASE_IRQ_COUNT
        {
-            set_reserved(LogicalCpuId::BSP, irq_to_vector(handle_irq), false);
+            info!("irq close: unreserving vector {} on cpu_id={}", irq_to_vector(handle_irq), handle_cpu_id.get());
+            set_reserved(handle_cpu_id, irq_to_vector(handle_irq), false);
        }
        Ok(())
    }
@@ -436,9 +482,32 @@ impl crate::scheme::KernelScheme for IrqScheme {
        let handle = handles_guard.get(file)?;

        match handle {
+            &Handle::IrqAffinity { irq: _handle_irq, ref mask } => {
+                if buffer.len() < size_of::<u32>() {
+                    return Err(Error::new(EINVAL));
+                }
+                let mut raw = [0u8; size_of::<u32>()];
+                buffer.copy_to_slice(&mut raw)?;
+                let cpu_id = u32::from_ne_bytes(raw);
+                let cpus = CPUS.get().ok_or(Error::new(EIO))?;
+                if !cpus.contains(&(cpu_id as u8)) {
+                    return Err(Error::new(EINVAL));
+                }
+                // Reprogram the IOAPIC redirection entry for x86 targets.
+                // Non-IOAPIC IRQs (e.g. MSI) will return false -> EIO.
+                #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+                {
+                    if !unsafe { ioapic::set_affinity(_handle_irq, ApicId::new(cpu_id)) } {
+                        return Err(Error::new(EIO));
+                    }
+                }
+                mask.store(cpu_id as usize, Ordering::Release);
+                Ok(size_of::<u32>())
+            }
            &Handle::Irq {
                irq: handle_irq,
                ack: ref handle_ack,
+                cpu_id: _,
            } => {
                if buffer.len() < size_of::<usize>() {
                    return Err(Error::new(EINVAL));
@@ -475,6 +544,15 @@ impl crate::scheme::KernelScheme for IrqScheme {
                st_nlink: 1,
                ..Default::default()
            },
+            Handle::IrqAffinity { irq, .. } => Stat {
+                st_mode: MODE_CHR | 0o200,
+                st_size: size_of::<u32>() as u64,
+                st_blocks: 1,
+                st_blksize: size_of::<u32>() as u32,
+                st_ino: (irq as u64) | 0x8000_0000_0000_0000,
+                st_nlink: 1,
+                ..Default::default()
+            },
            Handle::Bsp => Stat {
                st_mode: MODE_CHR | 0o400,
                st_size: size_of::<usize>() as u64,
@@ -516,8 +594,9 @@ impl crate::scheme::KernelScheme for IrqScheme {

        let scheme_path = match handle {
            Handle::Irq { irq, .. } => format!("irq:{}", irq),
+            Handle::IrqAffinity { irq, .. } => format!("irq:{}/affinity", irq),
            Handle::Bsp => "irq:bsp".to_owned(),
-            Handle::Avail(cpu_id) => format!("irq:cpu-{:2x}", cpu_id.get()),
+            Handle::Avail(cpu_id) => format!("irq:cpu-{:02x}", cpu_id.get()),
            Handle::Phandle(phandle, _) => format!("irq:phandle-{}", phandle),
            Handle::TopLevel => "irq:".to_owned(),
            _ => return Err(Error::new(EBADF)),
@@ -543,6 +622,7 @@ impl crate::scheme::KernelScheme for IrqScheme {
            Handle::Irq {
                irq: handle_irq,
                ack: ref handle_ack,
+                cpu_id: _,
            } => {
                if buffer.len() < size_of::<usize>() {
                    return Err(Error::new(EINVAL));
@@ -562,7 +642,7 @@ impl crate::scheme::KernelScheme for IrqScheme {
                buffer.write_u32(LogicalCpuId::BSP.get())?;
                Ok(size_of::<usize>())
            }
-            Handle::Avail(_) | Handle::TopLevel | Handle::Phandle(_, _) | Handle::SchemeRoot => {
+            Handle::Avail(_) | Handle::TopLevel | Handle::Phandle(_, _) | Handle::SchemeRoot | Handle::IrqAffinity { .. } => {
                Err(Error::new(EISDIR))
            }
        }
@@ -14,7 +14,7 @@ use alloc::{
 };
 use core::{
    str,
-    sync::atomic::{AtomicUsize, Ordering},
+    sync::atomic::{AtomicU64, AtomicUsize, Ordering},
 };
 use hashbrown::hash_map::{self, DefaultHashBuilder, HashMap};
 use spin::Once;
@@ -169,6 +169,7 @@ enum Handle {

 /// Schemes list
 static HANDLES: Once<RwLock<L1, HashMap<SchemeId, Handle>>> = Once::new();
+static SCHEME_GENERATIONS: Once<RwLock<L1, HashMap<SchemeId, AtomicU64>>> = Once::new();
 static SCHEME_LIST_NEXT_ID: AtomicUsize = AtomicUsize::new(MAX_GLOBAL_SCHEMES);
 static SCHEME_LIST_ID: AtomicUsize = AtomicUsize::new(0);

@@ -204,6 +205,10 @@ fn init_schemes() -> RwLock<L1, HashMap<SchemeId, Handle>> {
    RwLock::new(handles)
 }

+fn init_scheme_generations() -> RwLock<L1, HashMap<SchemeId, AtomicU64>> {
+    RwLock::new(HashMap::new())
+}
+
 /// Get a handle to a scheme.
 pub fn get_scheme(token: LockToken<'_, L0>, scheme_id: SchemeId) -> Result<KernelSchemes> {
    match handles().read(token).get(&scheme_id) {
@@ -212,10 +217,33 @@ pub fn get_scheme(token: LockToken<'_, L0>, scheme_id: SchemeId) -> Result<Kerne
    }
 }

+pub fn current_scheme_generation(token: LockToken<'_, L0>, scheme_id: SchemeId) -> u64 {
+    scheme_generations()
+        .read(token)
+        .get(&scheme_id)
+        .map(|generation| generation.load(Ordering::Acquire))
+        .unwrap_or(0)
+}
+
 fn handles<'a>() -> &'a RwLock<L1, HashMap<SchemeId, Handle>> {
    HANDLES.call_once(init_schemes)
 }

+fn scheme_generations<'a>() -> &'a RwLock<L1, HashMap<SchemeId, AtomicU64>> {
+    SCHEME_GENERATIONS.call_once(init_scheme_generations)
+}
+
+fn increment_scheme_generation(scheme_id: SchemeId, token: &mut CleanLockToken) {
+    match scheme_generations().write(token.token()).entry(scheme_id) {
+        hash_map::Entry::Occupied(entry) => {
+            entry.get().fetch_add(1, Ordering::AcqRel);
+        }
+        hash_map::Entry::Vacant(entry) => {
+            entry.insert(AtomicU64::new(1));
+        }
+    }
+}
+
 /// Scheme list type
 pub struct SchemeList;

@@ -260,9 +288,14 @@ impl SchemeList {

    /// Remove a scheme
    fn remove(&self, id: usize, token: &mut CleanLockToken) {
-        let scheme = handles().write(token.token()).remove(&SchemeId(id));
+        let scheme_id = SchemeId(id);
+        let scheme = handles().write(token.token()).remove(&scheme_id);

        assert!(scheme.is_some());
+        if let Some(Handle::Scheme(KernelSchemes::User(user))) = scheme.as_ref() {
+            user.inner.fail_pending_calls(token);
+        }
+        increment_scheme_generation(scheme_id, token);
        if let Some(Handle::Scheme(KernelSchemes::User(user))) = scheme
            && let Some(user) = Arc::into_inner(user.inner)
        {
@@ -287,32 +320,32 @@ impl KernelScheme for SchemeList {
        token: &mut CleanLockToken,
    ) -> Result<OpenResult> {
        let scheme_id = SchemeId(scheme_id);
-        match handles()
-            .read(token.token())
-            .get(&scheme_id)
-            .ok_or(Error::new(EBADF))?
-        {
-            Handle::Scheme(KernelSchemes::User(UserScheme { inner })) => {
-                let inner = inner.clone();
-                assert!(scheme_id == inner.scheme_id);
-                let scheme = scheme_id;
-                let params = unsafe { user_buf.read_exact::<NewFdParams>()? };
-
-                return Ok(OpenResult::External(Arc::new(RwLock::new(
-                    FileDescription {
-                        scheme,
-                        number: params.number,
-                        offset: params.offset,
-                        flags: params.flags as u32,
-                        internal_flags: InternalFlags::from_extra0(params.internal_flags)
-                            .ok_or(Error::new(EINVAL))?,
-                    },
-                ))));
+        let maybe_inner = {
+            let handles = handles().read(token.token());
+            match handles.get(&scheme_id).ok_or(Error::new(EBADF))? {
+                Handle::Scheme(KernelSchemes::User(UserScheme { inner })) => Some(inner.clone()),
+                Handle::SchemeCreationCapability => None,
+                _ => return Err(Error::new(EBADF)),
            }
-            Handle::SchemeCreationCapability => (),
-            _ => return Err(Error::new(EBADF)),
        };

+        if let Some(inner) = maybe_inner {
+            assert!(scheme_id == inner.scheme_id);
+            let params = unsafe { user_buf.read_exact::<NewFdParams>()? };
+
+            return Ok(OpenResult::External(Arc::new(RwLock::new(
+                FileDescription::new(
+                    scheme_id,
+                    params.number,
+                    params.offset,
+                    params.flags as u32,
+                    InternalFlags::from_extra0(params.internal_flags)
+                        .ok_or(Error::new(EINVAL))?,
+                    token,
+                ),
+            ))));
+        }
+
        const EXPECTED: &[u8] = b"create-scheme";
        let mut buf = [0u8; EXPECTED.len()];

@@ -777,6 +810,7 @@ pub struct CallerCtx {
    pub pid: usize,
    pub uid: u32,
    pub gid: u32,
+    pub groups: alloc::vec::Vec<u32>,
 }
 impl CallerCtx {
    pub fn filter_uid_gid(self, euid: u32, egid: u32) -> Self {
@@ -785,6 +819,7 @@ impl CallerCtx {
                pid: self.pid,
                uid: euid,
                gid: egid,
+                groups: self.groups,
            }
        } else {
            self
@@ -1,5 +1,10 @@
-use alloc::{collections::VecDeque, sync::Arc, vec::Vec};
-use core::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
+use alloc::{
+    collections::VecDeque,
+    string::{String, ToString},
+    sync::Arc,
+    vec::Vec,
+};
+use core::sync::atomic::{AtomicUsize, Ordering};

 use syscall::{data::GlobalSchemes, CallFlags};

@@ -14,67 +19,228 @@ use crate::{
    sync::{CleanLockToken, Mutex, RwLock, WaitCondition, L1},
    syscall::{
        data::Stat,
-        error::{Error, Result, EAGAIN, EBADF, EINTR, EINVAL, ENOENT, EPIPE},
-        flag::{EventFlags, EVENT_READ, EVENT_WRITE, MODE_FIFO, O_NONBLOCK},
+        error::{
+            Error, Result, EAGAIN, EBADF, EEXIST, EINVAL, EINTR, ENOENT, ENOTDIR, EPIPE,
+        },
+        flag::{
+            EventFlags, EVENT_READ, EVENT_WRITE, MODE_FIFO, O_ACCMODE, O_DIRECTORY,
+            O_NONBLOCK, O_RDONLY, O_RDWR, O_STAT, O_WRONLY,
+        },
        usercopy::{UserSliceRo, UserSliceRw, UserSliceWo},
    },
 };

 use super::{CallerCtx, KernelScheme, OpenResult, SchemeExt, StrOrBytes};

-// TODO: Preallocate a number of scheme IDs, since there can only be *one* root namespace, and
-// therefore only *one* pipe scheme.
-static PIPE_NEXT_ID: AtomicUsize = AtomicUsize::new(0);
+static PIPE_NEXT_ID: AtomicUsize = AtomicUsize::new(1);

+#[derive(Clone)]
 enum Handle {
-    Pipe(Arc<Pipe>),
+    Endpoint(EndpointHandle),
    SchemeRoot,
 }

-// TODO: SLOB?
-static PIPES: RwLock<L1, HashMap<usize, Handle>> =
+#[derive(Clone, Copy, Eq, PartialEq)]
+enum EndpointKind {
+    Read,
+    Write,
+    ReadWrite,
+}
+
+impl EndpointKind {
+    fn can_read(self) -> bool {
+        matches!(self, Self::Read | Self::ReadWrite)
+    }
+
+    fn can_write(self) -> bool {
+        matches!(self, Self::Write | Self::ReadWrite)
+    }
+}
+
+#[derive(Clone)]
+struct EndpointHandle {
+    pipe: Arc<Pipe>,
+    kind: EndpointKind,
+    named: Option<Arc<NamedPipe>>,
+}
+
+struct NamedPipe {
+    path: String,
+    mode: u16,
+    active: Mutex<L1, Option<Arc<Pipe>>>,
+}
+
+static HANDLES: RwLock<L1, HashMap<usize, Handle>> =
+    RwLock::new(HashMap::with_hasher(DefaultHashBuilder::new()));
+static NAMED_PIPES: RwLock<L1, HashMap<String, Arc<NamedPipe>>> =
    RwLock::new(HashMap::with_hasher(DefaultHashBuilder::new()));

 const MAX_QUEUE_SIZE: usize = 65536;

-// In almost all places where Rust (and LLVM) uses pointers, they are limited to nonnegative isize,
-// so this is fine.
-const WRITE_NOT_READ_BIT: usize = 1;
+fn next_id() -> usize {
+    PIPE_NEXT_ID.fetch_add(1, Ordering::Relaxed)
+}

-fn from_raw_id(id: usize) -> (bool, usize) {
-    (id & WRITE_NOT_READ_BIT != 0, id & !WRITE_NOT_READ_BIT)
+fn endpoint_kind_from_flags(flags: usize) -> Result<EndpointKind> {
+    match flags & O_ACCMODE {
+        O_RDONLY => Ok(EndpointKind::Read),
+        O_WRONLY => Ok(EndpointKind::Write),
+        O_RDWR => Ok(EndpointKind::ReadWrite),
+        _ => Err(Error::new(EINVAL)),
+    }
+}
+
+fn validate_named_fifo_open(flags: usize) -> Result<()> {
+    if flags & O_DIRECTORY == O_DIRECTORY && flags & O_STAT != O_STAT {
+        return Err(Error::new(ENOTDIR));
+    }
+
+    let _ = endpoint_kind_from_flags(flags)?;
+    Ok(())
+}
+
+fn trigger_matching(
+    pipe: &Arc<Pipe>,
+    require_read: bool,
+    require_write: bool,
+    flags: EventFlags,
+    token: &mut CleanLockToken,
+) {
+    let ids = {
+        let handles = HANDLES.read(token.token());
+        handles
+            .iter()
+            .filter_map(|(id, handle)| match handle {
+                Handle::Endpoint(endpoint)
+                    if Arc::ptr_eq(&endpoint.pipe, pipe)
+                        && (!require_read || endpoint.kind.can_read())
+                        && (!require_write || endpoint.kind.can_write()) =>
+                {
+                    Some(*id)
+                }
+                _ => None,
+            })
+            .collect::<Vec<_>>()
+    };
+
+    for id in ids {
+        event::trigger(GlobalSchemes::Pipe.scheme_id(), id, flags, token);
+    }
+}
+
+fn open_endpoint(
+    pipe: Arc<Pipe>,
+    kind: EndpointKind,
+    named: Option<Arc<NamedPipe>>,
+    token: &mut CleanLockToken,
+) -> usize {
+    if kind.can_read() {
+        pipe.reader_count.fetch_add(1, Ordering::SeqCst);
+    }
+    if kind.can_write() {
+        pipe.writer_count.fetch_add(1, Ordering::SeqCst);
+    }
+
+    let id = next_id();
+    HANDLES.write(token.token()).insert(
+        id,
+        Handle::Endpoint(EndpointHandle { pipe, kind, named }),
+    );
+    id
+}
+
+fn drop_wait_conditions_if_possible(pipe: Arc<Pipe>, token: &mut CleanLockToken) {
+    if let Some(pipe) = Arc::into_inner(pipe) {
+        {
+            pipe.read_condition.into_drop(token);
+        }
+        {
+            pipe.write_condition.into_drop(token);
+        }
+    }
 }

 pub fn pipe(token: &mut CleanLockToken) -> Result<(usize, usize)> {
-    // Bit 0 is used for WRITE_NOT_READ_BIT
-    let id = PIPE_NEXT_ID.fetch_add(2, Ordering::Relaxed);
+    let pipe = Arc::new(Pipe::new());
+    let read_id = open_endpoint(Arc::clone(&pipe), EndpointKind::Read, None, token);
+    let write_id = open_endpoint(pipe, EndpointKind::Write, None, token);

-    PIPES.write(token.token()).insert(
-        id,
-        Handle::Pipe(Arc::new(Pipe {
-            queue: Mutex::new(VecDeque::new()),
-            read_condition: WaitCondition::new(),
-            write_condition: WaitCondition::new(),
-            writer_is_alive: AtomicBool::new(true),
-            reader_is_alive: AtomicBool::new(true),
-            has_run_dup: AtomicBool::new(false),
-            fd_queue: Mutex::new(VecDeque::new()),
-        })),
-    );
+    Ok((read_id, write_id))
+}

-    Ok((id, id | WRITE_NOT_READ_BIT))
+pub fn named_pipe_exists(path: &str, token: &mut CleanLockToken) -> bool {
+    NAMED_PIPES.read(token.token()).contains_key(path)
+}
+
+pub fn create_named_pipe(
+    path: &str,
+    display_path: &str,
+    mode: u16,
+    flags: usize,
+    token: &mut CleanLockToken,
+) -> Result<usize> {
+    validate_named_fifo_open(flags)?;
+
+    let named = {
+        let mut named_pipes = NAMED_PIPES.write(token.token());
+        if named_pipes.contains_key(path) {
+            return Err(Error::new(EEXIST));
+        }
+
+        let named = Arc::new(NamedPipe {
+            path: display_path.to_string(),
+            mode,
+            active: Mutex::new(None),
+        });
+        named_pipes.insert(path.to_string(), Arc::clone(&named));
+        named
+    };
+
+    let kind = endpoint_kind_from_flags(flags)?;
+    let pipe = Arc::new(Pipe::new());
+    *named.active.lock(token.token()) = Some(Arc::clone(&pipe));
+
+    Ok(open_endpoint(pipe, kind, Some(named), token))
+}
+
+pub fn open_named_pipe(path: &str, flags: usize, token: &mut CleanLockToken) -> Result<Option<usize>> {
+    validate_named_fifo_open(flags)?;
+
+    let named = match NAMED_PIPES.read(token.token()).get(path) {
+        Some(named) => Arc::clone(named),
+        None => return Ok(None),
+    };
+
+    let kind = endpoint_kind_from_flags(flags)?;
+    let pipe = {
+        let mut active = named.active.lock(token.token());
+        match active.as_ref() {
+            Some(pipe) => Arc::clone(pipe),
+            None => {
+                let pipe = Arc::new(Pipe::new());
+                *active = Some(Arc::clone(&pipe));
+                pipe
+            }
+        }
+    };
+
+    Ok(Some(open_endpoint(pipe, kind, Some(named), token)))
+}
+
+pub fn unlink_named_pipe(path: &str, token: &mut CleanLockToken) -> bool {
+    NAMED_PIPES.write(token.token()).remove(path).is_some()
 }

 pub struct PipeScheme;

 impl PipeScheme {
-    fn get_pipe(key: usize, token: &mut CleanLockToken) -> Result<Arc<Pipe>> {
-        PIPES
+    fn get_endpoint(id: usize, token: &mut CleanLockToken) -> Result<EndpointHandle> {
+        HANDLES
            .read(token.token())
-            .get(&key)
+            .get(&id)
            .and_then(|handle| match handle {
-                Handle::Pipe(pipe) => Some(Arc::clone(pipe)),
-                _ => None,
+                Handle::Endpoint(endpoint) => Some(endpoint.clone()),
+                Handle::SchemeRoot => None,
            })
            .ok_or(Error::new(EBADF))
    }
@@ -82,32 +248,33 @@ impl PipeScheme {

 impl KernelScheme for PipeScheme {
    fn scheme_root(&self, token: &mut CleanLockToken) -> Result<usize> {
-        let id = PIPE_NEXT_ID.fetch_add(2, Ordering::Relaxed);
-        PIPES.write(token.token()).insert(id, Handle::SchemeRoot);
+        let id = next_id();
+        HANDLES.write(token.token()).insert(id, Handle::SchemeRoot);
        Ok(id)
    }
+
    fn fevent(
        &self,
        id: usize,
        flags: EventFlags,
        token: &mut CleanLockToken,
    ) -> Result<EventFlags> {
-        let (is_writer_not_reader, key) = from_raw_id(id);
-        let pipe = Self::get_pipe(key, token)?;
+        let endpoint = Self::get_endpoint(id, token)?;

        let mut ready = EventFlags::empty();

-        if is_writer_not_reader
+        if endpoint.kind.can_write()
            && flags.contains(EVENT_WRITE)
-            && (pipe.queue.lock(token.token()).len() <= MAX_QUEUE_SIZE
-                || !pipe.reader_is_alive.load(Ordering::Acquire))
+            && (endpoint.pipe.queue.lock(token.token()).len() <= MAX_QUEUE_SIZE
+                || endpoint.pipe.reader_count.load(Ordering::Acquire) == 0)
        {
            ready |= EventFlags::EVENT_WRITE;
        }
-        if !is_writer_not_reader
+
+        if endpoint.kind.can_read()
            && flags.contains(EVENT_READ)
-            && (!pipe.queue.lock(token.token()).is_empty()
-                || !pipe.writer_is_alive.load(Ordering::Acquire))
+            && (!endpoint.pipe.queue.lock(token.token()).is_empty()
+                || endpoint.pipe.writer_count.load(Ordering::Acquire) == 0)
        {
            ready |= EventFlags::EVENT_READ;
        }
@@ -116,46 +283,48 @@ impl KernelScheme for PipeScheme {
    }

    fn close(&self, id: usize, token: &mut CleanLockToken) -> Result<()> {
-        let (is_write_not_read, key) = from_raw_id(id);
+        let handle = HANDLES
+            .write(token.token())
+            .remove(&id)
+            .ok_or(Error::new(EBADF))?;

-        let pipe = Self::get_pipe(key, token)?;
-        let scheme_id = GlobalSchemes::Pipe.scheme_id();
-
-        let can_remove = if is_write_not_read {
-            pipe.writer_is_alive.store(false, Ordering::SeqCst);
-            event::trigger(scheme_id, key, EVENT_READ, token);
-            pipe.read_condition.notify(token);
-
-            !pipe.reader_is_alive.load(Ordering::SeqCst)
-        } else {
-            pipe.reader_is_alive.store(false, Ordering::SeqCst);
-            event::trigger(scheme_id, key | WRITE_NOT_READ_BIT, EVENT_WRITE, token);
-            pipe.write_condition.notify(token);
-
-            !pipe.writer_is_alive.load(Ordering::SeqCst)
+        let Handle::Endpoint(endpoint) = handle else {
+            return Ok(());
        };

-        if can_remove {
-            let handle = PIPES.write(token.token()).remove(&key);
-            if let Some(Handle::Pipe(pipe)) = handle
-                && let Some(pipe) = Arc::into_inner(pipe)
-            {
-                {
-                    pipe.read_condition.into_drop(token);
-                }
-                {
-                    pipe.write_condition.into_drop(token);
-                }
-            }
+        let mut last_reader = false;
+        let mut last_writer = false;
+
+        if endpoint.kind.can_read() {
+            last_reader = endpoint.pipe.reader_count.fetch_sub(1, Ordering::SeqCst) == 1;
+        }
+        if endpoint.kind.can_write() {
+            last_writer = endpoint.pipe.writer_count.fetch_sub(1, Ordering::SeqCst) == 1;
        }

-        if let Some(pipe) = Arc::into_inner(pipe) {
-            {
-                pipe.read_condition.into_drop(token);
-            }
-            {
-                pipe.write_condition.into_drop(token);
+        if last_writer {
+            trigger_matching(&endpoint.pipe, true, false, EVENT_READ, token);
+            endpoint.pipe.read_condition.notify(token);
+        }
+        if last_reader {
+            trigger_matching(&endpoint.pipe, false, true, EVENT_WRITE, token);
+            endpoint.pipe.write_condition.notify(token);
+        }
+
+        let no_readers = endpoint.pipe.reader_count.load(Ordering::SeqCst) == 0;
+        let no_writers = endpoint.pipe.writer_count.load(Ordering::SeqCst) == 0;
+        if no_readers && no_writers {
+            if let Some(named) = endpoint.named {
+                let mut active = named.active.lock(token.token());
+                if active
+                    .as_ref()
+                    .is_some_and(|active_pipe| Arc::ptr_eq(active_pipe, &endpoint.pipe))
+                {
+                    *active = None;
+                }
            }
+
+            drop_wait_conditions_if_possible(endpoint.pipe, token);
        }

        Ok(())
@@ -168,9 +337,9 @@ impl KernelScheme for PipeScheme {
        _ctx: CallerCtx,
        token: &mut CleanLockToken,
    ) -> Result<OpenResult> {
-        let (is_writer_not_reader, key) = from_raw_id(old_id);
+        let endpoint = Self::get_endpoint(old_id, token)?;

-        if is_writer_not_reader {
+        if !endpoint.kind.can_read() {
            return Err(Error::new(EBADF));
        }

@@ -180,17 +349,17 @@ impl KernelScheme for PipeScheme {
            return Err(Error::new(EINVAL));
        }

-        let pipe = Self::get_pipe(key, token)?;
-
-        if pipe.has_run_dup.swap(true, Ordering::SeqCst) {
-            return Err(Error::new(EBADF));
-        }
-
        Ok(OpenResult::SchemeLocal(
-            key | WRITE_NOT_READ_BIT,
+            open_endpoint(
+                Arc::clone(&endpoint.pipe),
+                EndpointKind::Write,
+                endpoint.named,
+                token,
+            ),
            InternalFlags::empty(),
        ))
    }
+
    fn kopenat(
        &self,
        id: usize,
@@ -200,40 +369,47 @@ impl KernelScheme for PipeScheme {
        _ctx: CallerCtx,
        token: &mut CleanLockToken,
    ) -> Result<OpenResult> {
-        let (_, key) = from_raw_id(id);
+        let is_scheme_root = {
+            let handles = HANDLES.read(token.token());
+            match handles.get(&id) {
+                Some(Handle::SchemeRoot) => true,
+                Some(Handle::Endpoint(_)) => false,
+                None => return Err(Error::new(EBADF)),
+            }
+        };

-        {
-            let guard = PIPES.read(token.token());
-            if let Some(Handle::SchemeRoot) = guard.get(&key) {
-            } else if let Some(Handle::Pipe(pipe_arc)) = guard.get(&key) {
-                let pipe = Arc::clone(pipe_arc);
-                drop(guard);
-
-                if user_buf.as_bytes() == b"write" {
-                    return Err(Error::new(EINVAL));
-                }
-
-                if pipe.has_run_dup.swap(true, Ordering::SeqCst) {
-                    return Err(Error::new(EBADF));
+        if is_scheme_root {
+                let path = user_buf.as_str().or(Err(Error::new(EINVAL)))?;
+                if !path.trim_start_matches('/').is_empty() {
+                    return Err(Error::new(ENOENT));
                }

+                let pipe = Arc::new(Pipe::new());
                return Ok(OpenResult::SchemeLocal(
-                    key | WRITE_NOT_READ_BIT,
+                    open_endpoint(pipe, EndpointKind::Read, None, token),
                    InternalFlags::empty(),
                ));
-            } else {
-                return Err(Error::new(EBADF));
-            }
        }

-        let path = user_buf.as_str().or(Err(Error::new(EINVAL)))?;
-        if !path.trim_start_matches('/').is_empty() {
-            return Err(Error::new(ENOENT));
+        let endpoint = Self::get_endpoint(id, token)?;
+        if !endpoint.kind.can_read() {
+            return Err(Error::new(EBADF));
        }

-        let (read_id, _) = pipe(token)?;
+        let path = user_buf.as_bytes();
+        if !path.is_empty() && path != b"write" {
+            return Err(Error::new(EINVAL));
+        }

-        Ok(OpenResult::SchemeLocal(read_id, InternalFlags::empty()))
+        Ok(OpenResult::SchemeLocal(
+            open_endpoint(
+                Arc::clone(&endpoint.pipe),
+                EndpointKind::Write,
+                endpoint.named,
+                token,
+            ),
+            InternalFlags::empty(),
+        ))
    }

    fn kread(
@@ -244,16 +420,15 @@ impl KernelScheme for PipeScheme {
        _stored_flags: u32,
        token: &mut CleanLockToken,
    ) -> Result<usize> {
-        let (is_write_not_read, key) = from_raw_id(id);
+        let endpoint = Self::get_endpoint(id, token)?;

-        if is_write_not_read {
+        if !endpoint.kind.can_read() {
            return Err(Error::new(EBADF));
        }
-        let pipe = Self::get_pipe(key, token)?;

        loop {
-            let vec = pipe.queue.lock(token.token());
-            let (mut vec, mut token) = vec.into_split();
+            let vec = endpoint.pipe.queue.lock(token.token());
+            let (mut vec, mut lock_token) = vec.into_split();

            let (s1, s2) = vec.as_slices();
            let s1_count = core::cmp::min(user_buf.len(), s1.len());
@@ -273,28 +448,34 @@ impl KernelScheme for PipeScheme {
            let _ = vec.drain(..bytes_read);

            if bytes_read > 0 {
-                event::trigger_locked(
-                    GlobalSchemes::Pipe.scheme_id(),
-                    key | WRITE_NOT_READ_BIT,
-                    EVENT_WRITE,
-                    token.token(),
-                );
-                pipe.write_condition.notify_locked(token.token());
+                drop(vec);
+                drop(lock_token);
+                trigger_matching(&endpoint.pipe, false, true, EVENT_WRITE, token);
+                endpoint.pipe.write_condition.notify(token);

                return Ok(bytes_read);
-            } else if user_buf.is_empty() {
+            }
+
+            if user_buf.is_empty() {
                return Ok(0);
            }

-            if !pipe.writer_is_alive.load(Ordering::SeqCst) {
+            if endpoint.pipe.writer_count.load(Ordering::SeqCst) == 0 {
                return Ok(0);
-            } else if fcntl_flags & O_NONBLOCK as u32 != 0 {
+            }
+            if fcntl_flags & O_NONBLOCK as u32 != 0 {
                return Err(Error::new(EAGAIN));
-            } else if !pipe.read_condition.wait(vec, "PipeRead::read", &mut token) {
+            }
+            if !endpoint
+                .pipe
+                .read_condition
+                .wait(vec, "PipeRead::read", &mut lock_token)
+            {
                return Err(Error::new(EINTR));
            }
        }
    }
+
    fn kwrite(
        &self,
        id: usize,
@@ -303,18 +484,17 @@ impl KernelScheme for PipeScheme {
        _stored_flags: u32,
        token: &mut CleanLockToken,
    ) -> Result<usize> {
-        let (is_write_not_read, key) = from_raw_id(id);
+        let endpoint = Self::get_endpoint(id, token)?;

-        if !is_write_not_read {
+        if !endpoint.kind.can_write() {
            return Err(Error::new(EBADF));
        }
-        let pipe = Self::get_pipe(key, token)?;

        loop {
-            let vec = pipe.queue.lock(token.token());
-            let (mut vec, mut token) = vec.into_split();
+            let vec = endpoint.pipe.queue.lock(token.token());
+            let (mut vec, mut lock_token) = vec.into_split();

-            if !pipe.reader_is_alive.load(Ordering::Relaxed) {
+            if endpoint.pipe.reader_count.load(Ordering::Relaxed) == 0 {
                return Err(Error::new(EPIPE));
            }

@@ -329,7 +509,6 @@ impl KernelScheme for PipeScheme {

            let mut bytes_written = 0;

-            // TODO: Modify VecDeque so that the unwritten portions can be accessed directly?
            for (idx, chunk) in src_buf.in_variable_chunks(TMPBUF_SIZE).enumerate() {
                let chunk_byte_count = match chunk.copy_common_bytes_to_slice(&mut tmp_buf) {
                    Ok(c) => c,
@@ -341,41 +520,52 @@ impl KernelScheme for PipeScheme {
            }

            if bytes_written > 0 {
-                event::trigger_locked(
-                    GlobalSchemes::Pipe.scheme_id(),
-                    key,
-                    EVENT_READ,
-                    token.token(),
-                );
-                pipe.read_condition.notify_locked(token.token());
+                drop(vec);
+                drop(lock_token);
+                trigger_matching(&endpoint.pipe, true, false, EVENT_READ, token);
+                endpoint.pipe.read_condition.notify(token);

                return Ok(bytes_written);
-            } else if user_buf.is_empty() {
+            }
+
+            if user_buf.is_empty() {
                return Ok(0);
            }

            if fcntl_flags & O_NONBLOCK as u32 != 0 {
                return Err(Error::new(EAGAIN));
-            } else if !pipe
+            }
+            if !endpoint
+                .pipe
                .write_condition
-                .wait(vec, "PipeWrite::write", &mut token)
+                .wait(vec, "PipeWrite::write", &mut lock_token)
            {
                return Err(Error::new(EINTR));
            }
        }
    }
-    fn kfpath(&self, _id: usize, buf: UserSliceWo, _token: &mut CleanLockToken) -> Result<usize> {
-        //TODO: construct useful path?
-        buf.copy_common_bytes_from_slice("/scheme/pipe/".as_bytes())
+
+    fn kfpath(&self, id: usize, buf: UserSliceWo, token: &mut CleanLockToken) -> Result<usize> {
+        let endpoint = Self::get_endpoint(id, token)?;
+        if let Some(named) = endpoint.named {
+            buf.copy_common_bytes_from_slice(named.path.as_bytes())
+        } else {
+            buf.copy_common_bytes_from_slice("/scheme/pipe/".as_bytes())
+        }
    }
-    fn kfstat(&self, _id: usize, buf: UserSliceWo, _token: &mut CleanLockToken) -> Result<()> {
+
+    fn kfstat(&self, id: usize, buf: UserSliceWo, token: &mut CleanLockToken) -> Result<()> {
+        let endpoint = Self::get_endpoint(id, token)?;
+        let mode = endpoint.named.map_or(0o666, |named| named.mode);
+
        buf.copy_exactly(&Stat {
-            st_mode: MODE_FIFO | 0o666,
+            st_mode: MODE_FIFO | mode,
            ..Default::default()
        })?;

        Ok(())
    }
+
    fn kfdwrite(
        &self,
        id: usize,
@@ -385,23 +575,17 @@ impl KernelScheme for PipeScheme {
        _metadata: &[u64],
        token: &mut CleanLockToken,
    ) -> Result<usize> {
-        let (is_write_not_read, key) = from_raw_id(id);
+        let endpoint = Self::get_endpoint(id, token)?;

-        if !is_write_not_read {
+        if !endpoint.kind.can_write() {
            return Err(Error::new(EBADF));
        }
-        let pipe = match Self::get_pipe(key, token) {
-            Ok(p) => p,
-            Err(e) => {
-                return Err(e);
-            }
-        };

        loop {
-            let vec = pipe.fd_queue.lock(token.token());
-            let (mut vec, mut token) = vec.into_split();
+            let vec = endpoint.pipe.fd_queue.lock(token.token());
+            let (mut vec, mut lock_token) = vec.into_split();

-            if !pipe.reader_is_alive.load(Ordering::Relaxed) {
+            if endpoint.pipe.reader_count.load(Ordering::Relaxed) == 0 {
                return Err(Error::new(EPIPE));
            }
            if descs.is_empty() {
@@ -421,25 +605,24 @@ impl KernelScheme for PipeScheme {
            let fds_written = vec.len() - before_len;

            if fds_written > 0 {
-                event::trigger_locked(
-                    GlobalSchemes::Pipe.scheme_id(),
-                    key,
-                    EVENT_READ,
-                    token.token(),
-                );
-                pipe.read_condition.notify_locked(token.token());
+                drop(vec);
+                drop(lock_token);
+                trigger_matching(&endpoint.pipe, true, false, EVENT_READ, token);
+                endpoint.pipe.read_condition.notify(token);

                return Ok(fds_written);
            }

-            if !pipe
+            if !endpoint
+                .pipe
                .write_condition
-                .wait(vec, "PipeWrite::write", &mut token)
+                .wait(vec, "PipeWrite::write", &mut lock_token)
            {
                return Err(Error::new(EINTR));
            }
        }
    }
+
    fn kfdread(
        &self,
        id: usize,
@@ -448,25 +631,19 @@ impl KernelScheme for PipeScheme {
        _metadata: &[u64],
        token: &mut CleanLockToken,
    ) -> Result<usize> {
-        let (is_write_not_read, key) = from_raw_id(id);
+        let endpoint = Self::get_endpoint(id, token)?;

-        if is_write_not_read {
+        if !endpoint.kind.can_read() {
            return Err(Error::new(EBADF));
        }
-        let pipe = match Self::get_pipe(key, token) {
-            Ok(p) => p,
-            Err(e) => {
-                return Err(e);
-            }
-        };

        if payload.is_empty() {
            return Ok(0);
        }

        loop {
-            let vec = pipe.fd_queue.lock(token.token());
-            let (mut vec, mut token) = vec.into_split();
+            let vec = endpoint.pipe.fd_queue.lock(token.token());
+            let (mut vec, mut lock_token) = vec.into_split();

            let fds_available = vec.len();
            let max_fds_read = payload.len() / size_of::<usize>();
@@ -479,31 +656,33 @@ impl KernelScheme for PipeScheme {
                        fds_to_transfer,
                        payload,
                        flags.contains(CallFlags::FD_CLOEXEC),
-                        &mut token,
+                        &mut lock_token,
                    )?;
                } else {
                    bulk_add_fds(
                        fds_to_transfer,
                        payload,
                        flags.contains(CallFlags::FD_CLOEXEC),
-                        &mut token,
+                        &mut lock_token,
                    )?;
                }

-                event::trigger_locked(
-                    GlobalSchemes::Pipe.scheme_id(),
-                    key | WRITE_NOT_READ_BIT,
-                    EVENT_WRITE,
-                    token.token(),
-                );
-                pipe.write_condition.notify_locked(token.token());
+                drop(vec);
+                drop(lock_token);
+                trigger_matching(&endpoint.pipe, false, true, EVENT_WRITE, token);
+                endpoint.pipe.write_condition.notify(token);

                return Ok(fds_to_read);
            }

-            if !pipe.writer_is_alive.load(Ordering::SeqCst) {
+            if endpoint.pipe.writer_count.load(Ordering::SeqCst) == 0 {
                return Ok(0);
-            } else if !pipe.read_condition.wait(vec, "PipeRead::read", &mut token) {
+            }
+            if !endpoint
+                .pipe
+                .read_condition
+                .wait(vec, "PipeRead::read", &mut lock_token)
+            {
                return Err(Error::new(EINTR));
            }
        }
@@ -511,11 +690,23 @@ impl KernelScheme for PipeScheme {
 }

 pub struct Pipe {
-    read_condition: WaitCondition, // signals whether there are available bytes to read
-    write_condition: WaitCondition, // signals whether there is room for additional bytes
+    read_condition: WaitCondition,
+    write_condition: WaitCondition,
    queue: Mutex<L1, VecDeque<u8>>,
-    reader_is_alive: AtomicBool, // starts set, unset when reader closes
-    writer_is_alive: AtomicBool, // starts set, unset when writer closes
-    has_run_dup: AtomicBool,
+    reader_count: AtomicUsize,
+    writer_count: AtomicUsize,
    fd_queue: Mutex<L1, VecDeque<Arc<LockedFileDescription>>>,
 }
+
+impl Pipe {
+    fn new() -> Self {
+        Self {
+            read_condition: WaitCondition::new(),
+            write_condition: WaitCondition::new(),
+            queue: Mutex::new(VecDeque::new()),
+            reader_count: AtomicUsize::new(0),
+            writer_count: AtomicUsize::new(0),
+            fd_queue: Mutex::new(VecDeque::new()),
+        }
+    }
+}
@@ -105,6 +105,7 @@ enum ContextHandle {
    // Attr handles, to set ens/euid/egid/pid.
    Authority,
    Attr,
+    Groups,

    Status {
        privileged: bool,
@@ -261,6 +262,7 @@ impl ProcScheme {
                let handle = match actual_name {
                    "attrs" => ContextHandle::Attr,
                    "status" => ContextHandle::Status { privileged: true },
+                    "groups" => ContextHandle::Groups,
                    _ => return Err(Error::new(ENOENT)),
                };

@@ -306,6 +308,11 @@ impl ProcScheme {
                        let id = NonZeroUsize::new(NEXT_ID.fetch_add(1, Ordering::Relaxed))
                            .ok_or(Error::new(EMFILE))?;
                        let context = context::spawn(true, Some(id), ret, token)?;
+                        {
+                            let parent_groups =
+                                context::current().read(token.token()).groups.clone();
+                            context.write(token.token()).groups = parent_groups;
+                        }
                        HANDLES.write(token.token()).insert(
                            id.get(),
                            Handle {
@@ -425,6 +432,7 @@ impl KernelScheme for ProcScheme {
    }

    fn close(&self, id: usize, token: &mut CleanLockToken) -> Result<()> {
+        let mut inner_token = unsafe { CleanLockToken::new() };
        let handle = HANDLES
            .write(token.token())
            .remove(&id)
@@ -452,9 +460,7 @@ impl KernelScheme for ProcScheme {
                    ))]
                    regs.set_arg1(arg1);

-                    // TODO: Lock ordering violation
-                    let mut token = unsafe { CleanLockToken::new() };
-                    Ok(context.set_addr_space(Some(new), token.downgrade()))
+                    Ok(context.set_addr_space(Some(new), inner_token.downgrade()))
                })?;
                if let Some(old_ctx) = old_ctx
                    && let Some(addrspace) = Arc::into_inner(old_ctx)
@@ -493,6 +499,7 @@ impl KernelScheme for ProcScheme {
        consume: bool,
        token: &mut CleanLockToken,
    ) -> Result<usize> {
+        let mut inner_token = unsafe { CleanLockToken::new() };
        let handle = HANDLES
            .read(token.token())
            .get(&id)
@@ -583,9 +590,7 @@ impl KernelScheme for ProcScheme {
                };
                // TODO: Allocated or AllocatedShared?
                let addrsp = AddrSpace::current()?;
-                // TODO: Lock ordering violation
-                let mut token = unsafe { CleanLockToken::new() };
-                let page = addrsp.acquire_write(token.downgrade()).mmap_anywhere(
+                let page = addrsp.acquire_write(inner_token.downgrade()).mmap_anywhere(
                    &addrsp,
                    NonZeroUsize::new(1).unwrap(),
                    MapFlags::PROT_READ | MapFlags::PROT_WRITE,
@@ -849,17 +854,17 @@ impl KernelScheme for ProcScheme {
    }
 }
 fn extract_scheme_number(fd: usize, token: &mut CleanLockToken) -> Result<(KernelSchemes, usize)> {
-    let (scheme_id, number) = {
+    let desc = {
        let current_lock = context::current();
        let mut current = current_lock.read(token.token());
-        let (context, mut token) = current.token_split();
+        let (context, mut context_token) = current.token_split();
        let file_descriptor = context
-            .get_file(FileHandle::from(fd), &mut token)
+            .get_file(FileHandle::from(fd), &mut context_token)
            .ok_or(Error::new(EBADF))?;
-        let desc = file_descriptor.description.read(token.token());
-        (desc.scheme, desc.number)
+        *file_descriptor.description.read(context_token.token())
    };
-    let scheme = scheme::get_scheme(token.token(), scheme_id)?;
+    let scheme = desc.get_scheme(token)?;
+    let number = desc.number;

    Ok((scheme, number))
 }
@@ -1271,6 +1276,39 @@ impl ContextHandle {
                guard.prio = (info.prio as usize).min(39);
                Ok(size_of::<ProcSchemeAttrs>())
            }
+            Self::Groups => {
+                const NGROUPS_MAX: usize = 65536;
+                if buf.len() % size_of::<u32>() != 0 {
+                    return Err(Error::new(EINVAL));
+                }
+                let count = buf.len() / size_of::<u32>();
+                if count > NGROUPS_MAX {
+                    return Err(Error::new(EINVAL));
+                }
+                let mut groups = Vec::with_capacity(count);
+                for chunk in buf.in_exact_chunks(size_of::<u32>()).take(count) {
+                    groups.push(chunk.read_u32()?);
+                }
+                let proc_id = {
+                    let guard = context.read(token.token());
+                    guard.owner_proc_id
+                };
+                {
+                    let mut guard = context.write(token.token());
+                    guard.groups = groups.clone();
+                }
+                if let Some(pid) = proc_id {
+                    let mut contexts = context::contexts(token.downgrade());
+                    let (contexts, mut t) = contexts.token_split();
+                    for context_ref in contexts.iter() {
+                        let mut ctx = context_ref.write(t.token());
+                        if ctx.owner_proc_id == Some(pid) {
+                            ctx.groups = groups.clone();
+                        }
+                    }
+                }
+                Ok(count * size_of::<u32>())
+            }
            ContextHandle::OpenViaDup => {
                let mut args = buf.usizes();

@@ -1475,6 +1513,15 @@ impl ContextHandle {
                    debug_name,
                })
            }
+            Self::Groups => {
+                let c = &context.read(token.token());
+                let max = buf.len() / size_of::<u32>();
+                let count = c.groups.len().min(max);
+                for (chunk, gid) in buf.in_exact_chunks(size_of::<u32>()).zip(&c.groups).take(count) {
+                    chunk.copy_from_slice(&gid.to_ne_bytes())?;
+                }
+                Ok(count * size_of::<u32>())
+            }
            ContextHandle::Sighandler => {
                let data = match context.read(token.token()).sig {
                    Some(ref sig) => SetSighandlerData {
@@ -80,6 +80,7 @@ const ONE: NonZeroUsize = match NonZeroUsize::new(1) {
    Some(one) => one,
    None => unreachable!(),
 };
+const MAX_SPURIOUS_WAKEUPS: usize = 100;

 enum ParsedCqe {
    TriggerFevent {
@@ -209,6 +210,8 @@ impl UserInner {
        caller_responsible: &mut PageSpan,
        token: &mut CleanLockToken,
    ) -> Result<Response> {
+        let mut remaining_spurious_wakeups = MAX_SPURIOUS_WAKEUPS;
+
        {
            // Disable preemption to avoid context switches between setting the
            // process state and sending the scheme request. The process is made
@@ -261,7 +264,10 @@ impl UserInner {
                    };

                let states = self.states.lock(token.token());
-                let (mut states, mut token) = states.into_split();
+                let (mut states, mut state_token) = states.into_split();
+                let mut timed_out_descriptions = None;
+                let mut remove_state = false;
+                let mut timed_out = false;
                match states.get_mut(sqe.tag as usize) {
                    // invalid state
                    None => return Err(Error::new(EBADFD)),
@@ -274,24 +280,35 @@ impl UserInner {
                            fds,
                        } => {
                            let maybe_eintr =
-                                eintr_if_sigkill(&mut callee_responsible, &mut token.token());
-                            *o = State::Waiting {
-                                canceling: true,
-                                callee_responsible,
-                                context,
-                                fds,
-                            };
+                                eintr_if_sigkill(&mut callee_responsible, &mut state_token.token());
+
+                            if maybe_eintr.is_ok() {
+                                remaining_spurious_wakeups =
+                                    remaining_spurious_wakeups.saturating_sub(1);
+                            }
+
+                            if maybe_eintr.is_ok() && remaining_spurious_wakeups == 0 {
+                                timed_out_descriptions = Some(Self::collect_descriptions_to_close(fds));
+                                remove_state = true;
+                            } else {
+                                *o = State::Waiting {
+                                    canceling: true,
+                                    callee_responsible,
+                                    context,
+                                    fds,
+                                };
+                            }

                            maybe_eintr?;

-                            context::current()
-                                .write(token.token())
-                                .block("UserInner::call (woken up after cancelation request)");
-
-                            // We do not want to drop the lock before blocking
-                            // as if we get preempted in between we might miss a
-                            // wakeup.
-                            drop(states);
+                            if remove_state {
+                                states.remove(sqe.tag as usize);
+                                timed_out = true;
+                            } else {
+                                context::current()
+                                    .write(state_token.token())
+                                    .block("UserInner::call (woken up after cancelation request)");
+                            }
                        }
                        // spurious wakeup
                        State::Waiting {
@@ -300,60 +317,76 @@ impl UserInner {
                            context,
                            mut callee_responsible,
                        } => {
-                            let maybe_eintr = eintr_if_sigkill(&mut callee_responsible, &mut token);
                            let current_context = context::current();
+                            let maybe_eintr =
+                                eintr_if_sigkill(&mut callee_responsible, &mut state_token);

-                            *o = State::Waiting {
-                                // Currently we treat all spurious wakeups to have the same behavior
-                                // as signals (i.e., we send a cancellation request). It is not something
-                                // that should happen, but it certainly can happen, for example if a context
-                                // is awoken through its thread handle without setting any sig bits, or if the
-                                // caller clears its own sig bits. If it actually is a signal, then it is the
-                                // intended behavior.
-                                canceling: true,
-                                fds,
-                                context,
-                                callee_responsible,
-                            };
+                            if maybe_eintr.is_ok() {
+                                remaining_spurious_wakeups =
+                                    remaining_spurious_wakeups.saturating_sub(1);
+                            }
+
+                            if maybe_eintr.is_ok() && remaining_spurious_wakeups == 0 {
+                                timed_out_descriptions = Some(Self::collect_descriptions_to_close(fds));
+                                remove_state = true;
+                            } else {
+                                *o = State::Waiting {
+                                    // Currently we treat all spurious wakeups to have the same behavior
+                                    // as signals (i.e., we send a cancellation request). It is not something
+                                    // that should happen, but it certainly can happen, for example if a context
+                                    // is awoken through its thread handle without setting any sig bits, or if the
+                                    // caller clears its own sig bits. If it actually is a signal, then it is the
+                                    // intended behavior.
+                                    canceling: true,
+                                    fds,
+                                    context,
+                                    callee_responsible,
+                                };
+                            }

                            maybe_eintr?;

-                            // We do not want to preempt between sending the
-                            // cancellation and blocking again where we might
-                            // miss a wakeup.
-                            let mut preempt = PreemptGuardL1::new(&current_context, &mut token);
-                            let token = preempt.token();
+                            if remove_state {
+                                states.remove(sqe.tag as usize);
+                                timed_out = true;
+                            } else {
+                                // We do not want to preempt between sending the
+                                // cancellation and blocking again where we might
+                                // miss a wakeup.
+                                let mut preempt =
+                                    PreemptGuardL1::new(&current_context, &mut state_token);
+                                let token = preempt.token();

-                            self.todo.send_locked(
-                                Sqe {
-                                    opcode: Opcode::Cancel as u8,
-                                    sqe_flags: SqeFlags::ONEWAY,
-                                    tag: sqe.tag,
-                                    ..Default::default()
-                                },
-                                token.token(),
-                            );
-                            event::trigger_locked(
-                                self.root_id,
-                                self.scheme_id.get(),
-                                EVENT_READ,
-                                token.token(),
-                            );
+                                self.todo.send_locked(
+                                    Sqe {
+                                        opcode: Opcode::Cancel as u8,
+                                        sqe_flags: SqeFlags::ONEWAY,
+                                        tag: sqe.tag,
+                                        ..Default::default()
+                                    },
+                                    token.token(),
+                                );
+                                event::trigger_locked(
+                                    self.root_id,
+                                    self.scheme_id.get(),
+                                    EVENT_READ,
+                                    token.token(),
+                                );

-                            // 1. If cancellation was requested and arrived
-                            // before the scheme processed the request, an
-                            // acknowledgement will be sent back after the
-                            // cancellation is processed and we will be woken up
-                            // again. State will be State::Responded then.
-                            //
-                            // 2. If cancellation was requested but the scheme
-                            // already processed the request, we will receive
-                            // the actual response next and woken up again.
-                            // State will be State::Responded then.
-                            context::current()
-                                .write(token.token())
-                                .block("UserInner::call (spurious wakeup)");
-                            drop(states);
+                                // 1. If cancellation was requested and arrived
+                                // before the scheme processed the request, an
+                                // acknowledgement will be sent back after the
+                                // cancellation is processed and we will be woken up
+                                // again. State will be State::Responded then.
+                                //
+                                // 2. If cancellation was requested but the scheme
+                                // already processed the request, we will receive
+                                // the actual response next and woken up again.
+                                // State will be State::Responded then.
+                                context::current()
+                                    .write(token.token())
+                                    .block("UserInner::call (spurious wakeup)");
+                            }
                        }

                        // invalid state
@@ -368,10 +401,70 @@ impl UserInner {
                        }
                    },
                }
+
+                if let Some(descriptions) = timed_out_descriptions {
+                    drop(states);
+                    for desc in descriptions {
+                        let _ = desc.try_close(token);
+                    }
+                }
+
+                if timed_out {
+                    return Err(Error::new(ETIMEDOUT));
+                }
            }
        }
    }

+    fn collect_descriptions_to_close(
+        fds: Vec<Arc<LockedFileDescription>>,
+    ) -> Vec<FileDescription> {
+        fds.into_iter()
+            .filter_map(|fd| Arc::try_unwrap(fd).ok())
+            .map(RwLock::into_inner)
+            .collect()
+    }
+
+    pub fn fail_pending_calls(&self, token: &mut CleanLockToken) {
+        let descriptions_to_close = {
+            let mut states_lock = self.states.lock(token.token());
+            let (states, mut lock_token) = states_lock.token_split();
+            let mut descriptions_to_close = Vec::new();
+            let mut states_to_remove = Vec::new();
+
+            for (id, state) in states.iter_mut() {
+                match mem::replace(state, State::Placeholder) {
+                    State::Waiting { context, fds, .. } => {
+                        descriptions_to_close.extend(Self::collect_descriptions_to_close(fds));
+
+                        match context.upgrade() {
+                            Some(context) => {
+                                *state = State::Responded(Response::Regular(
+                                    Err(Error::new(ENODEV)),
+                                    0,
+                                    false,
+                                ));
+                                context.write(lock_token.token()).unblock();
+                            }
+                            None => states_to_remove.push(id),
+                        }
+                    }
+                    old_state => *state = old_state,
+                }
+            }
+
+            for id in states_to_remove {
+                states.remove(id);
+            }
+
+            descriptions_to_close
+        };
+
+        for desc in descriptions_to_close {
+            let _ = desc.try_close(token);
+        }
+    }
+
    /// Map a readable structure to the scheme's userspace and return the
    /// pointer
    #[must_use = "copying back to head/tail buffers can fail"]
@@ -1283,6 +1376,7 @@ impl UserInner {
    }

    pub fn into_drop(self, token: &mut CleanLockToken) {
+        self.fail_pending_calls(token);
        self.todo.condition.into_drop(token);
    }
 }
@@ -74,14 +74,16 @@ impl MemoryEntry {
 }

 struct MemoryMap {
-    entries: [MemoryEntry; 512],
+    entries: [MemoryEntry; 1024],
    size: usize,
 }

 impl MemoryMap {
    fn register(&mut self, base: usize, size: usize, kind: BootloaderMemoryKind) {
        if self.size >= self.entries.len() {
-            panic!("Early memory map overflow!");
+            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+            unsafe { core::arch::asm!("out dx, al", in("dx") 0x3F8u16, in("al") b'!', options(nostack, preserves_flags)); }
+            panic!("Early memory map overflow at entry {} (max {})", self.size, self.entries.len());
        }
        let start = if kind == BootloaderMemoryKind::Free {
            align_up(base)
@@ -134,7 +136,7 @@ static MEMORY_MAP: SyncUnsafeCell<MemoryMap> = SyncUnsafeCell::new(MemoryMap {
        start: 0,
        end: 0,
        kind: BootloaderMemoryKind::Null,
-    }; 512],
+    }; 1024],
    size: 0,
 });

@@ -323,7 +325,16 @@ unsafe fn map_memory<A: Arch>(areas: &[MemoryArea], mut bump_allocator: &mut Bum
            }
        }

-        let kernel_area = (*MEMORY_MAP.get()).kernel().unwrap();
+        let kernel_area = match (*MEMORY_MAP.get()).kernel() {
+            Some(area) => area,
+            None => {
+                println!("FATAL: kernel memory area not found in boot memory map");
+                println!("Cannot determine kernel base address. Halting.");
+                loop {
+                    core::hint::spin_loop();
+                }
+            }
+        };
        let kernel_base = kernel_area.start;
        let kernel_size = kernel_area.end.saturating_sub(kernel_area.start);
        // Map kernel at KERNEL_OFFSET
@@ -149,6 +149,15 @@ static BOOTSTRAP: spin::Once<Bootstrap> = spin::Once::new();
 pub(crate) static AP_READY: AtomicBool = AtomicBool::new(false);
 static BSP_READY: AtomicBool = AtomicBool::new(false);

+#[cold]
+fn halt_boot(message: &str) -> ! {
+    print!("{message}");
+    println!("Kernel boot cannot continue. Halting.");
+    loop {
+        hint::spin_loop();
+    }
+}
+
 /// This is the kernel entry point for the primary CPU. The arch crate is responsible for calling this
 pub(crate) fn kmain(bootstrap: Bootstrap) -> ! {
    let mut token = unsafe { CleanLockToken::new() };
@@ -180,9 +189,7 @@ pub(crate) fn kmain(bootstrap: Bootstrap) -> ! {
            context.euid = 0;
            context.egid = 0;
        }
-        Err(err) => {
-            panic!("failed to spawn userspace_init: {:?}", err);
-        }
+        Err(_err) => halt_boot("FATAL: failed to spawn first userspace process userspace_init\n"),
    }

    run_userspace(&mut token)
@@ -0,0 +1,188 @@
+//! MCS (Mellor-Crummey Scott) fair spinlock.
+//!
+//! Each waiter spins on its own local `locked` flag instead of a shared lock
+//! word, eliminating cache-line bouncing under contention. FIFO ordering
+//! guarantees fairness. O(1) cache-line transfers on unlock.
+//!
+//! Supports transitive priority inheritance: when CPU A waits on a lock held
+//! by CPU B, and CPU B waits on a lock held by CPU C, A's priority is
+//! propagated through the chain to C (up to MAX_PI_CHAIN_DEPTH hops).
+
+use core::sync::atomic::{AtomicBool, AtomicPtr, AtomicU32, Ordering};
+use core::{hint, ptr};
+
+use crate::percpu::PercpuBlock;
+
+/// Maximum depth for transitive priority inheritance chain following.
+/// Prevents infinite loops from theoretical lock cycles and bounds latency.
+/// Linux uses 20; 8 is conservative for a microkernel with fewer nesting levels.
+const MAX_PI_CHAIN_DEPTH: u32 = 8;
+
+/// A node in the MCS lock queue.
+pub struct McsNode {
+    pub next: AtomicPtr<McsNode>,
+    pub locked: AtomicBool,
+}
+
+impl McsNode {
+    pub const fn new() -> Self {
+        Self {
+            next: AtomicPtr::new(ptr::null_mut()),
+            locked: AtomicBool::new(false),
+        }
+    }
+}
+
+/// Raw MCS spinlock primitive.
+pub struct McsRawLock {
+    tail: AtomicPtr<McsNode>,
+    /// CPU ID of the current lock holder (for priority inheritance).
+    /// `u32::MAX` means no holder.
+    holder_cpu: AtomicU32,
+}
+
+impl McsRawLock {
+    pub const fn new() -> Self {
+        Self {
+            tail: AtomicPtr::new(ptr::null_mut()),
+            holder_cpu: AtomicU32::new(u32::MAX),
+        }
+    }
+
+    #[inline]
+    pub fn acquire(&self, node: &McsNode) -> bool {
+        node.next.store(ptr::null_mut(), Ordering::Relaxed);
+        node.locked.store(true, Ordering::Relaxed);
+        let prev = self.tail.swap((node as *const McsNode).cast_mut(), Ordering::AcqRel);
+        if prev.is_null() {
+            // Uncontended — record ourselves as holder
+            let cpu_id = PercpuBlock::current().cpu_id.get();
+            self.holder_cpu.store(cpu_id, Ordering::Release);
+            return false;
+        }
+        unsafe {
+            (*prev).next.store((node as *const McsNode).cast_mut(), Ordering::Release);
+        }
+        let percpu = PercpuBlock::current();
+        // Record which lock we're spinning on (for transitive PI chain following)
+        percpu.waiting_on_lock.store(
+            (self as *const McsRawLock).cast_mut(),
+            Ordering::Release,
+        );
+        let mut donated = false;
+        while node.locked.load(Ordering::Acquire) {
+            percpu.maybe_handle_tlb_shootdown();
+            // Donate priority to the lock holder (transitively) once per acquisition
+            if !donated {
+                self.maybe_donate_priority(percpu);
+                donated = true;
+            }
+            hint::spin_loop();
+        }
+        // Clear waiting_on_lock before proceeding — we now hold the lock
+        percpu.waiting_on_lock.store(ptr::null_mut(), Ordering::Release);
+        self.holder_cpu.store(percpu.cpu_id.get(), Ordering::Release);
+        true
+    }
+
+    #[inline]
+    pub fn release(&self, node: &McsNode) {
+        // Clear priority inheritance donation — we no longer hold the lock
+        PercpuBlock::current().pi_donated_prio.store(u32::MAX, Ordering::Release);
+        // Clear holder CPU
+        self.holder_cpu.store(u32::MAX, Ordering::Release);
+
+        let next = node.next.load(Ordering::Acquire);
+        if next.is_null() {
+            if self
+                .tail
+                .compare_exchange(
+                    (node as *const McsNode).cast_mut(),
+                    ptr::null_mut(),
+                    Ordering::AcqRel,
+                    Ordering::Acquire,
+                )
+                .is_ok()
+            {
+                return;
+            }
+            while node.next.load(Ordering::Acquire).is_null() {
+                hint::spin_loop();
+            }
+        }
+        unsafe {
+            (*node.next.load(Ordering::Acquire)).locked.store(false, Ordering::Release);
+        }
+    }
+
+    #[inline]
+    pub fn try_acquire(&self, node: &McsNode) -> bool {
+        node.next.store(ptr::null_mut(), Ordering::Relaxed);
+        node.locked.store(true, Ordering::Relaxed);
+        let ok = self
+            .tail
+            .compare_exchange(
+                ptr::null_mut(),
+                (node as *const McsNode).cast_mut(),
+                Ordering::AcqRel,
+                Ordering::Acquire,
+            )
+            .is_ok();
+        if ok {
+            let cpu_id = PercpuBlock::current().cpu_id.get();
+            self.holder_cpu.store(cpu_id, Ordering::Release);
+        }
+        ok
+    }
+
+    /// Donate current CPU's context priority to the lock holder's CPU,
+    /// following the PI chain transitively (A→B→C).
+    ///
+    /// Reads priority from PercpuBlock::current_prio (cached by the scheduler)
+    /// to avoid acquiring any lock in the MCS spin loop.
+    ///
+    /// Chain following: if the holder is itself waiting on another lock,
+    /// we propagate our priority to that lock's holder too, up to
+    /// MAX_PI_CHAIN_DEPTH hops.
+    fn maybe_donate_priority(&self, my_percpu: &PercpuBlock) {
+        let my_prio = my_percpu.current_prio.get() as u32;
+        let mut current_holder_cpu = self.holder_cpu.load(Ordering::Relaxed);
+
+        for _ in 0..MAX_PI_CHAIN_DEPTH {
+            if current_holder_cpu == u32::MAX {
+                return;
+            }
+            let holder_percpu = crate::percpu::get_for_cpu(
+                crate::cpu_set::LogicalCpuId::new(current_holder_cpu),
+            );
+            let Some(holder) = holder_percpu else {
+                return;
+            };
+
+            // Donate if our priority is higher (lower number) than current donation
+            let current_donated = holder.pi_donated_prio.load(Ordering::Relaxed);
+            if my_prio < current_donated {
+                holder.pi_donated_prio.store(my_prio, Ordering::Release);
+            }
+
+            // Follow the chain: is this holder also waiting on another lock?
+            let next_lock_ptr = holder.waiting_on_lock.load(Ordering::Relaxed);
+            if next_lock_ptr.is_null() {
+                return;
+            }
+            // SAFETY: The pointed-to McsRawLock is a long-lived struct field
+            // (e.g., part of the run queue). The holder is currently spinning
+            // in acquire(), so the pointer is valid. We only read holder_cpu
+            // (an atomic u32) — no mutable access needed.
+            let next_holder_cpu =
+                unsafe { (*next_lock_ptr).holder_cpu.load(Ordering::Relaxed) };
+
+            // Cycle detection: if the next holder is the same CPU we just visited, stop
+            if next_holder_cpu == current_holder_cpu {
+                return;
+            }
+            current_holder_cpu = next_holder_cpu;
+        }
+        // Chain depth exhausted — stop to bound latency
+    }
+}
@@ -1,5 +1,6 @@
 pub use self::{ordered::*, wait_condition::WaitCondition, wait_queue::WaitQueue};

+pub mod mcs;
 pub mod ordered;
 pub mod wait_condition;
 pub mod wait_queue;
@@ -52,7 +52,9 @@
 //! *g1 = 12;
 //! ```
 use alloc::sync::Arc;
+use core::cell::UnsafeCell;
 use core::marker::PhantomData;
+use core::ptr;

 use crate::percpu::PercpuBlock;

@@ -732,3 +734,143 @@ impl<L: Level, T> Drop for ArcRwLockWriteGuard<L, T> {
 /// This function can only be called if no lock is held by the calling thread/task
 #[inline]
 pub fn check_no_locks(_: LockToken<'_, L0>) {}
+
+// ---------------------------------------------------------------------------
+// MCS-based fair mutex (McsMutex)
+// ---------------------------------------------------------------------------
+
+/// A mutual exclusion lock using the MCS fair spinlock algorithm.
+///
+/// Unlike `Mutex<L, T>` which uses a simple spinlock (no fairness under
+/// contention), `McsMutex` uses Mellor-Crummey Scott queue-based spinning:
+///
+/// - Each waiter spins on its **own** local flag — no shared cache-line bouncing.
+/// - FIFO ordering prevents starvation.
+/// - O(1) cache-line transfers on unlock.
+///
+/// The MCS node is stored in [`crate::percpu::PercpuBlock::mcs_sched_node`], so
+/// this type is suitable for scheduler-internal locks where the holder is always
+/// the current CPU.
+pub struct McsMutex<L: Level, T> {
+    raw: crate::sync::mcs::McsRawLock,
+    data: UnsafeCell<T>,
+    _phantom: PhantomData<L>,
+}
+
+unsafe impl<L: Level, T: Send> Sync for McsMutex<L, T> {}
+unsafe impl<L: Level, T: Send> Send for McsMutex<L, T> {}
+
+impl<L: Level, T> McsMutex<L, T> {
+    pub const fn new(val: T) -> Self {
+        Self {
+            raw: crate::sync::mcs::McsRawLock::new(),
+            data: UnsafeCell::new(val),
+            _phantom: PhantomData,
+        }
+    }
+}
+
+impl<L: Level, T> McsMutex<L, T> {
+    pub fn lock<'a, LP: Lower<L> + 'a>(
+        &'a self,
+        lock_token: LockToken<'a, LP>,
+    ) -> McsMutexGuard<'a, L, T> {
+        let percpu = PercpuBlock::current();
+        let contended = self.raw.acquire(&percpu.mcs_sched_node);
+        if contended {
+            percpu
+                .mcs_contention_count
+                .set(percpu.mcs_contention_count.get() + 1);
+        }
+        McsMutexGuard {
+            lock: self,
+            lock_token: LockToken::downgraded(lock_token),
+        }
+    }
+
+    pub fn try_lock<'a, LP: Lower<L> + 'a>(
+        &'a self,
+        lock_token: LockToken<'a, LP>,
+    ) -> Option<McsMutexGuard<'a, L, T>> {
+        let percpu = PercpuBlock::current();
+        if self.raw.try_acquire(&percpu.mcs_sched_node) {
+            Some(McsMutexGuard {
+                lock: self,
+                lock_token: LockToken::downgraded(lock_token),
+            })
+        } else {
+            None
+        }
+    }
+}
+
+pub struct McsMutexGuard<'a, L: Level, T: 'a> {
+    lock: &'a McsMutex<L, T>,
+    lock_token: LockToken<'a, L>,
+}
+
+impl<'a, L: Level, T: 'a> McsMutexGuard<'a, L, T> {
+    pub fn token_split(&mut self) -> (&mut T, LockToken<'_, L>) {
+        unsafe { (&mut *self.lock.data.get(), self.lock_token.token()) }
+    }
+
+    pub fn into_split(self) -> (McsRawGuard<'a, L, T>, LockToken<'a, L>) {
+        let lock_ref = self.lock;
+        let token = unsafe { core::ptr::read(&self.lock_token) };
+        core::mem::forget(self);
+        (McsRawGuard { lock: lock_ref }, token)
+    }
+
+    pub fn from_split(raw: McsRawGuard<'a, L, T>, token: LockToken<'a, L>) -> Self {
+        let lock_ref = raw.lock;
+        core::mem::forget(raw);
+        Self {
+            lock: lock_ref,
+            lock_token: token,
+        }
+    }
+}
+
+impl<L: Level, T> core::ops::Deref for McsMutexGuard<'_, L, T> {
+    type Target = T;
+    fn deref(&self) -> &Self::Target {
+        unsafe { &*self.lock.data.get() }
+    }
+}
+
+impl<L: Level, T> core::ops::DerefMut for McsMutexGuard<'_, L, T> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        unsafe { &mut *self.lock.data.get() }
+    }
+}
+
+impl<L: Level, T> Drop for McsMutexGuard<'_, L, T> {
+    fn drop(&mut self) {
+        let percpu = PercpuBlock::current();
+        self.lock.raw.release(&percpu.mcs_sched_node);
+    }
+}
+
+pub struct McsRawGuard<'a, L: Level, T: 'a> {
+    lock: &'a McsMutex<L, T>,
+}
+
+impl<L: Level, T> core::ops::Deref for McsRawGuard<'_, L, T> {
+    type Target = T;
+    fn deref(&self) -> &Self::Target {
+        unsafe { &*self.lock.data.get() }
+    }
+}
+
+impl<L: Level, T> core::ops::DerefMut for McsRawGuard<'_, L, T> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        unsafe { &mut *self.lock.data.get() }
+    }
+}
+
+impl<L: Level, T> Drop for McsRawGuard<'_, L, T> {
+    fn drop(&mut self) {
+        let percpu = PercpuBlock::current();
+        self.lock.raw.release(&percpu.mcs_sched_node);
+    }
+}
@@ -2,7 +2,7 @@

 use core::num::NonZeroUsize;

-use alloc::{string::String, sync::Arc, vec::Vec};
+use alloc::{format, string::{String, ToString}, sync::Arc, vec::Vec};
 use redox_path::RedoxPath;

 use crate::{
@@ -12,9 +12,9 @@ use crate::{
        memory::{AddrSpace, GenericFlusher, Grant, PageSpan, TlbShootdownActions},
    },
    memory::{Page, VirtualAddress, PAGE_SIZE},
-    scheme::{self, FileHandle, KernelScheme, OpenResult, StrOrBytes},
+    scheme::{self, pipe, FileHandle, KernelScheme, OpenResult, SchemeExt, StrOrBytes},
    sync::{CleanLockToken, RwLock},
-    syscall::{data::Stat, error::*, flag::*},
+    syscall::{data::{GlobalSchemes, Stat}, error::*, flag::*},
 };

 use super::usercopy::{UserSlice, UserSliceRo, UserSliceRw, UserSliceWo};
@@ -45,7 +45,7 @@ pub fn file_op_generic_ext<T>(
        (file, desc)
    };

-    let scheme = scheme::get_scheme(token.token(), desc.scheme)?;
+    let scheme = desc.get_scheme(token)?;

    op(&*scheme, file.description, desc, token)
 }
@@ -62,55 +62,32 @@ pub fn copy_path_to_buf(raw_path: UserSliceRo, max_len: usize) -> Result<String>
 // TODO: Define elsewhere
 const PATH_MAX: usize = PAGE_SIZE;

-pub fn openat(
-    fh: FileHandle,
-    raw_path: UserSliceRo,
+fn fifo_path_key(scheme_id: scheme::SchemeId, number: usize, path: &str) -> String {
+    if path.starts_with('/') {
+        path.to_string()
+    } else {
+        format!("@fifo:{}:{}:{}", scheme_id.get(), number, path)
+    }
+}
+
+fn install_open_result(
+    scheme_id: scheme::SchemeId,
    flags: usize,
-    fcntl_flags: u32,
-    euid: u32,
-    egid: u32,
+    open_result: OpenResult,
    token: &mut CleanLockToken,
 ) -> Result<FileHandle> {
-    let path_buf = copy_path_to_buf(raw_path, PATH_MAX)?;
-
-    let (scheme_id, number) = {
-        let current_lock = context::current();
-        let mut current = current_lock.read(token.token());
-        let (context, mut token) = current.token_split();
-        let pipe = context.get_file(fh, &mut token).ok_or(Error::new(EBADF))?;
-        let desc = pipe.description.read(token.token());
-        (desc.scheme, desc.number)
-    };
-
-    let caller_ctx = context::current()
-        .read(token.token())
-        .caller_ctx()
-        .filter_uid_gid(euid, egid);
-
-    let new_description = {
-        let scheme = scheme::get_scheme(token.token(), scheme_id)?;
-
-        let res = scheme.kopenat(
-            number,
-            StrOrBytes::from_str(&path_buf),
-            flags,
-            fcntl_flags,
-            caller_ctx,
-            token,
-        );
-
-        match res? {
-            OpenResult::SchemeLocal(number, internal_flags) => {
-                Arc::new(RwLock::new(FileDescription {
-                    offset: 0,
-                    internal_flags,
-                    scheme: scheme_id,
-                    number,
-                    flags: (flags & !O_CLOEXEC) as u32,
-                }))
-            }
-            OpenResult::External(desc) => desc,
-        }
+    let new_description = match open_result {
+        OpenResult::SchemeLocal(number, internal_flags) => Arc::new(RwLock::new(
+            FileDescription::new(
+                scheme_id,
+                number,
+                0,
+                (flags & !O_CLOEXEC) as u32,
+                internal_flags,
+                token,
+            ),
+        )),
+        OpenResult::External(desc) => desc,
    };

    let current_lock = context::current();
@@ -126,6 +103,102 @@ pub fn openat(
        )
        .ok_or(Error::new(EMFILE))
 }
+
+fn path_exists_in_scheme(
+    scheme: &dyn KernelScheme,
+    number: usize,
+    path: &str,
+    caller_ctx: scheme::CallerCtx,
+    token: &mut CleanLockToken,
+) -> Result<bool> {
+    match scheme.kopenat(number, StrOrBytes::from_str(path), O_STAT, 0, caller_ctx, token) {
+        Ok(OpenResult::SchemeLocal(number, _)) => {
+            let _ = scheme.close(number, token);
+            Ok(true)
+        }
+        Ok(OpenResult::External(_)) => Ok(true),
+        Err(err) if err.errno == ENOENT => Ok(false),
+        Err(err) => Err(err),
+    }
+}
+
+pub fn openat(
+    fh: FileHandle,
+    raw_path: UserSliceRo,
+    flags: usize,
+    fcntl_flags: u32,
+    euid: u32,
+    egid: u32,
+    token: &mut CleanLockToken,
+) -> Result<FileHandle> {
+    let path_buf = copy_path_to_buf(raw_path, PATH_MAX)?;
+
+    let desc = {
+        let current_lock = context::current();
+        let mut current = current_lock.read(token.token());
+        let (context, mut context_token) = current.token_split();
+        let pipe = context
+            .get_file(fh, &mut context_token)
+            .ok_or(Error::new(EBADF))?;
+        *pipe.description.read(context_token.token())
+    };
+    let scheme = desc.get_scheme(token)?;
+    let number = desc.number;
+    let scheme_id = desc.scheme;
+
+    let caller_ctx = context::current()
+        .read(token.token())
+        .caller_ctx()
+        .filter_uid_gid(euid, egid);
+
+    let fifo_mode_requested = flags & MODE_FIFO as usize == MODE_FIFO as usize;
+    let fifo_key = fifo_path_key(scheme_id, number, &path_buf);
+
+    if pipe::named_pipe_exists(&fifo_key, token) {
+        if flags & O_EXCL == O_EXCL && flags & O_CREAT == O_CREAT {
+            return Err(Error::new(EEXIST));
+        }
+        if fifo_mode_requested && flags & O_CREAT == O_CREAT {
+            return Err(Error::new(EEXIST));
+        }
+
+        let pipe_number = pipe::open_named_pipe(&fifo_key, flags, token)?
+            .ok_or(Error::new(ENOENT))?;
+        return install_open_result(
+            GlobalSchemes::Pipe.scheme_id(),
+            flags,
+            OpenResult::SchemeLocal(pipe_number, InternalFlags::empty()),
+            token,
+        );
+    }
+
+    if fifo_mode_requested && flags & O_CREAT == O_CREAT {
+        if path_exists_in_scheme(&*scheme, number, &path_buf, caller_ctx, token)? {
+            return Err(Error::new(EEXIST));
+        }
+
+        let mode = u16::try_from(flags & 0o7777).map_err(|_| Error::new(EINVAL))?;
+        let pipe_number = pipe::create_named_pipe(&fifo_key, &path_buf, mode, flags, token)?;
+
+        return install_open_result(
+            GlobalSchemes::Pipe.scheme_id(),
+            flags,
+            OpenResult::SchemeLocal(pipe_number, InternalFlags::empty()),
+            token,
+        );
+    }
+
+    let open_result = scheme.kopenat(
+        number,
+        StrOrBytes::from_str(&path_buf),
+        flags,
+        fcntl_flags,
+        caller_ctx,
+        token,
+    )?;
+
+    install_open_result(scheme_id, flags, open_result, token)
+}
 /// Unlinkat syscall
 pub fn unlinkat(
    fh: FileHandle,
@@ -137,22 +210,27 @@ pub fn unlinkat(
 ) -> Result<()> {
    let path_buf = copy_path_to_buf(raw_path, PATH_MAX)?;

-    let (number, scheme_id) = {
+    let desc = {
        let current_lock = context::current();
        let mut current = current_lock.read(token.token());
-        let (context, mut token) = current.token_split();
-        let pipe = context.get_file(fh, &mut token).ok_or(Error::new(EBADF))?;
-        let desc = pipe.description.read(token.token());
-        (desc.number, desc.scheme)
+        let (context, mut context_token) = current.token_split();
+        let pipe = context
+            .get_file(fh, &mut context_token)
+            .ok_or(Error::new(EBADF))?;
+        *pipe.description.read(context_token.token())
    };
-
-    let scheme = scheme::get_scheme(token.token(), scheme_id)?;
+    let number = desc.number;
+    let scheme = desc.get_scheme(token)?;

    let caller_ctx = context::current()
        .read(token.token())
        .caller_ctx()
        .filter_uid_gid(euid, egid);

+    if pipe::unlink_named_pipe(&fifo_path_key(desc.scheme, number, &path_buf), token) {
+        return Ok(());
+    }
+
    /*
    let mut path_buf = BorrowedHtBuf::head()?;
    let path = path_buf.use_for_string(raw_path)?;
@@ -199,17 +277,18 @@ fn duplicate_file(
        let description = { *file.description.read(token.token()) };

        let new_description = {
-            let scheme = scheme::get_scheme(token.token(), description.scheme)?;
+            let scheme = description.get_scheme(token)?;

            match scheme.kdup(description.number, user_buf, caller_ctx, token)? {
                OpenResult::SchemeLocal(number, internal_flags) => {
-                    Arc::new(RwLock::new(FileDescription {
-                        offset: 0,
-                        internal_flags,
-                        scheme: description.scheme,
+                    Arc::new(RwLock::new(FileDescription::new(
+                        description.scheme,
                        number,
-                        flags: description.flags,
-                    }))
+                        0,
+                        description.flags,
+                        internal_flags,
+                        token,
+                    )))
                }
                OpenResult::External(desc) => desc,
            }
@@ -296,11 +375,10 @@ fn call_normal(
    }
    .ok_or(Error::new(EBADF))?;

-    let (scheme_id, number) = {
-        let desc = file.description.read(token.token());
-        (desc.scheme, desc.number)
+    let (scheme, number) = {
+        let desc = *file.description.read(token.token());
+        (desc.get_scheme(token)?, desc.number)
    };
-    let scheme = scheme::get_scheme(token.token(), scheme_id)?;

    if flags.contains(CallFlags::STD_FS) {
        scheme.translate_std_fs_call(number, file.description, payload, flags, metadata, token)
@@ -341,28 +419,28 @@ fn fdwrite_inner(
 ) -> Result<usize> {
    // TODO: Ensure deadlocks can't happen
    let (scheme, number, descs_to_send) = {
-        let (scheme, number) = {
+        let desc = {
            let current_lock = context::current();
            let mut current = current_lock.read(token.token());
-            let (context, mut token) = current.token_split();
+            let (context, mut context_token) = current.token_split();
            let file_descriptor = context
-                .get_file(socket, &mut token)
+                .get_file(socket, &mut context_token)
                .ok_or(Error::new(EBADF))?;
-            let desc = &file_descriptor.description.read(token.token());
-            (desc.scheme, desc.number)
+            *file_descriptor.description.read(context_token.token())
        };
-        let scheme = scheme::get_scheme(token.token(), scheme)?;
+        let scheme = desc.get_scheme(token)?;
+        let number = desc.number;

        let current_lock = context::current();
        let mut current = current_lock.read(token.token());
-        let (context, mut token) = current.token_split();
+        let (context, mut context_token) = current.token_split();
        (
            scheme,
            number,
            if flags.contains(CallFlags::FD_CLONE) {
-                context.bulk_get_files(&target_fds, &mut token)
+                context.bulk_get_files(&target_fds, &mut context_token)
            } else {
-                context.bulk_remove_files(&target_fds, &mut token)
+                context.bulk_remove_files(&target_fds, &mut context_token)
            }?
            .into_iter()
            .map(|f| f.description)
@@ -395,18 +473,22 @@ fn call_fdread(
    metadata: &[u64],
    token: &mut CleanLockToken,
 ) -> Result<usize> {
+    let desc = {
+        let current_lock = context::current();
+        let mut current = current_lock.read(token.token());
+        let (context, mut context_token) = current.token_split();
+        let file_descriptor = context
+            .get_file(fd, &mut context_token)
+            .ok_or(Error::new(EBADF))?;
+        *file_descriptor.description.read(context_token.token())
+    };
    let (scheme, number) = {
-        let (scheme, number) = {
-            let current_lock = context::current();
-            let mut current = current_lock.read(token.token());
-            let (context, mut token) = current.token_split();
-            let file_descriptor = context.get_file(fd, &mut token).ok_or(Error::new(EBADF))?;
-            let desc = file_descriptor.description.read(token.token());
-            (desc.scheme, desc.number)
-        };
-        let scheme = scheme::get_scheme(token.token(), scheme)?;
-
-        (scheme, number)
+        let scheme = desc.get_scheme(token)?;
+        let number = desc.number;
+        (
+            scheme,
+            number,
+        )
    };

    scheme.kfdread(number, payload, flags, metadata, token)
@@ -440,9 +522,9 @@ pub fn fcntl(fd: FileHandle, cmd: usize, arg: usize, token: &mut CleanLockToken)
    }
    .ok_or(Error::new(EBADF))?;

-    let (scheme_id, number, flags) = {
-        let desc = file.description.write(token.token());
-        (desc.scheme, desc.number, desc.flags)
+    let (number, flags, desc) = {
+        let desc = *file.description.read(token.token());
+        (desc.number, desc.flags, desc)
    };

    if cmd == F_DUPFD || cmd == F_DUPFD_CLOEXEC {
@@ -460,7 +542,7 @@ pub fn fcntl(fd: FileHandle, cmd: usize, arg: usize, token: &mut CleanLockToken)

    // Communicate fcntl with scheme
    if cmd != F_GETFD && cmd != F_SETFD {
-        let scheme = scheme::get_scheme(token.token(), scheme_id)?;
+        let scheme = desc.get_scheme(token)?;

        scheme.fcntl(number, cmd, arg, token)?;
    };
@@ -518,13 +600,11 @@ pub fn flink(fd: FileHandle, raw_path: UserSliceRo, token: &mut CleanLockToken)
    let path = RedoxPath::from_absolute(&path_buf).ok_or(Error::new(EINVAL))?;
    let (_, reference) = path.as_parts().ok_or(Error::new(EINVAL))?;

-    let (number, scheme_id) = {
-        let desc = file.description.read(token.token());
-        (desc.number, desc.scheme)
+    let (number, scheme) = {
+        let desc = *file.description.read(token.token());
+        (desc.number, desc.get_scheme(token)?)
    };

-    let scheme = scheme::get_scheme(token.token(), scheme_id)?;
-
    // TODO: Check EXDEV.
    /*
    if scheme_id != description.scheme {
@@ -554,13 +634,11 @@ pub fn frename(fd: FileHandle, raw_path: UserSliceRo, token: &mut CleanLockToken
    let path = RedoxPath::from_absolute(&path_buf).ok_or(Error::new(EINVAL))?;
    let (_, reference) = path.as_parts().ok_or(Error::new(EINVAL))?;

-    let (number, scheme_id) = {
-        let desc = file.description.read(token.token());
-        (desc.number, desc.scheme)
+    let (number, scheme) = {
+        let desc = *file.description.read(token.token());
+        (desc.number, desc.get_scheme(token)?)
    };

-    let scheme = scheme::get_scheme(token.token(), scheme_id)?;
-
    // TODO: Check EXDEV.
    /*
    if scheme_id != description.scheme {
@@ -28,6 +28,11 @@ use crate::{
    sync::CleanLockToken,
 };

+/// Local syscall numbers not yet in the redox_syscall crate.
+/// These are allocated from the 987+ range to avoid collisions with crate numbers.
+pub const SYS_SCHED_SETAFFINITY: usize = 987;
+pub const SYS_SCHED_GETAFFINITY: usize = 988;
+
 /// Debug
 pub mod debug;

@@ -220,6 +225,10 @@ pub fn syscall(
                unlinkat(fd, UserSlice::ro(c, d)?, e, f as _, g as _, token).map(|()| 0)
            }
            SYS_YIELD => sched_yield(token).map(|()| 0),
+
+            // P17-3: CPU affinity syscalls. Numbers allocated locally (not yet in redox_syscall crate).
+            SYS_SCHED_SETAFFINITY => sched_setaffinity(b, UserSlice::ro(c, d)?, token),
+            SYS_SCHED_GETAFFINITY => sched_getaffinity(b, UserSlice::wo(c, d)?, token),
            SYS_NANOSLEEP => nanosleep(
                UserSlice::ro(b, size_of::<TimeSpec>())?,
                UserSlice::wo(c, size_of::<TimeSpec>())?.none_if_null(),
@@ -11,6 +11,7 @@ use crate::{
        memory::{AddrSpace, Grant, PageSpan},
        ContextRef,
    },
+    cpu_set::RawMask,
    event,
    sync::{CleanLockToken, RwLock},
    syscall::flag::{EventFlags, O_CREAT, O_RDWR},
@@ -271,24 +272,95 @@ unsafe fn bootstrap_mem(bootstrap: &crate::startup::Bootstrap) -> &'static [u8]
 }

 fn insert_fd(scheme: SchemeId, number: usize, cloexec: bool, token: &mut CleanLockToken) -> usize {
+    let description = Arc::new(RwLock::new(FileDescription::new(
+        scheme,
+        number,
+        0,
+        (O_CREAT | O_RDWR) as u32,
+        InternalFlags::empty(),
+        token,
+    )));
+
    let current_lock = context::current();
    let mut current = current_lock.read(token.token());
-    let (context, mut token) = current.token_split();
+    let (context, mut context_token) = current.token_split();
    context
        .add_file_min(
            FileDescriptor {
-                description: Arc::new(RwLock::new(FileDescription {
-                    scheme,
-                    number,
-                    offset: 0,
-                    flags: (O_CREAT | O_RDWR) as u32,
-                    internal_flags: InternalFlags::empty(),
-                })),
+                description,
                cloexec,
            },
            syscall::flag::UPPER_FDTBL_TAG + scheme.get(),
-            &mut token,
+            &mut context_token,
        )
        .expect("failed to insert fd to current context")
        .get()
 }
+
+/// Set CPU affinity mask for a process.
+///
+/// # Arguments (syscall ABI)
+/// - `pid`: Process ID (0 = current process; other PIDs not yet supported)
+/// - `mask_ptr`: Pointer to a `RawMask` (32 bytes on 64-bit, 256-bit bitmap)
+/// - `mask_len`: Length of mask in bytes (must equal `size_of::<RawMask>()`)
+pub fn sched_setaffinity(
+    pid: usize,
+    mask_ptr: super::usercopy::UserSliceRo,
+    token: &mut CleanLockToken,
+) -> Result<usize> {
+    // Validate mask size
+    if mask_ptr.len() != core::mem::size_of::<RawMask>() {
+        return Err(Error::new(super::error::EINVAL));
+    }
+
+    // pid == 0 means current process
+    let target = if pid == 0 {
+        context::current()
+    } else {
+        // TODO: Support PID-based lookup (requires context list iteration
+        // with lock token downgrades). For now, only pid=0 is supported.
+        return Err(Error::new(super::error::ESRCH));
+    };
+
+    // Read mask from userspace
+    let raw_mask: RawMask = unsafe { mask_ptr.read_exact() }?;
+
+    // Apply to context's affinity mask
+    let mut ctx = target.write(token.token());
+    ctx.sched_affinity.override_from(&raw_mask);
+
+    Ok(0)
+}
+
+/// Get CPU affinity mask for a process.
+///
+/// # Arguments (syscall ABI)
+/// - `pid`: Process ID (0 = current process; other PIDs not yet supported)
+/// - `mask_ptr`: Pointer to a `RawMask` buffer (32 bytes on 64-bit)
+/// - `mask_len`: Length of buffer in bytes (must equal `size_of::<RawMask>()`)
+///
+/// # Returns
+/// Number of bytes written to mask_ptr on success.
+pub fn sched_getaffinity(
+    pid: usize,
+    mask_ptr: super::usercopy::UserSliceWo,
+    token: &mut CleanLockToken,
+) -> Result<usize> {
+    // Validate mask size
+    if mask_ptr.len() != core::mem::size_of::<RawMask>() {
+        return Err(Error::new(super::error::EINVAL));
+    }
+
+    // pid == 0 means current process
+    let target = if pid == 0 {
+        context::current()
+    } else {
+        return Err(Error::new(super::error::ESRCH));
+    };
+
+    let ctx = target.read(token.token());
+    let raw_mask = ctx.sched_affinity.to_raw();
+    mask_ptr.copy_common_bytes_from_slice(crate::cpu_set::mask_as_bytes(&raw_mask))?;
+
+    Ok(core::mem::size_of::<RawMask>())
+}
@@ -0,0 +1,112 @@
+
+#####################################################
+#                                                   #
+#       THIS FILE IS GENERATED, DO NOT EDIT!        #
+#                                                   #
+# Generated with "ci-fairy generate-template", edit #
+# .gitlab-ci/ci.template and .gitlab-ci/config.yml  #
+# and rerun "ci-fairy generate-template" to change  #
+# this file.                                        #
+#                                                   #
+#####################################################
+
+.templates_sha: &template_sha 3d03cccd770c04e63b40325b42223495274d6a1d
+
+include:
+  - project: 'freedesktop/ci-templates'
+    ref: *template_sha
+    file:
+      - '/templates/ci-fairy.yml'
+      - '/templates/fedora.yml'
+  - template: Security/SAST.gitlab-ci.yml
+
+stages:
+  - sanity check
+  - prep
+  - build
+  - test
+
+variables:
+  FDO_UPSTREAM_REPO: xorg/lib/libxcvt
+  MESON_BUILDDIR: "builddir"
+  NINJA_ARGS: ''
+  MESON_ARGS: ''
+  MESON_TEST_ARGS: ''
+  GIT_DEPTH: 1
+
+.policy:
+  retry:
+    max: 2
+    when:
+      - runner_system_failure
+      - stuck_or_timeout_failure
+  # cancel run when a newer version is pushed to the branch
+  interruptible: true
+
+
+# Re-generate the CI script and make sure it's the one currently checked in
+# If this job fails, re-generate the gitlab-ci.yml script, see
+# $SRCDIR/.gitlab-ci/generate-gitlab-ci.py
+#
+check-ci-script:
+  extends:
+    - .fdo.ci-fairy
+  stage: sanity check
+  script:
+    - ci-fairy generate-template --verify && exit 0 || true
+    - echo "Committed gitlab-ci.yml differs from generated gitlab-ci.yml. Please verify"
+    - exit 1
+
+#
+# Verify that commit messages are as expected, signed-off, etc.
+#
+check-commit:
+  extends:
+    - .fdo.ci-fairy
+  stage: sanity check
+  script:
+    - ci-fairy check-commits --signed-off-by --junit-xml=results.xml
+  except:
+    - master@xorg/lib/libxcvt
+  variables:
+    GIT_DEPTH: 100
+  artifacts:
+    reports:
+      junit: results.xml
+
+#
+# Verify that merge request has the "allow collaboration" checkbox ticked
+#
+check-merge-request:
+  extends:
+    - .fdo.ci-fairy
+  stage: sanity check
+  script:
+    - ci-fairy check-merge-request --require-allow-collaboration --junit-xml=results.xml
+  artifacts:
+    when: on_failure
+    reports:
+      junit: results.xml
+  allow_failure: true
+
+
+.fedora.34:
+  variables:
+    FDO_DISTRIBUTION_VERSION: '34'
+    FDO_DISTRIBUTION_TAG: '2022-08-03.0'
+
+prep-fedora-34:
+  extends:
+    - .fdo.container-build@fedora
+    - .fedora.34
+  stage: prep
+  variables:
+    FDO_DISTRIBUTION_PACKAGES: "meson gcc"
+
+build-fedora-34:
+  extends:
+    - .fdo.distribution-image@fedora
+    - .fedora.34
+  stage: build
+  script:
+    - .gitlab-ci/meson-build.sh
@@ -0,0 +1,118 @@
+{# You're looking at the template here, so you can ignore the below
+   warning. This is the right file to edit #}
+
+#####################################################
+#                                                   #
+#       THIS FILE IS GENERATED, DO NOT EDIT!        #
+#                                                   #
+# Generated with "ci-fairy generate-template", edit #
+# .gitlab-ci/ci.template and .gitlab-ci/config.yml  #
+# and rerun "ci-fairy generate-template" to change  #
+# this file.                                        #
+#                                                   #
+#####################################################
+
+.templates_sha: &template_sha 3d03cccd770c04e63b40325b42223495274d6a1d
+
+include:
+  - project: 'freedesktop/ci-templates'
+    ref: *template_sha
+    file:
+      - '/templates/ci-fairy.yml'
+    {% for d in distributions %}
+      - '/templates/{{d.name}}.yml'
+    {% endfor %}
+  - template: Security/SAST.gitlab-ci.yml
+
+stages:
+  - sanity check
+  - prep
+  - build
+  - test
+
+variables:
+  FDO_UPSTREAM_REPO: xorg/lib/libxcvt
+  MESON_BUILDDIR: "builddir"
+  NINJA_ARGS: ''
+  MESON_ARGS: ''
+  MESON_TEST_ARGS: ''
+  GIT_DEPTH: 1
+
+.policy:
+  retry:
+    max: 2
+    when:
+      - runner_system_failure
+      - stuck_or_timeout_failure
+  # cancel run when a newer version is pushed to the branch
+  interruptible: true
+
+
+# Re-generate the CI script and make sure it's the one currently checked in
+# If this job fails, re-generate the gitlab-ci.yml script, see
+# $SRCDIR/.gitlab-ci/generate-gitlab-ci.py
+#
+check-ci-script:
+  extends:
+    - .fdo.ci-fairy
+  stage: sanity check
+  script:
+    - ci-fairy generate-template --verify && exit 0 || true
+    - echo "Committed gitlab-ci.yml differs from generated gitlab-ci.yml. Please verify"
+    - exit 1
+
+#
+# Verify that commit messages are as expected, signed-off, etc.
+#
+check-commit:
+  extends:
+    - .fdo.ci-fairy
+  stage: sanity check
+  script:
+    - ci-fairy check-commits --signed-off-by --junit-xml=results.xml
+  except:
+    - master@xorg/lib/libxcvt
+  variables:
+    GIT_DEPTH: 100
+  artifacts:
+    reports:
+      junit: results.xml
+
+#
+# Verify that merge request has the "allow collaboration" checkbox ticked
+#
+check-merge-request:
+  extends:
+    - .fdo.ci-fairy
+  stage: sanity check
+  script:
+    - ci-fairy check-merge-request --require-allow-collaboration --junit-xml=results.xml
+  artifacts:
+    when: on_failure
+    reports:
+      junit: results.xml
+  allow_failure: true
+
+{% for d in distributions %}
+
+.{{d.name}}.{{d.version}}:
+  variables:
+    FDO_DISTRIBUTION_VERSION: '{{d.version}}'
+    FDO_DISTRIBUTION_TAG: '{{d.tag}}'
+
+prep-{{d.name}}-{{d.version}}:
+  extends:
+    - .fdo.container-build@{{d.name}}
+    - .{{d.name}}.{{d.version}}
+  stage: prep
+  variables:
+    FDO_DISTRIBUTION_PACKAGES: "{{' '.join(d.packages)}}"
+
+build-{{d.name}}-{{d.version}}:
+  extends:
+    - .fdo.distribution-image@{{d.name}}
+    - .{{d.name}}.{{d.version}}
+  stage: build
+  script:
+    - .gitlab-ci/meson-build.sh
+{% endfor %}
@@ -0,0 +1,9 @@
+.default_tag: &default_tag '2022-08-03.0'
+
+distributions:
+  - name: fedora
+    tag: *default_tag
+    version: 34
+    packages:
+      - meson
+      - gcc
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+if [[ -f .meson_environment ]]; then
+	. .meson_environment
+fi
+
+if [[ -z "$MESON_BUILDDIR" ]]; then
+	echo "\$MESON_BUILDDIR undefined."
+	exit 1
+fi
+
+# emulate a few gitlab variables to make it easier to
+# run and debug locally.
+if [[ -z "$CI_JOB_ID" ]] || [[ -z "$CI_JOB_NAME" ]] || [[ -z "$CI_PROJECT_NAME" ]]; then
+	echo "Missing \$CI_JOB_ID or \$CI_JOB_NAME".
+	CI_PROJECT_NAME=$(basename $PWD)
+	CI_JOB_ID=$(date +%s)
+	CI_JOB_NAME='${CI_PROJECT_NAME}-job-local'
+	echo "Simulating gitlab environment: "
+	echo " CI_JOB_ID=$CI_JOB_ID"
+	echo " CI_JOB_NAME=$CI_JOB_NAME"
+	echo " CI_PROJECT_NAME=$CI_PROJECT_NAME"
+fi
+
+
+echo "*************************************************"
+echo "builddir: $MESON_BUILDDIR"
+echo "meson args: $MESON_ARGS"
+echo "ninja args: $NINJA_ARGS"
+echo "meson test args: $MESON_TEST_ARGS"
+echo "*************************************************"
+
+set -e
+
+rm -rf "$MESON_BUILDDIR"
+meson "$MESON_BUILDDIR" $MESON_ARGS
+meson configure "$MESON_BUILDDIR"
+ninja -C "$MESON_BUILDDIR" $NINJA_ARGS
+
+if [[ -z "$MESON_TEST_ARGS" ]]; then
+    exit 0
+fi
+
+# we still want to generate the reports, even if meson test fails
+meson test -C "$MESON_BUILDDIR" $MESON_TEST_ARGS --print-errorlogs
+exit_code=$?
+
+exit $exit_code
@@ -0,0 +1,67 @@
+Copyright 2005-2006 Luc Verhaegen.
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the "Software"),
+to deal in the Software without restriction, including without limitation
+the rights to use, copy, modify, merge, publish, distribute, sublicense,
+and/or sell copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
+
+
+
+Copyright 2005-2006 Luc Verhaegen.
+Copyright © 2021 Red Hat, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the "Software"),
+to deal in the Software without restriction, including without limitation
+the rights to use, copy, modify, merge, publish, distribute, sublicense,
+and/or sell copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
+
+
+
+Copyright © 2000 Compaq Computer Corporation
+Copyright © 2002 Hewlett Packard Company
+Copyright © 2006 Intel Corporation
+Copyright © 2008, 2021 Red Hat, Inc.
+
+Permission to use, copy, modify, distribute, and sell this software and its
+documentation for any purpose is hereby granted without fee, provided that
+the above copyright notice appear in all copies and that both that copyright
+notice and this permission notice appear in supporting documentation, and
+that the name of the copyright holders not be used in advertising or
+publicity pertaining to distribution of the software without specific,
+written prior permission.  The copyright holders make no representations
+about the suitability of this software for any purpose.  It is provided "as
+is" without express or implied warranty.
+
+THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+OF THIS SOFTWARE.
@@ -0,0 +1,36 @@
+libxcvt
+=======
+
+`libxcvt` is a library providing a standalone version of the X server
+implementation of the VESA CVT standard timing modelines generator.
+
+`libxcvt` also provides a standalone version of the command line tool
+`cvt` copied from the Xorg implementation and is meant to be a direct
+replacement to the version provided by the `Xorg` server.
+
+An example output is:
+
+```
+$ cvt --verbose 1920 1200 75
+# 1920x1200 74.93 Hz (CVT 2.30MA) hsync: 94.04 kHz; pclk: 245.25 MHz
+Modeline "1920x1200_75.00"  245.25  1920 2064 2264 2608  1200 1203 1209 1255 -hsync +vsync
+```
+
+Building
+========
+
+`libxcvt` is built using [Meson](https://mesonbuild.com/)
+
+	$ git clone https://gitlab.freedesktop.org/xorg/lib/libxcvt.git
+	$ cd libxcvt
+	$ meson build/ --prefix=...
+	$ ninja -C build/ install
+	$ cd ..
+
+Credit
+======
+
+The code base of `libxcvt` is identical to `xf86CVTMode()` therefore
+all credits for `libxcvt` go to the author (Luc Verhaegen) and
+contributors of `xf86CVTMode()` and the `cvt` utility as found in the
+[xserver](https://gitlab.freedesktop.org/xorg/xserver/) repository.
@@ -0,0 +1,257 @@
+/*
+ * Copyright 2005-2006 Luc Verhaegen.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+/* Standalone VESA CVT standard timing modelines generator. */
+
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+
+#include <libxcvt/libxcvt.h>
+
+static bool
+cvt_is_standard(int hdisplay, int vdisplay, float vrefresh, bool reduced, bool verbose)
+{
+    bool is_cvt = true;
+
+    if ((!(vdisplay % 3) && ((vdisplay * 4 / 3) == hdisplay)) ||
+        (!(vdisplay % 9) && ((vdisplay * 16 / 9) == hdisplay)) ||
+        (!(vdisplay % 10) && ((vdisplay * 16 / 10) == hdisplay)) ||
+        (!(vdisplay % 4) && ((vdisplay * 5 / 4) == hdisplay)) ||
+        (!(vdisplay % 9) && ((vdisplay * 15 / 9) == hdisplay)));
+    else {
+        if (verbose)
+            fprintf(stderr, "Warning: Aspect Ratio is not CVT standard.\n");
+        is_cvt = false;
+    }
+
+    if ((vrefresh != 50.0) && (vrefresh != 60.0) &&
+        (vrefresh != 75.0) && (vrefresh != 85.0)) {
+        if (verbose)
+            fprintf(stderr, "Warning: Refresh Rate %.2f is not CVT standard "
+                    "(50, 60, 75 or 85Hz).\n", vrefresh);
+        is_cvt = false;
+    }
+
+    return is_cvt;
+}
+/*
+ * I'm not documenting --interlaced for obvious reasons, even though I did
+ * implement it. I also can't deny having looked at gtf here.
+ */
+static void
+print_usage(char *Name)
+{
+    fprintf(stderr, "\n");
+    fprintf(stderr, "usage: %s [-v|--verbose] [-r|--reduced] X Y [refresh]\n",
+            Name);
+    fprintf(stderr, "\n");
+    fprintf(stderr, " -v|--verbose : Warn about CVT standard adherence.\n");
+    fprintf(stderr, " -r|--reduced : Create a mode with reduced blanking "
+            "(default: normal blanking).\n");
+    fprintf(stderr, "            X : Desired horizontal resolution "
+            "(multiple of 8, required).\n");
+    fprintf(stderr,
+            "            Y : Desired vertical resolution (required).\n");
+    fprintf(stderr,
+            "      refresh : Desired refresh rate (default: 60.0Hz).\n");
+    fprintf(stderr, "\n");
+
+    fprintf(stderr, "Calculates VESA CVT (Coordinated Video Timing) modelines"
+            " for use with X.\n");
+}
+
+/*
+ *
+ */
+static void
+print_comment(struct libxcvt_mode_info *mode_info, bool is_cvt, bool reduced)
+{
+    printf("# %dx%d %.2f Hz ", mode_info->hdisplay, mode_info->vdisplay, mode_info->vrefresh);
+
+    if (is_cvt) {
+        printf("(CVT %.2fM",
+               ((float) mode_info->hdisplay * mode_info->vdisplay) / 1000000.0);
+
+        if (!(mode_info->vdisplay % 3) &&
+            ((mode_info->vdisplay * 4 / 3) == mode_info->hdisplay))
+            printf("3");
+        else if (!(mode_info->vdisplay % 9) &&
+                 ((mode_info->vdisplay * 16 / 9) == mode_info->hdisplay))
+            printf("9");
+        else if (!(mode_info->vdisplay % 10) &&
+                 ((mode_info->vdisplay * 16 / 10) == mode_info->hdisplay))
+            printf("A");
+        else if (!(mode_info->vdisplay % 4) &&
+                 ((mode_info->vdisplay * 5 / 4) == mode_info->hdisplay))
+            printf("4");
+        else if (!(mode_info->vdisplay % 9) &&
+                 ((mode_info->vdisplay * 15 / 9) == mode_info->hdisplay))
+            printf("9");
+
+        if (reduced)
+            printf("-R");
+
+        printf(") ");
+    }
+    else
+        printf("(CVT) ");
+
+    printf("hsync: %.2f kHz; ", mode_info->hsync);
+    printf("pclk: %.2f MHz", ((float) mode_info->dot_clock) / 1000.0);
+
+    printf("\n");
+}
+
+/*
+ * Originally grabbed from xf86Mode.c.
+ *
+ * Ignoring the actual mode_info->name, as the user will want something solid
+ * to grab hold of.
+ */
+static void
+print_mode_line(struct libxcvt_mode_info *mode_info, int hdisplay, int vdisplay, float vrefresh,
+              bool reduced)
+{
+    if (reduced)
+        printf("Modeline \"%dx%dR\"  ", hdisplay, vdisplay);
+    else
+        printf("Modeline \"%dx%d_%.2f\"  ", hdisplay, vdisplay, vrefresh);
+
+    printf("%6.2f  %i %i %i %i  %i %i %i %i", mode_info->dot_clock / 1000.,
+           mode_info->hdisplay, mode_info->hsync_start, mode_info->hsync_end, mode_info->htotal,
+           mode_info->vdisplay, mode_info->vsync_start, mode_info->vsync_end, mode_info->vtotal);
+
+    if (mode_info->mode_flags & LIBXCVT_MODE_FLAG_INTERLACE)
+        printf(" interlace");
+    if (mode_info->mode_flags & LIBXCVT_MODE_FLAG_HSYNC_POSITIVE)
+        printf(" +hsync");
+    if (mode_info->mode_flags & LIBXCVT_MODE_FLAG_HSYNC_NEGATIVE)
+        printf(" -hsync");
+    if (mode_info->mode_flags & LIBXCVT_MODE_FLAG_VSYNC_POSITIVE)
+        printf(" +vsync");
+    if (mode_info->mode_flags & LIBXCVT_MODE_FLAG_VSYNC_NEGATIVE)
+        printf(" -vsync");
+
+    printf("\n");
+}
+
+/*
+ *
+ */
+int
+main(int argc, char *argv[])
+{
+    struct libxcvt_mode_info *mode_info;
+    int hdisplay = 0, vdisplay = 0;
+    float vrefresh = 0.0;
+    bool reduced = false, verbose = false, is_cvt;
+    bool interlaced = false;
+    int n;
+
+    if ((argc < 3) || (argc > 7)) {
+        print_usage(argv[0]);
+        return 1;
+    }
+
+    /* This doesn't filter out bad flags properly. Bad flags get passed down
+     * to atoi/atof, which then return 0, so that these variables can get
+     * filled next time round. So this is just a cosmetic problem.
+     */
+    for (n = 1; n < argc; n++) {
+        if (!strcmp(argv[n], "-r") || !strcmp(argv[n], "--reduced"))
+            reduced = true;
+        else if (!strcmp(argv[n], "-i") || !strcmp(argv[n], "--interlaced"))
+            interlaced = true;
+        else if (!strcmp(argv[n], "-v") || !strcmp(argv[n], "--verbose"))
+            verbose = true;
+        else if (!strcmp(argv[n], "-h") || !strcmp(argv[n], "--help")) {
+            print_usage(argv[0]);
+            return 0;
+        }
+        else if (!hdisplay) {
+            hdisplay = atoi(argv[n]);
+            if (!hdisplay) {
+                print_usage(argv[0]);
+                return 1;
+            }
+        }
+        else if (!vdisplay) {
+            vdisplay = atoi(argv[n]);
+            if (!vdisplay) {
+                print_usage(argv[0]);
+                return 1;
+            }
+        }
+        else if (!vrefresh) {
+            vrefresh = atof(argv[n]);
+            if (!vrefresh) {
+                print_usage(argv[0]);
+                return 1;
+            }
+        }
+        else {
+            print_usage(argv[0]);
+            return 1;
+        }
+    }
+
+    if (!hdisplay || !vdisplay) {
+        print_usage(argv[0]);
+        return 0;
+    }
+
+    /* Default to 60.0Hz */
+    if (!vrefresh)
+        vrefresh = 60.0;
+
+    /* Horizontal timing is always a multiple of 8: round up. */
+    if (hdisplay & 0x07) {
+        hdisplay &= ~0x07;
+        hdisplay += 8;
+    }
+
+    if (reduced) {
+        if ((vrefresh / 60.0) != floor(vrefresh / 60.0)) {
+            fprintf(stderr,
+                    "\nERROR: Multiple of 60Hz refresh rate required for "
+                    " reduced blanking.\n");
+            print_usage(argv[0]);
+            return 0;
+        }
+    }
+
+    mode_info = libxcvt_gen_mode_info(hdisplay, vdisplay, vrefresh, reduced, interlaced);
+    if (!mode_info) {
+        fprintf(stderr, "Out of memory!\n");
+            return 0;
+    }
+
+    is_cvt = cvt_is_standard(hdisplay, vdisplay, vrefresh, reduced, verbose);
+    print_comment(mode_info, is_cvt, reduced);
+    print_mode_line(mode_info, hdisplay, vdisplay, vrefresh, reduced);
+    free(mode_info);
+
+    return 0;
+}
@@ -0,0 +1,10 @@
+cvt_src = [
+	'cvt.c',
+]
+
+executable('cvt',
+	   cvt_src,
+	   include_directories : inc,
+	   link_with : libxcvt,
+	   dependencies: mdep,
+	   install : true)
@@ -0,0 +1,46 @@
+/*
+ * Copyright 2005-2006 Luc Verhaegen.
+ * Copyright © 2021 Red Hat, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef _LIBCVT_H_
+#define _LIBCVT_H_
+
+#include <stdbool.h>
+#include <libxcvt/libxcvt_mode.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct libxcvt_mode_info *
+libxcvt_gen_mode_info(int   hdisplay,
+                      int   vdisplay,
+                      float vrefresh,
+                      bool  reduced,
+                      bool  interlaced);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _LIBCVT_H_ */
@@ -0,0 +1,56 @@
+/*
+ * Copyright © 2000 Compaq Computer Corporation
+ * Copyright © 2002 Hewlett Packard Company
+ * Copyright © 2006 Intel Corporation
+ * Copyright © 2008, 2021 Red Hat, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that copyright
+ * notice and this permission notice appear in supporting documentation, and
+ * that the name of the copyright holders not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  The copyright holders make no representations
+ * about the suitability of this software for any purpose.  It is provided "as
+ * is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THIS SOFTWARE.
+ *
+ */
+
+#ifndef _LIBXCVT_MODE_H_
+#define _LIBXCVT_MODE_H_
+
+#include <stdint.h>
+
+/* Conveniently chosen to match the RandR definitions */
+enum  libxcvt_mode_flags {
+    LIBXCVT_MODE_FLAG_HSYNC_POSITIVE    = (1 << 0),
+    LIBXCVT_MODE_FLAG_HSYNC_NEGATIVE    = (1 << 1),
+    LIBXCVT_MODE_FLAG_VSYNC_POSITIVE    = (1 << 2),
+    LIBXCVT_MODE_FLAG_VSYNC_NEGATIVE    = (1 << 3),
+    LIBXCVT_MODE_FLAG_INTERLACE         = (1 << 4),
+};
+
+struct libxcvt_mode_info {
+    uint32_t                hdisplay;
+    uint32_t                vdisplay;
+    float                   vrefresh;
+    float                   hsync;
+    uint64_t                dot_clock;
+    uint16_t                hsync_start;
+    uint16_t                hsync_end;
+    uint16_t                htotal;
+    uint16_t                vsync_start;
+    uint16_t                vsync_end;
+    uint16_t                vtotal;
+    enum libxcvt_mode_flags mode_flags;
+};
+
+#endif /* _LIBXCVT_MODE_H_ */
@@ -0,0 +1 @@
+install_headers('libxcvt.h','libxcvt_mode.h', subdir: 'libxcvt')
@@ -0,0 +1 @@
+subdir('libxcvt')
@@ -0,0 +1,301 @@
+/*
+ * Copyright 2005-2006 Luc Verhaegen.
+ * Copyright © 2021 Red Hat, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+/* Standalone VESA CVT standard timing modelines generator. */
+
+
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <libxcvt/libxcvt.h>
+
+/*
+ * Generate a CVT standard mode from hdisplay, vdisplay and vrefresh.
+ *
+ * These calculations are stolen from the CVT calculation spreadsheet written
+ * by Graham Loveridge. He seems to be claiming no copyright and there seems to
+ * be no license attached to this. He apparently just wants to see his name
+ * mentioned.
+ *
+ * This file can be found at http://www.vesa.org/Public/CVT/CVTd6r1.xls
+ *
+ * Comments and structure corresponds to the comments and structure of the xls.
+ * This should ease importing of future changes to the standard (not very
+ * likely though).
+ *
+ * About margins; i'm sure that they are to be the bit between HDisplay and
+ * HBlankStart, HBlankEnd and HTotal, VDisplay and VBlankStart, VBlankEnd and
+ * VTotal, where the overscan colour is shown. FB seems to call _all_ blanking
+ * outside sync "margin" for some reason. Since we prefer seeing proper
+ * blanking instead of the overscan colour, and since the Crtc* values will
+ * probably get altered after us, we will disable margins altogether. With
+ * these calculations, Margins will plainly expand H/VDisplay, and we don't
+ * want that. -- libv
+ *
+ */
+struct libxcvt_mode_info *
+libxcvt_gen_mode_info(int hdisplay, int vdisplay, float vrefresh, bool reduced, bool interlaced)
+{
+    bool margins = false;
+    float vfield_rate, hperiod;
+    int hdisplay_rnd, hmargin;
+    int vdisplay_rnd, vmargin, vsync;
+    float interlace;            /* Please rename this */
+    struct libxcvt_mode_info *mode_info;
+
+    mode_info = calloc(1, sizeof *mode_info);
+    if (!mode_info)
+        return NULL;
+
+    mode_info->hdisplay = hdisplay;
+    mode_info->vdisplay = vdisplay;
+    mode_info->vrefresh = vrefresh;
+
+    /* 1) top/bottom margin size (% of height) - default: 1.8 */
+#define CVT_MARGIN_PERCENTAGE 1.8
+
+    /* 2) character cell horizontal granularity (pixels) - default 8 */
+#define CVT_H_GRANULARITY 8
+
+    /* 4) Minimum vertical front porch (lines) - default 3 */
+#define CVT_MIN_V_PORCH_RND 3
+
+    /* 4) Minimum number of vertical back porch lines - default 6 */
+#define CVT_MIN_V_BPORCH 6
+
+    /* Pixel Clock step (kHz) */
+#define CVT_CLOCK_STEP 250
+
+    /* CVT default is 60.0Hz */
+    if (!mode_info->vrefresh)
+        mode_info->vrefresh = 60.0;
+
+    /* 1. Required field rate */
+    if (interlaced)
+        vfield_rate = mode_info->vrefresh * 2;
+    else
+        vfield_rate = mode_info->vrefresh;
+
+    /* 2. Horizontal pixels */
+    hdisplay_rnd = mode_info->hdisplay - (mode_info->hdisplay % CVT_H_GRANULARITY);
+
+    /* 3. Determine left and right borders */
+    if (margins) {
+        /* right margin is actually exactly the same as left */
+        hmargin = (((float) hdisplay_rnd) * CVT_MARGIN_PERCENTAGE / 100.0);
+        hmargin -= hmargin % CVT_H_GRANULARITY;
+    }
+    else {
+        hmargin = 0;
+    }
+
+    /* 4. Find total active pixels */
+    mode_info->hdisplay = hdisplay_rnd + 2 * hmargin;
+
+    /* 5. Find number of lines per field */
+    if (interlaced)
+        vdisplay_rnd = mode_info->vdisplay / 2;
+    else
+        vdisplay_rnd = mode_info->vdisplay;
+
+    /* 6. Find top and bottom margins */
+    /* nope. */
+    if (margins)
+        /* top and bottom margins are equal again. */
+        vmargin = (((float) vdisplay_rnd) * CVT_MARGIN_PERCENTAGE / 100.0);
+    else
+        vmargin = 0;
+
+    mode_info->vdisplay = mode_info->vdisplay + 2 * vmargin;
+
+    /* 7. interlace */
+    if (interlaced)
+        interlace = 0.5;
+    else
+        interlace = 0.0;
+
+    /* Determine vsync Width from aspect ratio */
+    if (!(mode_info->vdisplay % 3) && ((mode_info->vdisplay * 4 / 3) == mode_info->hdisplay))
+        vsync = 4;
+    else if (!(mode_info->vdisplay % 9) && ((mode_info->vdisplay * 16 / 9) == mode_info->hdisplay))
+        vsync = 5;
+    else if (!(mode_info->vdisplay % 10) && ((mode_info->vdisplay * 16 / 10) == mode_info->hdisplay))
+        vsync = 6;
+    else if (!(mode_info->vdisplay % 4) && ((mode_info->vdisplay * 5 / 4) == mode_info->hdisplay))
+        vsync = 7;
+    else if (!(mode_info->vdisplay % 9) && ((mode_info->vdisplay * 15 / 9) == mode_info->hdisplay))
+        vsync = 7;
+    else                        /* Custom */
+        vsync = 10;
+
+    if (!reduced) {             /* simplified GTF calculation */
+
+        /* 4) Minimum time of vertical sync + back porch interval (µs)
+         * default 550.0 */
+#define CVT_MIN_VSYNC_BP 550.0
+
+        /* 3) Nominal HSync width (% of line period) - default 8 */
+#define CVT_HSYNC_PERCENTAGE 8
+
+        float hblank_percentage;
+        int vsync_and_back_porch, vback_porch;
+        int hblank, hsync_w;
+
+        /* 8. Estimated Horizontal period */
+        hperiod = ((float) (1000000.0 / vfield_rate - CVT_MIN_VSYNC_BP)) /
+            (vdisplay_rnd + 2 * vmargin + CVT_MIN_V_PORCH_RND + interlace);
+
+        /* 9. Find number of lines in sync + backporch */
+        if (((int) (CVT_MIN_VSYNC_BP / hperiod) + 1) <
+            (vsync + CVT_MIN_V_BPORCH))
+            vsync_and_back_porch = vsync + CVT_MIN_V_BPORCH;
+        else
+            vsync_and_back_porch = (int) (CVT_MIN_VSYNC_BP / hperiod) + 1;
+
+        /* 10. Find number of lines in back porch */
+        vback_porch = vsync_and_back_porch - vsync;
+        (void) vback_porch;
+
+        /* 11. Find total number of lines in vertical field */
+        mode_info->vtotal =
+            vdisplay_rnd + 2 * vmargin + vsync_and_back_porch + interlace +
+            CVT_MIN_V_PORCH_RND;
+
+        /* 5) Definition of Horizontal blanking time limitation */
+        /* Gradient (%/kHz) - default 600 */
+#define CVT_M_FACTOR 600
+
+        /* Offset (%) - default 40 */
+#define CVT_C_FACTOR 40
+
+        /* Blanking time scaling factor - default 128 */
+#define CVT_K_FACTOR 128
+
+        /* Scaling factor weighting - default 20 */
+#define CVT_J_FACTOR 20
+
+#define CVT_M_PRIME CVT_M_FACTOR * CVT_K_FACTOR / 256
+#define CVT_C_PRIME (CVT_C_FACTOR - CVT_J_FACTOR) * CVT_K_FACTOR / 256 + \
+        CVT_J_FACTOR
+
+        /* 12. Find ideal blanking duty cycle from formula */
+        hblank_percentage = CVT_C_PRIME - CVT_M_PRIME * hperiod / 1000.0;
+
+        /* 13. Blanking time */
+        if (hblank_percentage < 20)
+            hblank_percentage = 20;
+
+        hblank = mode_info->hdisplay * hblank_percentage / (100.0 - hblank_percentage);
+        hblank -= hblank % (2 * CVT_H_GRANULARITY);
+
+        /* 14. Find total number of pixels in a line. */
+        mode_info->htotal = mode_info->hdisplay + hblank;
+
+        /* Fill in HSync values */
+        mode_info->hsync_end = mode_info->hdisplay + hblank / 2;
+
+        hsync_w = (mode_info->htotal * CVT_HSYNC_PERCENTAGE) / 100;
+        hsync_w -= hsync_w % CVT_H_GRANULARITY;
+        mode_info->hsync_start = mode_info->hsync_end - hsync_w;
+
+        /* Fill in vsync values */
+        mode_info->vsync_start = mode_info->vdisplay + CVT_MIN_V_PORCH_RND;
+        mode_info->vsync_end = mode_info->vsync_start + vsync;
+
+    }
+    else {                      /* reduced blanking */
+        /* Minimum vertical blanking interval time (µs) - default 460 */
+#define CVT_RB_MIN_VBLANK 460.0
+
+        /* Fixed number of clocks for horizontal sync */
+#define CVT_RB_H_SYNC 32.0
+
+        /* Fixed number of clocks for horizontal blanking */
+#define CVT_RB_H_BLANK 160.0
+
+        /* Fixed number of lines for vertical front porch - default 3 */
+#define CVT_RB_VFPORCH 3
+
+        int vblank_interval_lines;
+
+        /* 8. Estimate Horizontal period. */
+        hperiod = ((float) (1000000.0 / vfield_rate - CVT_RB_MIN_VBLANK)) /
+            (vdisplay_rnd + 2 * vmargin);
+
+        /* 9. Find number of lines in vertical blanking */
+        vblank_interval_lines = ((float) CVT_RB_MIN_VBLANK) / hperiod + 1;
+
+        /* 10. Check if vertical blanking is sufficient */
+        if (vblank_interval_lines < (CVT_RB_VFPORCH + vsync + CVT_MIN_V_BPORCH))
+            vblank_interval_lines = CVT_RB_VFPORCH + vsync + CVT_MIN_V_BPORCH;
+
+        /* 11. Find total number of lines in vertical field */
+        mode_info->vtotal = vdisplay_rnd + 2 * vmargin + interlace + vblank_interval_lines;
+
+        /* 12. Find total number of pixels in a line */
+        mode_info->htotal = mode_info->hdisplay + CVT_RB_H_BLANK;
+
+        /* Fill in HSync values */
+        mode_info->hsync_end = mode_info->hdisplay + CVT_RB_H_BLANK / 2;
+        mode_info->hsync_start = mode_info->hsync_end - CVT_RB_H_SYNC;
+
+        /* Fill in vsync values */
+        mode_info->vsync_start = mode_info->vdisplay + CVT_RB_VFPORCH;
+        mode_info->vsync_end = mode_info->vsync_start + vsync;
+    }
+
+    /* 15/13. Find pixel clock frequency (kHz for xf86) */
+    mode_info->dot_clock = mode_info->htotal * 1000.0 / hperiod;
+    mode_info->dot_clock -= mode_info->dot_clock % CVT_CLOCK_STEP;
+
+    /* 16/14. Find actual Horizontal Frequency (kHz) */
+    mode_info->hsync = ((float) mode_info->dot_clock) / ((float) mode_info->htotal);
+
+    /* 17/15. Find actual Field rate */
+    mode_info->vrefresh = (1000.0 * ((float) mode_info->dot_clock)) /
+        ((float) (mode_info->htotal * mode_info->vtotal));
+
+    /* 18/16. Find actual vertical frame frequency */
+    /* ignore - just set the mode flag for interlaced */
+    if (interlaced)
+        mode_info->vtotal *= 2;
+
+    if (reduced)
+        mode_info->mode_flags |= LIBXCVT_MODE_FLAG_HSYNC_POSITIVE | LIBXCVT_MODE_FLAG_VSYNC_NEGATIVE;
+    else
+        mode_info->mode_flags |= LIBXCVT_MODE_FLAG_HSYNC_NEGATIVE | LIBXCVT_MODE_FLAG_VSYNC_POSITIVE;
+
+    if (interlaced)
+        mode_info->mode_flags |= LIBXCVT_MODE_FLAG_INTERLACE;
+
+    /* FWXGA hack adapted from hw/xfree86/modes/xf86EdidModes.c, because you can't say 1366 */
+    if (mode_info->hdisplay == 1360 && mode_info->vdisplay == 768) {
+         mode_info->hdisplay = 1366;
+         mode_info->hsync_start--;
+         mode_info->hsync_end--;
+    }
+
+    return mode_info;
+}
@@ -0,0 +1,7 @@
+libxcvt_sources = ['libxcvt.c']
+libxcvt = shared_library('xcvt',
+                         libxcvt_sources,
+                         include_directories : inc,
+                         version: meson.project_version(),
+                         darwin_versions : ['1.0.0', '1.0.0' ],
+                         install : true)
@@ -0,0 +1,41 @@
+.TH CVT 1 @vendorversion@
+.SH NAME
+cvt - calculate VESA CVT mode lines
+.SH SYNOPSIS
+.B cvt
+.RB [ \-v | \-\-verbose ]
+.RB [ \-r | \-\-reduced ]
+.I h-resolution
+.I v-resolution
+.RB [ refresh ]
+.SH DESCRIPTION
+.I Cvt
+is a utility for calculating VESA Coordinated Video Timing modes.  Given the
+desired horizontal and vertical resolutions, a modeline adhering to the CVT
+standard is printed. This modeline can be included in Xorg
+.B xorg.conf(@filemansuffix@)
+.
+
+.SH OPTIONS
+.TP 8
+.BR refresh
+Provide a vertical refresh rate in Hz.  The CVT standard prefers either 50.0,
+60.0, 75.0 or 85.0Hz.  The default is 60.0Hz.
+.TP 8
+.BR \-v | \-\-verbose
+Warn verbosely when a given mode does not completely correspond with CVT
+standards.
+.TP 8
+.BR \-r | \-\-reduced
+Create a mode with reduced blanking.  This allows for higher frequency signals,
+with a lower or equal dotclock. Not for Cathode Ray Tube based displays though.
+
+.SH "SEE ALSO"
+xorg.conf(@filemansuffix@), gtf(@appmansuffix@)
+.SH AUTHOR
+Luc Verhaegen.
+.PP
+This program is based on the Coordinated Video Timing sample
+implementation written by Graham Loveridge. This file is publicly
+available at <http://www.vesa.org/Public/CVT/CVTd6r1.xls>. CVT is a
+VESA trademark.
@@ -0,0 +1,12 @@
+man_conf = configuration_data()
+man_conf.set('appmansuffix', '1')
+man_conf.set('filemansuffix', '5')
+man_conf.set('vendorversion',
+	'"libxcvt @0@" "X Version 11"'.format(meson.project_version()))
+
+configure_file(
+	input: 'cvt.man',
+	output: 'cvt.1',
+	install_dir: join_paths(man, 'man1'),
+	configuration: man_conf
+)
@@ -0,0 +1,28 @@
+project('libxcvt', 'c',
+        version: '0.1.3',
+        meson_version: '>= 0.40.0',
+        default_options: ['warning_level=1',
+                          'buildtype=debugoptimized'])
+
+libcvt_version = meson.project_version().split('.')
+
+cc = meson.get_compiler('c')
+mdep = cc.find_library('m', required : false)
+
+prefix  = get_option('prefix')
+inc = include_directories('include')
+man = join_paths(prefix, get_option('mandir'))
+
+subdir('include')
+subdir('lib')
+subdir('cvt')
+subdir('man')
+
+pkg_mod = import('pkgconfig')
+pkg_mod.generate(libraries : libxcvt,
+                 version : meson.project_version(),
+                 name : 'libxcvt',
+                 description : 'A Library to generate VESA CVT standard timing modelines.')
+
+libxcvt_dep = declare_dependency(link_with: libxcvt,
+                                 include_directories: inc)
				`@@ -0,0 +1 @@`
				`install_headers('libxcvt.h','libxcvt_mode.h', subdir: 'libxcvt')`