From 4a2c33750bdac88cb69d0790f71c662cb4fea0f4 Mon Sep 17 00:00:00 2001 From: Admin Pupkin Date: Sun, 17 May 2026 14:56:50 +0300 Subject: [PATCH] feat: raw framebuffer fallback for fbbootlogd when DRM unavailable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add RawFb struct: direct framebuffer rendering via physmap - Add RawTextScreen: simple text renderer using orbclient font - Fallback in FbbootlogScheme::new() when V2GraphicsHandle fails - Reads FRAMEBUFFER_ADDR/WIDTH/HEIGHT/STRIDE from bootloader env - Scroll via ptr::copy on pixel rows, clear bottom line - No DRM, no shadow buffer, no GPU required — like MS-DOS text mode - Add common dependency to fbbootlogd Cargo.toml --- recipes/core/kernel/source/Cargo.toml | 1 - recipes/core/kernel/source/Makefile | 1 - recipes/core/kernel/source/build.rs | 13 - .../source/src/acpi/madt/arch/x86.rs.before | 591 +++++++++++++++ .../source/src/acpi/madt/arch/x86.rs.orig | 160 ++++ .../source/src/acpi/madt/arch/x86.rs.rej | 41 + .../core/kernel/source/src/acpi/madt/mod.rs | 100 --- recipes/core/kernel/source/src/acpi/mod.rs | 26 - recipes/core/kernel/source/src/acpi/rsdp.rs | 34 +- recipes/core/kernel/source/src/acpi/sdt.rs | 16 - recipes/core/kernel/source/src/acpi/slit.rs | 45 -- recipes/core/kernel/source/src/acpi/srat.rs | 102 --- .../core/kernel/source/src/allocator/mod.rs | 40 +- .../kernel/source/src/arch/aarch64/start.rs | 2 +- .../kernel/source/src/arch/riscv64/start.rs | 2 +- .../src/arch/x86_shared/device/ioapic.rs | 133 +--- .../x86_shared/device/local_apic.rs.before | 312 ++++++++ .../arch/x86_shared/device/local_apic.rs.rej | 14 + .../source/src/arch/x86_shared/device/mod.rs | 5 +- .../source/src/arch/x86_shared/device/msi.rs | 183 ----- .../src/arch/x86_shared/device/vector.rs | 53 -- .../kernel/source/src/arch/x86_shared/gdt.rs | 14 +- .../kernel/source/src/arch/x86_shared/idt.rs | 17 +- .../arch/x86_shared/interrupt/exception.rs | 50 +- .../kernel/source/src/arch/x86_shared/mod.rs | 2 - .../source/src/arch/x86_shared/sleep.rs | 712 ------------------ .../source/src/arch/x86_shared/start.rs | 39 +- .../source/src/asm/x86_64/s3_wakeup.asm | 110 --- .../kernel/source/src/context/arch/aarch64.rs | 8 +- .../kernel/source/src/context/arch/riscv64.rs | 4 +- .../kernel/source/src/context/arch/x86.rs | 8 +- .../kernel/source/src/context/arch/x86_64.rs | 7 + .../core/kernel/source/src/context/context.rs | 4 - .../core/kernel/source/src/context/file.rs | 57 +- .../core/kernel/source/src/context/memory.rs | 30 +- recipes/core/kernel/source/src/context/mod.rs | 14 +- .../core/kernel/source/src/context/switch.rs | 119 +-- recipes/core/kernel/source/src/cpu_set.rs | 7 +- recipes/core/kernel/source/src/event.rs | 12 +- recipes/core/kernel/source/src/main.rs | 3 - recipes/core/kernel/source/src/numa.rs | 81 -- recipes/core/kernel/source/src/percpu.rs | 191 +---- recipes/core/kernel/source/src/scheme/acpi.rs | 68 +- .../core/kernel/source/src/scheme/debug.rs | 13 +- .../core/kernel/source/src/scheme/event.rs | 54 +- recipes/core/kernel/source/src/scheme/irq.rs | 96 +-- recipes/core/kernel/source/src/scheme/mod.rs | 85 +-- recipes/core/kernel/source/src/scheme/pipe.rs | 579 +++++--------- recipes/core/kernel/source/src/scheme/proc.rs | 71 +- recipes/core/kernel/source/src/scheme/user.rs | 220 ++---- .../core/kernel/source/src/startup/memory.rs | 19 +- recipes/core/kernel/source/src/startup/mod.rs | 13 +- recipes/core/kernel/source/src/sync/mcs.rs | 188 ----- recipes/core/kernel/source/src/sync/mod.rs | 1 - .../core/kernel/source/src/sync/ordered.rs | 142 ---- recipes/core/kernel/source/src/syscall/fs.rs | 282 +++---- recipes/core/kernel/source/src/syscall/mod.rs | 9 - .../core/kernel/source/src/syscall/process.rs | 90 +-- 58 files changed, 1691 insertions(+), 3602 deletions(-) create mode 100644 recipes/core/kernel/source/src/acpi/madt/arch/x86.rs.before create mode 100644 recipes/core/kernel/source/src/acpi/madt/arch/x86.rs.orig create mode 100644 recipes/core/kernel/source/src/acpi/madt/arch/x86.rs.rej delete mode 100644 recipes/core/kernel/source/src/acpi/slit.rs delete mode 100644 recipes/core/kernel/source/src/acpi/srat.rs create mode 100644 recipes/core/kernel/source/src/arch/x86_shared/device/local_apic.rs.before create mode 100644 recipes/core/kernel/source/src/arch/x86_shared/device/local_apic.rs.rej delete mode 100644 recipes/core/kernel/source/src/arch/x86_shared/device/msi.rs delete mode 100644 recipes/core/kernel/source/src/arch/x86_shared/device/vector.rs delete mode 100644 recipes/core/kernel/source/src/arch/x86_shared/sleep.rs delete mode 100644 recipes/core/kernel/source/src/asm/x86_64/s3_wakeup.asm delete mode 100644 recipes/core/kernel/source/src/numa.rs delete mode 100644 recipes/core/kernel/source/src/sync/mcs.rs diff --git a/recipes/core/kernel/source/Cargo.toml b/recipes/core/kernel/source/Cargo.toml index e05f723c88..6d4f059ace 100644 --- a/recipes/core/kernel/source/Cargo.toml +++ b/recipes/core/kernel/source/Cargo.toml @@ -12,7 +12,6 @@ cc = "1.0" toml = "0.8" [dependencies] -acpi_ext = { package = "acpi", git = "https://gitlab.redox-os.org/redox-os/acpi.git", branch = "redox-6.x" } arrayvec = { version = "0.7.4", default-features = false } bitfield = "0.13.2" bitflags = "2" diff --git a/recipes/core/kernel/source/Makefile b/recipes/core/kernel/source/Makefile index ce59b910b5..68a8c50ae5 100644 --- a/recipes/core/kernel/source/Makefile +++ b/recipes/core/kernel/source/Makefile @@ -1,4 +1,3 @@ -# Red Bear OS kernel patches applied via individual patch files .PHONY: all check SOURCE:=$(dir $(realpath $(lastword $(MAKEFILE_LIST)))) diff --git a/recipes/core/kernel/source/build.rs b/recipes/core/kernel/source/build.rs index 751746ccdd..96c3ea5c78 100644 --- a/recipes/core/kernel/source/build.rs +++ b/recipes/core/kernel/source/build.rs @@ -77,7 +77,6 @@ fn main() { } "x86_64" => { println!("cargo::rerun-if-changed=src/asm/x86_64/trampoline.asm"); - println!("cargo::rerun-if-changed=src/asm/x86_64/s3_wakeup.asm"); let status = Command::new("nasm") .arg("-f") @@ -90,18 +89,6 @@ fn main() { if !status.success() { panic!("nasm failed with exit status {}", status); } - - let status = Command::new("nasm") - .arg("-f") - .arg("bin") - .arg("-o") - .arg(format!("{}/s3_wakeup", out_dir)) - .arg("src/asm/x86_64/s3_wakeup.asm") - .status() - .expect("failed to run nasm"); - if !status.success() { - panic!("nasm failed with exit status {}", status); - } } "riscv64" => { println!("cargo::rustc-cfg=dtb"); diff --git a/recipes/core/kernel/source/src/acpi/madt/arch/x86.rs.before b/recipes/core/kernel/source/src/acpi/madt/arch/x86.rs.before new file mode 100644 index 0000000000..a4d5a98b23 --- /dev/null +++ b/recipes/core/kernel/source/src/acpi/madt/arch/x86.rs.before @@ -0,0 +1,591 @@ +use core::{ + hint, + sync::atomic::{AtomicU8, Ordering}, +}; + +use x86::time::rdtsc; + +use crate::{ + arch::{ + device::local_apic::the_local_apic, + start::{kstart_ap, KernelArgsAp}, + }, + cpu_set::LogicalCpuId, + memory::{ + allocate_p2frame, map_device_memory, Frame, KernelMapper, Page, PageFlags, + PhysicalAddress, RmmA, RmmArch, VirtualAddress, PAGE_SIZE, + }, + startup::AP_READY, +}; + +use super::{Madt, MadtEntry}; + +use alloc::collections::BTreeSet; +use alloc::vec::Vec; + +/// Maximum number of APIC→CPU mappings we track for NUMA topology. +const MAX_APIC_MAPPINGS: usize = 256; + +struct ApicMapping { + apic_id: u32, + cpu_id: LogicalCpuId, +} + +const UNINIT_MAPPING: ApicMapping = ApicMapping { apic_id: u32::MAX, cpu_id: LogicalCpuId::new(0) }; + +static mut APIC_MAPPINGS: [ApicMapping; MAX_APIC_MAPPINGS] = [UNINIT_MAPPING; MAX_APIC_MAPPINGS]; +static mut APIC_MAPPING_COUNT: usize = 0; + +unsafe fn record_apic_mapping(apic_id: u32, cpu_id: LogicalCpuId) { + let count = APIC_MAPPING_COUNT; + if count < MAX_APIC_MAPPINGS { + APIC_MAPPINGS[count] = ApicMapping { apic_id, cpu_id }; + APIC_MAPPING_COUNT = count + 1; + } +} + +const AP_SPIN_LIMIT: u32 = 1_000_000; +const TRAMPOLINE: usize = 0x8000; +static TRAMPOLINE_DATA: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/trampoline")); + +/// Estimate TSC frequency in MHz from CPUID. +/// +/// Tries CPUID leaf 0x16 (Processor Frequency Information) first, +/// then CPUID leaf 0x15 (TSC/Core Crystal Clock Ratio). +/// Returns None if frequency cannot be determined. +fn tsc_freq_mhz_cpuid() -> Option { + let max_leaf = unsafe { core::arch::x86_64::__cpuid(0).eax as u32 }; + + // CPUID leaf 0x16: EAX = Core Base Frequency in MHz (Intel) + if max_leaf >= 0x16 { + let mhz = unsafe { core::arch::x86_64::__cpuid(0x16) }.eax as u64; + if mhz > 0 { + return Some(mhz); + } + } + + // CPUID leaf 0x15: EAX = denominator, EBX = numerator, ECX = crystal Hz + if max_leaf >= 0x15 { + let res = unsafe { core::arch::x86_64::__cpuid(0x15) }; + let denom = res.eax as u64; + let numer = res.ebx as u64; + let crystal_hz = res.ecx as u64; + if denom > 0 && numer > 0 && crystal_hz > 0 { + // TSC freq = crystal_hz * numer / denom + let tsc_hz = crystal_hz * numer / denom; + return Some(tsc_hz / 1_000_000); // Hz → MHz + } + } + + None +} + +/// Early-boot microsecond delay using the Time Stamp Counter. +/// +/// Uses CPUID-based TSC frequency estimation when available. +/// Falls back to a conservative spin loop calibrated for the +/// minimum expected CPU speed (1 GHz). +/// +/// # Safety +/// Must only be called after the BSP TSC is running (always true +/// after CPU reset on x86). +fn early_udelay(us: u64) { + if let Some(mhz) = tsc_freq_mhz_cpuid() { + // TSC-based delay: precise on invariant TSC (all modern x86). + // MHz = cycles per µs. + let target = unsafe { rdtsc() } + us * mhz; + while unsafe { rdtsc() } < target { + hint::spin_loop(); + } + } else { + // Fallback: conservative spin loop. + // spin_loop() (PAUSE) is ~40 cycles on modern Intel, ~1 on AMD. + // At 1 GHz minimum: 1000 cycles/µs ÷ 40 cycles/iter = 25 iters/µs. + // Use 50 iters/µs for safety margin on slower/variable CPUs. + let iters = us.saturating_mul(50); + for _ in 0..iters { + hint::spin_loop(); + } + } +} + +fn current_x2apic_processor_uid(madt: &Madt, apic_id: u32) -> Option { + madt.iter().find_map(|entry| match entry { + MadtEntry::LocalX2Apic(x2apic) if x2apic.x2apic_id == apic_id => Some(x2apic.processor_uid), + _ => None, + }) +} + +fn apply_lapic_address_override( + local_apic: &mut crate::arch::device::local_apic::LocalApic, + address: u64, +) { + if local_apic.x2 || address == 0 { + return; + } + + let Ok(physaddr) = usize::try_from(address) else { + warn!( + "Ignoring LAPIC address override {:#x}: does not fit host usize", + address + ); + return; + }; + + let mapped = unsafe { map_device_memory(PhysicalAddress::new(physaddr), 4096) }.data(); + local_apic.address = mapped; + debug!("Applied LAPIC address override: {:#x}", address); +} + +pub(super) fn init(madt: Madt) { + let local_apic = unsafe { the_local_apic() }; + let me = local_apic.id(); + + if local_apic.x2 { + debug!(" X2APIC {}", me.get()); + } else { + debug!(" XAPIC {}: {:>08X}", me.get(), local_apic.address); + } + + if cfg!(not(feature = "multi_core")) { + unsafe { + record_apic_mapping(me.get(), LogicalCpuId::new(0)); + } + crate::numa::init_default(); + return; + } + + // Map trampoline + let trampoline_frame = Frame::containing(PhysicalAddress::new(TRAMPOLINE)); + let trampoline_page = Page::containing_address(VirtualAddress::new(TRAMPOLINE)); + let (result, page_table_physaddr) = unsafe { + //TODO: do not have writable and executable! + let mut mapper = KernelMapper::lock_rw(); + + let result = match mapper.map_phys( + trampoline_page.start_address(), + trampoline_frame.base(), + PageFlags::new().execute(true).write(true), + ) { + Some(result) => result, + None => { + println!("KERNEL AP: failed to map trampoline page, AP bring-up disabled"); + return; + } + }; + + (result, mapper.table().phys().data()) + }; + result.flush(); + + // Write trampoline, make sure TRAMPOLINE page is free for use + for (i, val) in TRAMPOLINE_DATA.iter().enumerate() { + unsafe { + (*((TRAMPOLINE as *mut u8).add(i) as *const AtomicU8)).store(*val, Ordering::SeqCst); + } + } + + unsafe { + let preliminary_cpu_count = madt + .iter() + .filter(|entry| match entry { + MadtEntry::LocalApic(local) => u32::from(local.id) == me.get() || local.flags & 1 == 1, + MadtEntry::LocalX2Apic(local) => local.x2apic_id == me.get() || local.flags & 1 == 1, + _ => false, + }) + .count(); + crate::profiling::allocate(preliminary_cpu_count as u32); + } + + // Firmware bug detection: check for duplicate APIC IDs in MADT. + // Some firmware (especially on early BIOS/UEFI) may list the same + // processor multiple times. Keep first occurrence, warn on duplicates. + let mut seen_apic_ids: BTreeSet = BTreeSet::new(); + { + let _ = seen_apic_ids.insert(me.get()); // BSP + for entry in madt.iter() { + match entry { + MadtEntry::LocalApic(local) if local.flags & 1 == 1 => { + let id = u32::from(local.id); + if !seen_apic_ids.insert(id) { + warn!("MADT: duplicate APIC ID {} in LocalApic entry, firmware bug", id); + } + } + MadtEntry::LocalX2Apic(local) if local.flags & 1 == 1 => { + let id = local.x2apic_id; + if !seen_apic_ids.insert(id) { + warn!("MADT: duplicate x2APIC ID {} in LocalX2Apic entry, firmware bug", id); + } + } + _ => {} + } + } + } + + for madt_entry in madt.iter() { + debug!(" {:x?}", madt_entry); + if let MadtEntry::LocalApic(ap_local_apic) = madt_entry { + if u32::from(ap_local_apic.id) == me.get() { + debug!(" This is my local APIC"); + } else if ap_local_apic.flags & 1 == 1 { + // Allocate a stack + let alloc = match allocate_p2frame(4) { + Some(frame) => frame, + None => { + println!("KERNEL AP: CPU {} no memory for stack, skipping", ap_local_apic.id); + continue; + } + }; + let stack_start = RmmA::phys_to_virt(alloc.base()).data(); + let stack_end = stack_start + (PAGE_SIZE << 4); + + // Atomically allocate a CPU ID — fetch_add is SeqCst so that + // all later stores (PercpuBlock, NUMA node) are ordered after. + let cpu_id = LogicalCpuId::new(crate::CPU_COUNT.fetch_add(1, Ordering::SeqCst)); + if cpu_id.get() >= crate::cpu_set::MAX_CPU_COUNT { + println!( + "KERNEL AP: CPU {} exceeds logical CPU limit, skipping", + ap_local_apic.id + ); + continue; + } + + let pcr_ptr = crate::arch::gdt::allocate_and_init_pcr(cpu_id, stack_end); + + let idt_ptr = crate::arch::idt::allocate_and_init_idt(cpu_id); + + let args = KernelArgsAp { + stack_end: stack_end as *mut u8, + cpu_id, + pcr_ptr, + idt_ptr, + }; + + let ap_ready = (TRAMPOLINE + 8) as *mut u64; + let ap_args_ptr = unsafe { ap_ready.add(1) }; + let ap_page_table = unsafe { ap_ready.add(2) }; + let ap_code = unsafe { ap_ready.add(3) }; + + // Set the ap_ready to 0, volatile + unsafe { + ap_ready.write(0); + ap_args_ptr.write(&args as *const _ as u64); + ap_page_table.write(page_table_physaddr as u64); + #[expect(clippy::fn_to_numeric_cast)] + ap_code.write(kstart_ap as u64); + + // Ensure all trampoline writes are visible to the AP before + // it starts executing. asm!("") is only a compiler barrier; + // fence(SeqCst) is a full hardware memory barrier. + core::sync::atomic::fence(Ordering::SeqCst); + }; + AP_READY.store(false, Ordering::SeqCst); + + // Clear APIC Error Status Register before starting AP. + // Intel SDM §8.4.4: ESR should be cleared before sending SIPI. + unsafe { local_apic.esr(); } + + // Send INIT IPI (Assert) + { + // ICR: Delivery Mode=INIT(101), Level=Assert, Trigger=Edge + let mut icr = 0x4500u64; + if local_apic.x2 { + icr |= u64::from(ap_local_apic.id) << 32; + } else { + icr |= u64::from(ap_local_apic.id) << 56; + } + local_apic.set_icr(icr); + } + + // Intel SDM Vol 3A §8.4.4: wait 10ms after INIT deassert + // before sending first SIPI. Modern CPUs may need less, + // but 10ms is the safe specification-compliant value. + early_udelay(10_000); + + // Send START IPI #1 + { + let ap_segment = (TRAMPOLINE >> 12) & 0xFF; + // ICR: Delivery Mode=StartUp(110), Vector=ap_segment + // Note: bit 14 (Level) must be 0 for SIPI per Intel SDM. + let mut icr = 0x0600 | ap_segment as u64; + if local_apic.x2 { + icr |= u64::from(ap_local_apic.id) << 32; + } else { + icr |= u64::from(ap_local_apic.id) << 56; + } + local_apic.set_icr(icr); + } + + // Intel SDM: wait 200µs between SIPIs + early_udelay(200); + + // Send START IPI #2 (recommended for compatibility) + { + let ap_segment = (TRAMPOLINE >> 12) & 0xFF; + let mut icr = 0x0600 | ap_segment as u64; + if local_apic.x2 { + icr |= u64::from(ap_local_apic.id) << 32; + } else { + icr |= u64::from(ap_local_apic.id) << 56; + } + local_apic.set_icr(icr); + } + + // Wait briefly for SIPI to be accepted + early_udelay(200); + + // Check ESR for delivery errors after SIPI sequence. + // Bit 5 = Send Accept Error, Bit 6 = Send Illegal Vector. + let esr_val = unsafe { local_apic.esr() }; + if esr_val != 0 { + println!( + "KERNEL AP: CPU {} SIPI delivery error (ESR={:#x}), continuing", + ap_local_apic.id, esr_val + ); + } + + // Wait for trampoline ready with timeout + let mut trampoline_ready = false; + for _ in 0..AP_SPIN_LIMIT { + if unsafe { (*ap_ready.cast::()).load(Ordering::SeqCst) } != 0 { + trampoline_ready = true; + break; + } + hint::spin_loop(); + } + if !trampoline_ready { + println!("KERNEL AP: CPU {} trampoline timeout, skipping", ap_local_apic.id); + continue; + } + + let mut kernel_ready = false; + for _ in 0..AP_SPIN_LIMIT { + if AP_READY.load(Ordering::SeqCst) { + kernel_ready = true; + break; + } + hint::spin_loop(); + } + if !kernel_ready { + println!("KERNEL AP: CPU {} AP_READY timeout, skipping", ap_local_apic.id); + continue; + } + + // Record APIC→CPU mapping for NUMA topology. + unsafe { + record_apic_mapping(u32::from(ap_local_apic.id), cpu_id); + } + // Set NUMA node from SRAT data. + if let Some(percpu) = crate::percpu::get_for_cpu(cpu_id) { + if let Some(node) = crate::acpi::srat::numa_node_for_apic(u32::from(ap_local_apic.id)) { + percpu.numa_node.set(node); + } + } + + RmmA::invalidate_all(); + } else { + debug!("KERNEL AP: LAPIC CPU {} disabled in MADT, skipping", u32::from(ap_local_apic.id)); + } + } else if let MadtEntry::LocalX2Apic(ap_x2apic) = madt_entry { + let apic_id = ap_x2apic.x2apic_id; + let flags = ap_x2apic.flags; + + if apic_id == me.get() { + debug!(" This is my local x2APIC"); + } else if flags & 1 == 1 { + let alloc = match allocate_p2frame(4) { + Some(frame) => frame, + None => { + println!("KERNEL AP: CPU {} no memory for stack, skipping", apic_id); + continue; + } + }; + let stack_start = RmmA::phys_to_virt(alloc.base()).data(); + let stack_end = stack_start + (PAGE_SIZE << 4); + + // Atomically allocate a CPU ID — fetch_add is SeqCst so that + // all later stores (PercpuBlock, NUMA node) are ordered after. + let cpu_id = LogicalCpuId::new(crate::CPU_COUNT.fetch_add(1, Ordering::SeqCst)); + if cpu_id.get() >= crate::cpu_set::MAX_CPU_COUNT { + println!( + "KERNEL AP: CPU {} exceeds logical CPU limit, skipping", + apic_id + ); + continue; + } + + let pcr_ptr = crate::arch::gdt::allocate_and_init_pcr(cpu_id, stack_end); + let idt_ptr = crate::arch::idt::allocate_and_init_idt(cpu_id); + + let args = KernelArgsAp { + stack_end: stack_end as *mut u8, + cpu_id, + pcr_ptr, + idt_ptr, + }; + + let ap_ready = (TRAMPOLINE + 8) as *mut u64; + let ap_args_ptr = unsafe { ap_ready.add(1) }; + let ap_page_table = unsafe { ap_ready.add(2) }; + let ap_code = unsafe { ap_ready.add(3) }; + + unsafe { + ap_ready.write(0); + ap_args_ptr.write(&args as *const _ as u64); + ap_page_table.write(page_table_physaddr as u64); + #[expect(clippy::fn_to_numeric_cast)] + ap_code.write(kstart_ap as u64); + // Ensure all trampoline writes are visible to the AP. + core::sync::atomic::fence(Ordering::SeqCst); + } + AP_READY.store(false, Ordering::SeqCst); + + // Clear APIC Error Status Register before starting AP. + unsafe { local_apic.esr(); } + + // Send INIT IPI (Assert) + { + let mut icr = 0x4500u64; + if local_apic.x2 { + icr |= u64::from(apic_id) << 32; + } else { + icr |= u64::from(apic_id as u8) << 56; + } + local_apic.set_icr(icr); + } + + // Intel SDM Vol 3A §8.4.4: wait 10ms after INIT + early_udelay(10_000); + + // Send START IPI #1 + { + let ap_segment = (TRAMPOLINE >> 12) & 0xFF; + let mut icr = 0x0600u64 | ap_segment as u64; + if local_apic.x2 { + icr |= u64::from(apic_id) << 32; + } else { + icr |= u64::from(apic_id as u8) << 56; + } + local_apic.set_icr(icr); + } + + // Intel SDM: wait 200µs between SIPIs + early_udelay(200); + + // Send START IPI #2 (recommended for compatibility) + { + let ap_segment = (TRAMPOLINE >> 12) & 0xFF; + let mut icr = 0x0600u64 | ap_segment as u64; + if local_apic.x2 { + icr |= u64::from(apic_id) << 32; + } else { + icr |= u64::from(apic_id as u8) << 56; + } + local_apic.set_icr(icr); + } + + // Wait briefly for SIPI acceptance + early_udelay(200); + + // Check ESR for delivery errors. + let esr_val = unsafe { local_apic.esr() }; + if esr_val != 0 { + println!( + "KERNEL AP: CPU {} SIPI delivery error (ESR={:#x}), continuing", + apic_id, esr_val + ); + } + + let mut trampoline_ready = false; + for _ in 0..AP_SPIN_LIMIT { + if unsafe { (*ap_ready.cast::()).load(Ordering::SeqCst) } != 0 { + trampoline_ready = true; + break; + } + hint::spin_loop(); + } + if !trampoline_ready { + println!("KERNEL AP: CPU {} trampoline timeout, skipping", apic_id); + continue; + } + + let mut kernel_ready = false; + for _ in 0..AP_SPIN_LIMIT { + if AP_READY.load(Ordering::SeqCst) { + kernel_ready = true; + break; + } + hint::spin_loop(); + } + if !kernel_ready { + println!("KERNEL AP: CPU {} AP_READY timeout, skipping", apic_id); + continue; + } + + // Record APIC→CPU mapping for NUMA topology. + unsafe { + record_apic_mapping(apic_id, cpu_id); + } + // Set NUMA node from SRAT data. + if let Some(percpu) = crate::percpu::get_for_cpu(cpu_id) { + if let Some(node) = crate::acpi::srat::numa_node_for_apic(apic_id) { + percpu.numa_node.set(node); + } + } + + RmmA::invalidate_all(); + } else { + debug!("KERNEL AP: x2APIC CPU {} disabled in MADT (flags={:#x}), skipping", apic_id, flags); + } + } else if let MadtEntry::LocalApicNmi(nmi) = madt_entry { + let target_apic = nmi.processor; + if target_apic == 0xFF || target_apic == local_apic.id().get() as u8 { + unsafe { local_apic.set_lvt_nmi(nmi.nmi_pin, nmi.flags) }; + } + } else if let MadtEntry::LocalX2ApicNmi(nmi) = madt_entry { + let current_uid = current_x2apic_processor_uid(&madt, me.get()); + if nmi.processor_uid == u32::MAX || current_uid == Some(nmi.processor_uid) { + unsafe { local_apic.set_lvt_nmi(nmi.nmi_pin, nmi.flags) }; + } + } else if let MadtEntry::LapicAddressOverride(override_entry) = madt_entry { + apply_lapic_address_override(local_apic, override_entry.local_apic_address); + } + } + + // Initialize NUMA topology from APIC→CPU mappings and SRAT. + { + let mappings = unsafe { &APIC_MAPPINGS[..APIC_MAPPING_COUNT] }; + let mappings_ref: Vec<(u32, LogicalCpuId)> = mappings + .iter() + .map(|m| (m.apic_id, m.cpu_id)) + .collect(); + crate::numa::init_from_srat(&mappings_ref); + } + // Set BSP's NUMA node from SRAT. + if let Some(node) = crate::acpi::srat::numa_node_for_apic(me.get()) { + crate::percpu::PercpuBlock::current().numa_node.set(node); + } + + // Log final CPU count vs maximum + let cpu_count = crate::CPU_COUNT.load(Ordering::SeqCst); + info!( + "SMP: {} CPUs online (max {})", + cpu_count, crate::cpu_set::MAX_CPU_COUNT + ); + if cpu_count > crate::cpu_set::MAX_CPU_COUNT * 80 / 100 { + warn!( + "SMP: CPU count approaching MAX_CPU_COUNT limit ({}/{})", + cpu_count, crate::cpu_set::MAX_CPU_COUNT + ); + } + + // Unmap trampoline + if let Some((_frame, _, flush)) = unsafe { + KernelMapper::lock_rw() + .unmap_phys(trampoline_page.start_address()) + } { + flush.flush(); + } else { + println!("KERNEL AP: failed to unmap trampoline page (non-fatal)"); + } +} diff --git a/recipes/core/kernel/source/src/acpi/madt/arch/x86.rs.orig b/recipes/core/kernel/source/src/acpi/madt/arch/x86.rs.orig new file mode 100644 index 0000000000..4dc2388398 --- /dev/null +++ b/recipes/core/kernel/source/src/acpi/madt/arch/x86.rs.orig @@ -0,0 +1,160 @@ +use core::{ + hint, + sync::atomic::{AtomicU8, Ordering}, +}; + +use crate::{ + arch::{ + device::local_apic::the_local_apic, + start::{kstart_ap, KernelArgsAp}, + }, + cpu_set::LogicalCpuId, + memory::{ + allocate_p2frame, Frame, KernelMapper, Page, PageFlags, PhysicalAddress, RmmA, RmmArch, + VirtualAddress, PAGE_SIZE, + }, + startup::AP_READY, +}; + +use super::{Madt, MadtEntry}; + +const TRAMPOLINE: usize = 0x8000; +static TRAMPOLINE_DATA: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/trampoline")); + +pub(super) fn init(madt: Madt) { + let local_apic = unsafe { the_local_apic() }; + let me = local_apic.id(); + + if local_apic.x2 { + debug!(" X2APIC {}", me.get()); + } else { + debug!(" XAPIC {}: {:>08X}", me.get(), local_apic.address); + } + + if cfg!(not(feature = "multi_core")) { + return; + } + + // Map trampoline + let trampoline_frame = Frame::containing(PhysicalAddress::new(TRAMPOLINE)); + let trampoline_page = Page::containing_address(VirtualAddress::new(TRAMPOLINE)); + let (result, page_table_physaddr) = unsafe { + //TODO: do not have writable and executable! + let mut mapper = KernelMapper::lock_rw(); + + let result = mapper + .map_phys( + trampoline_page.start_address(), + trampoline_frame.base(), + PageFlags::new().execute(true).write(true), + ) + .expect("failed to map trampoline"); + + (result, mapper.table().phys().data()) + }; + result.flush(); + + // Write trampoline, make sure TRAMPOLINE page is free for use + for (i, val) in TRAMPOLINE_DATA.iter().enumerate() { + unsafe { + (*((TRAMPOLINE as *mut u8).add(i) as *const AtomicU8)).store(*val, Ordering::SeqCst); + } + } + + unsafe { + let preliminary_cpu_count = madt.iter().filter(|e| matches!(e, MadtEntry::LocalApic(entry) if u32::from(entry.id) == me.get() || entry.flags & 1 == 1)).count(); + crate::profiling::allocate(preliminary_cpu_count as u32); + } + + for madt_entry in madt.iter() { + debug!(" {:x?}", madt_entry); + if let MadtEntry::LocalApic(ap_local_apic) = madt_entry { + if u32::from(ap_local_apic.id) == me.get() { + debug!(" This is my local APIC"); + } else if ap_local_apic.flags & 1 == 1 { + let cpu_id = LogicalCpuId::next(); + + // Allocate a stack + let stack_start = RmmA::phys_to_virt( + allocate_p2frame(4) + .expect("no more frames in acpi stack_start") + .base(), + ) + .data(); + let stack_end = stack_start + (PAGE_SIZE << 4); + + let pcr_ptr = crate::arch::gdt::allocate_and_init_pcr(cpu_id, stack_end); + + let idt_ptr = crate::arch::idt::allocate_and_init_idt(cpu_id); + + let args = KernelArgsAp { + stack_end: stack_end as *mut u8, + cpu_id, + pcr_ptr, + idt_ptr, + }; + + let ap_ready = (TRAMPOLINE + 8) as *mut u64; + let ap_args_ptr = unsafe { ap_ready.add(1) }; + let ap_page_table = unsafe { ap_ready.add(2) }; + let ap_code = unsafe { ap_ready.add(3) }; + + // Set the ap_ready to 0, volatile + unsafe { + ap_ready.write(0); + ap_args_ptr.write(&args as *const _ as u64); + ap_page_table.write(page_table_physaddr as u64); + #[expect(clippy::fn_to_numeric_cast)] + ap_code.write(kstart_ap as u64); + + // TODO: Is this necessary (this fence)? + core::arch::asm!(""); + }; + AP_READY.store(false, Ordering::SeqCst); + + // Send INIT IPI + { + let mut icr = 0x4500; + if local_apic.x2 { + icr |= u64::from(ap_local_apic.id) << 32; + } else { + icr |= u64::from(ap_local_apic.id) << 56; + } + local_apic.set_icr(icr); + } + + // Send START IPI + { + let ap_segment = (TRAMPOLINE >> 12) & 0xFF; + let mut icr = 0x4600 | ap_segment as u64; + + if local_apic.x2 { + icr |= u64::from(ap_local_apic.id) << 32; + } else { + icr |= u64::from(ap_local_apic.id) << 56; + } + + local_apic.set_icr(icr); + } + + // Wait for trampoline ready + while unsafe { (*ap_ready.cast::()).load(Ordering::SeqCst) } == 0 { + hint::spin_loop(); + } + while !AP_READY.load(Ordering::SeqCst) { + hint::spin_loop(); + } + + RmmA::invalidate_all(); + } + } + } + + // Unmap trampoline + let (_frame, _, flush) = unsafe { + KernelMapper::lock_rw() + .unmap_phys(trampoline_page.start_address()) + .expect("failed to unmap trampoline page") + }; + flush.flush(); +} diff --git a/recipes/core/kernel/source/src/acpi/madt/arch/x86.rs.rej b/recipes/core/kernel/source/src/acpi/madt/arch/x86.rs.rej new file mode 100644 index 0000000000..d53ad0dcb6 --- /dev/null +++ b/recipes/core/kernel/source/src/acpi/madt/arch/x86.rs.rej @@ -0,0 +1,41 @@ +--- src/acpi/madt/arch/x86.rs ++++ src/acpi/madt/arch/x86.rs +@@ -446,11 +446,7 @@ + // Send INIT IPI (Assert) + { + let mut icr = 0x4500u64; +- if local_apic.x2 { +- icr |= u64::from(apic_id) << 32; +- } else { +- icr |= u64::from(apic_id as u8) << 56; +- } ++ icr |= u64::from(apic_id) << 32; + local_apic.set_icr(icr); + } + +@@ -460,11 +456,7 @@ + { + let ap_segment = (TRAMPOLINE >> 12) & 0xFF; + let mut icr = 0x0600u64 | ap_segment as u64; +- if local_apic.x2 { +- icr |= u64::from(apic_id) << 32; +- } else { +- icr |= u64::from(apic_id as u8) << 56; +- } ++ icr |= u64::from(apic_id) << 32; + local_apic.set_icr(icr); + } + +@@ -476,11 +468,7 @@ + { + let ap_segment = (TRAMPOLINE >> 12) & 0xFF; + let mut icr = 0x0600u64 | ap_segment as u64; +- if local_apic.x2 { +- icr |= u64::from(apic_id) << 32; +- } else { +- icr |= u64::from(apic_id as u8) << 56; +- } ++ icr |= u64::from(apic_id) << 32; + local_apic.set_icr(icr); + } + diff --git a/recipes/core/kernel/source/src/acpi/madt/mod.rs b/recipes/core/kernel/source/src/acpi/madt/mod.rs index ed68d6eea8..3159b9c497 100644 --- a/recipes/core/kernel/source/src/acpi/madt/mod.rs +++ b/recipes/core/kernel/source/src/acpi/madt/mod.rs @@ -34,12 +34,6 @@ impl Madt { let madt = Madt::new(find_one_sdt!("APIC")); if let Some(madt) = madt { - // Validate MADT checksum per ACPI 6.5 §5.2.2 - if !madt.sdt.validate_checksum() { - error!("MADT checksum validation failed, skipping APIC initialization"); - return; - } - // safe because no APs have been started yet. unsafe { MADT.get().write(Some(madt)) }; @@ -152,48 +146,6 @@ pub struct MadtGicd { _reserved2: [u8; 3], } -/// MADT Local x2APIC (entry type 0x9) -#[derive(Clone, Copy, Debug)] -#[repr(C, packed)] -pub struct MadtLocalX2Apic { - _reserved: u16, - pub x2apic_id: u32, - pub flags: u32, - pub processor_uid: u32, -} - -/// MADT Local APIC NMI (entry type 0x4) -#[derive(Clone, Copy, Debug)] -#[repr(C, packed)] -pub struct MadtLocalApicNmi { - pub processor: u8, - pub flags: u16, - pub nmi_pin: u8, -} - -/// MADT Local APIC address override (entry type 0x5) -#[derive(Clone, Copy, Debug)] -#[repr(C, packed)] -pub struct MadtLapicAddressOverride { - _reserved: u16, - pub local_apic_address: u64, -} - -/// MADT Local x2APIC NMI (entry type 0xA) -#[derive(Clone, Copy, Debug)] -#[repr(C, packed)] -pub struct MadtLocalX2ApicNmi { - _reserved: u16, - pub processor_uid: u32, - pub flags: u16, - pub nmi_pin: u8, - _reserved2: u8, -} - -const _: () = assert!(size_of::() == 4); -const _: () = assert!(size_of::() == 10); -const _: () = assert!(size_of::() == 10); - /// MADT Entries #[derive(Debug)] #[allow(dead_code)] @@ -204,18 +156,10 @@ pub enum MadtEntry { InvalidIoApic(usize), IntSrcOverride(&'static MadtIntSrcOverride), InvalidIntSrcOverride(usize), - LocalApicNmi(&'static MadtLocalApicNmi), - InvalidLocalApicNmi(usize), - LapicAddressOverride(&'static MadtLapicAddressOverride), - InvalidLapicAddressOverride(usize), Gicc(&'static MadtGicc), InvalidGicc(usize), Gicd(&'static MadtGicd), InvalidGicd(usize), - LocalX2Apic(&'static MadtLocalX2Apic), - InvalidLocalX2Apic(usize), - LocalX2ApicNmi(&'static MadtLocalX2ApicNmi), - InvalidLocalX2ApicNmi(usize), Unknown(u8), } @@ -232,10 +176,6 @@ impl Iterator for MadtIter { let entry_len = unsafe { *(self.sdt.data_address() as *const u8).add(self.i + 1) } as usize; - if entry_len < 2 { - return None; - } - if self.i + entry_len <= self.sdt.data_len() { let item = match entry_type { 0x0 => { @@ -266,46 +206,6 @@ impl Iterator for MadtIter { MadtEntry::InvalidIntSrcOverride(entry_len) } } - 0x4 => { - if entry_len == size_of::() + 2 { - MadtEntry::LocalApicNmi(unsafe { - &*((self.sdt.data_address() + self.i + 2) - as *const MadtLocalApicNmi) - }) - } else { - MadtEntry::InvalidLocalApicNmi(entry_len) - } - } - 0x5 => { - if entry_len == size_of::() + 2 { - MadtEntry::LapicAddressOverride(unsafe { - &*((self.sdt.data_address() + self.i + 2) - as *const MadtLapicAddressOverride) - }) - } else { - MadtEntry::InvalidLapicAddressOverride(entry_len) - } - } - 0x9 => { - if entry_len == size_of::() + 2 { - MadtEntry::LocalX2Apic(unsafe { - &*((self.sdt.data_address() + self.i + 2) - as *const MadtLocalX2Apic) - }) - } else { - MadtEntry::InvalidLocalX2Apic(entry_len) - } - } - 0xA => { - if entry_len == size_of::() + 2 { - MadtEntry::LocalX2ApicNmi(unsafe { - &*((self.sdt.data_address() + self.i + 2) - as *const MadtLocalX2ApicNmi) - }) - } else { - MadtEntry::InvalidLocalX2ApicNmi(entry_len) - } - } 0xB => { if entry_len >= size_of::() + 2 { MadtEntry::Gicc(unsafe { diff --git a/recipes/core/kernel/source/src/acpi/mod.rs b/recipes/core/kernel/source/src/acpi/mod.rs index d6a744ef90..59e3526544 100644 --- a/recipes/core/kernel/source/src/acpi/mod.rs +++ b/recipes/core/kernel/source/src/acpi/mod.rs @@ -20,8 +20,6 @@ mod rxsdt; pub mod sdt; #[cfg(target_arch = "aarch64")] mod spcr; -pub mod slit; -pub mod srat; mod xsdt; unsafe fn map_linearly(addr: PhysicalAddress, len: usize, mapper: &mut crate::memory::PageMapper) { @@ -84,14 +82,6 @@ impl Rxsdt for RxsdtEnum { pub static RXSDT_ENUM: Once = Once::new(); -#[derive(Clone, Copy, Debug)] -pub struct AcpiRootInfo { - pub revision: u8, - pub root_sdt_address: PhysicalAddress, -} - -pub static ACPI_ROOT_INFO: Once = Once::new(); - /// Parse the ACPI tables to gather CPU, interrupt, and timer information pub unsafe fn init(already_supplied_rsdp: Option<*const u8>) { unsafe { @@ -104,15 +94,6 @@ pub unsafe fn init(already_supplied_rsdp: Option<*const u8>) { let rsdp_opt = Rsdp::get_rsdp(already_supplied_rsdp); if let Some(rsdp) = rsdp_opt { - let root_info = ACPI_ROOT_INFO.call_once(|| AcpiRootInfo { - revision: rsdp.revision(), - root_sdt_address: rsdp.sdt_address(), - }); - - if root_info.root_sdt_address != rsdp.sdt_address() || root_info.revision != rsdp.revision() { - error!("ACPI_ROOT_INFO already initialized with a different RSDP root"); - } - debug!("SDT address: {:#x}", rsdp.sdt_address().data()); let rxsdt = get_sdt(rsdp.sdt_address(), &mut KernelMapper::lock_rw()); @@ -165,14 +146,7 @@ pub unsafe fn init(already_supplied_rsdp: Option<*const u8>) { // TODO: Enumerate processors in userspace, and then provide an ACPI-independent interface // to initialize enumerated processors to userspace? - // Parse SRAT BEFORE MADT so NUMA node mapping is available - // when APs are started and PercpuBlocks are created. - srat::init(); - Madt::init(); - - // Parse SLIT after MADT for the NUMA distance matrix. - slit::init(); //TODO: support this on any arch // SPCR must be initialized after MADT for interrupt controllers #[cfg(target_arch = "aarch64")] diff --git a/recipes/core/kernel/source/src/acpi/rsdp.rs b/recipes/core/kernel/source/src/acpi/rsdp.rs index 94e8603743..f10c5ac989 100644 --- a/recipes/core/kernel/source/src/acpi/rsdp.rs +++ b/recipes/core/kernel/source/src/acpi/rsdp.rs @@ -17,33 +17,9 @@ pub struct Rsdp { impl Rsdp { pub unsafe fn get_rsdp(already_supplied_rsdp: Option<*const u8>) -> Option { - already_supplied_rsdp.and_then(|rsdp_ptr| { - let rsdp = unsafe { *(rsdp_ptr as *const Rsdp) }; - - // Validate signature "RSD PTR " - if &rsdp.signature != b"RSD PTR " { - return None; - } - - // ACPI 1.0 checksum: sum of first 20 bytes must be zero - let bytes_v1 = unsafe { core::slice::from_raw_parts(rsdp_ptr, 20) }; - if bytes_v1.iter().fold(0u8, |sum, &b| sum.wrapping_add(b)) != 0 { - return None; - } - - // ACPI 2.0+ extended checksum: sum of entire table (length bytes) must be zero - if rsdp.revision >= 2 { - let full_len = rsdp._length as usize; - if full_len < 36 || full_len > 256 { - return None; - } - let bytes_full = unsafe { core::slice::from_raw_parts(rsdp_ptr, full_len) }; - if bytes_full.iter().fold(0u8, |sum, &b| sum.wrapping_add(b)) != 0 { - return None; - } - } - - Some(rsdp) + already_supplied_rsdp.map(|rsdp_ptr| { + // TODO: Validate + unsafe { *(rsdp_ptr as *const Rsdp) } }) } @@ -55,8 +31,4 @@ impl Rsdp { self.rsdt_address as usize }) } - - pub fn revision(&self) -> u8 { - self.revision - } } diff --git a/recipes/core/kernel/source/src/acpi/sdt.rs b/recipes/core/kernel/source/src/acpi/sdt.rs index 2f1f54cd9b..83ff67dac1 100644 --- a/recipes/core/kernel/source/src/acpi/sdt.rs +++ b/recipes/core/kernel/source/src/acpi/sdt.rs @@ -24,20 +24,4 @@ impl Sdt { let header_size = size_of::(); total_size.saturating_sub(header_size) } - - /// Validate the SDT checksum. - /// - /// Per ACPI 6.5 §5.2.2: the entire table (including the checksum field) - /// must sum to 0 when all bytes are added together as unsigned 8-bit values. - pub fn validate_checksum(&self) -> bool { - let ptr = self as *const _ as *const u8; - let len = self.length as usize; - if len < size_of::() { - return false; - } - let sum = unsafe { core::slice::from_raw_parts(ptr, len) } - .iter() - .fold(0u8, |acc, &b| acc.wrapping_add(b)); - sum == 0 - } } diff --git a/recipes/core/kernel/source/src/acpi/slit.rs b/recipes/core/kernel/source/src/acpi/slit.rs deleted file mode 100644 index 605f303390..0000000000 --- a/recipes/core/kernel/source/src/acpi/slit.rs +++ /dev/null @@ -1,45 +0,0 @@ -//! SLIT (System Locality Information Table) parser. -//! -//! Parses the NUMA distance matrix for scheduler NUMA-aware work stealing. - -use super::sdt::Sdt; -use crate::acpi::find_sdt; - -const MAX_NODES: usize = 8; - -static mut SLIT_MATRIX: [[u8; MAX_NODES]; MAX_NODES] = [[10u8; MAX_NODES]; MAX_NODES]; -static mut SLIT_NUM_NODES: usize = 0; -static mut SLIT_AVAILABLE: bool = false; - -pub fn is_available() -> bool { unsafe { SLIT_AVAILABLE } } -pub fn num_nodes() -> usize { unsafe { SLIT_NUM_NODES } } - -pub fn distance(from: u8, to: u8) -> u8 { - if !unsafe { SLIT_AVAILABLE } { return 10; } - let (from, to) = (from as usize, to as usize); - if from >= MAX_NODES || to >= MAX_NODES { return 10; } - unsafe { SLIT_MATRIX[from][to] } -} - -pub fn same_socket(node1: u8, node2: u8) -> bool { distance(node1, node2) <= 20 } - -pub fn init() { - let sdt = match find_sdt("SLIT").as_slice() { - [] => return, - [x] => *x, - xs => { println!("SLIT: {} tables found, expected 1", xs.len()); return; } - }; - if &sdt.signature != b"SLIT" { return; } - let data_addr = sdt.data_address(); - let data_len = sdt.data_len(); - if data_len < 8 { return; } - let num_nodes = unsafe { *(data_addr as *const u64) } as usize; - if num_nodes == 0 || num_nodes > MAX_NODES { println!("SLIT: {num_nodes} nodes (max {MAX_NODES}), ignoring"); return; } - let matrix_start = 8; - let matrix_size = num_nodes * num_nodes; - if data_len < matrix_start + matrix_size { println!("SLIT: matrix truncated ({data_len} < {})", matrix_start + matrix_size); return; } - let matrix = unsafe { &mut SLIT_MATRIX }; - for i in 0..num_nodes { for j in 0..num_nodes { matrix[i][j] = unsafe { *((data_addr + matrix_start + i * num_nodes + j) as *const u8) }; } } - unsafe { SLIT_NUM_NODES = num_nodes; SLIT_AVAILABLE = true; } - debug!("SLIT: {} nodes, distance matrix loaded", num_nodes); -} diff --git a/recipes/core/kernel/source/src/acpi/srat.rs b/recipes/core/kernel/source/src/acpi/srat.rs deleted file mode 100644 index 49b3ac0ac7..0000000000 --- a/recipes/core/kernel/source/src/acpi/srat.rs +++ /dev/null @@ -1,102 +0,0 @@ -//! SRAT (System Resource Affinity Table) parser. -//! -//! Parses CPU-to-NUMA-node and memory-to-NUMA-node affinity information. -//! Called before MADT init so that NUMA data is available during AP startup. - -use super::sdt::Sdt; -use crate::acpi::find_sdt; - -const MAX_CPU_ENTRIES: usize = 256; -const MAX_MEM_ENTRIES: usize = 64; - -#[derive(Clone, Copy)] -struct SratCpuEntry { apic_id: u32, node: u8, enabled: bool } - -#[derive(Clone, Copy)] -struct SratMemEntry { node: u8, base: u64, length: u64, enabled: bool } - -const CPU_NONE: SratCpuEntry = SratCpuEntry { apic_id: u32::MAX, node: 0, enabled: false }; -const MEM_NONE: SratMemEntry = SratMemEntry { node: 0, base: 0, length: 0, enabled: false }; - -static mut SRAT_CPU_ENTRIES: [SratCpuEntry; MAX_CPU_ENTRIES] = [CPU_NONE; MAX_CPU_ENTRIES]; -static mut SRAT_MEM_ENTRIES: [SratMemEntry; MAX_MEM_ENTRIES] = [MEM_NONE; MAX_MEM_ENTRIES]; -static mut SRAT_CPU_COUNT: usize = 0; -static mut SRAT_MEM_COUNT: usize = 0; -static mut SRAT_AVAILABLE: bool = false; - -pub fn is_available() -> bool { unsafe { SRAT_AVAILABLE } } - -pub fn numa_node_for_apic(apic_id: u32) -> Option { - if !unsafe { SRAT_AVAILABLE } { return None; } - let count = unsafe { SRAT_CPU_COUNT }; - let entries = unsafe { &SRAT_CPU_ENTRIES }; - for i in 0..count { - if entries[i].apic_id == apic_id && entries[i].enabled { return Some(entries[i].node); } - } - None -} - -pub fn numa_node_count() -> usize { - if !unsafe { SRAT_AVAILABLE } { return 1; } - let mut max_node: u8 = 0; - let count = unsafe { SRAT_CPU_COUNT }; - let entries = unsafe { &SRAT_CPU_ENTRIES }; - for i in 0..count { if entries[i].enabled && entries[i].node > max_node { max_node = entries[i].node; } } - (max_node as usize) + 1 -} - -#[repr(C, packed)] -struct SratLocalApic { _proximity_lo: u8, apic_id: u8, flags: u32, _local_sapic_eid: u8, _proximity_hi: [u8; 3], _clock_domain: u32 } - -#[repr(C, packed)] -struct SratMemoryAffinity { proximity_domain: u32, _reserved1: u16, base_address_lo: u32, base_address_hi: u32, length_lo: u32, length_hi: u32, _reserved2: u32, flags: u32, _reserved3: u64 } - -#[repr(C, packed)] -struct SratLocalX2Apic { _reserved: u16, proximity_domain: u32, x2apic_id: u32, flags: u32, _clock_domain: u32, _reserved2: u32 } - -pub fn init() { - let sdt = match find_sdt("SRAT").as_slice() { - [] => return, - [x] => *x, - xs => { println!("SRAT: {} tables found, expected 1", xs.len()); return; } - }; - if &sdt.signature != b"SRAT" { return; } - let data_addr = sdt.data_address(); - let data_len = sdt.data_len(); - if data_len < 12 { println!("SRAT: table too short ({data_len} bytes)"); return; } - let mut offset: usize = 12; - let cpu_entries = unsafe { &mut SRAT_CPU_ENTRIES }; - let mem_entries = unsafe { &mut SRAT_MEM_ENTRIES }; - let mut cpu_count: usize = 0; - let mut mem_count: usize = 0; - while offset + 2 <= data_len { - let entry_type = unsafe { *((data_addr + offset) as *const u8) }; - let entry_len = unsafe { *((data_addr + offset + 1) as *const u8) } as usize; - if entry_len < 2 || offset + entry_len > data_len { break; } - let entry_data = data_addr + offset + 2; - match entry_type { - 0x0 if entry_len >= size_of::() + 2 => { - let e = unsafe { &*(entry_data as *const SratLocalApic) }; - let enabled = (e.flags & 1) == 1; - let node = (e._proximity_lo as u32) | ((e._proximity_hi[0] as u32) << 8) | ((e._proximity_hi[1] as u32) << 16) | ((e._proximity_hi[2] as u32) << 24); - if cpu_count < MAX_CPU_ENTRIES { cpu_entries[cpu_count] = SratCpuEntry { apic_id: e.apic_id as u32, node: node as u8, enabled }; cpu_count += 1; } - } - 0x1 if entry_len >= size_of::() + 2 => { - let e = unsafe { &*(entry_data as *const SratMemoryAffinity) }; - let enabled = (e.flags & 1) == 1; - let base = (e.base_address_hi as u64) << 32 | e.base_address_lo as u64; - let length = (e.length_hi as u64) << 32 | e.length_lo as u64; - if mem_count < MAX_MEM_ENTRIES { mem_entries[mem_count] = SratMemEntry { node: e.proximity_domain as u8, base, length, enabled }; mem_count += 1; } - } - 0x2 if entry_len >= size_of::() + 2 => { - let e = unsafe { &*(entry_data as *const SratLocalX2Apic) }; - let enabled = (e.flags & 1) == 1; - if cpu_count < MAX_CPU_ENTRIES { cpu_entries[cpu_count] = SratCpuEntry { apic_id: e.x2apic_id, node: e.proximity_domain as u8, enabled }; cpu_count += 1; } - } - _ => {} - } - offset += entry_len; - } - unsafe { SRAT_CPU_COUNT = cpu_count; SRAT_MEM_COUNT = mem_count; SRAT_AVAILABLE = true; } - debug!("SRAT: {} CPU entries, {} memory entries", cpu_count, mem_count); -} diff --git a/recipes/core/kernel/source/src/allocator/mod.rs b/recipes/core/kernel/source/src/allocator/mod.rs index aaa719635e..4fdb0ba16e 100644 --- a/recipes/core/kernel/source/src/allocator/mod.rs +++ b/recipes/core/kernel/source/src/allocator/mod.rs @@ -7,40 +7,26 @@ mod linked_list; /// Size of kernel heap const KERNEL_HEAP_SIZE: usize = ::rmm::MEGABYTE; -#[cold] -fn halt_kernel_heap_init(message: &str) -> ! { - print!("{message}"); - println!("Kernel heap initialization cannot continue. Halting."); - loop { - core::hint::spin_loop(); - } -} - unsafe fn map_heap(mapper: &mut KernelMapper, offset: usize, size: usize) { let mut flush_all = PageFlushAll::new(); let heap_start_page = Page::containing_address(VirtualAddress::new(offset)); let heap_end_page = Page::containing_address(VirtualAddress::new(offset + size - 1)); for page in Page::range_inclusive(heap_start_page, heap_end_page) { - let phys = match mapper.allocator_mut().allocate_one() { - Some(phys) => phys, - None => halt_kernel_heap_init( - "FATAL: failed to allocate physical frame for kernel heap\n", - ), - }; + let phys = mapper + .allocator_mut() + .allocate_one() + .expect("failed to allocate kernel heap"); let flush = unsafe { - match mapper.map_phys( - page.start_address(), - phys, - PageFlags::new() - .write(true) - .global(cfg!(not(feature = "pti"))), - ) { - Some(flush) => flush, - None => halt_kernel_heap_init( - "FATAL: failed to map kernel heap virtual page\n", - ), - } + mapper + .map_phys( + page.start_address(), + phys, + PageFlags::new() + .write(true) + .global(cfg!(not(feature = "pti"))), + ) + .expect("failed to map kernel heap") }; flush_all.consume(flush); } diff --git a/recipes/core/kernel/source/src/arch/aarch64/start.rs b/recipes/core/kernel/source/src/arch/aarch64/start.rs index 65e3fe339b..e1c8cfb4ae 100644 --- a/recipes/core/kernel/source/src/arch/aarch64/start.rs +++ b/recipes/core/kernel/source/src/arch/aarch64/start.rs @@ -91,7 +91,7 @@ unsafe extern "C" fn start(args_ptr: *const KernelArgs) -> ! { dtb::serial::init_early(dtb); } - info!("RedBear OS starting..."); + info!("Redox OS starting..."); args.print(); // Initialize RMM diff --git a/recipes/core/kernel/source/src/arch/riscv64/start.rs b/recipes/core/kernel/source/src/arch/riscv64/start.rs index a825536aa9..2551968f05 100644 --- a/recipes/core/kernel/source/src/arch/riscv64/start.rs +++ b/recipes/core/kernel/source/src/arch/riscv64/start.rs @@ -97,7 +97,7 @@ unsafe extern "C" fn start(args_ptr: *const KernelArgs) -> ! { init_early(dtb); } - info!("RedBear OS starting..."); + info!("Redox OS starting..."); args.print(); if let Some(dtb) = &dtb { diff --git a/recipes/core/kernel/source/src/arch/x86_shared/device/ioapic.rs b/recipes/core/kernel/source/src/arch/x86_shared/device/ioapic.rs index b7656dba57..fb66d3bf2b 100644 --- a/recipes/core/kernel/source/src/arch/x86_shared/device/ioapic.rs +++ b/recipes/core/kernel/source/src/arch/x86_shared/device/ioapic.rs @@ -14,10 +14,6 @@ pub struct IoApicRegs { pointer: *const u32, } impl IoApicRegs { - fn redirection_index_valid(&mut self, idx: u8) -> bool { - idx <= self.max_redirection_table_entries() - } - fn ioregsel(&self) -> *const u32 { self.pointer } @@ -48,28 +44,21 @@ impl IoApicRegs { pub fn read_ioapicver(&mut self) -> u32 { self.read_reg(0x01) } - pub fn read_ioredtbl(&mut self, idx: u8) -> Option { - if !self.redirection_index_valid(idx) { - warn!("IOAPIC read_ioredtbl index {} out of range", idx); - return None; - } + pub fn read_ioredtbl(&mut self, idx: u8) -> u64 { + assert!(idx < 24); let lo = self.read_reg(0x10 + idx * 2); let hi = self.read_reg(0x10 + idx * 2 + 1); - Some(u64::from(lo) | (u64::from(hi) << 32)) + u64::from(lo) | (u64::from(hi) << 32) } - pub fn write_ioredtbl(&mut self, idx: u8, value: u64) -> bool { - if !self.redirection_index_valid(idx) { - warn!("IOAPIC write_ioredtbl index {} out of range", idx); - return false; - } + pub fn write_ioredtbl(&mut self, idx: u8, value: u64) { + assert!(idx < 24); let lo = value as u32; let hi = (value >> 32) as u32; self.write_reg(0x10 + idx * 2, lo); self.write_reg(0x10 + idx * 2 + 1, hi); - true } pub fn max_redirection_table_entries(&mut self) -> u8 { @@ -103,37 +92,17 @@ impl IoApic { } /// Map an interrupt vector to a physical local APIC ID of a processor (thus physical mode). #[allow(dead_code)] - pub fn map(&self, idx: u8, info: MapInfo) -> bool { - let Some(raw) = info.as_raw() else { - return false; - }; - self.regs.lock().write_ioredtbl(idx, raw) + pub fn map(&self, idx: u8, info: MapInfo) { + self.regs.lock().write_ioredtbl(idx, info.as_raw()) } pub fn set_mask(&self, gsi: u32, mask: bool) { let idx = (gsi - self.gsi_start) as u8; let mut guard = self.regs.lock(); - let Some(mut reg) = guard.read_ioredtbl(idx) else { - return; - }; + let mut reg = guard.read_ioredtbl(idx); reg &= !(1 << 16); reg |= u64::from(mask) << 16; - let _ = guard.write_ioredtbl(idx, reg); - } - /// Change the destination APIC for a GSI by reprogramming the redirection table entry. - /// Preserves all other fields (vector, polarity, trigger mode, delivery mode, mask). - /// Returns true if the entry was successfully updated. - pub fn set_irq_affinity(&self, gsi: u32, dest: ApicId) -> bool { - let idx = (gsi - self.gsi_start) as u8; - let mut guard = self.regs.lock(); - let Some(mut entry) = guard.read_ioredtbl(idx) else { - return false; - }; - // Clear destination field (bits 63:56 for xAPIC physical mode) - // and set new destination APIC ID - entry &= !(0xFF_u64 << 56); - entry |= u64::from(dest.get()) << 56; - guard.write_ioredtbl(idx, entry) + guard.write_ioredtbl(idx, reg); } } @@ -180,26 +149,19 @@ pub struct MapInfo { } impl MapInfo { - pub fn as_raw(&self) -> Option { - if !(0x20..=0xFE).contains(&self.vector) { - warn!( - "Refusing to map IOAPIC vector outside valid range: {:#x}", - self.vector - ); - return None; - } + pub fn as_raw(&self) -> u64 { + assert!(self.vector >= 0x20); + assert!(self.vector <= 0xFE); // TODO: Check for reserved fields. - Some( - (u64::from(self.dest.get()) << 56) + (u64::from(self.dest.get()) << 56) | (u64::from(self.mask) << 16) | ((self.trigger_mode as u64) << 15) | ((self.polarity as u64) << 13) | ((self.dest_mode as u64) << 11) | ((self.delivery_mode as u64) << 8) - | u64::from(self.vector), - ) + | u64::from(self.vector) } } @@ -213,7 +175,7 @@ impl fmt::Debug for IoApic { let count = guard.max_redirection_table_entries(); f.debug_list() - .entries((0..=count).filter_map(|i| guard.read_ioredtbl(i))) + .entries((0..count).map(|i| guard.read_ioredtbl(i))) .finish() } } @@ -275,14 +237,11 @@ pub unsafe fn handle_ioapic(madt_ioapic: &'static MadtIoApic) { let ioapic_registers = virt.data() as *const u32; let ioapic = IoApic::new(ioapic_registers, madt_ioapic.gsi_base); - let detected_id = ioapic.regs.lock().id(); - if detected_id != madt_ioapic.id { - warn!( - "mismatched ACPI MADT I/O APIC ID: MADT={}, IOAPIC={}; continuing with detected hardware", - madt_ioapic.id, - detected_id - ); - } + assert_eq!( + ioapic.regs.lock().id(), + madt_ioapic.id, + "mismatched ACPI MADT I/O APIC ID, and the ID reported by the I/O APIC" + ); (*IOAPICS.get()).get_or_insert_with(Vec::new).push(ioapic); } @@ -351,11 +310,11 @@ pub unsafe fn init() { } } } - for ioapic in ioapics() { - for idx in 0..=ioapic.count { - ioapic.set_mask(ioapic.gsi_start + u32::from(idx), true); - } - } + println!( + "I/O APICs: {:?}, overrides: {:?}", + ioapics(), + src_overrides() + ); // map the legacy PC-compatible IRQs (0-15) to 32-47, just like we did with 8259 PIC (if it // wouldn't have been disabled due to this I/O APIC) @@ -370,6 +329,7 @@ pub unsafe fn init() { .iter() .any(|over| over.bus_irq == legacy_irq) { + // there's an IRQ conflict, making this legacy IRQ inaccessible. continue; } ( @@ -389,6 +349,7 @@ pub unsafe fn init() { let redir_tbl_index = (gsi - apic.gsi_start) as u8; let map_info = MapInfo { + // only send to the BSP dest: bsp_apic_id, dest_mode: DestinationMode::Physical, delivery_mode: DeliveryMode::Fixed, @@ -405,32 +366,7 @@ pub unsafe fn init() { }, vector: 32 + legacy_irq, }; - if !apic.map(redir_tbl_index, map_info) { - warn!( - "Unable to map legacy IRQ {} (GSI {}) through IOAPIC index {}", - legacy_irq, - gsi, - redir_tbl_index - ); - } - - if legacy_irq == 0 && gsi != u32::from(legacy_irq) { - if let Some(apic0) = find_ioapic(u32::from(legacy_irq)) { - let idx0 = (u32::from(legacy_irq) - apic0.gsi_start) as u8; - let _ = apic0.map( - idx0, - MapInfo { - dest: bsp_apic_id, - dest_mode: DestinationMode::Physical, - delivery_mode: DeliveryMode::Fixed, - mask: false, - polarity: ApicPolarity::ActiveHigh, - trigger_mode: ApicTriggerMode::Edge, - vector: 32, - }, - ); - } - } + apic.map(redir_tbl_index, map_info); } println!( "I/O APICs: {:?}, overrides: {:?}", @@ -470,7 +406,7 @@ fn resolve(irq: u8) -> u32 { fn find_ioapic(gsi: u32) -> Option<&'static IoApic> { ioapics() .iter() - .find(|apic| gsi >= apic.gsi_start && gsi <= apic.gsi_start + u32::from(apic.count)) + .find(|apic| gsi >= apic.gsi_start && gsi < apic.gsi_start + u32::from(apic.count)) } pub unsafe fn mask(irq: u8) { @@ -489,14 +425,3 @@ pub unsafe fn unmask(irq: u8) { }; apic.set_mask(gsi, false); } - -/// Change the destination CPU for an IRQ by reprogramming the IOAPIC redirection entry. -/// Resolves the legacy IRQ to its GSI, finds the owning IOAPIC, and updates the destination -/// APIC ID in the redirection table while preserving all other fields. -pub unsafe fn set_affinity(irq: u8, dest: ApicId) -> bool { - let gsi = resolve(irq); - match find_ioapic(gsi) { - Some(apic) => apic.set_irq_affinity(gsi, dest), - None => false, - } -} diff --git a/recipes/core/kernel/source/src/arch/x86_shared/device/local_apic.rs.before b/recipes/core/kernel/source/src/arch/x86_shared/device/local_apic.rs.before new file mode 100644 index 0000000000..87c5a31ff3 --- /dev/null +++ b/recipes/core/kernel/source/src/arch/x86_shared/device/local_apic.rs.before @@ -0,0 +1,312 @@ +use core::{ + cell::SyncUnsafeCell, + ptr::{read_volatile, write_volatile}, +}; +use x86::msr::*; + +use crate::{ + arch::{cpuid::cpuid, ipi::IpiKind}, + memory::{map_device_memory, PhysicalAddress}, + percpu::PercpuBlock, +}; + +#[derive(Clone, Copy, Debug)] +pub struct ApicId(u32); + +impl ApicId { + pub fn new(inner: u32) -> Self { + Self(inner) + } + + pub fn get(&self) -> u32 { + self.0 + } +} + +static LOCAL_APIC: SyncUnsafeCell = SyncUnsafeCell::new(LocalApic { + address: 0, + x2: false, +}); +pub unsafe fn the_local_apic() -> &'static mut LocalApic { + unsafe { &mut *LOCAL_APIC.get() } +} + +pub unsafe fn init() { + unsafe { + the_local_apic().init(); + } +} + +pub unsafe fn init_ap() { + unsafe { + the_local_apic().init_ap(); + } +} + +/// Local APIC +pub struct LocalApic { + pub address: usize, + pub x2: bool, +} + +impl LocalApic { + unsafe fn init(&mut self) { + unsafe { + let physaddr = PhysicalAddress::new(rdmsr(IA32_APIC_BASE) as usize & 0xFFFF_0000); + + self.x2 = cpuid() + .get_feature_info() + .is_some_and(|feature_info| feature_info.has_x2apic()); + + if !self.x2 { + info!("Detected xAPIC at {:#x}", physaddr.data()); + self.address = map_device_memory(physaddr, 4096).data(); + } else { + info!("Detected x2APIC"); + } + + self.init_ap(); + } + } + + unsafe fn init_ap(&mut self) { + unsafe { + if self.x2 { + wrmsr(IA32_APIC_BASE, rdmsr(IA32_APIC_BASE) | (1 << 10)); + wrmsr(IA32_X2APIC_SIVR, 0x100); + } else { + self.write(0xF0, 0x100); + } + self.setup_error_int(); + //self.setup_timer(); + + PercpuBlock::current() + .misc_arch_info + .apic_id_opt + .set(Some(self.id())); + } + } + + unsafe fn read(&self, reg: u32) -> u32 { + debug_assert!(!self.x2); + unsafe { read_volatile((self.address + reg as usize) as *const u32) } + } + + unsafe fn write(&mut self, reg: u32, value: u32) { + debug_assert!(!self.x2); + unsafe { + write_volatile((self.address + reg as usize) as *mut u32, value); + } + } + + pub fn id(&self) -> ApicId { + ApicId::new(if self.x2 { + unsafe { rdmsr(IA32_X2APIC_APICID) as u32 } + } else { + unsafe { self.read(0x20) >> 24 } + }) + } + + pub fn version(&self) -> u32 { + if self.x2 { + unsafe { rdmsr(IA32_X2APIC_VERSION) as u32 } + } else { + unsafe { self.read(0x30) } + } + } + + pub fn icr(&self) -> u64 { + if self.x2 { + unsafe { rdmsr(IA32_X2APIC_ICR) } + } else { + unsafe { ((self.read(0x310) as u64) << 32) | self.read(0x300) as u64 } + } + } + + pub fn set_icr(&mut self, value: u64) { + if self.x2 { + unsafe { + const PENDING: u32 = 1 << 12; + while (rdmsr(IA32_X2APIC_ICR) as u32) & PENDING == PENDING { + core::hint::spin_loop(); + } + wrmsr(IA32_X2APIC_ICR, value); + while (rdmsr(IA32_X2APIC_ICR) as u32) & PENDING == PENDING { + core::hint::spin_loop(); + } + } + } else { + unsafe { + const PENDING: u32 = 1 << 12; + while self.read(0x300) & PENDING == PENDING { + core::hint::spin_loop(); + } + self.write(0x310, (value >> 32) as u32); + self.write(0x300, value as u32); + while self.read(0x300) & PENDING == PENDING { + core::hint::spin_loop(); + } + } + } + } + + pub fn ipi(&mut self, apic_id: ApicId, kind: IpiKind) { + let shift = if self.x2 { 32 } else { 56 }; + self.set_icr((u64::from(apic_id.get()) << shift) | 0x40 | kind as u64); + } + pub fn ipi_nmi(&mut self, apic_id: ApicId) { + let shift = if self.x2 { 32 } else { 56 }; + self.set_icr((u64::from(apic_id.get()) << shift) | (1 << 14) | (0b100 << 8)); + } + + pub unsafe fn eoi(&mut self) { + unsafe { + if self.x2 { + wrmsr(IA32_X2APIC_EOI, 0); + } else { + self.write(0xB0, 0); + } + } + } + /// Reads the Error Status Register. + pub unsafe fn esr(&mut self) -> u32 { + unsafe { + if self.x2 { + // update the ESR to the current state of the local apic. + wrmsr(IA32_X2APIC_ESR, 0); + // read the updated value + rdmsr(IA32_X2APIC_ESR) as u32 + } else { + self.write(0x280, 0); + self.read(0x280) + } + } + } + pub unsafe fn lvt_timer(&mut self) -> u32 { + unsafe { + if self.x2 { + rdmsr(IA32_X2APIC_LVT_TIMER) as u32 + } else { + self.read(0x320) + } + } + } + pub unsafe fn set_lvt_timer(&mut self, value: u32) { + unsafe { + if self.x2 { + wrmsr(IA32_X2APIC_LVT_TIMER, u64::from(value)); + } else { + self.write(0x320, value); + } + } + } + pub unsafe fn init_count(&mut self) -> u32 { + unsafe { + if self.x2 { + rdmsr(IA32_X2APIC_INIT_COUNT) as u32 + } else { + self.read(0x380) + } + } + } + pub unsafe fn set_init_count(&mut self, initial_count: u32) { + unsafe { + if self.x2 { + wrmsr(IA32_X2APIC_INIT_COUNT, u64::from(initial_count)); + } else { + self.write(0x380, initial_count); + } + } + } + pub unsafe fn cur_count(&mut self) -> u32 { + unsafe { + if self.x2 { + rdmsr(IA32_X2APIC_CUR_COUNT) as u32 + } else { + self.read(0x390) + } + } + } + pub unsafe fn div_conf(&mut self) -> u32 { + unsafe { + if self.x2 { + rdmsr(IA32_X2APIC_DIV_CONF) as u32 + } else { + self.read(0x3E0) + } + } + } + pub unsafe fn set_div_conf(&mut self, div_conf: u32) { + unsafe { + if self.x2 { + wrmsr(IA32_X2APIC_DIV_CONF, u64::from(div_conf)); + } else { + self.write(0x3E0, div_conf); + } + } + } + pub unsafe fn lvt_error(&mut self) -> u32 { + unsafe { + if self.x2 { + rdmsr(IA32_X2APIC_LVT_ERROR) as u32 + } else { + self.read(0x370) + } + } + } + pub unsafe fn set_lvt_error(&mut self, lvt_error: u32) { + unsafe { + if self.x2 { + wrmsr(IA32_X2APIC_LVT_ERROR, u64::from(lvt_error)); + } else { + self.write(0x370, lvt_error); + } + } + } + + pub unsafe fn set_lvt_nmi(&mut self, pin: u8, flags: u16) { + let polarity = match flags & 0b11 { + 0b11 => 1 << 13, + _ => 0, + }; + let trigger_mode = match (flags >> 2) & 0b11 { + 0b11 => 1 << 15, + _ => 0, + }; + let lvt_value = (0b100 << 8) | polarity | trigger_mode; + + unsafe { + match pin { + 0 => { + if self.x2 { + wrmsr(IA32_X2APIC_LVT_LINT0, u64::from(lvt_value)); + } else { + self.write(0x350, lvt_value); + } + } + 1 => { + if self.x2 { + wrmsr(IA32_X2APIC_LVT_LINT1, u64::from(lvt_value)); + } else { + self.write(0x360, lvt_value); + } + } + _ => {} + } + } + } + + unsafe fn setup_error_int(&mut self) { + unsafe { + let vector = 49u32; + self.set_lvt_error(vector); + } + } +} + +#[repr(u8)] +pub enum LvtTimerMode { + OneShot = 0b00, + Periodic = 0b01, + TscDeadline = 0b10, +} diff --git a/recipes/core/kernel/source/src/arch/x86_shared/device/local_apic.rs.rej b/recipes/core/kernel/source/src/arch/x86_shared/device/local_apic.rs.rej new file mode 100644 index 0000000000..ebc5ff76fd --- /dev/null +++ b/recipes/core/kernel/source/src/arch/x86_shared/device/local_apic.rs.rej @@ -0,0 +1,14 @@ +--- src/arch/x86_shared/device/local_apic.rs ++++ src/arch/x86_shared/device/local_apic.rs +@@ -61,9 +61,9 @@ + + if !self.x2 { +- info!("Detected xAPIC at {:#x}", physaddr.data()); ++ debug!("Detected xAPIC at {:#x}", physaddr.data()); + self.address = map_device_memory(physaddr, 4096).data(); + } else { +- info!("Detected x2APIC"); ++ debug!("Detected x2APIC"); + } + + diff --git a/recipes/core/kernel/source/src/arch/x86_shared/device/mod.rs b/recipes/core/kernel/source/src/arch/x86_shared/device/mod.rs index a1e0b78ad0..6f41770601 100644 --- a/recipes/core/kernel/source/src/arch/x86_shared/device/mod.rs +++ b/recipes/core/kernel/source/src/arch/x86_shared/device/mod.rs @@ -4,11 +4,9 @@ pub mod cpu; pub mod hpet; pub mod ioapic; pub mod local_apic; -pub mod msi; pub mod pic; pub mod pit; pub mod serial; -pub mod vector; #[cfg(feature = "system76_ec_debug")] pub mod system76_ec; @@ -25,7 +23,8 @@ pub unsafe fn init() { } } pub unsafe fn init_after_acpi() { - unsafe { ioapic::init() }; + // this will disable the IOAPIC if needed. + //ioapic::init(mapper); } unsafe fn init_hpet() -> bool { diff --git a/recipes/core/kernel/source/src/arch/x86_shared/device/msi.rs b/recipes/core/kernel/source/src/arch/x86_shared/device/msi.rs deleted file mode 100644 index 94ab973a26..0000000000 --- a/recipes/core/kernel/source/src/arch/x86_shared/device/msi.rs +++ /dev/null @@ -1,183 +0,0 @@ -// MSI/MSI-X support for x86 — kernel-level message composition and validation -// Cross-referenced from Linux 7.0: arch/x86/kernel/apic/msi.c (391 lines) - -use crate::arch::device::local_apic::ApicId; - -pub const MSI_ADDRESS_BASE: u64 = 0xFEE0_0000; -pub const MSI_ADDRESS_MASK: u64 = 0xFEEF_F000; -const MSI_DEST_MODE_LOGICAL: u64 = 1 << 2; -const MSI_REDIRECTION_HINT: u64 = 1 << 3; - -#[derive(Debug, Clone, Copy)] -pub struct MsiAddress { - pub raw: u64, -} - -#[derive(Debug, Clone, Copy)] -pub struct MsiData { - pub raw: u32, -} - -#[derive(Debug, Clone)] -pub struct MsiMessage { - pub address: MsiAddress, - pub data: MsiData, -} - -impl MsiAddress { - pub fn new(dest_apic_id: u8, redirection_hint: bool, dest_mode_logical: bool) -> Self { - let mut addr = MSI_ADDRESS_BASE; - addr |= u64::from(dest_apic_id) << 12; - if redirection_hint { - addr |= MSI_REDIRECTION_HINT; - } - if dest_mode_logical { - addr |= MSI_DEST_MODE_LOGICAL; - } - Self { raw: addr } - } - - pub fn validate(addr: u64) -> bool { - (addr & MSI_ADDRESS_MASK) == MSI_ADDRESS_BASE - } - - pub fn dest_apic_id(&self) -> u8 { - ((self.raw >> 12) & 0xFF) as u8 - } -} - -impl MsiData { - pub fn new(vector: u8, delivery_mode: u8, trigger_mode: u8) -> Self { - let mut data = u32::from(vector); - data |= u32::from(delivery_mode) << 8; - data |= u32::from(trigger_mode) << 15; - Self { raw: data } - } - - pub fn vector(&self) -> u8 { - (self.raw & 0xFF) as u8 - } - - pub fn delivery_mode(&self) -> u8 { - ((self.raw >> 8) & 0x7) as u8 - } - - pub fn trigger_mode(&self) -> u8 { - ((self.raw >> 15) & 0x1) as u8 - } -} - -impl MsiMessage { - pub fn compose(dest: ApicId, vector: u8, delivery_mode: u8, trigger_mode: u8) -> Self { - let address = MsiAddress::new(dest.get() as u8, false, false); - let data = MsiData::new(vector, delivery_mode, trigger_mode); - Self { address, data } - } - - pub fn validate(&self) -> bool { - MsiAddress::validate(self.address.raw) - && self.data.vector() >= 32 - && self.data.vector() < 255 - } -} - -pub fn is_valid_msi_address(addr: u64) -> bool { - MsiAddress::validate(addr) -} - -pub fn is_valid_msi_vector(vector: u8) -> bool { - vector >= 32 && vector < 255 -} - -#[derive(Debug)] -pub struct MsiCapability { - pub msg_ctl: u16, - pub msg_addr_lo: u32, - pub msg_addr_hi: u32, - pub msg_data: u16, - pub mask_bits: u32, - pub pending_bits: u32, - pub is_64bit: bool, - pub is_maskable: bool, - pub multiple_message_capable: u8, -} - -impl MsiCapability { - pub fn parse(raw: &[u32; 6], msg_ctl: u16) -> Self { - Self { - msg_ctl, - msg_addr_lo: raw[1], - msg_addr_hi: if msg_ctl & (1 << 7) != 0 { raw[2] } else { 0 }, - msg_data: if msg_ctl & (1 << 7) != 0 { - (raw[3] & 0xFFFF) as u16 - } else { - (raw[2] & 0xFFFF) as u16 - }, - mask_bits: if msg_ctl & (1 << 8) != 0 { - if msg_ctl & (1 << 7) != 0 { - raw[3] >> 16 - } else { - raw[3] - } - } else { - 0 - }, - pending_bits: if msg_ctl & (1 << 8) != 0 { raw[4] } else { 0 }, - is_64bit: msg_ctl & (1 << 7) != 0, - is_maskable: msg_ctl & (1 << 8) != 0, - multiple_message_capable: ((msg_ctl >> 1) & 0x7) as u8, - } - } -} - -#[derive(Debug)] -pub struct MsixCapability { - pub msg_ctl: u16, - pub table_offset: u32, - pub table_bar: u8, - pub pba_offset: u32, - pub pba_bar: u8, - pub table_size: u16, -} - -impl MsixCapability { - pub fn parse(raw: &[u32; 3], msg_ctl: u16) -> Self { - Self { - msg_ctl, - table_offset: raw[1] & !0x7, - table_bar: (raw[1] & 0x7) as u8, - pba_offset: raw[2] & !0x7, - pba_bar: (raw[2] & 0x7) as u8, - table_size: ((msg_ctl >> 1) & 0x7FF) as u16 + 1, - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_compose_message() { - let msg = MsiMessage::compose(ApicId::new(3), 48, 0b101, 1); - assert!(msg.validate()); - assert_eq!(msg.address.dest_apic_id(), 3); - assert_eq!(msg.data.vector(), 48); - assert_eq!(msg.data.delivery_mode(), 0b101); - assert_eq!(msg.data.trigger_mode(), 1); - } - - #[test] - fn test_invalid_address() { - assert!(!is_valid_msi_address(0xDEAD_BEEF)); - assert!(is_valid_msi_address(0xFEE0_0000)); - } - - #[test] - fn test_msi_parse() { - let raw = [0u32; 6]; - let cap = MsiCapability::parse(&raw, 0); - assert!(!cap.is_64bit); - assert!(!cap.is_maskable); - } -} diff --git a/recipes/core/kernel/source/src/arch/x86_shared/device/vector.rs b/recipes/core/kernel/source/src/arch/x86_shared/device/vector.rs deleted file mode 100644 index cd59ac7965..0000000000 --- a/recipes/core/kernel/source/src/arch/x86_shared/device/vector.rs +++ /dev/null @@ -1,53 +0,0 @@ -use crate::cpu_set::LogicalCpuId; - -const VECTOR_COUNT: usize = 224; - -static VECTORS: [core::sync::atomic::AtomicU32; 7] = [ - core::sync::atomic::AtomicU32::new(0), - core::sync::atomic::AtomicU32::new(0), - core::sync::atomic::AtomicU32::new(0), - core::sync::atomic::AtomicU32::new(0), - core::sync::atomic::AtomicU32::new(0), - core::sync::atomic::AtomicU32::new(0), - core::sync::atomic::AtomicU32::new(0), -]; - -pub fn allocate_vector(_cpu: LogicalCpuId) -> Option { - for (bank, slot) in VECTORS.iter().enumerate() { - let mut bits = slot.load(core::sync::atomic::Ordering::Acquire); - loop { - let free = bits.trailing_ones() as usize; - if free >= 32 { - break; - } - let bit = 1u32 << free; - match slot.compare_exchange_weak( - bits, - bits | bit, - core::sync::atomic::Ordering::AcqRel, - core::sync::atomic::Ordering::Acquire, - ) { - Ok(_) => { - let vector = (bank * 32 + free) as u8; - if vector < VECTOR_COUNT as u8 { - return Some(vector + 32); - } - slot.fetch_and(!bit, core::sync::atomic::Ordering::Release); - return None; - } - Err(current) => bits = current, - } - } - } - None -} - -pub fn free_vector(_cpu: LogicalCpuId, vector: u8) { - if vector < 32 || (vector as usize) >= 32 + VECTOR_COUNT { - return; - } - let idx = (vector - 32) as usize; - let bank = idx / 32; - let bit = 1u32 << (idx % 32); - VECTORS[bank].fetch_and(!bit, core::sync::atomic::Ordering::Release); -} diff --git a/recipes/core/kernel/source/src/arch/x86_shared/gdt.rs b/recipes/core/kernel/source/src/arch/x86_shared/gdt.rs index f7acae35f3..cad344f3c2 100644 --- a/recipes/core/kernel/source/src/arch/x86_shared/gdt.rs +++ b/recipes/core/kernel/source/src/arch/x86_shared/gdt.rs @@ -192,15 +192,6 @@ impl ProcessorControlRegion { } } -#[cold] -fn halt_pcr_init() -> ! { - println!("FATAL: failed to allocate physical memory for Processor Control Region"); - println!("Processor startup cannot continue. Halting."); - loop { - core::hint::spin_loop(); - } -} - pub unsafe fn pcr() -> *mut ProcessorControlRegion { unsafe { // Primitive benchmarking of RDFSBASE and RDGSBASE in userspace, appears to indicate that @@ -384,10 +375,7 @@ pub fn allocate_and_init_pcr( .next_power_of_two() .trailing_zeros(); - let pcr_frame = match crate::memory::allocate_p2frame(alloc_order) { - Some(frame) => frame, - None => halt_pcr_init(), - }; + let pcr_frame = crate::memory::allocate_p2frame(alloc_order).expect("failed to allocate PCR"); let pcr_ptr = RmmA::phys_to_virt(pcr_frame.base()).data() as *mut ProcessorControlRegion; unsafe { core::ptr::write(pcr_ptr, ProcessorControlRegion::new_partial_init(cpu_id)) }; diff --git a/recipes/core/kernel/source/src/arch/x86_shared/idt.rs b/recipes/core/kernel/source/src/arch/x86_shared/idt.rs index d5af75ddf0..500645855d 100644 --- a/recipes/core/kernel/source/src/arch/x86_shared/idt.rs +++ b/recipes/core/kernel/source/src/arch/x86_shared/idt.rs @@ -78,15 +78,6 @@ static INIT_BSP_IDT: SyncUnsafeCell = SyncUnsafeCell::new(Idt::new()); pub(crate) static IDTS: RwLock> = RwLock::new(HashMap::with_hasher(DefaultHashBuilder::new())); -#[cold] -fn halt_idt_init() -> ! { - println!("FATAL: failed to allocate physical pages for backup interrupt stack"); - println!("Interrupt setup cannot continue. Halting."); - loop { - core::hint::spin_loop(); - } -} - #[inline] pub fn is_reserved(cpu_id: LogicalCpuId, index: u8) -> bool { if cpu_id == LogicalCpuId::BSP { @@ -110,8 +101,6 @@ pub fn set_reserved(cpu_id: LogicalCpuId, index: u8, reserved: bool) { } pub fn available_irqs_iter(cpu_id: LogicalCpuId) -> impl Iterator + 'static { - let count = (32..=254).filter(|&index| !is_reserved(cpu_id, index)).count(); - info!("available_irqs_iter: cpu_id={} count={}", cpu_id.get(), count); (32..=254).filter(move |&index| !is_reserved(cpu_id, index)) } @@ -172,10 +161,8 @@ pub fn allocate_and_init_idt(cpu_id: LogicalCpuId) -> *mut Idt { .or_insert_with(|| Box::leak(Box::new(Idt::new()))); use crate::memory::{RmmA, RmmArch}; - let frames = match crate::memory::allocate_p2frame(4) { - Some(frames) => frames, - None => halt_idt_init(), - }; + let frames = crate::memory::allocate_p2frame(4) + .expect("failed to allocate pages for backup interrupt stack"); // Physical pages are mapped linearly. So is the linearly mapped virtual memory. let base_address = RmmA::phys_to_virt(frames.base()); diff --git a/recipes/core/kernel/source/src/arch/x86_shared/interrupt/exception.rs b/recipes/core/kernel/source/src/arch/x86_shared/interrupt/exception.rs index bfe9f096a2..7725a45d0a 100644 --- a/recipes/core/kernel/source/src/arch/x86_shared/interrupt/exception.rs +++ b/recipes/core/kernel/source/src/arch/x86_shared/interrupt/exception.rs @@ -1,5 +1,3 @@ -use core::sync::atomic::{AtomicBool, Ordering}; - use syscall::Exception; use x86::irq::PageFaultError; @@ -12,22 +10,6 @@ use crate::{ syscall::flag::*, }; -static NMI_IN_PROGRESS: AtomicBool = AtomicBool::new(false); - -unsafe fn nmi_raw_serial_write(bytes: &[u8]) { - use crate::syscall::io::{Io, Pio}; - - let mut com1 = Pio::::new(0x3F8); - let lsr = Pio::::new(0x3F8 + 5); - - for &byte in bytes { - while lsr.read() & (1 << 5) == 0 { - core::hint::spin_loop(); - } - com1.write(byte); - } -} - interrupt_stack!(divide_by_zero, |stack| { println!("Divide by zero"); stack.trace(); @@ -73,35 +55,9 @@ interrupt_stack!(non_maskable, @paranoid, |stack| { #[cfg(not(all(target_arch = "x86_64", feature = "profiling")))] { - if NMI_IN_PROGRESS.swap(true, Ordering::SeqCst) { - return; - } - - unsafe { - nmi_raw_serial_write(b"Non-maskable interrupt\n"); - nmi_raw_serial_write(b" RIP: "); - - #[cfg(target_arch = "x86")] - let instruction_pointer = u64::from(stack.iret.eip); - #[cfg(target_arch = "x86_64")] - let instruction_pointer = stack.iret.rip; - - let mut buf = [0u8; 19]; - buf[0] = b'0'; - buf[1] = b'x'; - for i in 0..16 { - let nibble = ((instruction_pointer >> (60 - i * 4)) & 0xF) as u8; - buf[2 + i] = if nibble < 10 { - b'0' + nibble - } else { - b'a' + nibble - 10 - }; - } - buf[18] = b'\n'; - nmi_raw_serial_write(&buf); - } - - NMI_IN_PROGRESS.store(false, Ordering::SeqCst); + // TODO: This will likely deadlock + println!("Non-maskable interrupt"); + stack.dump(); } }); diff --git a/recipes/core/kernel/source/src/arch/x86_shared/mod.rs b/recipes/core/kernel/source/src/arch/x86_shared/mod.rs index 11c33e9457..e3c30501b8 100644 --- a/recipes/core/kernel/source/src/arch/x86_shared/mod.rs +++ b/recipes/core/kernel/source/src/arch/x86_shared/mod.rs @@ -28,8 +28,6 @@ pub mod pti; /// Initialization and start function pub mod start; -pub mod sleep; - /// Stop function pub mod stop; diff --git a/recipes/core/kernel/source/src/arch/x86_shared/sleep.rs b/recipes/core/kernel/source/src/arch/x86_shared/sleep.rs deleted file mode 100644 index 9f98c0d892..0000000000 --- a/recipes/core/kernel/source/src/arch/x86_shared/sleep.rs +++ /dev/null @@ -1,712 +0,0 @@ -use alloc::{sync::Arc, vec::Vec}; -use core::{ - ptr::NonNull, - str::FromStr, - sync::atomic::{AtomicU32, Ordering}, -}; - -use acpi_ext::{ - aml::{namespace::AmlName, object::Object, Interpreter}, - registers::FixedRegisters, - sdt::{facs::Facs, fadt::Fadt, SdtHeader}, - AcpiTables, Handle, Handler, PhysicalMapping, -}; -use spin::Mutex; -use syscall::error::{Error, EINVAL, EIO}; -use x86::{segmentation::SegmentSelector, task, Ring}; - -use crate::{ - acpi::ACPI_ROOT_INFO, - arch::interrupt, - memory::{ - round_down_pages, round_up_pages, KernelMapper, Page, PageFlags, PhysicalAddress, RmmA, - RmmArch, VirtualAddress, PAGE_SIZE, - }, - syscall::io::{Io, Pio}, -}; - -const ACPI_SLP_TYP_SHIFT: u16 = 10; -const ACPI_SLP_TYP_MASK: u16 = 0x1C00; -const ACPI_SLP_EN: u16 = 1 << 13; -const WAKE_TRAMPOLINE_PHYS: usize = 0x8000; -const SLEEP_RETURN_OK: usize = 0; - -#[cfg(target_arch = "x86_64")] -static WAKE_TRAMPOLINE_DATA: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/s3_wakeup")); - -#[repr(C, packed)] -#[derive(Clone, Copy, Debug, Default)] -struct DescriptorTableRegister { - limit: u16, - base: u64, -} - -#[repr(C, align(64))] -#[derive(Clone, Copy, Debug)] -struct FpuState { - bytes: [u8; 4096], -} - -impl Default for FpuState { - fn default() -> Self { - Self { bytes: [0; 4096] } - } -} - -#[derive(Clone, Copy, Debug, Eq, PartialEq)] -pub enum SleepState { - S3, - S5, -} - -#[derive(Clone, Copy, Debug, Eq, PartialEq)] -pub enum SleepError { - UnsupportedArch, - MissingAcpi, - MissingFadt, - MissingFacs, - MissingSleepObject, - InvalidSleepObject, - UnsupportedPmControl, - UnsupportedAmlOperation, - SleepDidNotEnter, -} - -impl SleepError { - fn code(self) -> usize { - match self { - Self::UnsupportedArch => EINVAL as usize, - Self::MissingAcpi - | Self::MissingFadt - | Self::MissingFacs - | Self::MissingSleepObject - | Self::UnsupportedAmlOperation => EIO as usize, - Self::InvalidSleepObject | Self::UnsupportedPmControl | Self::SleepDidNotEnter => { - EINVAL as usize - } - } - } - - fn from_code(code: usize) -> Self { - match code as i32 { - x if x == EINVAL => Self::InvalidSleepObject, - _ => Self::MissingAcpi, - } - } -} - -#[derive(Clone, Copy, Debug, Default)] -struct SavedCpuContext { - entry_rsp: usize, - runtime_rsp: usize, - facs_address: usize, - cr0: usize, - cr2: usize, - cr3: usize, - cr4: usize, - rflags: usize, - gdtr: DescriptorTableRegister, - idtr: DescriptorTableRegister, - efer: u64, - fs_base: u64, - gs_base: u64, - kernel_gs_base: u64, - fpu: FpuState, -} - -static SAVED_CONTEXT: Mutex> = Mutex::new(None); -static AML_MUTEX_IDS: AtomicU32 = AtomicU32::new(1); - -#[derive(Clone, Copy, Debug)] -struct SleepTypeData { - a: u16, - b: u16, -} - -#[derive(Clone, Copy)] -struct KernelAcpiHandler; - -impl KernelAcpiHandler { - fn map_range(physical_address: usize, size: usize) -> (*mut u8, usize) { - let map_base = round_down_pages(physical_address); - let map_offset = physical_address - map_base; - let mapped_length = round_up_pages(size + map_offset); - - // SAFETY: The ACPI interpreter only requests firmware-described physical regions. - unsafe { - let mut mapper = KernelMapper::lock_rw(); - for page_index in 0..mapped_length / PAGE_SIZE { - let (_, flush) = mapper - .map_linearly( - PhysicalAddress::new(map_base + page_index * PAGE_SIZE), - PageFlags::new(), - ) - .expect("failed to linearly map ACPI physical region"); - flush.flush(); - } - } - - let virtual_base = RmmA::phys_to_virt(PhysicalAddress::new(map_base)).data(); - ((virtual_base + map_offset) as *mut u8, mapped_length) - } -} - -impl Handler for KernelAcpiHandler { - unsafe fn map_physical_region(&self, physical_address: usize, size: usize) -> PhysicalMapping { - let (virtual_start, mapped_length) = Self::map_range(physical_address, size); - PhysicalMapping { - physical_start: physical_address, - virtual_start: NonNull::new(virtual_start.cast::()) - .expect("expected mapped ACPI virtual address to be non-null"), - region_length: size, - mapped_length, - handler: *self, - } - } - - fn unmap_physical_region(_region: &PhysicalMapping) {} - - fn read_u8(&self, address: usize) -> u8 { - // SAFETY: AML system-memory accesses are byte-addressable firmware regions. - unsafe { core::ptr::read_volatile(RmmA::phys_to_virt(PhysicalAddress::new(address)).data() as *const u8) } - } - - fn read_u16(&self, address: usize) -> u16 { - // SAFETY: AML system-memory accesses are word-addressable firmware regions. - unsafe { - core::ptr::read_volatile(RmmA::phys_to_virt(PhysicalAddress::new(address)).data() as *const u16) - } - } - - fn read_u32(&self, address: usize) -> u32 { - // SAFETY: AML system-memory accesses are dword-addressable firmware regions. - unsafe { - core::ptr::read_volatile(RmmA::phys_to_virt(PhysicalAddress::new(address)).data() as *const u32) - } - } - - fn read_u64(&self, address: usize) -> u64 { - // SAFETY: AML system-memory accesses are qword-addressable firmware regions. - unsafe { - core::ptr::read_volatile(RmmA::phys_to_virt(PhysicalAddress::new(address)).data() as *const u64) - } - } - - fn write_u8(&self, address: usize, value: u8) { - // SAFETY: AML system-memory accesses are byte-addressable firmware regions. - unsafe { - core::ptr::write_volatile(RmmA::phys_to_virt(PhysicalAddress::new(address)).data() as *mut u8, value) - } - } - - fn write_u16(&self, address: usize, value: u16) { - // SAFETY: AML system-memory accesses are word-addressable firmware regions. - unsafe { - core::ptr::write_volatile( - RmmA::phys_to_virt(PhysicalAddress::new(address)).data() as *mut u16, - value, - ) - } - } - - fn write_u32(&self, address: usize, value: u32) { - // SAFETY: AML system-memory accesses are dword-addressable firmware regions. - unsafe { - core::ptr::write_volatile( - RmmA::phys_to_virt(PhysicalAddress::new(address)).data() as *mut u32, - value, - ) - } - } - - fn write_u64(&self, address: usize, value: u64) { - // SAFETY: AML system-memory accesses are qword-addressable firmware regions. - unsafe { - core::ptr::write_volatile( - RmmA::phys_to_virt(PhysicalAddress::new(address)).data() as *mut u64, - value, - ) - } - } - - fn read_io_u8(&self, port: u16) -> u8 { - Pio::::new(port).read() - } - - fn read_io_u16(&self, port: u16) -> u16 { - Pio::::new(port).read() - } - - fn read_io_u32(&self, port: u16) -> u32 { - Pio::::new(port).read() - } - - fn write_io_u8(&self, port: u16, value: u8) { - Pio::::new(port).write(value) - } - - fn write_io_u16(&self, port: u16, value: u16) { - Pio::::new(port).write(value) - } - - fn write_io_u32(&self, port: u16, value: u32) { - Pio::::new(port).write(value) - } - - fn read_pci_u8(&self, _address: acpi_ext::PciAddress, _offset: u16) -> u8 { - 0 - } - - fn read_pci_u16(&self, _address: acpi_ext::PciAddress, _offset: u16) -> u16 { - 0 - } - - fn read_pci_u32(&self, _address: acpi_ext::PciAddress, _offset: u16) -> u32 { - 0 - } - - fn write_pci_u8(&self, _address: acpi_ext::PciAddress, _offset: u16, _value: u8) {} - - fn write_pci_u16(&self, _address: acpi_ext::PciAddress, _offset: u16, _value: u16) {} - - fn write_pci_u32(&self, _address: acpi_ext::PciAddress, _offset: u16, _value: u32) {} - - fn nanos_since_boot(&self) -> u64 { - 0 - } - - fn stall(&self, microseconds: u64) { - for _ in 0..(microseconds.saturating_mul(64)) { - core::hint::spin_loop(); - } - } - - fn sleep(&self, milliseconds: u64) { - for _ in 0..(milliseconds.saturating_mul(64_000)) { - core::hint::spin_loop(); - } - } - - fn create_mutex(&self) -> Handle { - Handle(AML_MUTEX_IDS.fetch_add(1, Ordering::Relaxed)) - } - - fn acquire(&self, _mutex: Handle, _timeout: u16) -> Result<(), acpi_ext::aml::AmlError> { - Ok(()) - } - - fn release(&self, _mutex: Handle) {} -} - -fn sleep_state_name(state: SleepState) -> &'static str { - match state { - SleepState::S3 => "\\_S3", - SleepState::S5 => "\\_S5", - } -} - -fn encode_sleep_type(value: u16) -> u16 { - if value <= 0x7 { - value << ACPI_SLP_TYP_SHIFT - } else { - value & ACPI_SLP_TYP_MASK - } -} - -fn load_interpreter() -> Result<( - Arc>, - PhysicalMapping, - Interpreter, -), SleepError> { - let root = *ACPI_ROOT_INFO.get().ok_or(SleepError::MissingAcpi)?; - let handler = KernelAcpiHandler; - - // SAFETY: ACPI root info is captured from the firmware-provided, already validated root table. - let tables = unsafe { - AcpiTables::from_rsdt(handler, root.revision, root.root_sdt_address.data()) - .map_err(|_| SleepError::MissingAcpi)? - }; - let fadt = tables.find_table::().ok_or(SleepError::MissingFadt)?; - let registers = Arc::new( - FixedRegisters::new(&fadt, handler).map_err(|_| SleepError::UnsupportedPmControl)?, - ); - let facs_address = fadt.facs_address().map_err(|_| SleepError::MissingFacs)?; - - // SAFETY: The FADT-supplied FACS address is used exactly as described by the ACPI spec. - let facs = unsafe { handler.map_physical_region::(facs_address, core::mem::size_of::()) }; - // SAFETY: The AML interpreter only needs an owned mapping of the same firmware FACS table. - let interpreter_facs = unsafe { - handler.map_physical_region::(facs_address, core::mem::size_of::()) - }; - let dsdt = tables.dsdt().map_err(|_| SleepError::MissingFadt)?; - let interpreter = Interpreter::new(handler, dsdt.revision, Arc::clone(®isters), Some(interpreter_facs)); - - // SAFETY: Each AML table mapping is owned by the interpreter during table loading. - unsafe { - let mapping = handler.map_physical_region::(dsdt.phys_address, dsdt.length as usize); - let stream = core::slice::from_raw_parts( - mapping.virtual_start.as_ptr().byte_add(core::mem::size_of::()) as *const u8, - dsdt.length as usize - core::mem::size_of::(), - ); - interpreter - .load_table(stream) - .map_err(|_| SleepError::UnsupportedAmlOperation)?; - - for ssdt in tables.ssdts() { - let mapping = handler.map_physical_region::(ssdt.phys_address, ssdt.length as usize); - let stream = core::slice::from_raw_parts( - mapping.virtual_start.as_ptr().byte_add(core::mem::size_of::()) as *const u8, - ssdt.length as usize - core::mem::size_of::(), - ); - interpreter - .load_table(stream) - .map_err(|_| SleepError::UnsupportedAmlOperation)?; - } - } - - Ok((registers, facs, interpreter)) -} - -fn sleep_type_data_from_interpreter( - interpreter: &Interpreter, - state: SleepState, -) -> Result { - let name = AmlName::from_str(sleep_state_name(state)).map_err(|_| SleepError::MissingSleepObject)?; - let object = interpreter - .evaluate(name, Vec::new()) - .map_err(|_| SleepError::MissingSleepObject)?; - - let Object::Package(package) = &*object else { - return Err(SleepError::InvalidSleepObject); - }; - - let Some(typa_object) = package.first() else { - return Err(SleepError::InvalidSleepObject); - }; - let Some(typb_object) = package.get(1) else { - return Err(SleepError::InvalidSleepObject); - }; - - let Object::Integer(typa) = &**typa_object else { - return Err(SleepError::InvalidSleepObject); - }; - let Object::Integer(typb) = &**typb_object else { - return Err(SleepError::InvalidSleepObject); - }; - - Ok(SleepTypeData { - a: encode_sleep_type(*typa as u16), - b: encode_sleep_type(*typb as u16), - }) -} - -fn sleep_type_data(state: SleepState) -> Result { - let (_registers, _facs, interpreter) = load_interpreter()?; - sleep_type_data_from_interpreter(&interpreter, state) -} - -fn install_wake_trampoline(stack_rsp: usize, cr3: usize) { - let trampoline_page = Page::containing_address(VirtualAddress::new(WAKE_TRAMPOLINE_PHYS)); - let trampoline_frame = PhysicalAddress::new(WAKE_TRAMPOLINE_PHYS); - - // SAFETY: The 0x8000 low-memory trampoline page is reserved by the kernel for bootstrap stubs. - let (result, _) = unsafe { - let mut mapper = KernelMapper::lock_rw(); - let result = mapper - .map_phys( - trampoline_page.start_address(), - trampoline_frame, - PageFlags::new().execute(true).write(true), - ) - .expect("failed to map S3 wake trampoline page"); - (result, mapper.table().phys().data()) - }; - result.flush(); - - for (index, value) in WAKE_TRAMPOLINE_DATA.iter().enumerate() { - // SAFETY: The trampoline page is mapped writable at the same virtual address as the physical page. - unsafe { - core::ptr::write_volatile((WAKE_TRAMPOLINE_PHYS as *mut u8).add(index), *value); - } - } - - // SAFETY: The wake trampoline layout reserves three qword fields immediately after the jump. - unsafe { - let stack_slot = (WAKE_TRAMPOLINE_PHYS + 8) as *mut u64; - let page_table_slot = stack_slot.add(1); - let code_slot = stack_slot.add(2); - stack_slot.write(stack_rsp as u64); - page_table_slot.write(cr3 as u64); - #[expect(clippy::fn_to_numeric_cast)] - code_slot.write(resume_from_s3_trampoline as usize as u64); - } - - // SAFETY: The trampoline mapping is no longer needed once the physical page has been populated. - let (_frame, _, flush) = unsafe { - KernelMapper::lock_rw() - .unmap_phys(trampoline_page.start_address()) - .expect("failed to unmap S3 wake trampoline page") - }; - flush.flush(); -} - -fn save_descriptor_tables(context: &mut SavedCpuContext) { - // SAFETY: SGDT/SIDT only read the current CPU descriptor-table registers into the provided storage. - unsafe { - core::arch::asm!("sgdt [{}]", in(reg) &mut context.gdtr, options(nostack, preserves_flags)); - core::arch::asm!("sidt [{}]", in(reg) &mut context.idtr, options(nostack, preserves_flags)); - } -} - -fn save_fpu_state(context: &mut SavedCpuContext) { - // SAFETY: The kernel owns the current CPU at suspend entry and the FXSAVE buffer is 64-byte aligned. - unsafe { - core::arch::asm!( - "fxsave64 [{}]", - in(reg) context.fpu.bytes.as_mut_ptr(), - ); - } -} - -fn restore_fpu_state(context: &SavedCpuContext) { - // SAFETY: The saved FXSAVE image belongs to the same CPU context and matches the restore instruction. - unsafe { - core::arch::asm!( - "fxrstor64 [{}]", - in(reg) context.fpu.bytes.as_ptr(), - ); - } -} - -fn save_cpu_context(entry_rsp: usize) -> SavedCpuContext { - let mut context = SavedCpuContext { - entry_rsp, - ..SavedCpuContext::default() - }; - - // SAFETY: Reading control registers and MSRs is required to reconstruct the CPU execution state on wake. - unsafe { - core::arch::asm!( - "mov {}, cr0", - out(reg) context.cr0, - options(nostack, preserves_flags) - ); - core::arch::asm!( - "mov {}, cr2", - out(reg) context.cr2, - options(nostack, preserves_flags) - ); - core::arch::asm!( - "mov {}, cr3", - out(reg) context.cr3, - options(nostack, preserves_flags) - ); - core::arch::asm!( - "mov {}, cr4", - out(reg) context.cr4, - options(nostack, preserves_flags) - ); - core::arch::asm!( - "pushfq", - "pop {}", - out(reg) context.rflags, - options(preserves_flags) - ); - core::arch::asm!("mov {}, rsp", out(reg) context.runtime_rsp, options(nostack, preserves_flags)); - - context.efer = x86::msr::rdmsr(x86::msr::IA32_EFER); - context.fs_base = x86::msr::rdmsr(x86::msr::IA32_FS_BASE); - context.gs_base = x86::msr::rdmsr(x86::msr::IA32_GS_BASE); - context.kernel_gs_base = x86::msr::rdmsr(x86::msr::IA32_KERNEL_GSBASE); - } - - save_descriptor_tables(&mut context); - save_fpu_state(&mut context); - context -} - -fn set_firmware_waking_vector(facs: &mut PhysicalMapping, vector: usize) { - facs.firmware_waking_vector = vector as u32; - facs.x_firmware_waking_vector = vector as u64; -} - -fn write_pm1_control_block( - registers: &FixedRegisters, - sleep_type: SleepTypeData, -) -> Result<(), SleepError> { - let current_a = registers - .pm1_control_registers - .pm1a - .read() - .map_err(|_| SleepError::UnsupportedPmControl)? as u16; - let armed_a = (current_a & !(ACPI_SLP_TYP_MASK | ACPI_SLP_EN)) | sleep_type.a; - - registers - .pm1_control_registers - .pm1a - .write(u64::from(armed_a)) - .map_err(|_| SleepError::UnsupportedPmControl)?; - - if let Some(pm1b) = ®isters.pm1_control_registers.pm1b { - let current_b = pm1b.read().map_err(|_| SleepError::UnsupportedPmControl)? as u16; - let armed_b = (current_b & !(ACPI_SLP_TYP_MASK | ACPI_SLP_EN)) | sleep_type.b; - pm1b.write(u64::from(armed_b)) - .map_err(|_| SleepError::UnsupportedPmControl)?; - pm1b.write(u64::from(armed_b | ACPI_SLP_EN)) - .map_err(|_| SleepError::UnsupportedPmControl)?; - } - - // SAFETY: WBINVD is required here to flush dirty cache lines before firmware powers down the CPU package. - unsafe { - core::arch::asm!("wbinvd", options(nostack, preserves_flags)); - } - - registers - .pm1_control_registers - .pm1a - .write(u64::from(armed_a | ACPI_SLP_EN)) - .map_err(|_| SleepError::UnsupportedPmControl)?; - - Ok(()) -} - -#[unsafe(naked)] -unsafe extern "sysv64" fn enter_sleep_raw(state: usize) -> usize { - core::arch::naked_asm!( - "mov rsi, rsp", - "jmp {inner}", - inner = sym enter_sleep_raw_inner, - ); -} - -extern "C" fn enter_sleep_raw_inner(state: usize, entry_rsp: usize) -> usize { - let state = match state { - 3 => SleepState::S3, - 5 => SleepState::S5, - _ => return SleepError::InvalidSleepObject.code(), - }; - - let (registers, mut facs, interpreter) = match load_interpreter() { - Ok(tuple) => tuple, - Err(error) => return error.code(), - }; - let sleep_type = match sleep_type_data_from_interpreter(&interpreter, state) { - Ok(data) => data, - Err(error) => return error.code(), - }; - - let mut context = save_cpu_context(entry_rsp); - context.facs_address = facs.physical_start; - install_wake_trampoline(context.runtime_rsp, context.cr3); - set_firmware_waking_vector(&mut facs, WAKE_TRAMPOLINE_PHYS); - - { - let mut saved = SAVED_CONTEXT.lock(); - *saved = Some(context); - } - - // SAFETY: Suspend entry must not be interrupted while the wake vector and PM1 control block are being armed. - unsafe { - interrupt::disable(); - } - - if let Err(error) = write_pm1_control_block(registers.as_ref(), sleep_type) { - return error.code(); - } - - // SAFETY: The final CLI+HLT sequence is the architectural handoff point after asserting SLP_EN. - unsafe { - core::arch::asm!("cli; hlt", options(nostack)); - } - - SleepError::SleepDidNotEnter.code() -} - -extern "C" fn resume_from_s3_trampoline() -> ! { - let mut saved = SAVED_CONTEXT.lock(); - let context = saved.take().expect("S3 wake trampoline resumed without saved CPU context"); - drop(saved); - - // SAFETY: The saved FACS physical address was captured from the validated FADT during suspend entry. - if context.facs_address != 0 { - let mut facs = unsafe { - KernelAcpiHandler.map_physical_region::( - context.facs_address, - core::mem::size_of::(), - ) - }; - set_firmware_waking_vector(&mut facs, 0); - } - - // SAFETY: The wake trampoline already switched to the saved kernel CR3 and long mode, so the remaining restores are architectural register state only. - unsafe { - x86::msr::wrmsr(x86::msr::IA32_EFER, context.efer); - core::arch::asm!("mov cr3, {}", in(reg) context.cr3, options(nostack)); - core::arch::asm!("mov cr4, {}", in(reg) context.cr4, options(nostack)); - core::arch::asm!("mov cr2, {}", in(reg) context.cr2, options(nostack)); - core::arch::asm!("mov cr0, {}", in(reg) context.cr0, options(nostack)); - core::arch::asm!("lgdt [{}]", in(reg) &context.gdtr, options(nostack)); - core::arch::asm!("lidt [{}]", in(reg) &context.idtr, options(nostack)); - - task::load_tr(SegmentSelector::new(crate::arch::gdt::GDT_TSS as u16, Ring::Ring0)); - - x86::msr::wrmsr(x86::msr::IA32_FS_BASE, context.fs_base); - x86::msr::wrmsr(x86::msr::IA32_GS_BASE, context.gs_base); - x86::msr::wrmsr(x86::msr::IA32_KERNEL_GSBASE, context.kernel_gs_base); - } - - restore_fpu_state(&context); - - // SAFETY: Returning with the original entry stack and RFLAGS completes the suspend call as a successful function return. - unsafe { - core::arch::asm!( - "mov rsp, {entry_rsp}", - "push {rflags}", - "popfq", - "xor eax, eax", - "ret", - entry_rsp = in(reg) context.entry_rsp, - rflags = in(reg) context.rflags, - options(noreturn) - ); - } -} - -pub fn enter_sleep_state(state: SleepState) -> core::result::Result<(), SleepError> { - #[cfg(not(target_arch = "x86_64"))] - { - let _ = state; - return Err(SleepError::UnsupportedArch); - } - - #[cfg(target_arch = "x86_64")] - { - let raw = unsafe { - enter_sleep_raw(match state { - SleepState::S3 => 3, - SleepState::S5 => 5, - }) - }; - if raw == SLEEP_RETURN_OK { - Ok(()) - } else { - Err(SleepError::from_code(raw)) - } - } -} - -pub fn available_sleep_states() -> &'static [u8] { - if sleep_type_data(SleepState::S3).is_ok() { - b"S3\nS5\n" - } else { - b"S5\n" - } -} - -pub fn trigger_sleep_request(request: &str) -> Result<(), Error> { - match request.trim() { - "S3" => enter_sleep_state(SleepState::S3).map_err(|_| Error::new(EIO)), - "S5" => enter_sleep_state(SleepState::S5).map_err(|_| Error::new(EIO)), - _ => Err(Error::new(EINVAL)), - } -} diff --git a/recipes/core/kernel/source/src/arch/x86_shared/start.rs b/recipes/core/kernel/source/src/arch/x86_shared/start.rs index cf3e433bee..7a7c0ae815 100644 --- a/recipes/core/kernel/source/src/arch/x86_shared/start.rs +++ b/recipes/core/kernel/source/src/arch/x86_shared/start.rs @@ -82,15 +82,6 @@ extern "C" fn kstart() { /// The entry to Rust, all things must be initialized unsafe extern "C" fn start(args_ptr: *const KernelArgs, stack_end: usize) -> ! { unsafe { - // EARLY CANARY: write 'R' to COM1 before any kernel init. - // This proves the serial hardware works and the kernel reached Rust entry. - // If this character appears but "RedBear OS starting..." does not, - // the hang is in args_ptr.read(), serial::init(), or graphical_debug::init(). - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - { - core::arch::asm!("out dx, al", in("dx") 0x3F8u16, in("al") b'R', options(nostack, preserves_flags)); - } - let bootstrap = { let args = args_ptr.read(); @@ -100,49 +91,27 @@ unsafe extern "C" fn start(args_ptr: *const KernelArgs, stack_end: usize) -> ! { // Set up graphical debug graphical_debug::init(args.env()); - // SECOND CANARY: write 'S' to COM1 after serial init. - // If 'R' appears but 'S' does not, the hang is in serial::init() or graphical_debug::init(). - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - { - core::arch::asm!("out dx, al", in("dx") 0x3F8u16, in("al") b'S', options(nostack, preserves_flags)); - } - - info!("RedBear OS starting..."); + info!("Redox OS starting..."); args.print(); - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - { core::arch::asm!("out dx, al", in("dx") 0x3F8u16, in("al") b'1', options(nostack, preserves_flags)); } - // Set up GDT gdt::init_bsp(stack_end); - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - { core::arch::asm!("out dx, al", in("dx") 0x3F8u16, in("al") b'2', options(nostack, preserves_flags)); } - // Set up IDT idt::init_bsp(); - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - { core::arch::asm!("out dx, al", in("dx") 0x3F8u16, in("al") b'3', options(nostack, preserves_flags)); } - // Initialize RMM #[cfg(target_arch = "x86")] crate::startup::memory::init(&args, Some(0x100000), Some(0x40000000)); #[cfg(target_arch = "x86_64")] crate::startup::memory::init(&args, Some(0x100000), None); - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - { core::arch::asm!("out dx, al", in("dx") 0x3F8u16, in("al") b'4', options(nostack, preserves_flags)); } - // Initialize paging paging::init(); #[cfg(target_arch = "x86_64")] crate::arch::alternative::early_init(true); - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - { core::arch::asm!("out dx, al", in("dx") 0x3F8u16, in("al") b'5', options(nostack, preserves_flags)); } - // Set up syscall instruction interrupt::syscall::init(); @@ -152,9 +121,6 @@ unsafe extern "C" fn start(args_ptr: *const KernelArgs, stack_end: usize) -> ! { // Activate memory logging crate::log::init(); - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - { core::arch::asm!("out dx, al", in("dx") 0x3F8u16, in("al") b'6', options(nostack, preserves_flags)); } - // Initialize miscellaneous processor features #[cfg(target_arch = "x86_64")] crate::arch::misc::init(LogicalCpuId::BSP); @@ -162,9 +128,6 @@ unsafe extern "C" fn start(args_ptr: *const KernelArgs, stack_end: usize) -> ! { // Initialize devices device::init(); - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - { core::arch::asm!("out dx, al", in("dx") 0x3F8u16, in("al") b'7', options(nostack, preserves_flags)); } - // Read ACPI tables, starts APs if cfg!(feature = "acpi") { crate::acpi::init(args.acpi_rsdp()); diff --git a/recipes/core/kernel/source/src/asm/x86_64/s3_wakeup.asm b/recipes/core/kernel/source/src/asm/x86_64/s3_wakeup.asm deleted file mode 100644 index 7beeccf603..0000000000 --- a/recipes/core/kernel/source/src/asm/x86_64/s3_wakeup.asm +++ /dev/null @@ -1,110 +0,0 @@ -; ACPI S3 wake trampoline -; compiled with nasm by build.rs, copied to physical 0x8000 before S3 entry - -ORG 0x8000 -SECTION .text -USE16 - -trampoline: - jmp short startup_wake - times 8 - ($ - trampoline) nop - .stack: dq 0 - .page_table: dq 0 - .code: dq 0 - -startup_wake: - cli - - xor ax, ax - mov ds, ax - mov es, ax - mov ss, ax - mov sp, 0 - - mov edi, [trampoline.page_table] - mov cr3, edi - - mov eax, cr0 - and al, 11110011b - or al, 00100010b - mov cr0, eax - - mov eax, cr4 - or eax, 1 << 9 | 1 << 7 | 1 << 5 | 1 << 4 - mov cr4, eax - - fninit - - lgdt [gdtr] - - mov ecx, 0xC0000080 - rdmsr - or eax, 1 << 11 | 1 << 8 - wrmsr - - mov ebx, cr0 - or ebx, 1 << 31 | 1 << 16 | 1 - mov cr0, ebx - - jmp gdt.kernel_code:long_mode_wake - -USE64 -long_mode_wake: - mov rax, gdt.kernel_data - mov ds, rax - mov es, rax - mov fs, rax - mov gs, rax - mov ss, rax - - mov rsp, [trampoline.stack] - mov rax, [trampoline.code] - jmp rax - -struc GDTEntry - .limitl resw 1 - .basel resw 1 - .basem resb 1 - .attribute resb 1 - .flags__limith resb 1 - .baseh resb 1 -endstruc - -attrib: - .present equ 1 << 7 - .user equ 1 << 4 - .code equ 1 << 3 - .writable equ 1 << 1 - -flags: - .long_mode equ 1 << 5 - -gdtr: - dw gdt.end + 1 - dq gdt - -gdt: -.null equ $ - gdt - dq 0 - -.kernel_code equ $ - gdt -istruc GDTEntry - at GDTEntry.limitl, dw 0 - at GDTEntry.basel, dw 0 - at GDTEntry.basem, db 0 - at GDTEntry.attribute, db attrib.present | attrib.user | attrib.code - at GDTEntry.flags__limith, db flags.long_mode - at GDTEntry.baseh, db 0 -iend - -.kernel_data equ $ - gdt -istruc GDTEntry - at GDTEntry.limitl, dw 0 - at GDTEntry.basel, dw 0 - at GDTEntry.basem, db 0 - at GDTEntry.attribute, db attrib.present | attrib.user | attrib.writable - at GDTEntry.flags__limith, db 0 - at GDTEntry.baseh, db 0 -iend - -.end equ $ - gdt diff --git a/recipes/core/kernel/source/src/context/arch/aarch64.rs b/recipes/core/kernel/source/src/context/arch/aarch64.rs index b8f8ac95d7..33dc83a987 100644 --- a/recipes/core/kernel/source/src/context/arch/aarch64.rs +++ b/recipes/core/kernel/source/src/context/arch/aarch64.rs @@ -4,10 +4,16 @@ use crate::{ percpu::PercpuBlock, syscall::FloatRegisters, }; -use core::{mem::offset_of, ptr}; +use core::{mem::offset_of, ptr, sync::atomic::AtomicBool}; use spin::Once; use syscall::{EnvRegisters, Result}; +/// This must be used by the kernel to ensure that context switches are done atomically +/// Compare and exchange this to true when beginning a context switch on any CPU +/// The `Context::switch_to` function will set it back to false, allowing other CPU's to switch +/// This must be done, as no locks can be held on the stack during switch +pub static CONTEXT_SWITCH_LOCK: AtomicBool = AtomicBool::new(false); + // 512 bytes for registers, extra bytes for fpcr and fpsr pub const KFX_ALIGN: usize = 16; diff --git a/recipes/core/kernel/source/src/context/arch/riscv64.rs b/recipes/core/kernel/source/src/context/arch/riscv64.rs index fe63639acb..4bd843e620 100644 --- a/recipes/core/kernel/source/src/context/arch/riscv64.rs +++ b/recipes/core/kernel/source/src/context/arch/riscv64.rs @@ -2,11 +2,13 @@ use crate::{ arch::interrupt::InterruptStack, context::context::Kstack, memory::RmmA, percpu::PercpuBlock, syscall::FloatRegisters, }; -use core::mem::offset_of; +use core::{mem::offset_of, sync::atomic::AtomicBool}; use rmm::{Arch, VirtualAddress}; use spin::Once; use syscall::{error::*, EnvRegisters}; +pub static CONTEXT_SWITCH_LOCK: AtomicBool = AtomicBool::new(false); + pub const KFX_ALIGN: usize = 16; #[derive(Clone, Debug, Default)] diff --git a/recipes/core/kernel/source/src/context/arch/x86.rs b/recipes/core/kernel/source/src/context/arch/x86.rs index dc01f6e707..2862d35f20 100644 --- a/recipes/core/kernel/source/src/context/arch/x86.rs +++ b/recipes/core/kernel/source/src/context/arch/x86.rs @@ -1,4 +1,4 @@ -use core::mem::offset_of; +use core::{mem::offset_of, sync::atomic::AtomicBool}; use rmm::{Arch, VirtualAddress}; use spin::Once; use syscall::{error::*, EnvRegisters}; @@ -14,6 +14,12 @@ use crate::{ syscall::FloatRegisters, }; +/// This must be used by the kernel to ensure that context switches are done atomically +/// Compare and exchange this to true when beginning a context switch on any CPU +/// The `Context::switch_to` function will set it back to false, allowing other CPU's to switch +/// This must be done, as no locks can be held on the stack during switch +pub static CONTEXT_SWITCH_LOCK: AtomicBool = AtomicBool::new(false); + const ST_RESERVED: u128 = 0xFFFF_FFFF_FFFF_0000_0000_0000_0000_0000; pub const KFX_ALIGN: usize = 16; diff --git a/recipes/core/kernel/source/src/context/arch/x86_64.rs b/recipes/core/kernel/source/src/context/arch/x86_64.rs index 574d373887..6758c9fca5 100644 --- a/recipes/core/kernel/source/src/context/arch/x86_64.rs +++ b/recipes/core/kernel/source/src/context/arch/x86_64.rs @@ -1,5 +1,6 @@ use core::{ ptr::{addr_of, addr_of_mut}, + sync::atomic::AtomicBool, }; use crate::syscall::FloatRegisters; @@ -11,6 +12,12 @@ use spin::Once; use syscall::{error::*, EnvRegisters}; use x86::msr; +/// This must be used by the kernel to ensure that context switches are done atomically +/// Compare and exchange this to true when beginning a context switch on any CPU +/// The `Context::switch_to` function will set it back to false, allowing other CPU's to switch +/// This must be done, as no locks can be held on the stack during switch +pub static CONTEXT_SWITCH_LOCK: AtomicBool = AtomicBool::new(false); + const ST_RESERVED: u128 = 0xFFFF_FFFF_FFFF_0000_0000_0000_0000_0000; #[cfg(cpu_feature_never = "xsave")] diff --git a/recipes/core/kernel/source/src/context/context.rs b/recipes/core/kernel/source/src/context/context.rs index 6d723f498f..c97c5166be 100644 --- a/recipes/core/kernel/source/src/context/context.rs +++ b/recipes/core/kernel/source/src/context/context.rs @@ -148,8 +148,6 @@ pub struct Context { pub euid: u32, pub egid: u32, pub pid: usize, - /// Supplementary group IDs for access control decisions. - pub groups: Vec, // See [`PreemptGuard`] // @@ -206,7 +204,6 @@ impl Context { euid: 0, egid: 0, pid: 0, - groups: Vec::new(), #[cfg(feature = "syscall_debug")] syscall_debug_info: crate::syscall::debug::SyscallDebugInfo::default(), @@ -482,7 +479,6 @@ impl Context { uid: self.euid, gid: self.egid, pid: self.pid, - groups: self.groups.clone(), } } } diff --git a/recipes/core/kernel/source/src/context/file.rs b/recipes/core/kernel/source/src/context/file.rs index 150f483a47..2d3790f147 100644 --- a/recipes/core/kernel/source/src/context/file.rs +++ b/recipes/core/kernel/source/src/context/file.rs @@ -4,7 +4,7 @@ use crate::{ event, scheme::{self, SchemeId}, sync::{CleanLockToken, RwLock, L6}, - syscall::error::{Error, Result, ESTALE}, + syscall::error::Result, }; use alloc::sync::Arc; use syscall::{schemev2::NewFdFlags, RwFlags, O_APPEND, O_NONBLOCK}; @@ -18,7 +18,6 @@ pub struct FileDescription { pub offset: u64, /// The scheme that this file refers to pub scheme: SchemeId, - pub scheme_generation: Option, /// The number the scheme uses to refer to this file pub number: usize, /// The flags passed to open or fcntl(SETFL) @@ -33,52 +32,6 @@ bitflags! { } } impl FileDescription { - pub fn with_generation( - scheme: SchemeId, - scheme_generation: Option, - number: usize, - offset: u64, - flags: u32, - internal_flags: InternalFlags, - ) -> Self { - Self { - offset, - scheme, - scheme_generation, - number, - flags, - internal_flags, - } - } - - pub fn new( - scheme: SchemeId, - number: usize, - offset: u64, - flags: u32, - internal_flags: InternalFlags, - token: &mut CleanLockToken, - ) -> Self { - Self::with_generation( - scheme, - Some(scheme::current_scheme_generation(token.token(), scheme)), - number, - offset, - flags, - internal_flags, - ) - } - - pub fn get_scheme(&self, token: &mut CleanLockToken) -> Result { - if let Some(expected_generation) = self.scheme_generation - && expected_generation != scheme::current_scheme_generation(token.token(), self.scheme) - { - return Err(Error::new(ESTALE)); - } - - scheme::get_scheme(token.token(), self.scheme) - } - pub fn rw_flags(&self, rw: RwFlags) -> u32 { let mut ret = self.flags & !(O_NONBLOCK | O_APPEND) as u32; if rw.contains(RwFlags::APPEND) { @@ -123,7 +76,7 @@ impl FileDescription { pub fn try_close(self, token: &mut CleanLockToken) -> Result<()> { event::unregister_file(self.scheme, self.number, token); - let scheme = self.get_scheme(token)?; + let scheme = scheme::get_scheme(token.token(), self.scheme)?; scheme.close(self.number, token) } @@ -132,12 +85,12 @@ impl FileDescription { impl FileDescriptor { pub fn close(self, token: &mut CleanLockToken) -> Result<()> { { - let (desc, number, internal_flags) = { + let (scheme_id, number, internal_flags) = { let desc = self.description.read(token.token()); - (*desc, desc.number, desc.internal_flags) + (desc.scheme, desc.number, desc.internal_flags) }; if internal_flags.contains(InternalFlags::NOTIFY_ON_NEXT_DETACH) { - let scheme = desc.get_scheme(token)?; + let scheme = scheme::get_scheme(token.token(), scheme_id)?; scheme.detach(number, token)?; } } diff --git a/recipes/core/kernel/source/src/context/memory.rs b/recipes/core/kernel/source/src/context/memory.rs index 127a34fd87..93446ba7a7 100644 --- a/recipes/core/kernel/source/src/context/memory.rs +++ b/recipes/core/kernel/source/src/context/memory.rs @@ -64,13 +64,14 @@ impl UnmapResult { return Ok(()); }; - let (scheme, number) = { - let desc = *description.read(token.token()); - (desc.get_scheme(token)?, desc.number) + let (scheme_id, number) = { + let desc = description.write(token.token()); + (desc.scheme, desc.number) }; - let funmap_result = scheme - .kfunmap(number, base_offset, self.size, self.flags, token); + let scheme_opt = scheme::get_scheme(token.token(), scheme_id); + let funmap_result = scheme_opt + .and_then(|scheme| scheme.kfunmap(number, base_offset, self.size, self.flags, token)); if let Ok(fd) = Arc::try_unwrap(description) { fd.into_inner().try_close(token)?; @@ -2686,13 +2687,20 @@ fn correct_inner<'l>( // XXX: This is cheating, but guaranteed we won't deadlock because we've dropped addr_space_guard let mut token = unsafe { CleanLockToken::new() }; - let desc = *file_ref.description.read(token.token()); - let scheme = desc.get_scheme(&mut token).map_err(|_| PfError::Segv)?; - let scheme_number = desc.number; - let user_inner = match scheme { - KernelSchemes::User(user) => user.inner, - _ => return Err(PfError::Segv), + let (scheme_id, scheme_number) = { + let desc = &file_ref.description.read(token.token()); + (desc.scheme, desc.number) }; + let user_inner = scheme::get_scheme(token.token(), scheme_id) + .ok() + .and_then(|s| { + if let KernelSchemes::User(user) = s { + Some(user.inner) + } else { + None + } + }) + .ok_or(PfError::Segv)?; let offset = file_ref.base_offset as u64 + (pages_from_grant_start * PAGE_SIZE) as u64; user_inner diff --git a/recipes/core/kernel/source/src/context/mod.rs b/recipes/core/kernel/source/src/context/mod.rs index df44cc4565..37c73f5a37 100644 --- a/recipes/core/kernel/source/src/context/mod.rs +++ b/recipes/core/kernel/source/src/context/mod.rs @@ -14,8 +14,8 @@ use crate::{ memory::{RmmA, RmmArch, TableKind}, percpu::PercpuBlock, sync::{ - ArcRwLockWriteGuard, CleanLockToken, LockToken, McsMutex, McsMutexGuard, Mutex, - MutexGuard, RwLock, RwLockReadGuard, RwLockWriteGuard, L0, L1, L2, L4, + ArcRwLockWriteGuard, CleanLockToken, LockToken, Mutex, MutexGuard, RwLock, RwLockReadGuard, + RwLockWriteGuard, L0, L1, L2, L4, }, syscall::error::Result, }; @@ -74,12 +74,10 @@ pub use self::arch::empty_cr3; // the context file descriptors. static CONTEXTS: RwLock> = RwLock::new(BTreeSet::new()); -// Actual context store for the scheduler — uses MCS fair spinlock to -// eliminate cache-line bouncing under multi-CPU contention. -static RUN_CONTEXTS: McsMutex = McsMutex::new(RunContextData::new()); +// Actual context store for the scheduler +static RUN_CONTEXTS: Mutex = Mutex::new(RunContextData::new()); -// Context that has been pushed out from RUN_CONTEXTS after being idle. -// Uses regular Mutex (lower contention; wakeup_contexts uses try_lock). +// Context that has been pushed out from RUN_CONTEXTS after being idle static IDLE_CONTEXTS: Mutex> = Mutex::new(VecDeque::new()); pub struct RunContextData { @@ -115,7 +113,7 @@ pub fn idle_contexts_try( IDLE_CONTEXTS.try_lock(token) } -pub fn run_contexts(token: LockToken<'_, L0>) -> McsMutexGuard<'_, L1, RunContextData> { +pub fn run_contexts(token: LockToken<'_, L0>) -> MutexGuard<'_, L1, RunContextData> { RUN_CONTEXTS.lock(token) } diff --git a/recipes/core/kernel/source/src/context/switch.rs b/recipes/core/kernel/source/src/context/switch.rs index 2dbed065eb..86684c8f4c 100644 --- a/recipes/core/kernel/source/src/context/switch.rs +++ b/recipes/core/kernel/source/src/context/switch.rs @@ -15,7 +15,7 @@ use crate::{ use alloc::{sync::Arc, vec::Vec}; use core::{ cell::{Cell, RefCell}, - mem, + hint, mem, sync::atomic::Ordering, }; use syscall::PtraceFlags; @@ -26,11 +26,6 @@ enum UpdateResult { Blocked, } -/// Default number of PIT ticks before triggering a context switch. -/// At ~2.25 ms per tick, 3 ticks ≈ 6.75 ms timeslice. -/// Configurable per-CPU via `ContextSwitchPercpu::preempt_interval`. -const DEFAULT_PREEMPT_INTERVAL: usize = 3; - // A simple geometric series where value[i] ~= value[i - 1] * 1.25 const SCHED_PRIO_TO_WEIGHT: [usize; 40] = [ 88761, 71755, 56483, 46273, 36291, 29154, 23254, 18705, 14949, 11916, 9548, 7620, 6100, 4904, @@ -95,15 +90,13 @@ struct SwitchResultInner { /// /// The function also calls the signal handler after switching contexts. pub fn tick(token: &mut CleanLockToken) { - let percpu = PercpuBlock::current(); - let ticks_cell = &percpu.switch_internals.pit_ticks; + let ticks_cell = &PercpuBlock::current().switch_internals.pit_ticks; let new_ticks = ticks_cell.get() + 1; ticks_cell.set(new_ticks); - // Trigger a context switch when the per-CPU preempt interval is reached. - let interval = percpu.switch_internals.preempt_interval.get(); - if new_ticks >= interval { + // Trigger a context switch after every 3 ticks (approx. 6.75 ms). + if new_ticks >= 3 { switch(token); crate::context::signal::signal_handler(token); } @@ -127,10 +120,7 @@ pub unsafe extern "C" fn switch_finish_hook() { crate::arch::stop::emergency_reset(); } } - PercpuBlock::current() - .switch_internals - .in_context_switch - .set(false); + arch::CONTEXT_SWITCH_LOCK.store(false, Ordering::SeqCst); crate::percpu::switch_arch_hook(); } } @@ -160,15 +150,16 @@ pub fn switch(token: &mut CleanLockToken) -> SwitchResult { //set PIT Interrupt counter to 0, giving each process same amount of PIT ticks percpu.switch_internals.pit_ticks.set(0); - // Acquire the per-CPU context switch flag. Each CPU can only be in one context - // switch at a time. The per-context write locks provide cross-CPU safety; this - // flag catches re-entrant switches on the same CPU (a kernel bug). - debug_assert!( - !percpu.switch_internals.in_context_switch.get(), - "context switch re-entry on CPU {}", - percpu.cpu_id - ); - percpu.switch_internals.in_context_switch.set(true); + // Acquire the global lock to ensure exclusive access during context switch and avoid + // issues that would be caused by the unsafe operations below + // TODO: Better memory orderings? + while arch::CONTEXT_SWITCH_LOCK + .compare_exchange_weak(false, true, Ordering::SeqCst, Ordering::Relaxed) + .is_err() + { + hint::spin_loop(); + percpu.maybe_handle_tlb_shootdown(); + } // Lock the previous context. let prev_context_lock = crate::context::current(); @@ -176,8 +167,8 @@ pub fn switch(token: &mut CleanLockToken) -> SwitchResult { let mut prev_context_guard = unsafe { prev_context_lock.write_arc() }; if !prev_context_guard.is_preemptable() { - // Unset per-CPU context switch flag - percpu.switch_internals.in_context_switch.set(false); + // Unset global lock + arch::CONTEXT_SWITCH_LOCK.store(false, Ordering::SeqCst); // Pretend to have finished switching, so CPU is not idled return SwitchResult::Switched; @@ -301,8 +292,8 @@ pub fn switch(token: &mut CleanLockToken) -> SwitchResult { SwitchResult::Switched } _ => { - // No target was found, unset per-CPU context switch flag and return - percpu.switch_internals.in_context_switch.set(false); + // No target was found, unset global lock and return + arch::CONTEXT_SWITCH_LOCK.store(false, Ordering::SeqCst); percpu.stats.set_state(cpu_stats::CpuState::Idle); @@ -361,7 +352,6 @@ fn wakeup_contexts(token: &mut CleanLockToken, switch_time: u128) -> Vec<(usize, } /// This is the scheduler function which currently utilises Deficit Weighted Round Robin Scheduler -/// with NUMA-aware context selection preference. fn select_next_context( token: &mut CleanLockToken, percpu: &PercpuBlock, @@ -387,10 +377,6 @@ fn select_next_context( let total_contexts: usize = contexts_list.iter().map(|q| q.len()).sum(); let mut skipped_contexts = 0; - // NUMA-aware selection: remember cross-node fallback candidate. - let my_numa_node = percpu.numa_node.get(); - let mut cross_node_fallback: Option<(usize, ArcContextLockWriteGuard)> = None; - 'priority: loop { i = (i + 1) % 40; total_iters += 1; @@ -455,44 +441,9 @@ fn select_next_context( // Is this context runnable on this CPU? let sw = unsafe { update_runnable(&mut next_context_guard, cpu_id, switch_time) }; if let UpdateResult::CanSwitch = sw { - // NUMA-aware selection: check if this context's last CPU was on the same node. - let same_node = if my_numa_node != u8::MAX { - next_context_guard.cpu_id - .map(|cid| { - crate::percpu::get_for_cpu(cid) - .map(|p| p.numa_node.get() == my_numa_node) - .unwrap_or(false) - }) - .unwrap_or(true) // New context (no last CPU) — treat as same node - } else { - true // No NUMA info — treat all as same node - }; - - if same_node { - // Cache-warm: select immediately - percpu.current_prio.set(next_context_guard.prio); - next_context_guard_opt = Some(next_context_guard); - balance[i] -= SCHED_PRIO_TO_WEIGHT[20]; - break 'priority; - } else { - // Cross-node candidate: save as fallback, keep scanning for same-node - if cross_node_fallback.is_none() { - // Cache the priority and balance for later - cross_node_fallback = - Some((next_context_guard.prio, next_context_guard)); - balance[i] -= SCHED_PRIO_TO_WEIGHT[20]; - // Don't break — keep looking for a same-node context - continue; - } else { - // Already have a cross-node fallback; push this one back - contexts.push_back(next_context_ref); - skipped_contexts += 1; - if skipped_contexts >= total_contexts { - break 'priority; - } - continue; - } - } + next_context_guard_opt = Some(next_context_guard); + balance[i] -= SCHED_PRIO_TO_WEIGHT[20]; + break 'priority; } else { if matches!(sw, UpdateResult::Blocked) { idle_contexts(token.token()).push_back(next_context_ref); @@ -507,15 +458,6 @@ fn select_next_context( } } } - - // If we found a cross-node fallback but no same-node context, use it - if next_context_guard_opt.is_none() { - if let Some((prio, guard)) = cross_node_fallback { - percpu.current_prio.set(prio); - next_context_guard_opt = Some(guard); - } - } - percpu.balance.set(balance); percpu.last_queue.set(i); @@ -523,10 +465,7 @@ fn select_next_context( // Send the old process to the back of the line (if it is still runnable) let prev_ctx = WeakContextRef(Arc::downgrade(&prev_context_lock)); if prev_context_guard.status.is_runnable() { - let raw_prio = prev_context_guard.prio; - let prio = percpu.effective_prio(raw_prio); - // Clear PI donation — previous context is being re-queued - percpu.pi_donated_prio.store(u32::MAX, Ordering::Relaxed); + let prio = prev_context_guard.prio; contexts_list[prio].push_back(prev_ctx); } else { idle_contexts(token.token()).push_back(prev_ctx); @@ -538,8 +477,7 @@ fn select_next_context( return Ok(Some(next_context_guard)); } else { if !was_idle && !Arc::ptr_eq(&prev_context_lock, &idle_context) { - // Switching to idle context — cache lowest priority - percpu.current_prio.set(39); + // We switch into the idle context Ok(Some(unsafe { idle_context.write_arc() })) } else { // We found no other process to run. @@ -556,13 +494,6 @@ pub struct ContextSwitchPercpu { switch_result: Cell>, switch_time: Cell, pit_ticks: Cell, - /// Per-CPU context switch flag. Set to true during a context switch on this CPU. - /// Replaced the global CONTEXT_SWITCH_LOCK to eliminate cross-CPU serialization. - in_context_switch: Cell, - /// Number of PIT ticks before triggering a context switch. - /// Default: 3 (≈6.75 ms). Lower values improve interactive responsiveness; - /// higher values improve throughput for batch/compute workloads. - preempt_interval: Cell, current_ctxt: RefCell>>, @@ -577,8 +508,6 @@ impl ContextSwitchPercpu { switch_result: Cell::new(None), switch_time: Cell::new(0), pit_ticks: Cell::new(0), - in_context_switch: Cell::new(false), - preempt_interval: Cell::new(DEFAULT_PREEMPT_INTERVAL), current_ctxt: RefCell::new(None), idle_ctxt: RefCell::new(None), being_sigkilled: Cell::new(false), diff --git a/recipes/core/kernel/source/src/cpu_set.rs b/recipes/core/kernel/source/src/cpu_set.rs index 5594cac082..4aae7781e9 100644 --- a/recipes/core/kernel/source/src/cpu_set.rs +++ b/recipes/core/kernel/source/src/cpu_set.rs @@ -42,18 +42,17 @@ impl core::fmt::Display for LogicalCpuId { } #[cfg(target_pointer_width = "64")] -pub const MAX_CPU_COUNT: u32 = 256; +pub const MAX_CPU_COUNT: u32 = 128; #[cfg(target_pointer_width = "32")] pub const MAX_CPU_COUNT: u32 = 32; const SET_WORDS: usize = (MAX_CPU_COUNT / usize::BITS) as usize; -// TODO: Support more than 256 CPUs. +// TODO: Support more than 128 CPUs. // The maximum number of CPUs on Linux is configurable, and the type for LogicalCpuSet and // LogicalCpuId may be optimized accordingly. In that case, box the mask if it's larger than some -// base size (probably 256 bytes). AMD EPYC has 128C/256T, Threadripper PRO 96C/192T — -// 256 covers current hardware. +// base size (probably 256 bytes). #[derive(Debug)] pub struct LogicalCpuSet([AtomicUsize; SET_WORDS]); diff --git a/recipes/core/kernel/source/src/event.rs b/recipes/core/kernel/source/src/event.rs index f4f57c2351..7398145ad6 100644 --- a/recipes/core/kernel/source/src/event.rs +++ b/recipes/core/kernel/source/src/event.rs @@ -1,5 +1,5 @@ use alloc::sync::Arc; -use core::sync::atomic::{AtomicU64, AtomicUsize, Ordering}; +use core::sync::atomic::{AtomicUsize, Ordering}; use hashbrown::{hash_map::DefaultHashBuilder, HashMap}; use smallvec::SmallVec; use syscall::data::GlobalSchemes; @@ -23,7 +23,6 @@ int_like!(EventQueueId, AtomicEventQueueId, usize, AtomicUsize); pub struct EventQueue { id: EventQueueId, queue: WaitQueue, - pub eventfd: Option<(AtomicU64, bool)>, // (counter, semaphore_mode) } impl EventQueue { @@ -31,15 +30,6 @@ impl EventQueue { EventQueue { id, queue: WaitQueue::new(), - eventfd: None, - } - } - - pub fn new_eventfd(id: EventQueueId, initval: u64, semaphore: bool) -> EventQueue { - EventQueue { - id, - queue: WaitQueue::new(), - eventfd: Some((AtomicU64::new(initval), semaphore)), } } diff --git a/recipes/core/kernel/source/src/main.rs b/recipes/core/kernel/source/src/main.rs index 81487fac89..32f491d0e8 100644 --- a/recipes/core/kernel/source/src/main.rs +++ b/recipes/core/kernel/source/src/main.rs @@ -70,9 +70,6 @@ mod log; /// Memory management mod memory; -/// NUMA topology -mod numa; - /// Panic mod panic; diff --git a/recipes/core/kernel/source/src/numa.rs b/recipes/core/kernel/source/src/numa.rs deleted file mode 100644 index cba73a4465..0000000000 --- a/recipes/core/kernel/source/src/numa.rs +++ /dev/null @@ -1,81 +0,0 @@ -/// NUMA topology hints for the kernel scheduler. -/// -/// NUMA discovery (SRAT/SLIT parsing) is performed during kernel ACPI init -/// (`acpi::init()`). The kernel stores a lightweight copy for O(1) scheduling -/// lookups. If no SRAT is found, `init_default()` creates a single-node topology. -use crate::acpi::srat; -use crate::cpu_set::{LogicalCpuId, LogicalCpuSet}; -use core::sync::atomic::{AtomicBool, Ordering}; - -const MAX_NUMA_NODES: usize = 8; - -#[derive(Debug)] -pub struct NumaHint { - pub node_id: u8, - pub cpus: LogicalCpuSet, -} - -pub struct NumaTopology { - pub nodes: [Option; MAX_NUMA_NODES], - pub initialized: AtomicBool, -} - -impl NumaTopology { - pub const fn new() -> Self { - const NONE: Option = None; - Self { nodes: [NONE; MAX_NUMA_NODES], initialized: AtomicBool::new(false) } - } - - pub fn node_for_cpu(&self, cpu: LogicalCpuId) -> Option { - for node in self.nodes.iter().flatten() { - if node.cpus.contains(cpu) { return Some(node.node_id); } - } - None - } - - pub fn same_node(&self, cpu1: LogicalCpuId, cpu2: LogicalCpuId) -> bool { - self.node_for_cpu(cpu1) == self.node_for_cpu(cpu2) - } -} - -static mut NUMA_TOPOLOGY: NumaTopology = NumaTopology::new(); - -pub fn topology() -> &'static NumaTopology { unsafe { &NUMA_TOPOLOGY } } - -/// Initialize NUMA topology from SRAT data parsed during ACPI init. -pub fn init_from_srat(apic_ids: &[(u32, LogicalCpuId)]) { - let topo = topology(); - if topo.initialized.swap(true, Ordering::AcqRel) { return; } - if !srat::is_available() { init_default_inner(); return; } - unsafe { - let topo_mut = &mut *core::ptr::addr_of_mut!(NUMA_TOPOLOGY); - for &(apic_id, cpu_id) in apic_ids { - if let Some(node) = srat::numa_node_for_apic(apic_id) { - let idx = node as usize; - if idx < MAX_NUMA_NODES { - topo_mut.nodes[idx].get_or_insert_with(|| NumaHint { node_id: node, cpus: LogicalCpuSet::empty() }).cpus.atomic_set(cpu_id); - } - } - } - if topo_mut.nodes.iter().all(|n| n.is_none()) { - topo_mut.nodes[0] = Some(NumaHint { node_id: 0, cpus: LogicalCpuSet::all() }); - } - } - let node_count = topology().nodes.iter().filter(|n| n.is_some()).count(); - debug!("NUMA: {node_count} node(s) from SRAT"); -} - -/// Fallback: single-node topology. -pub fn init_default() { - let topo = topology(); - if topo.initialized.swap(true, Ordering::AcqRel) { return; } - init_default_inner(); -} - -fn init_default_inner() { - unsafe { - let topo_mut = &mut *core::ptr::addr_of_mut!(NUMA_TOPOLOGY); - topo_mut.nodes[0] = Some(NumaHint { node_id: 0, cpus: LogicalCpuSet::all() }); - } - debug!("NUMA: single-node topology (no SRAT)"); -} diff --git a/recipes/core/kernel/source/src/percpu.rs b/recipes/core/kernel/source/src/percpu.rs index 9309a41d4d..f4ad5e66e6 100644 --- a/recipes/core/kernel/source/src/percpu.rs +++ b/recipes/core/kernel/source/src/percpu.rs @@ -4,14 +4,9 @@ use alloc::{ }; use core::{ cell::{Cell, RefCell}, - hint, - sync::atomic::{AtomicBool, AtomicPtr, AtomicU32, AtomicU64, Ordering}, + sync::atomic::{AtomicBool, AtomicPtr, Ordering}, }; -/// Maximum number of pages to flush individually using INVLPG before falling -/// back to a full TLB flush (CR3 reload). -const TLB_RANGE_THRESHOLD: u32 = 32; - use rmm::Arch; use syscall::PtraceFlags; @@ -21,7 +16,7 @@ use crate::{ cpu_set::{LogicalCpuId, MAX_CPU_COUNT}, cpu_stats::{CpuStats, CpuStatsData}, ptrace::Session, - sync::{mcs::McsNode, mcs::McsRawLock, CleanLockToken}, + sync::CleanLockToken, syscall::debug::SyscallDebugInfo, }; @@ -39,38 +34,6 @@ pub struct PercpuBlock { pub balance: Cell<[usize; 40]>, pub last_queue: Cell, - /// Per-CPU MCS node for the scheduler run-queue lock (RUN_CONTEXTS). - pub mcs_sched_node: McsNode, - - /// Counts how many times the scheduler MCS lock acquisition was contended. - pub mcs_contention_count: Cell, - - /// TLB shootdown range: start virtual address (page-aligned). - /// Set to 0 for a full flush. Only valid when `wants_tlb_shootdown` is true. - pub tlb_flush_start: AtomicU64, - /// TLB shootdown range: number of pages to invalidate. - pub tlb_flush_count: AtomicU32, - - /// Priority inheritance donation. When another CPU is blocked waiting on a - /// lock this CPU holds, the blocked CPU may donate its priority here. - /// `u32::MAX` means no donation; otherwise it's a priority level (0-39). - pub pi_donated_prio: AtomicU32, - - /// Cached priority of the currently-running context on this CPU. - /// Set by the scheduler when selecting a new context. Read by the MCS - /// lock during priority donation — avoids acquiring the context RwLock - /// from the spin loop. Default 39 (lowest priority). - pub current_prio: Cell, - - /// NUMA proximity domain for this CPU. Set during ACPI init from SRAT. - /// `u8::MAX` means unknown (no SRAT or APIC ID not listed). - pub numa_node: Cell, - - /// Pointer to the MCS lock this CPU is currently spinning on (for transitive PI). - /// `null` when not waiting on any lock. Set in McsRawLock::acquire() before - /// entering the spin loop, cleared upon acquisition. - pub waiting_on_lock: AtomicPtr, - // TODO: Put mailbox queues here, e.g. for TLB shootdown? Just be sure to 128-byte align it // first to avoid cache invalidation. pub profiling: Option<&'static crate::profiling::RingBuffer>, @@ -94,15 +57,6 @@ pub unsafe fn init_tlb_shootdown(id: LogicalCpuId, block: *mut PercpuBlock) { ALL_PERCPU_BLOCKS[id.get() as usize].store(block, Ordering::Release) } -/// Get a reference to another CPU's PercpuBlock by logical CPU ID. -pub fn get_for_cpu(id: LogicalCpuId) -> Option<&'static PercpuBlock> { - unsafe { - ALL_PERCPU_BLOCKS[id.get() as usize] - .load(Ordering::Acquire) - .as_ref() - } -} - pub fn get_all_stats() -> Vec<(LogicalCpuId, CpuStatsData)> { let mut res = ALL_PERCPU_BLOCKS .iter() @@ -147,148 +101,25 @@ pub fn shootdown_tlb_ipi(target: Option) { core::hint::spin_loop(); } } - // Full flush — clear range info (Release ordering ensures the flag - // swap and these stores are visible to the handler before the IPI). - percpublock.tlb_flush_start.store(0, Ordering::Release); - percpublock.tlb_flush_count.store(0, Ordering::Release); crate::ipi::ipi_single(crate::ipi::IpiKind::Tlb, percpublock); } else { - // Broadcast TLB shootdown: set flag on all other CPUs, then send a single - // IPI with "all except self" destination shorthand instead of N individual IPIs. - let my_percpublock = PercpuBlock::current(); for id in 0..crate::cpu_count() { - let target_id = LogicalCpuId::new(id); - if target_id == my_percpublock.cpu_id { - continue; - } - let Some(percpublock) = (unsafe { - ALL_PERCPU_BLOCKS[id as usize] - .load(Ordering::Acquire) - .as_ref() - }) else { - continue; - }; - // Wait if this CPU still has a pending shootdown from a previous request - #[expect(clippy::bool_comparison)] - while percpublock - .wants_tlb_shootdown - .swap(true, Ordering::Release) - == true - { - while percpublock.wants_tlb_shootdown.load(Ordering::Relaxed) == true { - my_percpublock.maybe_handle_tlb_shootdown(); - hint::spin_loop(); - } - } - // Full flush — clear range info (Release ordering) - percpublock.tlb_flush_start.store(0, Ordering::Release); - percpublock.tlb_flush_count.store(0, Ordering::Release); + // TODO: Optimize: use global counter and percpu ack counters, send IPI using + // destination shorthand "all CPUs". + shootdown_tlb_ipi(Some(LogicalCpuId::new(id))); } - // Single broadcast IPI to all other CPUs using destination shorthand - crate::ipi::ipi(crate::ipi::IpiKind::Tlb, crate::ipi::IpiTarget::Other); - } -} - -/// Range-based TLB shootdown IPI. Only invalidates the specified virtual address -/// range using INVLPG per page for ranges up to TLB_RANGE_THRESHOLD pages. -/// Falls back to full flush for larger ranges. -pub fn shootdown_tlb_ipi_range(target: Option, start: usize, count: usize) { - if cfg!(not(feature = "multi_core")) { - return; - } - - let start_aligned = start as u64 & !0xFFF; - let count_u32 = count as u32; - let use_range = count_u32 > 0 && count_u32 <= TLB_RANGE_THRESHOLD; - - let set_range = |percpublock: &PercpuBlock| { - if use_range { - percpublock.tlb_flush_start.store(start_aligned, Ordering::Release); - percpublock.tlb_flush_count.store(count_u32, Ordering::Release); - } else { - percpublock.tlb_flush_start.store(0, Ordering::Release); - percpublock.tlb_flush_count.store(0, Ordering::Release); - } - }; - - if let Some(target) = target { - let my_percpublock = PercpuBlock::current(); - assert_ne!(target, my_percpublock.cpu_id); - - let Some(percpublock) = (unsafe { - ALL_PERCPU_BLOCKS[target.get() as usize] - .load(Ordering::Acquire) - .as_ref() - }) else { - return; - }; - #[expect(clippy::bool_comparison)] - while percpublock.wants_tlb_shootdown.swap(true, Ordering::Release) == true { - while percpublock.wants_tlb_shootdown.load(Ordering::Relaxed) == true { - my_percpublock.maybe_handle_tlb_shootdown(); - hint::spin_loop(); - } - } - set_range(percpublock); - crate::ipi::ipi_single(crate::ipi::IpiKind::Tlb, percpublock); - } else { - let my_percpublock = PercpuBlock::current(); - for id in 0..crate::cpu_count() { - let target_id = LogicalCpuId::new(id); - if target_id == my_percpublock.cpu_id { - continue; - } - let Some(percpublock) = (unsafe { - ALL_PERCPU_BLOCKS[id as usize] - .load(Ordering::Acquire) - .as_ref() - }) else { - continue; - }; - #[expect(clippy::bool_comparison)] - while percpublock.wants_tlb_shootdown.swap(true, Ordering::Release) == true { - while percpublock.wants_tlb_shootdown.load(Ordering::Relaxed) == true { - my_percpublock.maybe_handle_tlb_shootdown(); - hint::spin_loop(); - } - } - set_range(percpublock); - } - crate::ipi::ipi(crate::ipi::IpiKind::Tlb, crate::ipi::IpiTarget::Other); } } impl PercpuBlock { - /// Return the effective scheduling priority, accounting for priority inheritance. - /// Lower number = higher priority (0-39 range). - pub fn effective_prio(&self, context_prio: usize) -> usize { - let donated = self.pi_donated_prio.load(Ordering::Relaxed); - if donated < context_prio as u32 { - donated as usize - } else { - context_prio - } - } - pub fn maybe_handle_tlb_shootdown(&self) { #[expect(clippy::bool_comparison)] if self.wants_tlb_shootdown.swap(false, Ordering::Relaxed) == false { return; } - let start = self.tlb_flush_start.load(Ordering::Acquire); - let count = self.tlb_flush_count.load(Ordering::Acquire); - - if start != 0 && count > 0 && count <= TLB_RANGE_THRESHOLD { - // Range-based flush using INVLPG per page — cheaper than full CR3 reload. - for i in 0..count { - let addr = start + (i as u64) * 4096; - crate::memory::RmmA::invalidate(rmm::VirtualAddress::new(addr as usize)); - } - } else { - // Full TLB flush (CR3 reload) for large ranges or global shootdowns. - crate::memory::RmmA::invalidate_all(); - } + // TODO: Finer-grained flush + crate::memory::RmmA::invalidate_all(); if let Some(addrsp) = &*self.current_addrsp.borrow() { addrsp.tlb_ack.fetch_add(1, Ordering::Release); @@ -358,14 +189,6 @@ impl PercpuBlock { wants_tlb_shootdown: AtomicBool::new(false), balance: Cell::new([0; 40]), last_queue: Cell::new(39), - mcs_sched_node: McsNode::new(), - mcs_contention_count: Cell::new(0), - tlb_flush_start: AtomicU64::new(0), - tlb_flush_count: AtomicU32::new(0), - pi_donated_prio: AtomicU32::new(u32::MAX), - current_prio: Cell::new(39), - numa_node: Cell::new(u8::MAX), - waiting_on_lock: AtomicPtr::new(core::ptr::null_mut()), ptrace_flags: Cell::new(PtraceFlags::empty()), ptrace_session: RefCell::new(None), inside_syscall: Cell::new(false), diff --git a/recipes/core/kernel/source/src/scheme/acpi.rs b/recipes/core/kernel/source/src/scheme/acpi.rs index 5d734691a9..87570a1297 100644 --- a/recipes/core/kernel/source/src/scheme/acpi.rs +++ b/recipes/core/kernel/source/src/scheme/acpi.rs @@ -10,7 +10,6 @@ use syscall::{ use crate::{ acpi::{RxsdtEnum, RXSDT_ENUM}, - arch::sleep, context::file::InternalFlags, event, sync::{CleanLockToken, RwLock, WaitCondition, L1}, @@ -41,7 +40,6 @@ enum HandleKind { TopLevel, Rxsdt, ShutdownPipe, - SleepControl, SchemeRoot, } @@ -148,11 +146,11 @@ impl KernelScheme for AcpiScheme { if flags & O_EXCL == O_EXCL || flags & O_SYMLINK == O_SYMLINK { return Err(Error::new(EINVAL)); } + if flags & O_ACCMODE != O_RDONLY && flags & O_STAT != O_STAT { + return Err(Error::new(EROFS)); + } let (handle_kind, int_flags) = match path { "" => { - if flags & O_ACCMODE != O_RDONLY && flags & O_STAT != O_STAT { - return Err(Error::new(EROFS)); - } if flags & O_DIRECTORY != O_DIRECTORY && flags & O_STAT != O_STAT { return Err(Error::new(EISDIR)); } @@ -160,36 +158,17 @@ impl KernelScheme for AcpiScheme { (HandleKind::TopLevel, InternalFlags::POSITIONED) } "rxsdt" => { - if flags & O_ACCMODE != O_RDONLY && flags & O_STAT != O_STAT { - return Err(Error::new(EROFS)); - } if flags & O_DIRECTORY == O_DIRECTORY && flags & O_STAT != O_STAT { return Err(Error::new(ENOTDIR)); } (HandleKind::Rxsdt, InternalFlags::POSITIONED) } "kstop" => { - if flags & O_ACCMODE != O_RDONLY && flags & O_STAT != O_STAT { - return Err(Error::new(EROFS)); - } if flags & O_DIRECTORY == O_DIRECTORY && flags & O_STAT != O_STAT { return Err(Error::new(ENOTDIR)); } (HandleKind::ShutdownPipe, InternalFlags::empty()) } - "sleep" => { - if flags & O_ACCMODE == O_RDONLY || flags & O_STAT == O_STAT { - // allowed - } else if flags & O_ACCMODE != syscall::flag::O_WRONLY - && flags & O_ACCMODE != syscall::flag::O_RDWR - { - return Err(Error::new(EINVAL)); - } - if flags & O_DIRECTORY == O_DIRECTORY && flags & O_STAT != O_STAT { - return Err(Error::new(ENOTDIR)); - } - (HandleKind::SleepControl, InternalFlags::POSITIONED) - } _ => return Err(Error::new(ENOENT)), }; @@ -212,7 +191,6 @@ impl KernelScheme for AcpiScheme { Ok(match handle.kind { HandleKind::Rxsdt => DATA.get().ok_or(Error::new(EBADFD))?.len() as u64, HandleKind::ShutdownPipe => 1, - HandleKind::SleepControl => sleep::available_sleep_states().len() as u64, HandleKind::TopLevel => 0, HandleKind::SchemeRoot => return Err(Error::new(EBADF))?, }) @@ -275,7 +253,6 @@ impl KernelScheme for AcpiScheme { return dst_buf.copy_exactly(&[0x42]).map(|()| 1); } - HandleKind::SleepControl => sleep::available_sleep_states(), HandleKind::Rxsdt => DATA.get().ok_or(Error::new(EBADFD))?, HandleKind::TopLevel => return Err(Error::new(EISDIR)), HandleKind::SchemeRoot => return Err(Error::new(EBADF)), @@ -318,45 +295,11 @@ impl KernelScheme for AcpiScheme { kind: DirentKind::Socket, name: "kstop", inode: 0, - next_opaque_id: 2, - })?; - } - if opaque <= 2 { - buf.entry(DirEntry { - kind: DirentKind::Regular, - name: "sleep", - inode: 0, next_opaque_id: u64::MAX, })?; } Ok(buf.finalize()) } - fn kwrite( - &self, - id: usize, - buf: crate::syscall::usercopy::UserSliceRo, - _flags: u32, - _stored_flags: u32, - token: &mut CleanLockToken, - ) -> Result { - let handle = *HANDLES.read(token.token()).get(id)?; - - if handle.stat { - return Err(Error::new(EBADF)); - } - - match handle.kind { - HandleKind::SleepControl => { - let mut tmp = [0_u8; 16]; - let len = buf.copy_common_bytes_to_slice(&mut tmp)?; - let request = core::str::from_utf8(&tmp[..len]).map_err(|_| Error::new(EINVAL))?; - sleep::trigger_sleep_request(request)?; - Ok(len) - } - HandleKind::SchemeRoot => Err(Error::new(EBADF)), - _ => Err(Error::new(EBADF)), - } - } fn kfpath(&self, _id: usize, buf: UserSliceWo, _token: &mut CleanLockToken) -> Result { //TODO: construct useful path? buf.copy_common_bytes_from_slice("/scheme/kernel.acpi/".as_bytes()) @@ -385,11 +328,6 @@ impl KernelScheme for AcpiScheme { st_size: 1, ..Default::default() }, - HandleKind::SleepControl => Stat { - st_mode: MODE_FILE, - st_size: sleep::available_sleep_states().len().try_into().unwrap_or(u64::MAX), - ..Default::default() - }, HandleKind::SchemeRoot => return Err(Error::new(EBADF)), })?; diff --git a/recipes/core/kernel/source/src/scheme/debug.rs b/recipes/core/kernel/source/src/scheme/debug.rs index 4a23b3cf4f..c70ac5792b 100644 --- a/recipes/core/kernel/source/src/scheme/debug.rs +++ b/recipes/core/kernel/source/src/scheme/debug.rs @@ -22,10 +22,9 @@ struct Handle { static HANDLES: RwLock> = RwLock::new(HandleMap::new()); -/// Add to the input queue, translating CR to NL (ICRNL) for serial console compatibility. +/// Add to the input queue pub fn debug_input(data: u8, token: &mut CleanLockToken) { - let translated = if data == b'\r' { b'\n' } else { data }; - INPUT.send(translated, token); + INPUT.send(data, token); } // Notify readers of input updates @@ -107,16 +106,12 @@ impl KernelScheme for DebugScheme { fn fevent( &self, id: usize, - flags: EventFlags, + _flags: EventFlags, token: &mut CleanLockToken, ) -> Result { let _handle = *HANDLES.read(token.token()).get(id)?; - let mut ready = EventFlags::empty(); - if flags.contains(EventFlags::EVENT_READ) { - ready |= EventFlags::EVENT_READ; - } - Ok(ready) + Ok(EventFlags::empty()) } fn fsync(&self, id: usize, token: &mut CleanLockToken) -> Result<()> { diff --git a/recipes/core/kernel/source/src/scheme/event.rs b/recipes/core/kernel/source/src/scheme/event.rs index e6e5142f56..36efe5b2b0 100644 --- a/recipes/core/kernel/source/src/scheme/event.rs +++ b/recipes/core/kernel/source/src/scheme/event.rs @@ -1,5 +1,4 @@ use alloc::sync::Arc; -use core::sync::atomic::Ordering; use syscall::{EventFlags, O_NONBLOCK}; use crate::{ @@ -26,25 +25,12 @@ impl KernelScheme for EventScheme { fn kopenat( &self, id: usize, - user_buf: StrOrBytes, + _user_buf: StrOrBytes, _flags: usize, _fcntl_flags: u32, _ctx: CallerCtx, token: &mut CleanLockToken, ) -> Result { - let path = match &user_buf { - StrOrBytes::Str(s) => s, - StrOrBytes::Bytes(b) => core::str::from_utf8(b).unwrap_or(""), - }; - if path.starts_with("eventfd/") { - let rest = &path[8..]; // after "eventfd/" - let mut parts = rest.split('/'); - let initval: u64 = parts.next().and_then(|s| s.parse().ok()).unwrap_or(0); - let sem: bool = parts.next().and_then(|s| s.parse().ok()).unwrap_or(false); - let id = next_queue_id(); - queues_mut(token.token()).insert(id, Arc::new(EventQueue::new_eventfd(id, initval, sem))); - return Ok(OpenResult::SchemeLocal(id.get(), InternalFlags::empty())); - } if id != SCHEME_ROOT_ID { return Err(Error::new(EACCES)); } @@ -81,31 +67,6 @@ impl KernelScheme for EventScheme { handle.clone() }; - if let Some((ref counter, semaphore)) = queue.eventfd { - let is_nonblock = flags & O_NONBLOCK as u32 != 0; - if semaphore { - let val = counter.load(Ordering::Acquire); - if val == 0 { - if is_nonblock { return Err(Error::new(EAGAIN)); } - // Blocking wait not implemented for eventfd in kernel - return Err(Error::new(EAGAIN)); - } - if counter.compare_exchange(val, val - 1, Ordering::AcqRel, Ordering::Relaxed).is_ok() { - let one: u64 = 1; - buf.copy_from_slice(unsafe { core::slice::from_raw_parts(&one as *const u64 as *const u8, 8) })?; - return Ok(8); - } - return Err(Error::new(EAGAIN)); - } else { - let val = counter.swap(0, Ordering::AcqRel); - if val == 0 && is_nonblock { - return Err(Error::new(EAGAIN)); - } - buf.copy_from_slice(unsafe { core::slice::from_raw_parts(&val as *const u64 as *const u8, 8) })?; - return Ok(8); - } - } - queue.read(buf, flags & O_NONBLOCK as u32 == 0, token) } @@ -124,19 +85,6 @@ impl KernelScheme for EventScheme { let handle = handles.get(&id).ok_or(Error::new(EBADF))?; handle.clone() }; - - if let Some((ref counter, _semaphore)) = queue.eventfd { - if buf.len() >= 8 { - let mut bytes = [0u8; 8]; - buf.copy_to_slice(&mut bytes)?; - let val = u64::from_ne_bytes(bytes); - if val == u64::MAX { return Err(Error::new(EINVAL)); } - counter.fetch_add(val, Ordering::AcqRel); - return Ok(8); - } - return Err(Error::new(EINVAL)); - } - let mut events_written = 0; for chunk in buf.in_exact_chunks(size_of::()) { diff --git a/recipes/core/kernel/source/src/scheme/irq.rs b/recipes/core/kernel/source/src/scheme/irq.rs index 4222960986..a8795e5958 100644 --- a/recipes/core/kernel/source/src/scheme/irq.rs +++ b/recipes/core/kernel/source/src/scheme/irq.rs @@ -18,9 +18,6 @@ use syscall::{ use crate::context::file::InternalFlags; use super::{CallerCtx, HandleMap, OpenResult, SchemeExt, StrOrBytes}; -#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] -use crate::arch::device::{ioapic, local_apic::ApicId}; - #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] use crate::arch::interrupt::{available_irqs_iter, irq::acknowledge, is_reserved, set_reserved}; #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] @@ -59,11 +56,8 @@ const INO_AVAIL: u64 = 0x8000_0000_0000_0000; const INO_BSP: u64 = 0x8001_0000_0000_0000; const INO_PHANDLE: u64 = 0x8003_0000_0000_0000; -/// Add to the input queue, with iommu validation gate for MSI vectors +/// Add to the input queue pub fn irq_trigger(irq: u8, token: &mut CleanLockToken) { - if irq >= 16 && !iommu_validate_msi_irq(irq) { - return; - } COUNTS.lock()[irq as usize] += 1; let fds: SmallVec<[usize; 8]> = { HANDLES @@ -83,17 +77,16 @@ pub fn irq_trigger(irq: u8, token: &mut CleanLockToken) { #[allow(dead_code)] enum Handle { SchemeRoot, - Irq { ack: AtomicUsize, irq: u8, cpu_id: LogicalCpuId }, + Irq { ack: AtomicUsize, irq: u8 }, Avail(LogicalCpuId), TopLevel, Phandle(u8, Vec), Bsp, - IrqAffinity { irq: u8, mask: AtomicUsize }, } impl Handle { fn as_irq_handle(&self) -> Option<(&AtomicUsize, u8)> { match self { - &Self::Irq { ref ack, irq, cpu_id: _ } => Some((ack, irq)), + &Self::Irq { ref ack, irq } => Some((ack, irq)), _ => None, } } @@ -147,7 +140,6 @@ impl IrqScheme { Handle::Irq { ack: AtomicUsize::new(0), irq: irq_number, - cpu_id: LogicalCpuId::BSP, }, InternalFlags::empty(), ) @@ -166,7 +158,6 @@ impl IrqScheme { Handle::Irq { ack: AtomicUsize::new(0), irq: irq_number, - cpu_id, }, InternalFlags::empty(), ) @@ -208,7 +199,6 @@ impl IrqScheme { Handle::Irq { ack: AtomicUsize::new(0), irq: irq_number as u8, - cpu_id: LogicalCpuId::new(0), }, InternalFlags::empty(), ) @@ -224,14 +214,6 @@ const fn vector_to_irq(vector: u8) -> u8 { vector - 32 } -const fn msi_vector_is_valid(vector: u8) -> bool { - vector >= 32 && vector < 0xEF -} - -fn iommu_validate_msi_irq(_irq: u8) -> bool { - true -} - impl crate::scheme::KernelScheme for IrqScheme { fn scheme_root(&self, token: &mut CleanLockToken) -> Result { let id = HANDLES.write(token.token()).insert(Handle::SchemeRoot); @@ -298,21 +280,7 @@ impl crate::scheme::KernelScheme for IrqScheme { InternalFlags::POSITIONED, ) } else if let Some(path_str) = path_str.strip_prefix('/') { - let (irq_str, affinity) = path_str - .trim_end_matches('/') - .rsplit_once('/') - .map(|(a, b)| (a, Some(b))) - .unwrap_or((path_str.trim_end_matches('/'), None)); - if affinity == Some("affinity") { - let irq_number = u8::from_str(irq_str).or(Err(Error::new(ENOENT)))?; - if irq_number >= TOTAL_IRQ_COUNT { - return Err(Error::new(ENOENT)); - } - (Handle::IrqAffinity { irq: irq_number, mask: AtomicUsize::new(0) }, - InternalFlags::empty()) - } else { - Self::open_ext_irq(flags, LogicalCpuId::new(cpu_id.into()), path_str)? - } + Self::open_ext_irq(flags, LogicalCpuId::new(cpu_id.into()), path_str)? } else { return Err(Error::new(ENOENT)); } @@ -339,20 +307,12 @@ impl crate::scheme::KernelScheme for IrqScheme { } #[cfg(not(dtb))] panic!("") - } else if let Some(rest) = path_str.strip_suffix("/affinity") { - let irq_number = u8::from_str(rest).or(Err(Error::new(ENOENT)))?; - if irq_number >= TOTAL_IRQ_COUNT { - return Err(Error::new(ENOENT)); - } - (Handle::IrqAffinity { irq: irq_number, mask: AtomicUsize::new(0) }, - InternalFlags::empty()) } else if let Ok(plain_irq_number) = u8::from_str(path_str) { if plain_irq_number < BASE_IRQ_COUNT { ( Handle::Irq { ack: AtomicUsize::new(0), irq: plain_irq_number, - cpu_id: LogicalCpuId::BSP, }, InternalFlags::empty(), ) @@ -408,7 +368,6 @@ impl crate::scheme::KernelScheme for IrqScheme { } } Handle::Avail(cpu_id) => { - let mut listed = 0; for vector in available_irqs_iter(cpu_id).skip(opaque) { let irq = vector_to_irq(vector); if cpu_id == LogicalCpuId::BSP && irq < BASE_IRQ_COUNT { @@ -422,9 +381,7 @@ impl crate::scheme::KernelScheme for IrqScheme { name: &intermediate, next_opaque_id: u64::from(vector) + 1, })?; - listed += 1; } - info!("irq getdents Avail: cpu_id={} opaque={} listed={}", cpu_id.get(), opaque, listed); } _ => return Err(Error::new(ENOTDIR)), } @@ -459,14 +416,11 @@ impl crate::scheme::KernelScheme for IrqScheme { let handle = handles_guard.get(id)?; if let &Handle::Irq { - irq: handle_irq, - cpu_id: handle_cpu_id, - .. + irq: handle_irq, .. } = handle && handle_irq > BASE_IRQ_COUNT { - info!("irq close: unreserving vector {} on cpu_id={}", irq_to_vector(handle_irq), handle_cpu_id.get()); - set_reserved(handle_cpu_id, irq_to_vector(handle_irq), false); + set_reserved(LogicalCpuId::BSP, irq_to_vector(handle_irq), false); } Ok(()) } @@ -482,32 +436,9 @@ impl crate::scheme::KernelScheme for IrqScheme { let handle = handles_guard.get(file)?; match handle { - &Handle::IrqAffinity { irq: _handle_irq, ref mask } => { - if buffer.len() < size_of::() { - return Err(Error::new(EINVAL)); - } - let mut raw = [0u8; size_of::()]; - buffer.copy_to_slice(&mut raw)?; - let cpu_id = u32::from_ne_bytes(raw); - let cpus = CPUS.get().ok_or(Error::new(EIO))?; - if !cpus.contains(&(cpu_id as u8)) { - return Err(Error::new(EINVAL)); - } - // Reprogram the IOAPIC redirection entry for x86 targets. - // Non-IOAPIC IRQs (e.g. MSI) will return false -> EIO. - #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] - { - if !unsafe { ioapic::set_affinity(_handle_irq, ApicId::new(cpu_id)) } { - return Err(Error::new(EIO)); - } - } - mask.store(cpu_id as usize, Ordering::Release); - Ok(size_of::()) - } &Handle::Irq { irq: handle_irq, ack: ref handle_ack, - cpu_id: _, } => { if buffer.len() < size_of::() { return Err(Error::new(EINVAL)); @@ -544,15 +475,6 @@ impl crate::scheme::KernelScheme for IrqScheme { st_nlink: 1, ..Default::default() }, - Handle::IrqAffinity { irq, .. } => Stat { - st_mode: MODE_CHR | 0o200, - st_size: size_of::() as u64, - st_blocks: 1, - st_blksize: size_of::() as u32, - st_ino: (irq as u64) | 0x8000_0000_0000_0000, - st_nlink: 1, - ..Default::default() - }, Handle::Bsp => Stat { st_mode: MODE_CHR | 0o400, st_size: size_of::() as u64, @@ -594,9 +516,8 @@ impl crate::scheme::KernelScheme for IrqScheme { let scheme_path = match handle { Handle::Irq { irq, .. } => format!("irq:{}", irq), - Handle::IrqAffinity { irq, .. } => format!("irq:{}/affinity", irq), Handle::Bsp => "irq:bsp".to_owned(), - Handle::Avail(cpu_id) => format!("irq:cpu-{:02x}", cpu_id.get()), + Handle::Avail(cpu_id) => format!("irq:cpu-{:2x}", cpu_id.get()), Handle::Phandle(phandle, _) => format!("irq:phandle-{}", phandle), Handle::TopLevel => "irq:".to_owned(), _ => return Err(Error::new(EBADF)), @@ -622,7 +543,6 @@ impl crate::scheme::KernelScheme for IrqScheme { Handle::Irq { irq: handle_irq, ack: ref handle_ack, - cpu_id: _, } => { if buffer.len() < size_of::() { return Err(Error::new(EINVAL)); @@ -642,7 +562,7 @@ impl crate::scheme::KernelScheme for IrqScheme { buffer.write_u32(LogicalCpuId::BSP.get())?; Ok(size_of::()) } - Handle::Avail(_) | Handle::TopLevel | Handle::Phandle(_, _) | Handle::SchemeRoot | Handle::IrqAffinity { .. } => { + Handle::Avail(_) | Handle::TopLevel | Handle::Phandle(_, _) | Handle::SchemeRoot => { Err(Error::new(EISDIR)) } } diff --git a/recipes/core/kernel/source/src/scheme/mod.rs b/recipes/core/kernel/source/src/scheme/mod.rs index 765e547f77..d30272c129 100644 --- a/recipes/core/kernel/source/src/scheme/mod.rs +++ b/recipes/core/kernel/source/src/scheme/mod.rs @@ -14,7 +14,7 @@ use alloc::{ }; use core::{ str, - sync::atomic::{AtomicU64, AtomicUsize, Ordering}, + sync::atomic::{AtomicUsize, Ordering}, }; use hashbrown::hash_map::{self, DefaultHashBuilder, HashMap}; use spin::Once; @@ -169,7 +169,6 @@ enum Handle { /// Schemes list static HANDLES: Once>> = Once::new(); -static SCHEME_GENERATIONS: Once>> = Once::new(); static SCHEME_LIST_NEXT_ID: AtomicUsize = AtomicUsize::new(MAX_GLOBAL_SCHEMES); static SCHEME_LIST_ID: AtomicUsize = AtomicUsize::new(0); @@ -205,10 +204,6 @@ fn init_schemes() -> RwLock> { RwLock::new(handles) } -fn init_scheme_generations() -> RwLock> { - RwLock::new(HashMap::new()) -} - /// Get a handle to a scheme. pub fn get_scheme(token: LockToken<'_, L0>, scheme_id: SchemeId) -> Result { match handles().read(token).get(&scheme_id) { @@ -217,33 +212,10 @@ pub fn get_scheme(token: LockToken<'_, L0>, scheme_id: SchemeId) -> Result, scheme_id: SchemeId) -> u64 { - scheme_generations() - .read(token) - .get(&scheme_id) - .map(|generation| generation.load(Ordering::Acquire)) - .unwrap_or(0) -} - fn handles<'a>() -> &'a RwLock> { HANDLES.call_once(init_schemes) } -fn scheme_generations<'a>() -> &'a RwLock> { - SCHEME_GENERATIONS.call_once(init_scheme_generations) -} - -fn increment_scheme_generation(scheme_id: SchemeId, token: &mut CleanLockToken) { - match scheme_generations().write(token.token()).entry(scheme_id) { - hash_map::Entry::Occupied(entry) => { - entry.get().fetch_add(1, Ordering::AcqRel); - } - hash_map::Entry::Vacant(entry) => { - entry.insert(AtomicU64::new(1)); - } - } -} - /// Scheme list type pub struct SchemeList; @@ -288,14 +260,9 @@ impl SchemeList { /// Remove a scheme fn remove(&self, id: usize, token: &mut CleanLockToken) { - let scheme_id = SchemeId(id); - let scheme = handles().write(token.token()).remove(&scheme_id); + let scheme = handles().write(token.token()).remove(&SchemeId(id)); assert!(scheme.is_some()); - if let Some(Handle::Scheme(KernelSchemes::User(user))) = scheme.as_ref() { - user.inner.fail_pending_calls(token); - } - increment_scheme_generation(scheme_id, token); if let Some(Handle::Scheme(KernelSchemes::User(user))) = scheme && let Some(user) = Arc::into_inner(user.inner) { @@ -320,32 +287,32 @@ impl KernelScheme for SchemeList { token: &mut CleanLockToken, ) -> Result { let scheme_id = SchemeId(scheme_id); - let maybe_inner = { - let handles = handles().read(token.token()); - match handles.get(&scheme_id).ok_or(Error::new(EBADF))? { - Handle::Scheme(KernelSchemes::User(UserScheme { inner })) => Some(inner.clone()), - Handle::SchemeCreationCapability => None, - _ => return Err(Error::new(EBADF)), + match handles() + .read(token.token()) + .get(&scheme_id) + .ok_or(Error::new(EBADF))? + { + Handle::Scheme(KernelSchemes::User(UserScheme { inner })) => { + let inner = inner.clone(); + assert!(scheme_id == inner.scheme_id); + let scheme = scheme_id; + let params = unsafe { user_buf.read_exact::()? }; + + return Ok(OpenResult::External(Arc::new(RwLock::new( + FileDescription { + scheme, + number: params.number, + offset: params.offset, + flags: params.flags as u32, + internal_flags: InternalFlags::from_extra0(params.internal_flags) + .ok_or(Error::new(EINVAL))?, + }, + )))); } + Handle::SchemeCreationCapability => (), + _ => return Err(Error::new(EBADF)), }; - if let Some(inner) = maybe_inner { - assert!(scheme_id == inner.scheme_id); - let params = unsafe { user_buf.read_exact::()? }; - - return Ok(OpenResult::External(Arc::new(RwLock::new( - FileDescription::new( - scheme_id, - params.number, - params.offset, - params.flags as u32, - InternalFlags::from_extra0(params.internal_flags) - .ok_or(Error::new(EINVAL))?, - token, - ), - )))); - } - const EXPECTED: &[u8] = b"create-scheme"; let mut buf = [0u8; EXPECTED.len()]; @@ -810,7 +777,6 @@ pub struct CallerCtx { pub pid: usize, pub uid: u32, pub gid: u32, - pub groups: alloc::vec::Vec, } impl CallerCtx { pub fn filter_uid_gid(self, euid: u32, egid: u32) -> Self { @@ -819,7 +785,6 @@ impl CallerCtx { pid: self.pid, uid: euid, gid: egid, - groups: self.groups, } } else { self diff --git a/recipes/core/kernel/source/src/scheme/pipe.rs b/recipes/core/kernel/source/src/scheme/pipe.rs index ebabb5daa2..df5db9d908 100644 --- a/recipes/core/kernel/source/src/scheme/pipe.rs +++ b/recipes/core/kernel/source/src/scheme/pipe.rs @@ -1,10 +1,5 @@ -use alloc::{ - collections::VecDeque, - string::{String, ToString}, - sync::Arc, - vec::Vec, -}; -use core::sync::atomic::{AtomicUsize, Ordering}; +use alloc::{collections::VecDeque, sync::Arc, vec::Vec}; +use core::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; use syscall::{data::GlobalSchemes, CallFlags}; @@ -19,228 +14,67 @@ use crate::{ sync::{CleanLockToken, Mutex, RwLock, WaitCondition, L1}, syscall::{ data::Stat, - error::{ - Error, Result, EAGAIN, EBADF, EEXIST, EINVAL, EINTR, ENOENT, ENOTDIR, EPIPE, - }, - flag::{ - EventFlags, EVENT_READ, EVENT_WRITE, MODE_FIFO, O_ACCMODE, O_DIRECTORY, - O_NONBLOCK, O_RDONLY, O_RDWR, O_STAT, O_WRONLY, - }, + error::{Error, Result, EAGAIN, EBADF, EINTR, EINVAL, ENOENT, EPIPE}, + flag::{EventFlags, EVENT_READ, EVENT_WRITE, MODE_FIFO, O_NONBLOCK}, usercopy::{UserSliceRo, UserSliceRw, UserSliceWo}, }, }; use super::{CallerCtx, KernelScheme, OpenResult, SchemeExt, StrOrBytes}; -static PIPE_NEXT_ID: AtomicUsize = AtomicUsize::new(1); +// TODO: Preallocate a number of scheme IDs, since there can only be *one* root namespace, and +// therefore only *one* pipe scheme. +static PIPE_NEXT_ID: AtomicUsize = AtomicUsize::new(0); -#[derive(Clone)] enum Handle { - Endpoint(EndpointHandle), + Pipe(Arc), SchemeRoot, } -#[derive(Clone, Copy, Eq, PartialEq)] -enum EndpointKind { - Read, - Write, - ReadWrite, -} - -impl EndpointKind { - fn can_read(self) -> bool { - matches!(self, Self::Read | Self::ReadWrite) - } - - fn can_write(self) -> bool { - matches!(self, Self::Write | Self::ReadWrite) - } -} - -#[derive(Clone)] -struct EndpointHandle { - pipe: Arc, - kind: EndpointKind, - named: Option>, -} - -struct NamedPipe { - path: String, - mode: u16, - active: Mutex>>, -} - -static HANDLES: RwLock> = - RwLock::new(HashMap::with_hasher(DefaultHashBuilder::new())); -static NAMED_PIPES: RwLock>> = +// TODO: SLOB? +static PIPES: RwLock> = RwLock::new(HashMap::with_hasher(DefaultHashBuilder::new())); const MAX_QUEUE_SIZE: usize = 65536; -fn next_id() -> usize { - PIPE_NEXT_ID.fetch_add(1, Ordering::Relaxed) -} +// In almost all places where Rust (and LLVM) uses pointers, they are limited to nonnegative isize, +// so this is fine. +const WRITE_NOT_READ_BIT: usize = 1; -fn endpoint_kind_from_flags(flags: usize) -> Result { - match flags & O_ACCMODE { - O_RDONLY => Ok(EndpointKind::Read), - O_WRONLY => Ok(EndpointKind::Write), - O_RDWR => Ok(EndpointKind::ReadWrite), - _ => Err(Error::new(EINVAL)), - } -} - -fn validate_named_fifo_open(flags: usize) -> Result<()> { - if flags & O_DIRECTORY == O_DIRECTORY && flags & O_STAT != O_STAT { - return Err(Error::new(ENOTDIR)); - } - - let _ = endpoint_kind_from_flags(flags)?; - Ok(()) -} - -fn trigger_matching( - pipe: &Arc, - require_read: bool, - require_write: bool, - flags: EventFlags, - token: &mut CleanLockToken, -) { - let ids = { - let handles = HANDLES.read(token.token()); - handles - .iter() - .filter_map(|(id, handle)| match handle { - Handle::Endpoint(endpoint) - if Arc::ptr_eq(&endpoint.pipe, pipe) - && (!require_read || endpoint.kind.can_read()) - && (!require_write || endpoint.kind.can_write()) => - { - Some(*id) - } - _ => None, - }) - .collect::>() - }; - - for id in ids { - event::trigger(GlobalSchemes::Pipe.scheme_id(), id, flags, token); - } -} - -fn open_endpoint( - pipe: Arc, - kind: EndpointKind, - named: Option>, - token: &mut CleanLockToken, -) -> usize { - if kind.can_read() { - pipe.reader_count.fetch_add(1, Ordering::SeqCst); - } - if kind.can_write() { - pipe.writer_count.fetch_add(1, Ordering::SeqCst); - } - - let id = next_id(); - HANDLES.write(token.token()).insert( - id, - Handle::Endpoint(EndpointHandle { pipe, kind, named }), - ); - id -} - -fn drop_wait_conditions_if_possible(pipe: Arc, token: &mut CleanLockToken) { - if let Some(pipe) = Arc::into_inner(pipe) { - { - pipe.read_condition.into_drop(token); - } - { - pipe.write_condition.into_drop(token); - } - } +fn from_raw_id(id: usize) -> (bool, usize) { + (id & WRITE_NOT_READ_BIT != 0, id & !WRITE_NOT_READ_BIT) } pub fn pipe(token: &mut CleanLockToken) -> Result<(usize, usize)> { - let pipe = Arc::new(Pipe::new()); - let read_id = open_endpoint(Arc::clone(&pipe), EndpointKind::Read, None, token); - let write_id = open_endpoint(pipe, EndpointKind::Write, None, token); + // Bit 0 is used for WRITE_NOT_READ_BIT + let id = PIPE_NEXT_ID.fetch_add(2, Ordering::Relaxed); - Ok((read_id, write_id)) -} + PIPES.write(token.token()).insert( + id, + Handle::Pipe(Arc::new(Pipe { + queue: Mutex::new(VecDeque::new()), + read_condition: WaitCondition::new(), + write_condition: WaitCondition::new(), + writer_is_alive: AtomicBool::new(true), + reader_is_alive: AtomicBool::new(true), + has_run_dup: AtomicBool::new(false), + fd_queue: Mutex::new(VecDeque::new()), + })), + ); -pub fn named_pipe_exists(path: &str, token: &mut CleanLockToken) -> bool { - NAMED_PIPES.read(token.token()).contains_key(path) -} - -pub fn create_named_pipe( - path: &str, - display_path: &str, - mode: u16, - flags: usize, - token: &mut CleanLockToken, -) -> Result { - validate_named_fifo_open(flags)?; - - let named = { - let mut named_pipes = NAMED_PIPES.write(token.token()); - if named_pipes.contains_key(path) { - return Err(Error::new(EEXIST)); - } - - let named = Arc::new(NamedPipe { - path: display_path.to_string(), - mode, - active: Mutex::new(None), - }); - named_pipes.insert(path.to_string(), Arc::clone(&named)); - named - }; - - let kind = endpoint_kind_from_flags(flags)?; - let pipe = Arc::new(Pipe::new()); - *named.active.lock(token.token()) = Some(Arc::clone(&pipe)); - - Ok(open_endpoint(pipe, kind, Some(named), token)) -} - -pub fn open_named_pipe(path: &str, flags: usize, token: &mut CleanLockToken) -> Result> { - validate_named_fifo_open(flags)?; - - let named = match NAMED_PIPES.read(token.token()).get(path) { - Some(named) => Arc::clone(named), - None => return Ok(None), - }; - - let kind = endpoint_kind_from_flags(flags)?; - let pipe = { - let mut active = named.active.lock(token.token()); - match active.as_ref() { - Some(pipe) => Arc::clone(pipe), - None => { - let pipe = Arc::new(Pipe::new()); - *active = Some(Arc::clone(&pipe)); - pipe - } - } - }; - - Ok(Some(open_endpoint(pipe, kind, Some(named), token))) -} - -pub fn unlink_named_pipe(path: &str, token: &mut CleanLockToken) -> bool { - NAMED_PIPES.write(token.token()).remove(path).is_some() + Ok((id, id | WRITE_NOT_READ_BIT)) } pub struct PipeScheme; impl PipeScheme { - fn get_endpoint(id: usize, token: &mut CleanLockToken) -> Result { - HANDLES + fn get_pipe(key: usize, token: &mut CleanLockToken) -> Result> { + PIPES .read(token.token()) - .get(&id) + .get(&key) .and_then(|handle| match handle { - Handle::Endpoint(endpoint) => Some(endpoint.clone()), - Handle::SchemeRoot => None, + Handle::Pipe(pipe) => Some(Arc::clone(pipe)), + _ => None, }) .ok_or(Error::new(EBADF)) } @@ -248,33 +82,32 @@ impl PipeScheme { impl KernelScheme for PipeScheme { fn scheme_root(&self, token: &mut CleanLockToken) -> Result { - let id = next_id(); - HANDLES.write(token.token()).insert(id, Handle::SchemeRoot); + let id = PIPE_NEXT_ID.fetch_add(2, Ordering::Relaxed); + PIPES.write(token.token()).insert(id, Handle::SchemeRoot); Ok(id) } - fn fevent( &self, id: usize, flags: EventFlags, token: &mut CleanLockToken, ) -> Result { - let endpoint = Self::get_endpoint(id, token)?; + let (is_writer_not_reader, key) = from_raw_id(id); + let pipe = Self::get_pipe(key, token)?; let mut ready = EventFlags::empty(); - if endpoint.kind.can_write() + if is_writer_not_reader && flags.contains(EVENT_WRITE) - && (endpoint.pipe.queue.lock(token.token()).len() <= MAX_QUEUE_SIZE - || endpoint.pipe.reader_count.load(Ordering::Acquire) == 0) + && (pipe.queue.lock(token.token()).len() <= MAX_QUEUE_SIZE + || !pipe.reader_is_alive.load(Ordering::Acquire)) { ready |= EventFlags::EVENT_WRITE; } - - if endpoint.kind.can_read() + if !is_writer_not_reader && flags.contains(EVENT_READ) - && (!endpoint.pipe.queue.lock(token.token()).is_empty() - || endpoint.pipe.writer_count.load(Ordering::Acquire) == 0) + && (!pipe.queue.lock(token.token()).is_empty() + || !pipe.writer_is_alive.load(Ordering::Acquire)) { ready |= EventFlags::EVENT_READ; } @@ -283,48 +116,46 @@ impl KernelScheme for PipeScheme { } fn close(&self, id: usize, token: &mut CleanLockToken) -> Result<()> { - let handle = HANDLES - .write(token.token()) - .remove(&id) - .ok_or(Error::new(EBADF))?; + let (is_write_not_read, key) = from_raw_id(id); - let Handle::Endpoint(endpoint) = handle else { - return Ok(()); + let pipe = Self::get_pipe(key, token)?; + let scheme_id = GlobalSchemes::Pipe.scheme_id(); + + let can_remove = if is_write_not_read { + pipe.writer_is_alive.store(false, Ordering::SeqCst); + event::trigger(scheme_id, key, EVENT_READ, token); + pipe.read_condition.notify(token); + + !pipe.reader_is_alive.load(Ordering::SeqCst) + } else { + pipe.reader_is_alive.store(false, Ordering::SeqCst); + event::trigger(scheme_id, key | WRITE_NOT_READ_BIT, EVENT_WRITE, token); + pipe.write_condition.notify(token); + + !pipe.writer_is_alive.load(Ordering::SeqCst) }; - let mut last_reader = false; - let mut last_writer = false; - - if endpoint.kind.can_read() { - last_reader = endpoint.pipe.reader_count.fetch_sub(1, Ordering::SeqCst) == 1; - } - if endpoint.kind.can_write() { - last_writer = endpoint.pipe.writer_count.fetch_sub(1, Ordering::SeqCst) == 1; - } - - if last_writer { - trigger_matching(&endpoint.pipe, true, false, EVENT_READ, token); - endpoint.pipe.read_condition.notify(token); - } - if last_reader { - trigger_matching(&endpoint.pipe, false, true, EVENT_WRITE, token); - endpoint.pipe.write_condition.notify(token); - } - - let no_readers = endpoint.pipe.reader_count.load(Ordering::SeqCst) == 0; - let no_writers = endpoint.pipe.writer_count.load(Ordering::SeqCst) == 0; - if no_readers && no_writers { - if let Some(named) = endpoint.named { - let mut active = named.active.lock(token.token()); - if active - .as_ref() - .is_some_and(|active_pipe| Arc::ptr_eq(active_pipe, &endpoint.pipe)) + if can_remove { + let handle = PIPES.write(token.token()).remove(&key); + if let Some(Handle::Pipe(pipe)) = handle + && let Some(pipe) = Arc::into_inner(pipe) + { { - *active = None; + pipe.read_condition.into_drop(token); + } + { + pipe.write_condition.into_drop(token); } } + } - drop_wait_conditions_if_possible(endpoint.pipe, token); + if let Some(pipe) = Arc::into_inner(pipe) { + { + pipe.read_condition.into_drop(token); + } + { + pipe.write_condition.into_drop(token); + } } Ok(()) @@ -337,9 +168,9 @@ impl KernelScheme for PipeScheme { _ctx: CallerCtx, token: &mut CleanLockToken, ) -> Result { - let endpoint = Self::get_endpoint(old_id, token)?; + let (is_writer_not_reader, key) = from_raw_id(old_id); - if !endpoint.kind.can_read() { + if is_writer_not_reader { return Err(Error::new(EBADF)); } @@ -349,17 +180,17 @@ impl KernelScheme for PipeScheme { return Err(Error::new(EINVAL)); } + let pipe = Self::get_pipe(key, token)?; + + if pipe.has_run_dup.swap(true, Ordering::SeqCst) { + return Err(Error::new(EBADF)); + } + Ok(OpenResult::SchemeLocal( - open_endpoint( - Arc::clone(&endpoint.pipe), - EndpointKind::Write, - endpoint.named, - token, - ), + key | WRITE_NOT_READ_BIT, InternalFlags::empty(), )) } - fn kopenat( &self, id: usize, @@ -369,47 +200,40 @@ impl KernelScheme for PipeScheme { _ctx: CallerCtx, token: &mut CleanLockToken, ) -> Result { - let is_scheme_root = { - let handles = HANDLES.read(token.token()); - match handles.get(&id) { - Some(Handle::SchemeRoot) => true, - Some(Handle::Endpoint(_)) => false, - None => return Err(Error::new(EBADF)), - } - }; + let (_, key) = from_raw_id(id); - if is_scheme_root { - let path = user_buf.as_str().or(Err(Error::new(EINVAL)))?; - if !path.trim_start_matches('/').is_empty() { - return Err(Error::new(ENOENT)); + { + let guard = PIPES.read(token.token()); + if let Some(Handle::SchemeRoot) = guard.get(&key) { + } else if let Some(Handle::Pipe(pipe_arc)) = guard.get(&key) { + let pipe = Arc::clone(pipe_arc); + drop(guard); + + if user_buf.as_bytes() == b"write" { + return Err(Error::new(EINVAL)); + } + + if pipe.has_run_dup.swap(true, Ordering::SeqCst) { + return Err(Error::new(EBADF)); } - let pipe = Arc::new(Pipe::new()); return Ok(OpenResult::SchemeLocal( - open_endpoint(pipe, EndpointKind::Read, None, token), + key | WRITE_NOT_READ_BIT, InternalFlags::empty(), )); + } else { + return Err(Error::new(EBADF)); + } } - let endpoint = Self::get_endpoint(id, token)?; - if !endpoint.kind.can_read() { - return Err(Error::new(EBADF)); + let path = user_buf.as_str().or(Err(Error::new(EINVAL)))?; + if !path.trim_start_matches('/').is_empty() { + return Err(Error::new(ENOENT)); } - let path = user_buf.as_bytes(); - if !path.is_empty() && path != b"write" { - return Err(Error::new(EINVAL)); - } + let (read_id, _) = pipe(token)?; - Ok(OpenResult::SchemeLocal( - open_endpoint( - Arc::clone(&endpoint.pipe), - EndpointKind::Write, - endpoint.named, - token, - ), - InternalFlags::empty(), - )) + Ok(OpenResult::SchemeLocal(read_id, InternalFlags::empty())) } fn kread( @@ -420,15 +244,16 @@ impl KernelScheme for PipeScheme { _stored_flags: u32, token: &mut CleanLockToken, ) -> Result { - let endpoint = Self::get_endpoint(id, token)?; + let (is_write_not_read, key) = from_raw_id(id); - if !endpoint.kind.can_read() { + if is_write_not_read { return Err(Error::new(EBADF)); } + let pipe = Self::get_pipe(key, token)?; loop { - let vec = endpoint.pipe.queue.lock(token.token()); - let (mut vec, mut lock_token) = vec.into_split(); + let vec = pipe.queue.lock(token.token()); + let (mut vec, mut token) = vec.into_split(); let (s1, s2) = vec.as_slices(); let s1_count = core::cmp::min(user_buf.len(), s1.len()); @@ -448,34 +273,28 @@ impl KernelScheme for PipeScheme { let _ = vec.drain(..bytes_read); if bytes_read > 0 { - drop(vec); - drop(lock_token); - trigger_matching(&endpoint.pipe, false, true, EVENT_WRITE, token); - endpoint.pipe.write_condition.notify(token); + event::trigger_locked( + GlobalSchemes::Pipe.scheme_id(), + key | WRITE_NOT_READ_BIT, + EVENT_WRITE, + token.token(), + ); + pipe.write_condition.notify_locked(token.token()); return Ok(bytes_read); - } - - if user_buf.is_empty() { + } else if user_buf.is_empty() { return Ok(0); } - if endpoint.pipe.writer_count.load(Ordering::SeqCst) == 0 { + if !pipe.writer_is_alive.load(Ordering::SeqCst) { return Ok(0); - } - if fcntl_flags & O_NONBLOCK as u32 != 0 { + } else if fcntl_flags & O_NONBLOCK as u32 != 0 { return Err(Error::new(EAGAIN)); - } - if !endpoint - .pipe - .read_condition - .wait(vec, "PipeRead::read", &mut lock_token) - { + } else if !pipe.read_condition.wait(vec, "PipeRead::read", &mut token) { return Err(Error::new(EINTR)); } } } - fn kwrite( &self, id: usize, @@ -484,17 +303,18 @@ impl KernelScheme for PipeScheme { _stored_flags: u32, token: &mut CleanLockToken, ) -> Result { - let endpoint = Self::get_endpoint(id, token)?; + let (is_write_not_read, key) = from_raw_id(id); - if !endpoint.kind.can_write() { + if !is_write_not_read { return Err(Error::new(EBADF)); } + let pipe = Self::get_pipe(key, token)?; loop { - let vec = endpoint.pipe.queue.lock(token.token()); - let (mut vec, mut lock_token) = vec.into_split(); + let vec = pipe.queue.lock(token.token()); + let (mut vec, mut token) = vec.into_split(); - if endpoint.pipe.reader_count.load(Ordering::Relaxed) == 0 { + if !pipe.reader_is_alive.load(Ordering::Relaxed) { return Err(Error::new(EPIPE)); } @@ -509,6 +329,7 @@ impl KernelScheme for PipeScheme { let mut bytes_written = 0; + // TODO: Modify VecDeque so that the unwritten portions can be accessed directly? for (idx, chunk) in src_buf.in_variable_chunks(TMPBUF_SIZE).enumerate() { let chunk_byte_count = match chunk.copy_common_bytes_to_slice(&mut tmp_buf) { Ok(c) => c, @@ -520,52 +341,41 @@ impl KernelScheme for PipeScheme { } if bytes_written > 0 { - drop(vec); - drop(lock_token); - trigger_matching(&endpoint.pipe, true, false, EVENT_READ, token); - endpoint.pipe.read_condition.notify(token); + event::trigger_locked( + GlobalSchemes::Pipe.scheme_id(), + key, + EVENT_READ, + token.token(), + ); + pipe.read_condition.notify_locked(token.token()); return Ok(bytes_written); - } - - if user_buf.is_empty() { + } else if user_buf.is_empty() { return Ok(0); } if fcntl_flags & O_NONBLOCK as u32 != 0 { return Err(Error::new(EAGAIN)); - } - if !endpoint - .pipe + } else if !pipe .write_condition - .wait(vec, "PipeWrite::write", &mut lock_token) + .wait(vec, "PipeWrite::write", &mut token) { return Err(Error::new(EINTR)); } } } - - fn kfpath(&self, id: usize, buf: UserSliceWo, token: &mut CleanLockToken) -> Result { - let endpoint = Self::get_endpoint(id, token)?; - if let Some(named) = endpoint.named { - buf.copy_common_bytes_from_slice(named.path.as_bytes()) - } else { - buf.copy_common_bytes_from_slice("/scheme/pipe/".as_bytes()) - } + fn kfpath(&self, _id: usize, buf: UserSliceWo, _token: &mut CleanLockToken) -> Result { + //TODO: construct useful path? + buf.copy_common_bytes_from_slice("/scheme/pipe/".as_bytes()) } - - fn kfstat(&self, id: usize, buf: UserSliceWo, token: &mut CleanLockToken) -> Result<()> { - let endpoint = Self::get_endpoint(id, token)?; - let mode = endpoint.named.map_or(0o666, |named| named.mode); - + fn kfstat(&self, _id: usize, buf: UserSliceWo, _token: &mut CleanLockToken) -> Result<()> { buf.copy_exactly(&Stat { - st_mode: MODE_FIFO | mode, + st_mode: MODE_FIFO | 0o666, ..Default::default() })?; Ok(()) } - fn kfdwrite( &self, id: usize, @@ -575,17 +385,23 @@ impl KernelScheme for PipeScheme { _metadata: &[u64], token: &mut CleanLockToken, ) -> Result { - let endpoint = Self::get_endpoint(id, token)?; + let (is_write_not_read, key) = from_raw_id(id); - if !endpoint.kind.can_write() { + if !is_write_not_read { return Err(Error::new(EBADF)); } + let pipe = match Self::get_pipe(key, token) { + Ok(p) => p, + Err(e) => { + return Err(e); + } + }; loop { - let vec = endpoint.pipe.fd_queue.lock(token.token()); - let (mut vec, mut lock_token) = vec.into_split(); + let vec = pipe.fd_queue.lock(token.token()); + let (mut vec, mut token) = vec.into_split(); - if endpoint.pipe.reader_count.load(Ordering::Relaxed) == 0 { + if !pipe.reader_is_alive.load(Ordering::Relaxed) { return Err(Error::new(EPIPE)); } if descs.is_empty() { @@ -605,24 +421,25 @@ impl KernelScheme for PipeScheme { let fds_written = vec.len() - before_len; if fds_written > 0 { - drop(vec); - drop(lock_token); - trigger_matching(&endpoint.pipe, true, false, EVENT_READ, token); - endpoint.pipe.read_condition.notify(token); + event::trigger_locked( + GlobalSchemes::Pipe.scheme_id(), + key, + EVENT_READ, + token.token(), + ); + pipe.read_condition.notify_locked(token.token()); return Ok(fds_written); } - if !endpoint - .pipe + if !pipe .write_condition - .wait(vec, "PipeWrite::write", &mut lock_token) + .wait(vec, "PipeWrite::write", &mut token) { return Err(Error::new(EINTR)); } } } - fn kfdread( &self, id: usize, @@ -631,19 +448,25 @@ impl KernelScheme for PipeScheme { _metadata: &[u64], token: &mut CleanLockToken, ) -> Result { - let endpoint = Self::get_endpoint(id, token)?; + let (is_write_not_read, key) = from_raw_id(id); - if !endpoint.kind.can_read() { + if is_write_not_read { return Err(Error::new(EBADF)); } + let pipe = match Self::get_pipe(key, token) { + Ok(p) => p, + Err(e) => { + return Err(e); + } + }; if payload.is_empty() { return Ok(0); } loop { - let vec = endpoint.pipe.fd_queue.lock(token.token()); - let (mut vec, mut lock_token) = vec.into_split(); + let vec = pipe.fd_queue.lock(token.token()); + let (mut vec, mut token) = vec.into_split(); let fds_available = vec.len(); let max_fds_read = payload.len() / size_of::(); @@ -656,33 +479,31 @@ impl KernelScheme for PipeScheme { fds_to_transfer, payload, flags.contains(CallFlags::FD_CLOEXEC), - &mut lock_token, + &mut token, )?; } else { bulk_add_fds( fds_to_transfer, payload, flags.contains(CallFlags::FD_CLOEXEC), - &mut lock_token, + &mut token, )?; } - drop(vec); - drop(lock_token); - trigger_matching(&endpoint.pipe, false, true, EVENT_WRITE, token); - endpoint.pipe.write_condition.notify(token); + event::trigger_locked( + GlobalSchemes::Pipe.scheme_id(), + key | WRITE_NOT_READ_BIT, + EVENT_WRITE, + token.token(), + ); + pipe.write_condition.notify_locked(token.token()); return Ok(fds_to_read); } - if endpoint.pipe.writer_count.load(Ordering::SeqCst) == 0 { + if !pipe.writer_is_alive.load(Ordering::SeqCst) { return Ok(0); - } - if !endpoint - .pipe - .read_condition - .wait(vec, "PipeRead::read", &mut lock_token) - { + } else if !pipe.read_condition.wait(vec, "PipeRead::read", &mut token) { return Err(Error::new(EINTR)); } } @@ -690,23 +511,11 @@ impl KernelScheme for PipeScheme { } pub struct Pipe { - read_condition: WaitCondition, - write_condition: WaitCondition, + read_condition: WaitCondition, // signals whether there are available bytes to read + write_condition: WaitCondition, // signals whether there is room for additional bytes queue: Mutex>, - reader_count: AtomicUsize, - writer_count: AtomicUsize, + reader_is_alive: AtomicBool, // starts set, unset when reader closes + writer_is_alive: AtomicBool, // starts set, unset when writer closes + has_run_dup: AtomicBool, fd_queue: Mutex>>, } - -impl Pipe { - fn new() -> Self { - Self { - read_condition: WaitCondition::new(), - write_condition: WaitCondition::new(), - queue: Mutex::new(VecDeque::new()), - reader_count: AtomicUsize::new(0), - writer_count: AtomicUsize::new(0), - fd_queue: Mutex::new(VecDeque::new()), - } - } -} diff --git a/recipes/core/kernel/source/src/scheme/proc.rs b/recipes/core/kernel/source/src/scheme/proc.rs index a9de02ea1a..47588e10d2 100644 --- a/recipes/core/kernel/source/src/scheme/proc.rs +++ b/recipes/core/kernel/source/src/scheme/proc.rs @@ -105,7 +105,6 @@ enum ContextHandle { // Attr handles, to set ens/euid/egid/pid. Authority, Attr, - Groups, Status { privileged: bool, @@ -262,7 +261,6 @@ impl ProcScheme { let handle = match actual_name { "attrs" => ContextHandle::Attr, "status" => ContextHandle::Status { privileged: true }, - "groups" => ContextHandle::Groups, _ => return Err(Error::new(ENOENT)), }; @@ -308,11 +306,6 @@ impl ProcScheme { let id = NonZeroUsize::new(NEXT_ID.fetch_add(1, Ordering::Relaxed)) .ok_or(Error::new(EMFILE))?; let context = context::spawn(true, Some(id), ret, token)?; - { - let parent_groups = - context::current().read(token.token()).groups.clone(); - context.write(token.token()).groups = parent_groups; - } HANDLES.write(token.token()).insert( id.get(), Handle { @@ -432,7 +425,6 @@ impl KernelScheme for ProcScheme { } fn close(&self, id: usize, token: &mut CleanLockToken) -> Result<()> { - let mut inner_token = unsafe { CleanLockToken::new() }; let handle = HANDLES .write(token.token()) .remove(&id) @@ -460,7 +452,9 @@ impl KernelScheme for ProcScheme { ))] regs.set_arg1(arg1); - Ok(context.set_addr_space(Some(new), inner_token.downgrade())) + // TODO: Lock ordering violation + let mut token = unsafe { CleanLockToken::new() }; + Ok(context.set_addr_space(Some(new), token.downgrade())) })?; if let Some(old_ctx) = old_ctx && let Some(addrspace) = Arc::into_inner(old_ctx) @@ -499,7 +493,6 @@ impl KernelScheme for ProcScheme { consume: bool, token: &mut CleanLockToken, ) -> Result { - let mut inner_token = unsafe { CleanLockToken::new() }; let handle = HANDLES .read(token.token()) .get(&id) @@ -590,7 +583,9 @@ impl KernelScheme for ProcScheme { }; // TODO: Allocated or AllocatedShared? let addrsp = AddrSpace::current()?; - let page = addrsp.acquire_write(inner_token.downgrade()).mmap_anywhere( + // TODO: Lock ordering violation + let mut token = unsafe { CleanLockToken::new() }; + let page = addrsp.acquire_write(token.downgrade()).mmap_anywhere( &addrsp, NonZeroUsize::new(1).unwrap(), MapFlags::PROT_READ | MapFlags::PROT_WRITE, @@ -854,17 +849,17 @@ impl KernelScheme for ProcScheme { } } fn extract_scheme_number(fd: usize, token: &mut CleanLockToken) -> Result<(KernelSchemes, usize)> { - let desc = { + let (scheme_id, number) = { let current_lock = context::current(); let mut current = current_lock.read(token.token()); - let (context, mut context_token) = current.token_split(); + let (context, mut token) = current.token_split(); let file_descriptor = context - .get_file(FileHandle::from(fd), &mut context_token) + .get_file(FileHandle::from(fd), &mut token) .ok_or(Error::new(EBADF))?; - *file_descriptor.description.read(context_token.token()) + let desc = file_descriptor.description.read(token.token()); + (desc.scheme, desc.number) }; - let scheme = desc.get_scheme(token)?; - let number = desc.number; + let scheme = scheme::get_scheme(token.token(), scheme_id)?; Ok((scheme, number)) } @@ -1276,39 +1271,6 @@ impl ContextHandle { guard.prio = (info.prio as usize).min(39); Ok(size_of::()) } - Self::Groups => { - const NGROUPS_MAX: usize = 65536; - if buf.len() % size_of::() != 0 { - return Err(Error::new(EINVAL)); - } - let count = buf.len() / size_of::(); - if count > NGROUPS_MAX { - return Err(Error::new(EINVAL)); - } - let mut groups = Vec::with_capacity(count); - for chunk in buf.in_exact_chunks(size_of::()).take(count) { - groups.push(chunk.read_u32()?); - } - let proc_id = { - let guard = context.read(token.token()); - guard.owner_proc_id - }; - { - let mut guard = context.write(token.token()); - guard.groups = groups.clone(); - } - if let Some(pid) = proc_id { - let mut contexts = context::contexts(token.downgrade()); - let (contexts, mut t) = contexts.token_split(); - for context_ref in contexts.iter() { - let mut ctx = context_ref.write(t.token()); - if ctx.owner_proc_id == Some(pid) { - ctx.groups = groups.clone(); - } - } - } - Ok(count * size_of::()) - } ContextHandle::OpenViaDup => { let mut args = buf.usizes(); @@ -1513,15 +1475,6 @@ impl ContextHandle { debug_name, }) } - Self::Groups => { - let c = &context.read(token.token()); - let max = buf.len() / size_of::(); - let count = c.groups.len().min(max); - for (chunk, gid) in buf.in_exact_chunks(size_of::()).zip(&c.groups).take(count) { - chunk.copy_from_slice(&gid.to_ne_bytes())?; - } - Ok(count * size_of::()) - } ContextHandle::Sighandler => { let data = match context.read(token.token()).sig { Some(ref sig) => SetSighandlerData { diff --git a/recipes/core/kernel/source/src/scheme/user.rs b/recipes/core/kernel/source/src/scheme/user.rs index dfbf66b1b1..b9013021e6 100644 --- a/recipes/core/kernel/source/src/scheme/user.rs +++ b/recipes/core/kernel/source/src/scheme/user.rs @@ -80,7 +80,6 @@ const ONE: NonZeroUsize = match NonZeroUsize::new(1) { Some(one) => one, None => unreachable!(), }; -const MAX_SPURIOUS_WAKEUPS: usize = 100; enum ParsedCqe { TriggerFevent { @@ -210,8 +209,6 @@ impl UserInner { caller_responsible: &mut PageSpan, token: &mut CleanLockToken, ) -> Result { - let mut remaining_spurious_wakeups = MAX_SPURIOUS_WAKEUPS; - { // Disable preemption to avoid context switches between setting the // process state and sending the scheme request. The process is made @@ -264,10 +261,7 @@ impl UserInner { }; let states = self.states.lock(token.token()); - let (mut states, mut state_token) = states.into_split(); - let mut timed_out_descriptions = None; - let mut remove_state = false; - let mut timed_out = false; + let (mut states, mut token) = states.into_split(); match states.get_mut(sqe.tag as usize) { // invalid state None => return Err(Error::new(EBADFD)), @@ -280,35 +274,24 @@ impl UserInner { fds, } => { let maybe_eintr = - eintr_if_sigkill(&mut callee_responsible, &mut state_token.token()); - - if maybe_eintr.is_ok() { - remaining_spurious_wakeups = - remaining_spurious_wakeups.saturating_sub(1); - } - - if maybe_eintr.is_ok() && remaining_spurious_wakeups == 0 { - timed_out_descriptions = Some(Self::collect_descriptions_to_close(fds)); - remove_state = true; - } else { - *o = State::Waiting { - canceling: true, - callee_responsible, - context, - fds, - }; - } + eintr_if_sigkill(&mut callee_responsible, &mut token.token()); + *o = State::Waiting { + canceling: true, + callee_responsible, + context, + fds, + }; maybe_eintr?; - if remove_state { - states.remove(sqe.tag as usize); - timed_out = true; - } else { - context::current() - .write(state_token.token()) - .block("UserInner::call (woken up after cancelation request)"); - } + context::current() + .write(token.token()) + .block("UserInner::call (woken up after cancelation request)"); + + // We do not want to drop the lock before blocking + // as if we get preempted in between we might miss a + // wakeup. + drop(states); } // spurious wakeup State::Waiting { @@ -317,76 +300,60 @@ impl UserInner { context, mut callee_responsible, } => { + let maybe_eintr = eintr_if_sigkill(&mut callee_responsible, &mut token); let current_context = context::current(); - let maybe_eintr = - eintr_if_sigkill(&mut callee_responsible, &mut state_token); - if maybe_eintr.is_ok() { - remaining_spurious_wakeups = - remaining_spurious_wakeups.saturating_sub(1); - } - - if maybe_eintr.is_ok() && remaining_spurious_wakeups == 0 { - timed_out_descriptions = Some(Self::collect_descriptions_to_close(fds)); - remove_state = true; - } else { - *o = State::Waiting { - // Currently we treat all spurious wakeups to have the same behavior - // as signals (i.e., we send a cancellation request). It is not something - // that should happen, but it certainly can happen, for example if a context - // is awoken through its thread handle without setting any sig bits, or if the - // caller clears its own sig bits. If it actually is a signal, then it is the - // intended behavior. - canceling: true, - fds, - context, - callee_responsible, - }; - } + *o = State::Waiting { + // Currently we treat all spurious wakeups to have the same behavior + // as signals (i.e., we send a cancellation request). It is not something + // that should happen, but it certainly can happen, for example if a context + // is awoken through its thread handle without setting any sig bits, or if the + // caller clears its own sig bits. If it actually is a signal, then it is the + // intended behavior. + canceling: true, + fds, + context, + callee_responsible, + }; maybe_eintr?; - if remove_state { - states.remove(sqe.tag as usize); - timed_out = true; - } else { - // We do not want to preempt between sending the - // cancellation and blocking again where we might - // miss a wakeup. - let mut preempt = - PreemptGuardL1::new(¤t_context, &mut state_token); - let token = preempt.token(); + // We do not want to preempt between sending the + // cancellation and blocking again where we might + // miss a wakeup. + let mut preempt = PreemptGuardL1::new(¤t_context, &mut token); + let token = preempt.token(); - self.todo.send_locked( - Sqe { - opcode: Opcode::Cancel as u8, - sqe_flags: SqeFlags::ONEWAY, - tag: sqe.tag, - ..Default::default() - }, - token.token(), - ); - event::trigger_locked( - self.root_id, - self.scheme_id.get(), - EVENT_READ, - token.token(), - ); + self.todo.send_locked( + Sqe { + opcode: Opcode::Cancel as u8, + sqe_flags: SqeFlags::ONEWAY, + tag: sqe.tag, + ..Default::default() + }, + token.token(), + ); + event::trigger_locked( + self.root_id, + self.scheme_id.get(), + EVENT_READ, + token.token(), + ); - // 1. If cancellation was requested and arrived - // before the scheme processed the request, an - // acknowledgement will be sent back after the - // cancellation is processed and we will be woken up - // again. State will be State::Responded then. - // - // 2. If cancellation was requested but the scheme - // already processed the request, we will receive - // the actual response next and woken up again. - // State will be State::Responded then. - context::current() - .write(token.token()) - .block("UserInner::call (spurious wakeup)"); - } + // 1. If cancellation was requested and arrived + // before the scheme processed the request, an + // acknowledgement will be sent back after the + // cancellation is processed and we will be woken up + // again. State will be State::Responded then. + // + // 2. If cancellation was requested but the scheme + // already processed the request, we will receive + // the actual response next and woken up again. + // State will be State::Responded then. + context::current() + .write(token.token()) + .block("UserInner::call (spurious wakeup)"); + drop(states); } // invalid state @@ -401,70 +368,10 @@ impl UserInner { } }, } - - if let Some(descriptions) = timed_out_descriptions { - drop(states); - for desc in descriptions { - let _ = desc.try_close(token); - } - } - - if timed_out { - return Err(Error::new(ETIMEDOUT)); - } } } } - fn collect_descriptions_to_close( - fds: Vec>, - ) -> Vec { - fds.into_iter() - .filter_map(|fd| Arc::try_unwrap(fd).ok()) - .map(RwLock::into_inner) - .collect() - } - - pub fn fail_pending_calls(&self, token: &mut CleanLockToken) { - let descriptions_to_close = { - let mut states_lock = self.states.lock(token.token()); - let (states, mut lock_token) = states_lock.token_split(); - let mut descriptions_to_close = Vec::new(); - let mut states_to_remove = Vec::new(); - - for (id, state) in states.iter_mut() { - match mem::replace(state, State::Placeholder) { - State::Waiting { context, fds, .. } => { - descriptions_to_close.extend(Self::collect_descriptions_to_close(fds)); - - match context.upgrade() { - Some(context) => { - *state = State::Responded(Response::Regular( - Err(Error::new(ENODEV)), - 0, - false, - )); - context.write(lock_token.token()).unblock(); - } - None => states_to_remove.push(id), - } - } - old_state => *state = old_state, - } - } - - for id in states_to_remove { - states.remove(id); - } - - descriptions_to_close - }; - - for desc in descriptions_to_close { - let _ = desc.try_close(token); - } - } - /// Map a readable structure to the scheme's userspace and return the /// pointer #[must_use = "copying back to head/tail buffers can fail"] @@ -1376,7 +1283,6 @@ impl UserInner { } pub fn into_drop(self, token: &mut CleanLockToken) { - self.fail_pending_calls(token); self.todo.condition.into_drop(token); } } diff --git a/recipes/core/kernel/source/src/startup/memory.rs b/recipes/core/kernel/source/src/startup/memory.rs index 9fb5fb10d9..26922dde0a 100644 --- a/recipes/core/kernel/source/src/startup/memory.rs +++ b/recipes/core/kernel/source/src/startup/memory.rs @@ -74,16 +74,14 @@ impl MemoryEntry { } struct MemoryMap { - entries: [MemoryEntry; 1024], + entries: [MemoryEntry; 512], size: usize, } impl MemoryMap { fn register(&mut self, base: usize, size: usize, kind: BootloaderMemoryKind) { if self.size >= self.entries.len() { - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - unsafe { core::arch::asm!("out dx, al", in("dx") 0x3F8u16, in("al") b'!', options(nostack, preserves_flags)); } - panic!("Early memory map overflow at entry {} (max {})", self.size, self.entries.len()); + panic!("Early memory map overflow!"); } let start = if kind == BootloaderMemoryKind::Free { align_up(base) @@ -136,7 +134,7 @@ static MEMORY_MAP: SyncUnsafeCell = SyncUnsafeCell::new(MemoryMap { start: 0, end: 0, kind: BootloaderMemoryKind::Null, - }; 1024], + }; 512], size: 0, }); @@ -325,16 +323,7 @@ unsafe fn map_memory(areas: &[MemoryArea], mut bump_allocator: &mut Bum } } - let kernel_area = match (*MEMORY_MAP.get()).kernel() { - Some(area) => area, - None => { - println!("FATAL: kernel memory area not found in boot memory map"); - println!("Cannot determine kernel base address. Halting."); - loop { - core::hint::spin_loop(); - } - } - }; + let kernel_area = (*MEMORY_MAP.get()).kernel().unwrap(); let kernel_base = kernel_area.start; let kernel_size = kernel_area.end.saturating_sub(kernel_area.start); // Map kernel at KERNEL_OFFSET diff --git a/recipes/core/kernel/source/src/startup/mod.rs b/recipes/core/kernel/source/src/startup/mod.rs index 86aabc227a..8ad3cdf7f8 100644 --- a/recipes/core/kernel/source/src/startup/mod.rs +++ b/recipes/core/kernel/source/src/startup/mod.rs @@ -149,15 +149,6 @@ static BOOTSTRAP: spin::Once = spin::Once::new(); pub(crate) static AP_READY: AtomicBool = AtomicBool::new(false); static BSP_READY: AtomicBool = AtomicBool::new(false); -#[cold] -fn halt_boot(message: &str) -> ! { - print!("{message}"); - println!("Kernel boot cannot continue. Halting."); - loop { - hint::spin_loop(); - } -} - /// This is the kernel entry point for the primary CPU. The arch crate is responsible for calling this pub(crate) fn kmain(bootstrap: Bootstrap) -> ! { let mut token = unsafe { CleanLockToken::new() }; @@ -189,7 +180,9 @@ pub(crate) fn kmain(bootstrap: Bootstrap) -> ! { context.euid = 0; context.egid = 0; } - Err(_err) => halt_boot("FATAL: failed to spawn first userspace process userspace_init\n"), + Err(err) => { + panic!("failed to spawn userspace_init: {:?}", err); + } } run_userspace(&mut token) diff --git a/recipes/core/kernel/source/src/sync/mcs.rs b/recipes/core/kernel/source/src/sync/mcs.rs deleted file mode 100644 index 3ccde13862..0000000000 --- a/recipes/core/kernel/source/src/sync/mcs.rs +++ /dev/null @@ -1,188 +0,0 @@ -//! MCS (Mellor-Crummey Scott) fair spinlock. -//! -//! Each waiter spins on its own local `locked` flag instead of a shared lock -//! word, eliminating cache-line bouncing under contention. FIFO ordering -//! guarantees fairness. O(1) cache-line transfers on unlock. -//! -//! Supports transitive priority inheritance: when CPU A waits on a lock held -//! by CPU B, and CPU B waits on a lock held by CPU C, A's priority is -//! propagated through the chain to C (up to MAX_PI_CHAIN_DEPTH hops). - -use core::sync::atomic::{AtomicBool, AtomicPtr, AtomicU32, Ordering}; -use core::{hint, ptr}; - -use crate::percpu::PercpuBlock; - -/// Maximum depth for transitive priority inheritance chain following. -/// Prevents infinite loops from theoretical lock cycles and bounds latency. -/// Linux uses 20; 8 is conservative for a microkernel with fewer nesting levels. -const MAX_PI_CHAIN_DEPTH: u32 = 8; - -/// A node in the MCS lock queue. -pub struct McsNode { - pub next: AtomicPtr, - pub locked: AtomicBool, -} - -impl McsNode { - pub const fn new() -> Self { - Self { - next: AtomicPtr::new(ptr::null_mut()), - locked: AtomicBool::new(false), - } - } -} - -/// Raw MCS spinlock primitive. -pub struct McsRawLock { - tail: AtomicPtr, - /// CPU ID of the current lock holder (for priority inheritance). - /// `u32::MAX` means no holder. - holder_cpu: AtomicU32, -} - -impl McsRawLock { - pub const fn new() -> Self { - Self { - tail: AtomicPtr::new(ptr::null_mut()), - holder_cpu: AtomicU32::new(u32::MAX), - } - } - - #[inline] - pub fn acquire(&self, node: &McsNode) -> bool { - node.next.store(ptr::null_mut(), Ordering::Relaxed); - node.locked.store(true, Ordering::Relaxed); - let prev = self.tail.swap((node as *const McsNode).cast_mut(), Ordering::AcqRel); - if prev.is_null() { - // Uncontended — record ourselves as holder - let cpu_id = PercpuBlock::current().cpu_id.get(); - self.holder_cpu.store(cpu_id, Ordering::Release); - return false; - } - unsafe { - (*prev).next.store((node as *const McsNode).cast_mut(), Ordering::Release); - } - let percpu = PercpuBlock::current(); - // Record which lock we're spinning on (for transitive PI chain following) - percpu.waiting_on_lock.store( - (self as *const McsRawLock).cast_mut(), - Ordering::Release, - ); - let mut donated = false; - while node.locked.load(Ordering::Acquire) { - percpu.maybe_handle_tlb_shootdown(); - // Donate priority to the lock holder (transitively) once per acquisition - if !donated { - self.maybe_donate_priority(percpu); - donated = true; - } - hint::spin_loop(); - } - // Clear waiting_on_lock before proceeding — we now hold the lock - percpu.waiting_on_lock.store(ptr::null_mut(), Ordering::Release); - self.holder_cpu.store(percpu.cpu_id.get(), Ordering::Release); - true - } - - #[inline] - pub fn release(&self, node: &McsNode) { - // Clear priority inheritance donation — we no longer hold the lock - PercpuBlock::current().pi_donated_prio.store(u32::MAX, Ordering::Release); - // Clear holder CPU - self.holder_cpu.store(u32::MAX, Ordering::Release); - - let next = node.next.load(Ordering::Acquire); - if next.is_null() { - if self - .tail - .compare_exchange( - (node as *const McsNode).cast_mut(), - ptr::null_mut(), - Ordering::AcqRel, - Ordering::Acquire, - ) - .is_ok() - { - return; - } - while node.next.load(Ordering::Acquire).is_null() { - hint::spin_loop(); - } - } - unsafe { - (*node.next.load(Ordering::Acquire)).locked.store(false, Ordering::Release); - } - } - - #[inline] - pub fn try_acquire(&self, node: &McsNode) -> bool { - node.next.store(ptr::null_mut(), Ordering::Relaxed); - node.locked.store(true, Ordering::Relaxed); - let ok = self - .tail - .compare_exchange( - ptr::null_mut(), - (node as *const McsNode).cast_mut(), - Ordering::AcqRel, - Ordering::Acquire, - ) - .is_ok(); - if ok { - let cpu_id = PercpuBlock::current().cpu_id.get(); - self.holder_cpu.store(cpu_id, Ordering::Release); - } - ok - } - - /// Donate current CPU's context priority to the lock holder's CPU, - /// following the PI chain transitively (A→B→C). - /// - /// Reads priority from PercpuBlock::current_prio (cached by the scheduler) - /// to avoid acquiring any lock in the MCS spin loop. - /// - /// Chain following: if the holder is itself waiting on another lock, - /// we propagate our priority to that lock's holder too, up to - /// MAX_PI_CHAIN_DEPTH hops. - fn maybe_donate_priority(&self, my_percpu: &PercpuBlock) { - let my_prio = my_percpu.current_prio.get() as u32; - let mut current_holder_cpu = self.holder_cpu.load(Ordering::Relaxed); - - for _ in 0..MAX_PI_CHAIN_DEPTH { - if current_holder_cpu == u32::MAX { - return; - } - let holder_percpu = crate::percpu::get_for_cpu( - crate::cpu_set::LogicalCpuId::new(current_holder_cpu), - ); - let Some(holder) = holder_percpu else { - return; - }; - - // Donate if our priority is higher (lower number) than current donation - let current_donated = holder.pi_donated_prio.load(Ordering::Relaxed); - if my_prio < current_donated { - holder.pi_donated_prio.store(my_prio, Ordering::Release); - } - - // Follow the chain: is this holder also waiting on another lock? - let next_lock_ptr = holder.waiting_on_lock.load(Ordering::Relaxed); - if next_lock_ptr.is_null() { - return; - } - // SAFETY: The pointed-to McsRawLock is a long-lived struct field - // (e.g., part of the run queue). The holder is currently spinning - // in acquire(), so the pointer is valid. We only read holder_cpu - // (an atomic u32) — no mutable access needed. - let next_holder_cpu = - unsafe { (*next_lock_ptr).holder_cpu.load(Ordering::Relaxed) }; - - // Cycle detection: if the next holder is the same CPU we just visited, stop - if next_holder_cpu == current_holder_cpu { - return; - } - current_holder_cpu = next_holder_cpu; - } - // Chain depth exhausted — stop to bound latency - } -} diff --git a/recipes/core/kernel/source/src/sync/mod.rs b/recipes/core/kernel/source/src/sync/mod.rs index 7655a8d9c0..6ad2708ba4 100644 --- a/recipes/core/kernel/source/src/sync/mod.rs +++ b/recipes/core/kernel/source/src/sync/mod.rs @@ -1,6 +1,5 @@ pub use self::{ordered::*, wait_condition::WaitCondition, wait_queue::WaitQueue}; -pub mod mcs; pub mod ordered; pub mod wait_condition; pub mod wait_queue; diff --git a/recipes/core/kernel/source/src/sync/ordered.rs b/recipes/core/kernel/source/src/sync/ordered.rs index c6763cb663..91d46158db 100644 --- a/recipes/core/kernel/source/src/sync/ordered.rs +++ b/recipes/core/kernel/source/src/sync/ordered.rs @@ -52,9 +52,7 @@ //! *g1 = 12; //! ``` use alloc::sync::Arc; -use core::cell::UnsafeCell; use core::marker::PhantomData; -use core::ptr; use crate::percpu::PercpuBlock; @@ -734,143 +732,3 @@ impl Drop for ArcRwLockWriteGuard { /// This function can only be called if no lock is held by the calling thread/task #[inline] pub fn check_no_locks(_: LockToken<'_, L0>) {} - -// --------------------------------------------------------------------------- -// MCS-based fair mutex (McsMutex) -// --------------------------------------------------------------------------- - -/// A mutual exclusion lock using the MCS fair spinlock algorithm. -/// -/// Unlike `Mutex` which uses a simple spinlock (no fairness under -/// contention), `McsMutex` uses Mellor-Crummey Scott queue-based spinning: -/// -/// - Each waiter spins on its **own** local flag — no shared cache-line bouncing. -/// - FIFO ordering prevents starvation. -/// - O(1) cache-line transfers on unlock. -/// -/// The MCS node is stored in [`crate::percpu::PercpuBlock::mcs_sched_node`], so -/// this type is suitable for scheduler-internal locks where the holder is always -/// the current CPU. -pub struct McsMutex { - raw: crate::sync::mcs::McsRawLock, - data: UnsafeCell, - _phantom: PhantomData, -} - -unsafe impl Sync for McsMutex {} -unsafe impl Send for McsMutex {} - -impl McsMutex { - pub const fn new(val: T) -> Self { - Self { - raw: crate::sync::mcs::McsRawLock::new(), - data: UnsafeCell::new(val), - _phantom: PhantomData, - } - } -} - -impl McsMutex { - pub fn lock<'a, LP: Lower + 'a>( - &'a self, - lock_token: LockToken<'a, LP>, - ) -> McsMutexGuard<'a, L, T> { - let percpu = PercpuBlock::current(); - let contended = self.raw.acquire(&percpu.mcs_sched_node); - if contended { - percpu - .mcs_contention_count - .set(percpu.mcs_contention_count.get() + 1); - } - McsMutexGuard { - lock: self, - lock_token: LockToken::downgraded(lock_token), - } - } - - pub fn try_lock<'a, LP: Lower + 'a>( - &'a self, - lock_token: LockToken<'a, LP>, - ) -> Option> { - let percpu = PercpuBlock::current(); - if self.raw.try_acquire(&percpu.mcs_sched_node) { - Some(McsMutexGuard { - lock: self, - lock_token: LockToken::downgraded(lock_token), - }) - } else { - None - } - } -} - -pub struct McsMutexGuard<'a, L: Level, T: 'a> { - lock: &'a McsMutex, - lock_token: LockToken<'a, L>, -} - -impl<'a, L: Level, T: 'a> McsMutexGuard<'a, L, T> { - pub fn token_split(&mut self) -> (&mut T, LockToken<'_, L>) { - unsafe { (&mut *self.lock.data.get(), self.lock_token.token()) } - } - - pub fn into_split(self) -> (McsRawGuard<'a, L, T>, LockToken<'a, L>) { - let lock_ref = self.lock; - let token = unsafe { core::ptr::read(&self.lock_token) }; - core::mem::forget(self); - (McsRawGuard { lock: lock_ref }, token) - } - - pub fn from_split(raw: McsRawGuard<'a, L, T>, token: LockToken<'a, L>) -> Self { - let lock_ref = raw.lock; - core::mem::forget(raw); - Self { - lock: lock_ref, - lock_token: token, - } - } -} - -impl core::ops::Deref for McsMutexGuard<'_, L, T> { - type Target = T; - fn deref(&self) -> &Self::Target { - unsafe { &*self.lock.data.get() } - } -} - -impl core::ops::DerefMut for McsMutexGuard<'_, L, T> { - fn deref_mut(&mut self) -> &mut Self::Target { - unsafe { &mut *self.lock.data.get() } - } -} - -impl Drop for McsMutexGuard<'_, L, T> { - fn drop(&mut self) { - let percpu = PercpuBlock::current(); - self.lock.raw.release(&percpu.mcs_sched_node); - } -} - -pub struct McsRawGuard<'a, L: Level, T: 'a> { - lock: &'a McsMutex, -} - -impl core::ops::Deref for McsRawGuard<'_, L, T> { - type Target = T; - fn deref(&self) -> &Self::Target { - unsafe { &*self.lock.data.get() } - } -} - -impl core::ops::DerefMut for McsRawGuard<'_, L, T> { - fn deref_mut(&mut self) -> &mut Self::Target { - unsafe { &mut *self.lock.data.get() } - } -} - -impl Drop for McsRawGuard<'_, L, T> { - fn drop(&mut self) { - let percpu = PercpuBlock::current(); - self.lock.raw.release(&percpu.mcs_sched_node); - } -} diff --git a/recipes/core/kernel/source/src/syscall/fs.rs b/recipes/core/kernel/source/src/syscall/fs.rs index acd3bc2212..bf984641f4 100644 --- a/recipes/core/kernel/source/src/syscall/fs.rs +++ b/recipes/core/kernel/source/src/syscall/fs.rs @@ -2,7 +2,7 @@ use core::num::NonZeroUsize; -use alloc::{format, string::{String, ToString}, sync::Arc, vec::Vec}; +use alloc::{string::String, sync::Arc, vec::Vec}; use redox_path::RedoxPath; use crate::{ @@ -12,9 +12,9 @@ use crate::{ memory::{AddrSpace, GenericFlusher, Grant, PageSpan, TlbShootdownActions}, }, memory::{Page, VirtualAddress, PAGE_SIZE}, - scheme::{self, pipe, FileHandle, KernelScheme, OpenResult, SchemeExt, StrOrBytes}, + scheme::{self, FileHandle, KernelScheme, OpenResult, StrOrBytes}, sync::{CleanLockToken, RwLock}, - syscall::{data::{GlobalSchemes, Stat}, error::*, flag::*}, + syscall::{data::Stat, error::*, flag::*}, }; use super::usercopy::{UserSlice, UserSliceRo, UserSliceRw, UserSliceWo}; @@ -45,7 +45,7 @@ pub fn file_op_generic_ext( (file, desc) }; - let scheme = desc.get_scheme(token)?; + let scheme = scheme::get_scheme(token.token(), desc.scheme)?; op(&*scheme, file.description, desc, token) } @@ -62,32 +62,55 @@ pub fn copy_path_to_buf(raw_path: UserSliceRo, max_len: usize) -> Result // TODO: Define elsewhere const PATH_MAX: usize = PAGE_SIZE; -fn fifo_path_key(scheme_id: scheme::SchemeId, number: usize, path: &str) -> String { - if path.starts_with('/') { - path.to_string() - } else { - format!("@fifo:{}:{}:{}", scheme_id.get(), number, path) - } -} - -fn install_open_result( - scheme_id: scheme::SchemeId, +pub fn openat( + fh: FileHandle, + raw_path: UserSliceRo, flags: usize, - open_result: OpenResult, + fcntl_flags: u32, + euid: u32, + egid: u32, token: &mut CleanLockToken, ) -> Result { - let new_description = match open_result { - OpenResult::SchemeLocal(number, internal_flags) => Arc::new(RwLock::new( - FileDescription::new( - scheme_id, - number, - 0, - (flags & !O_CLOEXEC) as u32, - internal_flags, - token, - ), - )), - OpenResult::External(desc) => desc, + let path_buf = copy_path_to_buf(raw_path, PATH_MAX)?; + + let (scheme_id, number) = { + let current_lock = context::current(); + let mut current = current_lock.read(token.token()); + let (context, mut token) = current.token_split(); + let pipe = context.get_file(fh, &mut token).ok_or(Error::new(EBADF))?; + let desc = pipe.description.read(token.token()); + (desc.scheme, desc.number) + }; + + let caller_ctx = context::current() + .read(token.token()) + .caller_ctx() + .filter_uid_gid(euid, egid); + + let new_description = { + let scheme = scheme::get_scheme(token.token(), scheme_id)?; + + let res = scheme.kopenat( + number, + StrOrBytes::from_str(&path_buf), + flags, + fcntl_flags, + caller_ctx, + token, + ); + + match res? { + OpenResult::SchemeLocal(number, internal_flags) => { + Arc::new(RwLock::new(FileDescription { + offset: 0, + internal_flags, + scheme: scheme_id, + number, + flags: (flags & !O_CLOEXEC) as u32, + })) + } + OpenResult::External(desc) => desc, + } }; let current_lock = context::current(); @@ -103,102 +126,6 @@ fn install_open_result( ) .ok_or(Error::new(EMFILE)) } - -fn path_exists_in_scheme( - scheme: &dyn KernelScheme, - number: usize, - path: &str, - caller_ctx: scheme::CallerCtx, - token: &mut CleanLockToken, -) -> Result { - match scheme.kopenat(number, StrOrBytes::from_str(path), O_STAT, 0, caller_ctx, token) { - Ok(OpenResult::SchemeLocal(number, _)) => { - let _ = scheme.close(number, token); - Ok(true) - } - Ok(OpenResult::External(_)) => Ok(true), - Err(err) if err.errno == ENOENT => Ok(false), - Err(err) => Err(err), - } -} - -pub fn openat( - fh: FileHandle, - raw_path: UserSliceRo, - flags: usize, - fcntl_flags: u32, - euid: u32, - egid: u32, - token: &mut CleanLockToken, -) -> Result { - let path_buf = copy_path_to_buf(raw_path, PATH_MAX)?; - - let desc = { - let current_lock = context::current(); - let mut current = current_lock.read(token.token()); - let (context, mut context_token) = current.token_split(); - let pipe = context - .get_file(fh, &mut context_token) - .ok_or(Error::new(EBADF))?; - *pipe.description.read(context_token.token()) - }; - let scheme = desc.get_scheme(token)?; - let number = desc.number; - let scheme_id = desc.scheme; - - let caller_ctx = context::current() - .read(token.token()) - .caller_ctx() - .filter_uid_gid(euid, egid); - - let fifo_mode_requested = flags & MODE_FIFO as usize == MODE_FIFO as usize; - let fifo_key = fifo_path_key(scheme_id, number, &path_buf); - - if pipe::named_pipe_exists(&fifo_key, token) { - if flags & O_EXCL == O_EXCL && flags & O_CREAT == O_CREAT { - return Err(Error::new(EEXIST)); - } - if fifo_mode_requested && flags & O_CREAT == O_CREAT { - return Err(Error::new(EEXIST)); - } - - let pipe_number = pipe::open_named_pipe(&fifo_key, flags, token)? - .ok_or(Error::new(ENOENT))?; - return install_open_result( - GlobalSchemes::Pipe.scheme_id(), - flags, - OpenResult::SchemeLocal(pipe_number, InternalFlags::empty()), - token, - ); - } - - if fifo_mode_requested && flags & O_CREAT == O_CREAT { - if path_exists_in_scheme(&*scheme, number, &path_buf, caller_ctx, token)? { - return Err(Error::new(EEXIST)); - } - - let mode = u16::try_from(flags & 0o7777).map_err(|_| Error::new(EINVAL))?; - let pipe_number = pipe::create_named_pipe(&fifo_key, &path_buf, mode, flags, token)?; - - return install_open_result( - GlobalSchemes::Pipe.scheme_id(), - flags, - OpenResult::SchemeLocal(pipe_number, InternalFlags::empty()), - token, - ); - } - - let open_result = scheme.kopenat( - number, - StrOrBytes::from_str(&path_buf), - flags, - fcntl_flags, - caller_ctx, - token, - )?; - - install_open_result(scheme_id, flags, open_result, token) -} /// Unlinkat syscall pub fn unlinkat( fh: FileHandle, @@ -210,27 +137,22 @@ pub fn unlinkat( ) -> Result<()> { let path_buf = copy_path_to_buf(raw_path, PATH_MAX)?; - let desc = { + let (number, scheme_id) = { let current_lock = context::current(); let mut current = current_lock.read(token.token()); - let (context, mut context_token) = current.token_split(); - let pipe = context - .get_file(fh, &mut context_token) - .ok_or(Error::new(EBADF))?; - *pipe.description.read(context_token.token()) + let (context, mut token) = current.token_split(); + let pipe = context.get_file(fh, &mut token).ok_or(Error::new(EBADF))?; + let desc = pipe.description.read(token.token()); + (desc.number, desc.scheme) }; - let number = desc.number; - let scheme = desc.get_scheme(token)?; + + let scheme = scheme::get_scheme(token.token(), scheme_id)?; let caller_ctx = context::current() .read(token.token()) .caller_ctx() .filter_uid_gid(euid, egid); - if pipe::unlink_named_pipe(&fifo_path_key(desc.scheme, number, &path_buf), token) { - return Ok(()); - } - /* let mut path_buf = BorrowedHtBuf::head()?; let path = path_buf.use_for_string(raw_path)?; @@ -277,18 +199,17 @@ fn duplicate_file( let description = { *file.description.read(token.token()) }; let new_description = { - let scheme = description.get_scheme(token)?; + let scheme = scheme::get_scheme(token.token(), description.scheme)?; match scheme.kdup(description.number, user_buf, caller_ctx, token)? { OpenResult::SchemeLocal(number, internal_flags) => { - Arc::new(RwLock::new(FileDescription::new( - description.scheme, - number, - 0, - description.flags, + Arc::new(RwLock::new(FileDescription { + offset: 0, internal_flags, - token, - ))) + scheme: description.scheme, + number, + flags: description.flags, + })) } OpenResult::External(desc) => desc, } @@ -375,10 +296,11 @@ fn call_normal( } .ok_or(Error::new(EBADF))?; - let (scheme, number) = { - let desc = *file.description.read(token.token()); - (desc.get_scheme(token)?, desc.number) + let (scheme_id, number) = { + let desc = file.description.read(token.token()); + (desc.scheme, desc.number) }; + let scheme = scheme::get_scheme(token.token(), scheme_id)?; if flags.contains(CallFlags::STD_FS) { scheme.translate_std_fs_call(number, file.description, payload, flags, metadata, token) @@ -419,28 +341,28 @@ fn fdwrite_inner( ) -> Result { // TODO: Ensure deadlocks can't happen let (scheme, number, descs_to_send) = { - let desc = { + let (scheme, number) = { let current_lock = context::current(); let mut current = current_lock.read(token.token()); - let (context, mut context_token) = current.token_split(); + let (context, mut token) = current.token_split(); let file_descriptor = context - .get_file(socket, &mut context_token) + .get_file(socket, &mut token) .ok_or(Error::new(EBADF))?; - *file_descriptor.description.read(context_token.token()) + let desc = &file_descriptor.description.read(token.token()); + (desc.scheme, desc.number) }; - let scheme = desc.get_scheme(token)?; - let number = desc.number; + let scheme = scheme::get_scheme(token.token(), scheme)?; let current_lock = context::current(); let mut current = current_lock.read(token.token()); - let (context, mut context_token) = current.token_split(); + let (context, mut token) = current.token_split(); ( scheme, number, if flags.contains(CallFlags::FD_CLONE) { - context.bulk_get_files(&target_fds, &mut context_token) + context.bulk_get_files(&target_fds, &mut token) } else { - context.bulk_remove_files(&target_fds, &mut context_token) + context.bulk_remove_files(&target_fds, &mut token) }? .into_iter() .map(|f| f.description) @@ -473,22 +395,18 @@ fn call_fdread( metadata: &[u64], token: &mut CleanLockToken, ) -> Result { - let desc = { - let current_lock = context::current(); - let mut current = current_lock.read(token.token()); - let (context, mut context_token) = current.token_split(); - let file_descriptor = context - .get_file(fd, &mut context_token) - .ok_or(Error::new(EBADF))?; - *file_descriptor.description.read(context_token.token()) - }; let (scheme, number) = { - let scheme = desc.get_scheme(token)?; - let number = desc.number; - ( - scheme, - number, - ) + let (scheme, number) = { + let current_lock = context::current(); + let mut current = current_lock.read(token.token()); + let (context, mut token) = current.token_split(); + let file_descriptor = context.get_file(fd, &mut token).ok_or(Error::new(EBADF))?; + let desc = file_descriptor.description.read(token.token()); + (desc.scheme, desc.number) + }; + let scheme = scheme::get_scheme(token.token(), scheme)?; + + (scheme, number) }; scheme.kfdread(number, payload, flags, metadata, token) @@ -522,9 +440,9 @@ pub fn fcntl(fd: FileHandle, cmd: usize, arg: usize, token: &mut CleanLockToken) } .ok_or(Error::new(EBADF))?; - let (number, flags, desc) = { - let desc = *file.description.read(token.token()); - (desc.number, desc.flags, desc) + let (scheme_id, number, flags) = { + let desc = file.description.write(token.token()); + (desc.scheme, desc.number, desc.flags) }; if cmd == F_DUPFD || cmd == F_DUPFD_CLOEXEC { @@ -542,7 +460,7 @@ pub fn fcntl(fd: FileHandle, cmd: usize, arg: usize, token: &mut CleanLockToken) // Communicate fcntl with scheme if cmd != F_GETFD && cmd != F_SETFD { - let scheme = desc.get_scheme(token)?; + let scheme = scheme::get_scheme(token.token(), scheme_id)?; scheme.fcntl(number, cmd, arg, token)?; }; @@ -600,11 +518,13 @@ pub fn flink(fd: FileHandle, raw_path: UserSliceRo, token: &mut CleanLockToken) let path = RedoxPath::from_absolute(&path_buf).ok_or(Error::new(EINVAL))?; let (_, reference) = path.as_parts().ok_or(Error::new(EINVAL))?; - let (number, scheme) = { - let desc = *file.description.read(token.token()); - (desc.number, desc.get_scheme(token)?) + let (number, scheme_id) = { + let desc = file.description.read(token.token()); + (desc.number, desc.scheme) }; + let scheme = scheme::get_scheme(token.token(), scheme_id)?; + // TODO: Check EXDEV. /* if scheme_id != description.scheme { @@ -634,11 +554,13 @@ pub fn frename(fd: FileHandle, raw_path: UserSliceRo, token: &mut CleanLockToken let path = RedoxPath::from_absolute(&path_buf).ok_or(Error::new(EINVAL))?; let (_, reference) = path.as_parts().ok_or(Error::new(EINVAL))?; - let (number, scheme) = { - let desc = *file.description.read(token.token()); - (desc.number, desc.get_scheme(token)?) + let (number, scheme_id) = { + let desc = file.description.read(token.token()); + (desc.number, desc.scheme) }; + let scheme = scheme::get_scheme(token.token(), scheme_id)?; + // TODO: Check EXDEV. /* if scheme_id != description.scheme { diff --git a/recipes/core/kernel/source/src/syscall/mod.rs b/recipes/core/kernel/source/src/syscall/mod.rs index c7d67727d8..450a9d112f 100644 --- a/recipes/core/kernel/source/src/syscall/mod.rs +++ b/recipes/core/kernel/source/src/syscall/mod.rs @@ -28,11 +28,6 @@ use crate::{ sync::CleanLockToken, }; -/// Local syscall numbers not yet in the redox_syscall crate. -/// These are allocated from the 987+ range to avoid collisions with crate numbers. -pub const SYS_SCHED_SETAFFINITY: usize = 987; -pub const SYS_SCHED_GETAFFINITY: usize = 988; - /// Debug pub mod debug; @@ -225,10 +220,6 @@ pub fn syscall( unlinkat(fd, UserSlice::ro(c, d)?, e, f as _, g as _, token).map(|()| 0) } SYS_YIELD => sched_yield(token).map(|()| 0), - - // P17-3: CPU affinity syscalls. Numbers allocated locally (not yet in redox_syscall crate). - SYS_SCHED_SETAFFINITY => sched_setaffinity(b, UserSlice::ro(c, d)?, token), - SYS_SCHED_GETAFFINITY => sched_getaffinity(b, UserSlice::wo(c, d)?, token), SYS_NANOSLEEP => nanosleep( UserSlice::ro(b, size_of::())?, UserSlice::wo(c, size_of::())?.none_if_null(), diff --git a/recipes/core/kernel/source/src/syscall/process.rs b/recipes/core/kernel/source/src/syscall/process.rs index 3edf23aa88..e83da427b4 100644 --- a/recipes/core/kernel/source/src/syscall/process.rs +++ b/recipes/core/kernel/source/src/syscall/process.rs @@ -11,7 +11,6 @@ use crate::{ memory::{AddrSpace, Grant, PageSpan}, ContextRef, }, - cpu_set::RawMask, event, sync::{CleanLockToken, RwLock}, syscall::flag::{EventFlags, O_CREAT, O_RDWR}, @@ -272,95 +271,24 @@ unsafe fn bootstrap_mem(bootstrap: &crate::startup::Bootstrap) -> &'static [u8] } fn insert_fd(scheme: SchemeId, number: usize, cloexec: bool, token: &mut CleanLockToken) -> usize { - let description = Arc::new(RwLock::new(FileDescription::new( - scheme, - number, - 0, - (O_CREAT | O_RDWR) as u32, - InternalFlags::empty(), - token, - ))); - let current_lock = context::current(); let mut current = current_lock.read(token.token()); - let (context, mut context_token) = current.token_split(); + let (context, mut token) = current.token_split(); context .add_file_min( FileDescriptor { - description, + description: Arc::new(RwLock::new(FileDescription { + scheme, + number, + offset: 0, + flags: (O_CREAT | O_RDWR) as u32, + internal_flags: InternalFlags::empty(), + })), cloexec, }, syscall::flag::UPPER_FDTBL_TAG + scheme.get(), - &mut context_token, + &mut token, ) .expect("failed to insert fd to current context") .get() } - -/// Set CPU affinity mask for a process. -/// -/// # Arguments (syscall ABI) -/// - `pid`: Process ID (0 = current process; other PIDs not yet supported) -/// - `mask_ptr`: Pointer to a `RawMask` (32 bytes on 64-bit, 256-bit bitmap) -/// - `mask_len`: Length of mask in bytes (must equal `size_of::()`) -pub fn sched_setaffinity( - pid: usize, - mask_ptr: super::usercopy::UserSliceRo, - token: &mut CleanLockToken, -) -> Result { - // Validate mask size - if mask_ptr.len() != core::mem::size_of::() { - return Err(Error::new(super::error::EINVAL)); - } - - // pid == 0 means current process - let target = if pid == 0 { - context::current() - } else { - // TODO: Support PID-based lookup (requires context list iteration - // with lock token downgrades). For now, only pid=0 is supported. - return Err(Error::new(super::error::ESRCH)); - }; - - // Read mask from userspace - let raw_mask: RawMask = unsafe { mask_ptr.read_exact() }?; - - // Apply to context's affinity mask - let mut ctx = target.write(token.token()); - ctx.sched_affinity.override_from(&raw_mask); - - Ok(0) -} - -/// Get CPU affinity mask for a process. -/// -/// # Arguments (syscall ABI) -/// - `pid`: Process ID (0 = current process; other PIDs not yet supported) -/// - `mask_ptr`: Pointer to a `RawMask` buffer (32 bytes on 64-bit) -/// - `mask_len`: Length of buffer in bytes (must equal `size_of::()`) -/// -/// # Returns -/// Number of bytes written to mask_ptr on success. -pub fn sched_getaffinity( - pid: usize, - mask_ptr: super::usercopy::UserSliceWo, - token: &mut CleanLockToken, -) -> Result { - // Validate mask size - if mask_ptr.len() != core::mem::size_of::() { - return Err(Error::new(super::error::EINVAL)); - } - - // pid == 0 means current process - let target = if pid == 0 { - context::current() - } else { - return Err(Error::new(super::error::ESRCH)); - }; - - let ctx = target.read(token.token()); - let raw_mask = ctx.sched_affinity.to_raw(); - mask_ptr.copy_common_bytes_from_slice(crate::cpu_set::mask_as_bytes(&raw_mask))?; - - Ok(core::mem::size_of::()) -}