feat: raw framebuffer fallback for fbbootlogd when DRM unavailable

- Add RawFb struct: direct framebuffer rendering via physmap
- Add RawTextScreen: simple text renderer using orbclient font
- Fallback in FbbootlogScheme::new() when V2GraphicsHandle fails
- Reads FRAMEBUFFER_ADDR/WIDTH/HEIGHT/STRIDE from bootloader env
- Scroll via ptr::copy on pixel rows, clear bottom line
- No DRM, no shadow buffer, no GPU required — like MS-DOS text mode
- Add common dependency to fbbootlogd Cargo.toml
This commit is contained in:
2026-05-17 14:56:50 +03:00
parent ea7234f44b
commit 4a2c33750b
58 changed files with 1691 additions and 3602 deletions
-1
View File
@@ -12,7 +12,6 @@ cc = "1.0"
toml = "0.8"
[dependencies]
acpi_ext = { package = "acpi", git = "https://gitlab.redox-os.org/redox-os/acpi.git", branch = "redox-6.x" }
arrayvec = { version = "0.7.4", default-features = false }
bitfield = "0.13.2"
bitflags = "2"
-1
View File
@@ -1,4 +1,3 @@
# Red Bear OS kernel patches applied via individual patch files
.PHONY: all check
SOURCE:=$(dir $(realpath $(lastword $(MAKEFILE_LIST))))
-13
View File
@@ -77,7 +77,6 @@ fn main() {
}
"x86_64" => {
println!("cargo::rerun-if-changed=src/asm/x86_64/trampoline.asm");
println!("cargo::rerun-if-changed=src/asm/x86_64/s3_wakeup.asm");
let status = Command::new("nasm")
.arg("-f")
@@ -90,18 +89,6 @@ fn main() {
if !status.success() {
panic!("nasm failed with exit status {}", status);
}
let status = Command::new("nasm")
.arg("-f")
.arg("bin")
.arg("-o")
.arg(format!("{}/s3_wakeup", out_dir))
.arg("src/asm/x86_64/s3_wakeup.asm")
.status()
.expect("failed to run nasm");
if !status.success() {
panic!("nasm failed with exit status {}", status);
}
}
"riscv64" => {
println!("cargo::rustc-cfg=dtb");
@@ -0,0 +1,591 @@
use core::{
hint,
sync::atomic::{AtomicU8, Ordering},
};
use x86::time::rdtsc;
use crate::{
arch::{
device::local_apic::the_local_apic,
start::{kstart_ap, KernelArgsAp},
},
cpu_set::LogicalCpuId,
memory::{
allocate_p2frame, map_device_memory, Frame, KernelMapper, Page, PageFlags,
PhysicalAddress, RmmA, RmmArch, VirtualAddress, PAGE_SIZE,
},
startup::AP_READY,
};
use super::{Madt, MadtEntry};
use alloc::collections::BTreeSet;
use alloc::vec::Vec;
/// Maximum number of APIC→CPU mappings we track for NUMA topology.
const MAX_APIC_MAPPINGS: usize = 256;
struct ApicMapping {
apic_id: u32,
cpu_id: LogicalCpuId,
}
const UNINIT_MAPPING: ApicMapping = ApicMapping { apic_id: u32::MAX, cpu_id: LogicalCpuId::new(0) };
static mut APIC_MAPPINGS: [ApicMapping; MAX_APIC_MAPPINGS] = [UNINIT_MAPPING; MAX_APIC_MAPPINGS];
static mut APIC_MAPPING_COUNT: usize = 0;
unsafe fn record_apic_mapping(apic_id: u32, cpu_id: LogicalCpuId) {
let count = APIC_MAPPING_COUNT;
if count < MAX_APIC_MAPPINGS {
APIC_MAPPINGS[count] = ApicMapping { apic_id, cpu_id };
APIC_MAPPING_COUNT = count + 1;
}
}
const AP_SPIN_LIMIT: u32 = 1_000_000;
const TRAMPOLINE: usize = 0x8000;
static TRAMPOLINE_DATA: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/trampoline"));
/// Estimate TSC frequency in MHz from CPUID.
///
/// Tries CPUID leaf 0x16 (Processor Frequency Information) first,
/// then CPUID leaf 0x15 (TSC/Core Crystal Clock Ratio).
/// Returns None if frequency cannot be determined.
fn tsc_freq_mhz_cpuid() -> Option<u64> {
let max_leaf = unsafe { core::arch::x86_64::__cpuid(0).eax as u32 };
// CPUID leaf 0x16: EAX = Core Base Frequency in MHz (Intel)
if max_leaf >= 0x16 {
let mhz = unsafe { core::arch::x86_64::__cpuid(0x16) }.eax as u64;
if mhz > 0 {
return Some(mhz);
}
}
// CPUID leaf 0x15: EAX = denominator, EBX = numerator, ECX = crystal Hz
if max_leaf >= 0x15 {
let res = unsafe { core::arch::x86_64::__cpuid(0x15) };
let denom = res.eax as u64;
let numer = res.ebx as u64;
let crystal_hz = res.ecx as u64;
if denom > 0 && numer > 0 && crystal_hz > 0 {
// TSC freq = crystal_hz * numer / denom
let tsc_hz = crystal_hz * numer / denom;
return Some(tsc_hz / 1_000_000); // Hz → MHz
}
}
None
}
/// Early-boot microsecond delay using the Time Stamp Counter.
///
/// Uses CPUID-based TSC frequency estimation when available.
/// Falls back to a conservative spin loop calibrated for the
/// minimum expected CPU speed (1 GHz).
///
/// # Safety
/// Must only be called after the BSP TSC is running (always true
/// after CPU reset on x86).
fn early_udelay(us: u64) {
if let Some(mhz) = tsc_freq_mhz_cpuid() {
// TSC-based delay: precise on invariant TSC (all modern x86).
// MHz = cycles per µs.
let target = unsafe { rdtsc() } + us * mhz;
while unsafe { rdtsc() } < target {
hint::spin_loop();
}
} else {
// Fallback: conservative spin loop.
// spin_loop() (PAUSE) is ~40 cycles on modern Intel, ~1 on AMD.
// At 1 GHz minimum: 1000 cycles/µs ÷ 40 cycles/iter = 25 iters/µs.
// Use 50 iters/µs for safety margin on slower/variable CPUs.
let iters = us.saturating_mul(50);
for _ in 0..iters {
hint::spin_loop();
}
}
}
fn current_x2apic_processor_uid(madt: &Madt, apic_id: u32) -> Option<u32> {
madt.iter().find_map(|entry| match entry {
MadtEntry::LocalX2Apic(x2apic) if x2apic.x2apic_id == apic_id => Some(x2apic.processor_uid),
_ => None,
})
}
fn apply_lapic_address_override(
local_apic: &mut crate::arch::device::local_apic::LocalApic,
address: u64,
) {
if local_apic.x2 || address == 0 {
return;
}
let Ok(physaddr) = usize::try_from(address) else {
warn!(
"Ignoring LAPIC address override {:#x}: does not fit host usize",
address
);
return;
};
let mapped = unsafe { map_device_memory(PhysicalAddress::new(physaddr), 4096) }.data();
local_apic.address = mapped;
debug!("Applied LAPIC address override: {:#x}", address);
}
pub(super) fn init(madt: Madt) {
let local_apic = unsafe { the_local_apic() };
let me = local_apic.id();
if local_apic.x2 {
debug!(" X2APIC {}", me.get());
} else {
debug!(" XAPIC {}: {:>08X}", me.get(), local_apic.address);
}
if cfg!(not(feature = "multi_core")) {
unsafe {
record_apic_mapping(me.get(), LogicalCpuId::new(0));
}
crate::numa::init_default();
return;
}
// Map trampoline
let trampoline_frame = Frame::containing(PhysicalAddress::new(TRAMPOLINE));
let trampoline_page = Page::containing_address(VirtualAddress::new(TRAMPOLINE));
let (result, page_table_physaddr) = unsafe {
//TODO: do not have writable and executable!
let mut mapper = KernelMapper::lock_rw();
let result = match mapper.map_phys(
trampoline_page.start_address(),
trampoline_frame.base(),
PageFlags::new().execute(true).write(true),
) {
Some(result) => result,
None => {
println!("KERNEL AP: failed to map trampoline page, AP bring-up disabled");
return;
}
};
(result, mapper.table().phys().data())
};
result.flush();
// Write trampoline, make sure TRAMPOLINE page is free for use
for (i, val) in TRAMPOLINE_DATA.iter().enumerate() {
unsafe {
(*((TRAMPOLINE as *mut u8).add(i) as *const AtomicU8)).store(*val, Ordering::SeqCst);
}
}
unsafe {
let preliminary_cpu_count = madt
.iter()
.filter(|entry| match entry {
MadtEntry::LocalApic(local) => u32::from(local.id) == me.get() || local.flags & 1 == 1,
MadtEntry::LocalX2Apic(local) => local.x2apic_id == me.get() || local.flags & 1 == 1,
_ => false,
})
.count();
crate::profiling::allocate(preliminary_cpu_count as u32);
}
// Firmware bug detection: check for duplicate APIC IDs in MADT.
// Some firmware (especially on early BIOS/UEFI) may list the same
// processor multiple times. Keep first occurrence, warn on duplicates.
let mut seen_apic_ids: BTreeSet<u32> = BTreeSet::new();
{
let _ = seen_apic_ids.insert(me.get()); // BSP
for entry in madt.iter() {
match entry {
MadtEntry::LocalApic(local) if local.flags & 1 == 1 => {
let id = u32::from(local.id);
if !seen_apic_ids.insert(id) {
warn!("MADT: duplicate APIC ID {} in LocalApic entry, firmware bug", id);
}
}
MadtEntry::LocalX2Apic(local) if local.flags & 1 == 1 => {
let id = local.x2apic_id;
if !seen_apic_ids.insert(id) {
warn!("MADT: duplicate x2APIC ID {} in LocalX2Apic entry, firmware bug", id);
}
}
_ => {}
}
}
}
for madt_entry in madt.iter() {
debug!(" {:x?}", madt_entry);
if let MadtEntry::LocalApic(ap_local_apic) = madt_entry {
if u32::from(ap_local_apic.id) == me.get() {
debug!(" This is my local APIC");
} else if ap_local_apic.flags & 1 == 1 {
// Allocate a stack
let alloc = match allocate_p2frame(4) {
Some(frame) => frame,
None => {
println!("KERNEL AP: CPU {} no memory for stack, skipping", ap_local_apic.id);
continue;
}
};
let stack_start = RmmA::phys_to_virt(alloc.base()).data();
let stack_end = stack_start + (PAGE_SIZE << 4);
// Atomically allocate a CPU ID — fetch_add is SeqCst so that
// all later stores (PercpuBlock, NUMA node) are ordered after.
let cpu_id = LogicalCpuId::new(crate::CPU_COUNT.fetch_add(1, Ordering::SeqCst));
if cpu_id.get() >= crate::cpu_set::MAX_CPU_COUNT {
println!(
"KERNEL AP: CPU {} exceeds logical CPU limit, skipping",
ap_local_apic.id
);
continue;
}
let pcr_ptr = crate::arch::gdt::allocate_and_init_pcr(cpu_id, stack_end);
let idt_ptr = crate::arch::idt::allocate_and_init_idt(cpu_id);
let args = KernelArgsAp {
stack_end: stack_end as *mut u8,
cpu_id,
pcr_ptr,
idt_ptr,
};
let ap_ready = (TRAMPOLINE + 8) as *mut u64;
let ap_args_ptr = unsafe { ap_ready.add(1) };
let ap_page_table = unsafe { ap_ready.add(2) };
let ap_code = unsafe { ap_ready.add(3) };
// Set the ap_ready to 0, volatile
unsafe {
ap_ready.write(0);
ap_args_ptr.write(&args as *const _ as u64);
ap_page_table.write(page_table_physaddr as u64);
#[expect(clippy::fn_to_numeric_cast)]
ap_code.write(kstart_ap as u64);
// Ensure all trampoline writes are visible to the AP before
// it starts executing. asm!("") is only a compiler barrier;
// fence(SeqCst) is a full hardware memory barrier.
core::sync::atomic::fence(Ordering::SeqCst);
};
AP_READY.store(false, Ordering::SeqCst);
// Clear APIC Error Status Register before starting AP.
// Intel SDM §8.4.4: ESR should be cleared before sending SIPI.
unsafe { local_apic.esr(); }
// Send INIT IPI (Assert)
{
// ICR: Delivery Mode=INIT(101), Level=Assert, Trigger=Edge
let mut icr = 0x4500u64;
if local_apic.x2 {
icr |= u64::from(ap_local_apic.id) << 32;
} else {
icr |= u64::from(ap_local_apic.id) << 56;
}
local_apic.set_icr(icr);
}
// Intel SDM Vol 3A §8.4.4: wait 10ms after INIT deassert
// before sending first SIPI. Modern CPUs may need less,
// but 10ms is the safe specification-compliant value.
early_udelay(10_000);
// Send START IPI #1
{
let ap_segment = (TRAMPOLINE >> 12) & 0xFF;
// ICR: Delivery Mode=StartUp(110), Vector=ap_segment
// Note: bit 14 (Level) must be 0 for SIPI per Intel SDM.
let mut icr = 0x0600 | ap_segment as u64;
if local_apic.x2 {
icr |= u64::from(ap_local_apic.id) << 32;
} else {
icr |= u64::from(ap_local_apic.id) << 56;
}
local_apic.set_icr(icr);
}
// Intel SDM: wait 200µs between SIPIs
early_udelay(200);
// Send START IPI #2 (recommended for compatibility)
{
let ap_segment = (TRAMPOLINE >> 12) & 0xFF;
let mut icr = 0x0600 | ap_segment as u64;
if local_apic.x2 {
icr |= u64::from(ap_local_apic.id) << 32;
} else {
icr |= u64::from(ap_local_apic.id) << 56;
}
local_apic.set_icr(icr);
}
// Wait briefly for SIPI to be accepted
early_udelay(200);
// Check ESR for delivery errors after SIPI sequence.
// Bit 5 = Send Accept Error, Bit 6 = Send Illegal Vector.
let esr_val = unsafe { local_apic.esr() };
if esr_val != 0 {
println!(
"KERNEL AP: CPU {} SIPI delivery error (ESR={:#x}), continuing",
ap_local_apic.id, esr_val
);
}
// Wait for trampoline ready with timeout
let mut trampoline_ready = false;
for _ in 0..AP_SPIN_LIMIT {
if unsafe { (*ap_ready.cast::<AtomicU8>()).load(Ordering::SeqCst) } != 0 {
trampoline_ready = true;
break;
}
hint::spin_loop();
}
if !trampoline_ready {
println!("KERNEL AP: CPU {} trampoline timeout, skipping", ap_local_apic.id);
continue;
}
let mut kernel_ready = false;
for _ in 0..AP_SPIN_LIMIT {
if AP_READY.load(Ordering::SeqCst) {
kernel_ready = true;
break;
}
hint::spin_loop();
}
if !kernel_ready {
println!("KERNEL AP: CPU {} AP_READY timeout, skipping", ap_local_apic.id);
continue;
}
// Record APIC→CPU mapping for NUMA topology.
unsafe {
record_apic_mapping(u32::from(ap_local_apic.id), cpu_id);
}
// Set NUMA node from SRAT data.
if let Some(percpu) = crate::percpu::get_for_cpu(cpu_id) {
if let Some(node) = crate::acpi::srat::numa_node_for_apic(u32::from(ap_local_apic.id)) {
percpu.numa_node.set(node);
}
}
RmmA::invalidate_all();
} else {
debug!("KERNEL AP: LAPIC CPU {} disabled in MADT, skipping", u32::from(ap_local_apic.id));
}
} else if let MadtEntry::LocalX2Apic(ap_x2apic) = madt_entry {
let apic_id = ap_x2apic.x2apic_id;
let flags = ap_x2apic.flags;
if apic_id == me.get() {
debug!(" This is my local x2APIC");
} else if flags & 1 == 1 {
let alloc = match allocate_p2frame(4) {
Some(frame) => frame,
None => {
println!("KERNEL AP: CPU {} no memory for stack, skipping", apic_id);
continue;
}
};
let stack_start = RmmA::phys_to_virt(alloc.base()).data();
let stack_end = stack_start + (PAGE_SIZE << 4);
// Atomically allocate a CPU ID — fetch_add is SeqCst so that
// all later stores (PercpuBlock, NUMA node) are ordered after.
let cpu_id = LogicalCpuId::new(crate::CPU_COUNT.fetch_add(1, Ordering::SeqCst));
if cpu_id.get() >= crate::cpu_set::MAX_CPU_COUNT {
println!(
"KERNEL AP: CPU {} exceeds logical CPU limit, skipping",
apic_id
);
continue;
}
let pcr_ptr = crate::arch::gdt::allocate_and_init_pcr(cpu_id, stack_end);
let idt_ptr = crate::arch::idt::allocate_and_init_idt(cpu_id);
let args = KernelArgsAp {
stack_end: stack_end as *mut u8,
cpu_id,
pcr_ptr,
idt_ptr,
};
let ap_ready = (TRAMPOLINE + 8) as *mut u64;
let ap_args_ptr = unsafe { ap_ready.add(1) };
let ap_page_table = unsafe { ap_ready.add(2) };
let ap_code = unsafe { ap_ready.add(3) };
unsafe {
ap_ready.write(0);
ap_args_ptr.write(&args as *const _ as u64);
ap_page_table.write(page_table_physaddr as u64);
#[expect(clippy::fn_to_numeric_cast)]
ap_code.write(kstart_ap as u64);
// Ensure all trampoline writes are visible to the AP.
core::sync::atomic::fence(Ordering::SeqCst);
}
AP_READY.store(false, Ordering::SeqCst);
// Clear APIC Error Status Register before starting AP.
unsafe { local_apic.esr(); }
// Send INIT IPI (Assert)
{
let mut icr = 0x4500u64;
if local_apic.x2 {
icr |= u64::from(apic_id) << 32;
} else {
icr |= u64::from(apic_id as u8) << 56;
}
local_apic.set_icr(icr);
}
// Intel SDM Vol 3A §8.4.4: wait 10ms after INIT
early_udelay(10_000);
// Send START IPI #1
{
let ap_segment = (TRAMPOLINE >> 12) & 0xFF;
let mut icr = 0x0600u64 | ap_segment as u64;
if local_apic.x2 {
icr |= u64::from(apic_id) << 32;
} else {
icr |= u64::from(apic_id as u8) << 56;
}
local_apic.set_icr(icr);
}
// Intel SDM: wait 200µs between SIPIs
early_udelay(200);
// Send START IPI #2 (recommended for compatibility)
{
let ap_segment = (TRAMPOLINE >> 12) & 0xFF;
let mut icr = 0x0600u64 | ap_segment as u64;
if local_apic.x2 {
icr |= u64::from(apic_id) << 32;
} else {
icr |= u64::from(apic_id as u8) << 56;
}
local_apic.set_icr(icr);
}
// Wait briefly for SIPI acceptance
early_udelay(200);
// Check ESR for delivery errors.
let esr_val = unsafe { local_apic.esr() };
if esr_val != 0 {
println!(
"KERNEL AP: CPU {} SIPI delivery error (ESR={:#x}), continuing",
apic_id, esr_val
);
}
let mut trampoline_ready = false;
for _ in 0..AP_SPIN_LIMIT {
if unsafe { (*ap_ready.cast::<AtomicU8>()).load(Ordering::SeqCst) } != 0 {
trampoline_ready = true;
break;
}
hint::spin_loop();
}
if !trampoline_ready {
println!("KERNEL AP: CPU {} trampoline timeout, skipping", apic_id);
continue;
}
let mut kernel_ready = false;
for _ in 0..AP_SPIN_LIMIT {
if AP_READY.load(Ordering::SeqCst) {
kernel_ready = true;
break;
}
hint::spin_loop();
}
if !kernel_ready {
println!("KERNEL AP: CPU {} AP_READY timeout, skipping", apic_id);
continue;
}
// Record APIC→CPU mapping for NUMA topology.
unsafe {
record_apic_mapping(apic_id, cpu_id);
}
// Set NUMA node from SRAT data.
if let Some(percpu) = crate::percpu::get_for_cpu(cpu_id) {
if let Some(node) = crate::acpi::srat::numa_node_for_apic(apic_id) {
percpu.numa_node.set(node);
}
}
RmmA::invalidate_all();
} else {
debug!("KERNEL AP: x2APIC CPU {} disabled in MADT (flags={:#x}), skipping", apic_id, flags);
}
} else if let MadtEntry::LocalApicNmi(nmi) = madt_entry {
let target_apic = nmi.processor;
if target_apic == 0xFF || target_apic == local_apic.id().get() as u8 {
unsafe { local_apic.set_lvt_nmi(nmi.nmi_pin, nmi.flags) };
}
} else if let MadtEntry::LocalX2ApicNmi(nmi) = madt_entry {
let current_uid = current_x2apic_processor_uid(&madt, me.get());
if nmi.processor_uid == u32::MAX || current_uid == Some(nmi.processor_uid) {
unsafe { local_apic.set_lvt_nmi(nmi.nmi_pin, nmi.flags) };
}
} else if let MadtEntry::LapicAddressOverride(override_entry) = madt_entry {
apply_lapic_address_override(local_apic, override_entry.local_apic_address);
}
}
// Initialize NUMA topology from APIC→CPU mappings and SRAT.
{
let mappings = unsafe { &APIC_MAPPINGS[..APIC_MAPPING_COUNT] };
let mappings_ref: Vec<(u32, LogicalCpuId)> = mappings
.iter()
.map(|m| (m.apic_id, m.cpu_id))
.collect();
crate::numa::init_from_srat(&mappings_ref);
}
// Set BSP's NUMA node from SRAT.
if let Some(node) = crate::acpi::srat::numa_node_for_apic(me.get()) {
crate::percpu::PercpuBlock::current().numa_node.set(node);
}
// Log final CPU count vs maximum
let cpu_count = crate::CPU_COUNT.load(Ordering::SeqCst);
info!(
"SMP: {} CPUs online (max {})",
cpu_count, crate::cpu_set::MAX_CPU_COUNT
);
if cpu_count > crate::cpu_set::MAX_CPU_COUNT * 80 / 100 {
warn!(
"SMP: CPU count approaching MAX_CPU_COUNT limit ({}/{})",
cpu_count, crate::cpu_set::MAX_CPU_COUNT
);
}
// Unmap trampoline
if let Some((_frame, _, flush)) = unsafe {
KernelMapper::lock_rw()
.unmap_phys(trampoline_page.start_address())
} {
flush.flush();
} else {
println!("KERNEL AP: failed to unmap trampoline page (non-fatal)");
}
}
@@ -0,0 +1,160 @@
use core::{
hint,
sync::atomic::{AtomicU8, Ordering},
};
use crate::{
arch::{
device::local_apic::the_local_apic,
start::{kstart_ap, KernelArgsAp},
},
cpu_set::LogicalCpuId,
memory::{
allocate_p2frame, Frame, KernelMapper, Page, PageFlags, PhysicalAddress, RmmA, RmmArch,
VirtualAddress, PAGE_SIZE,
},
startup::AP_READY,
};
use super::{Madt, MadtEntry};
const TRAMPOLINE: usize = 0x8000;
static TRAMPOLINE_DATA: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/trampoline"));
pub(super) fn init(madt: Madt) {
let local_apic = unsafe { the_local_apic() };
let me = local_apic.id();
if local_apic.x2 {
debug!(" X2APIC {}", me.get());
} else {
debug!(" XAPIC {}: {:>08X}", me.get(), local_apic.address);
}
if cfg!(not(feature = "multi_core")) {
return;
}
// Map trampoline
let trampoline_frame = Frame::containing(PhysicalAddress::new(TRAMPOLINE));
let trampoline_page = Page::containing_address(VirtualAddress::new(TRAMPOLINE));
let (result, page_table_physaddr) = unsafe {
//TODO: do not have writable and executable!
let mut mapper = KernelMapper::lock_rw();
let result = mapper
.map_phys(
trampoline_page.start_address(),
trampoline_frame.base(),
PageFlags::new().execute(true).write(true),
)
.expect("failed to map trampoline");
(result, mapper.table().phys().data())
};
result.flush();
// Write trampoline, make sure TRAMPOLINE page is free for use
for (i, val) in TRAMPOLINE_DATA.iter().enumerate() {
unsafe {
(*((TRAMPOLINE as *mut u8).add(i) as *const AtomicU8)).store(*val, Ordering::SeqCst);
}
}
unsafe {
let preliminary_cpu_count = madt.iter().filter(|e| matches!(e, MadtEntry::LocalApic(entry) if u32::from(entry.id) == me.get() || entry.flags & 1 == 1)).count();
crate::profiling::allocate(preliminary_cpu_count as u32);
}
for madt_entry in madt.iter() {
debug!(" {:x?}", madt_entry);
if let MadtEntry::LocalApic(ap_local_apic) = madt_entry {
if u32::from(ap_local_apic.id) == me.get() {
debug!(" This is my local APIC");
} else if ap_local_apic.flags & 1 == 1 {
let cpu_id = LogicalCpuId::next();
// Allocate a stack
let stack_start = RmmA::phys_to_virt(
allocate_p2frame(4)
.expect("no more frames in acpi stack_start")
.base(),
)
.data();
let stack_end = stack_start + (PAGE_SIZE << 4);
let pcr_ptr = crate::arch::gdt::allocate_and_init_pcr(cpu_id, stack_end);
let idt_ptr = crate::arch::idt::allocate_and_init_idt(cpu_id);
let args = KernelArgsAp {
stack_end: stack_end as *mut u8,
cpu_id,
pcr_ptr,
idt_ptr,
};
let ap_ready = (TRAMPOLINE + 8) as *mut u64;
let ap_args_ptr = unsafe { ap_ready.add(1) };
let ap_page_table = unsafe { ap_ready.add(2) };
let ap_code = unsafe { ap_ready.add(3) };
// Set the ap_ready to 0, volatile
unsafe {
ap_ready.write(0);
ap_args_ptr.write(&args as *const _ as u64);
ap_page_table.write(page_table_physaddr as u64);
#[expect(clippy::fn_to_numeric_cast)]
ap_code.write(kstart_ap as u64);
// TODO: Is this necessary (this fence)?
core::arch::asm!("");
};
AP_READY.store(false, Ordering::SeqCst);
// Send INIT IPI
{
let mut icr = 0x4500;
if local_apic.x2 {
icr |= u64::from(ap_local_apic.id) << 32;
} else {
icr |= u64::from(ap_local_apic.id) << 56;
}
local_apic.set_icr(icr);
}
// Send START IPI
{
let ap_segment = (TRAMPOLINE >> 12) & 0xFF;
let mut icr = 0x4600 | ap_segment as u64;
if local_apic.x2 {
icr |= u64::from(ap_local_apic.id) << 32;
} else {
icr |= u64::from(ap_local_apic.id) << 56;
}
local_apic.set_icr(icr);
}
// Wait for trampoline ready
while unsafe { (*ap_ready.cast::<AtomicU8>()).load(Ordering::SeqCst) } == 0 {
hint::spin_loop();
}
while !AP_READY.load(Ordering::SeqCst) {
hint::spin_loop();
}
RmmA::invalidate_all();
}
}
}
// Unmap trampoline
let (_frame, _, flush) = unsafe {
KernelMapper::lock_rw()
.unmap_phys(trampoline_page.start_address())
.expect("failed to unmap trampoline page")
};
flush.flush();
}
@@ -0,0 +1,41 @@
--- src/acpi/madt/arch/x86.rs
+++ src/acpi/madt/arch/x86.rs
@@ -446,11 +446,7 @@
// Send INIT IPI (Assert)
{
let mut icr = 0x4500u64;
- if local_apic.x2 {
- icr |= u64::from(apic_id) << 32;
- } else {
- icr |= u64::from(apic_id as u8) << 56;
- }
+ icr |= u64::from(apic_id) << 32;
local_apic.set_icr(icr);
}
@@ -460,11 +456,7 @@
{
let ap_segment = (TRAMPOLINE >> 12) & 0xFF;
let mut icr = 0x0600u64 | ap_segment as u64;
- if local_apic.x2 {
- icr |= u64::from(apic_id) << 32;
- } else {
- icr |= u64::from(apic_id as u8) << 56;
- }
+ icr |= u64::from(apic_id) << 32;
local_apic.set_icr(icr);
}
@@ -476,11 +468,7 @@
{
let ap_segment = (TRAMPOLINE >> 12) & 0xFF;
let mut icr = 0x0600u64 | ap_segment as u64;
- if local_apic.x2 {
- icr |= u64::from(apic_id) << 32;
- } else {
- icr |= u64::from(apic_id as u8) << 56;
- }
+ icr |= u64::from(apic_id) << 32;
local_apic.set_icr(icr);
}
@@ -34,12 +34,6 @@ impl Madt {
let madt = Madt::new(find_one_sdt!("APIC"));
if let Some(madt) = madt {
// Validate MADT checksum per ACPI 6.5 §5.2.2
if !madt.sdt.validate_checksum() {
error!("MADT checksum validation failed, skipping APIC initialization");
return;
}
// safe because no APs have been started yet.
unsafe { MADT.get().write(Some(madt)) };
@@ -152,48 +146,6 @@ pub struct MadtGicd {
_reserved2: [u8; 3],
}
/// MADT Local x2APIC (entry type 0x9)
#[derive(Clone, Copy, Debug)]
#[repr(C, packed)]
pub struct MadtLocalX2Apic {
_reserved: u16,
pub x2apic_id: u32,
pub flags: u32,
pub processor_uid: u32,
}
/// MADT Local APIC NMI (entry type 0x4)
#[derive(Clone, Copy, Debug)]
#[repr(C, packed)]
pub struct MadtLocalApicNmi {
pub processor: u8,
pub flags: u16,
pub nmi_pin: u8,
}
/// MADT Local APIC address override (entry type 0x5)
#[derive(Clone, Copy, Debug)]
#[repr(C, packed)]
pub struct MadtLapicAddressOverride {
_reserved: u16,
pub local_apic_address: u64,
}
/// MADT Local x2APIC NMI (entry type 0xA)
#[derive(Clone, Copy, Debug)]
#[repr(C, packed)]
pub struct MadtLocalX2ApicNmi {
_reserved: u16,
pub processor_uid: u32,
pub flags: u16,
pub nmi_pin: u8,
_reserved2: u8,
}
const _: () = assert!(size_of::<MadtLocalApicNmi>() == 4);
const _: () = assert!(size_of::<MadtLapicAddressOverride>() == 10);
const _: () = assert!(size_of::<MadtLocalX2ApicNmi>() == 10);
/// MADT Entries
#[derive(Debug)]
#[allow(dead_code)]
@@ -204,18 +156,10 @@ pub enum MadtEntry {
InvalidIoApic(usize),
IntSrcOverride(&'static MadtIntSrcOverride),
InvalidIntSrcOverride(usize),
LocalApicNmi(&'static MadtLocalApicNmi),
InvalidLocalApicNmi(usize),
LapicAddressOverride(&'static MadtLapicAddressOverride),
InvalidLapicAddressOverride(usize),
Gicc(&'static MadtGicc),
InvalidGicc(usize),
Gicd(&'static MadtGicd),
InvalidGicd(usize),
LocalX2Apic(&'static MadtLocalX2Apic),
InvalidLocalX2Apic(usize),
LocalX2ApicNmi(&'static MadtLocalX2ApicNmi),
InvalidLocalX2ApicNmi(usize),
Unknown(u8),
}
@@ -232,10 +176,6 @@ impl Iterator for MadtIter {
let entry_len =
unsafe { *(self.sdt.data_address() as *const u8).add(self.i + 1) } as usize;
if entry_len < 2 {
return None;
}
if self.i + entry_len <= self.sdt.data_len() {
let item = match entry_type {
0x0 => {
@@ -266,46 +206,6 @@ impl Iterator for MadtIter {
MadtEntry::InvalidIntSrcOverride(entry_len)
}
}
0x4 => {
if entry_len == size_of::<MadtLocalApicNmi>() + 2 {
MadtEntry::LocalApicNmi(unsafe {
&*((self.sdt.data_address() + self.i + 2)
as *const MadtLocalApicNmi)
})
} else {
MadtEntry::InvalidLocalApicNmi(entry_len)
}
}
0x5 => {
if entry_len == size_of::<MadtLapicAddressOverride>() + 2 {
MadtEntry::LapicAddressOverride(unsafe {
&*((self.sdt.data_address() + self.i + 2)
as *const MadtLapicAddressOverride)
})
} else {
MadtEntry::InvalidLapicAddressOverride(entry_len)
}
}
0x9 => {
if entry_len == size_of::<MadtLocalX2Apic>() + 2 {
MadtEntry::LocalX2Apic(unsafe {
&*((self.sdt.data_address() + self.i + 2)
as *const MadtLocalX2Apic)
})
} else {
MadtEntry::InvalidLocalX2Apic(entry_len)
}
}
0xA => {
if entry_len == size_of::<MadtLocalX2ApicNmi>() + 2 {
MadtEntry::LocalX2ApicNmi(unsafe {
&*((self.sdt.data_address() + self.i + 2)
as *const MadtLocalX2ApicNmi)
})
} else {
MadtEntry::InvalidLocalX2ApicNmi(entry_len)
}
}
0xB => {
if entry_len >= size_of::<MadtGicc>() + 2 {
MadtEntry::Gicc(unsafe {
@@ -20,8 +20,6 @@ mod rxsdt;
pub mod sdt;
#[cfg(target_arch = "aarch64")]
mod spcr;
pub mod slit;
pub mod srat;
mod xsdt;
unsafe fn map_linearly(addr: PhysicalAddress, len: usize, mapper: &mut crate::memory::PageMapper) {
@@ -84,14 +82,6 @@ impl Rxsdt for RxsdtEnum {
pub static RXSDT_ENUM: Once<RxsdtEnum> = Once::new();
#[derive(Clone, Copy, Debug)]
pub struct AcpiRootInfo {
pub revision: u8,
pub root_sdt_address: PhysicalAddress,
}
pub static ACPI_ROOT_INFO: Once<AcpiRootInfo> = Once::new();
/// Parse the ACPI tables to gather CPU, interrupt, and timer information
pub unsafe fn init(already_supplied_rsdp: Option<*const u8>) {
unsafe {
@@ -104,15 +94,6 @@ pub unsafe fn init(already_supplied_rsdp: Option<*const u8>) {
let rsdp_opt = Rsdp::get_rsdp(already_supplied_rsdp);
if let Some(rsdp) = rsdp_opt {
let root_info = ACPI_ROOT_INFO.call_once(|| AcpiRootInfo {
revision: rsdp.revision(),
root_sdt_address: rsdp.sdt_address(),
});
if root_info.root_sdt_address != rsdp.sdt_address() || root_info.revision != rsdp.revision() {
error!("ACPI_ROOT_INFO already initialized with a different RSDP root");
}
debug!("SDT address: {:#x}", rsdp.sdt_address().data());
let rxsdt = get_sdt(rsdp.sdt_address(), &mut KernelMapper::lock_rw());
@@ -165,14 +146,7 @@ pub unsafe fn init(already_supplied_rsdp: Option<*const u8>) {
// TODO: Enumerate processors in userspace, and then provide an ACPI-independent interface
// to initialize enumerated processors to userspace?
// Parse SRAT BEFORE MADT so NUMA node mapping is available
// when APs are started and PercpuBlocks are created.
srat::init();
Madt::init();
// Parse SLIT after MADT for the NUMA distance matrix.
slit::init();
//TODO: support this on any arch
// SPCR must be initialized after MADT for interrupt controllers
#[cfg(target_arch = "aarch64")]
+3 -31
View File
@@ -17,33 +17,9 @@ pub struct Rsdp {
impl Rsdp {
pub unsafe fn get_rsdp(already_supplied_rsdp: Option<*const u8>) -> Option<Rsdp> {
already_supplied_rsdp.and_then(|rsdp_ptr| {
let rsdp = unsafe { *(rsdp_ptr as *const Rsdp) };
// Validate signature "RSD PTR "
if &rsdp.signature != b"RSD PTR " {
return None;
}
// ACPI 1.0 checksum: sum of first 20 bytes must be zero
let bytes_v1 = unsafe { core::slice::from_raw_parts(rsdp_ptr, 20) };
if bytes_v1.iter().fold(0u8, |sum, &b| sum.wrapping_add(b)) != 0 {
return None;
}
// ACPI 2.0+ extended checksum: sum of entire table (length bytes) must be zero
if rsdp.revision >= 2 {
let full_len = rsdp._length as usize;
if full_len < 36 || full_len > 256 {
return None;
}
let bytes_full = unsafe { core::slice::from_raw_parts(rsdp_ptr, full_len) };
if bytes_full.iter().fold(0u8, |sum, &b| sum.wrapping_add(b)) != 0 {
return None;
}
}
Some(rsdp)
already_supplied_rsdp.map(|rsdp_ptr| {
// TODO: Validate
unsafe { *(rsdp_ptr as *const Rsdp) }
})
}
@@ -55,8 +31,4 @@ impl Rsdp {
self.rsdt_address as usize
})
}
pub fn revision(&self) -> u8 {
self.revision
}
}
@@ -24,20 +24,4 @@ impl Sdt {
let header_size = size_of::<Sdt>();
total_size.saturating_sub(header_size)
}
/// Validate the SDT checksum.
///
/// Per ACPI 6.5 §5.2.2: the entire table (including the checksum field)
/// must sum to 0 when all bytes are added together as unsigned 8-bit values.
pub fn validate_checksum(&self) -> bool {
let ptr = self as *const _ as *const u8;
let len = self.length as usize;
if len < size_of::<Sdt>() {
return false;
}
let sum = unsafe { core::slice::from_raw_parts(ptr, len) }
.iter()
.fold(0u8, |acc, &b| acc.wrapping_add(b));
sum == 0
}
}
@@ -1,45 +0,0 @@
//! SLIT (System Locality Information Table) parser.
//!
//! Parses the NUMA distance matrix for scheduler NUMA-aware work stealing.
use super::sdt::Sdt;
use crate::acpi::find_sdt;
const MAX_NODES: usize = 8;
static mut SLIT_MATRIX: [[u8; MAX_NODES]; MAX_NODES] = [[10u8; MAX_NODES]; MAX_NODES];
static mut SLIT_NUM_NODES: usize = 0;
static mut SLIT_AVAILABLE: bool = false;
pub fn is_available() -> bool { unsafe { SLIT_AVAILABLE } }
pub fn num_nodes() -> usize { unsafe { SLIT_NUM_NODES } }
pub fn distance(from: u8, to: u8) -> u8 {
if !unsafe { SLIT_AVAILABLE } { return 10; }
let (from, to) = (from as usize, to as usize);
if from >= MAX_NODES || to >= MAX_NODES { return 10; }
unsafe { SLIT_MATRIX[from][to] }
}
pub fn same_socket(node1: u8, node2: u8) -> bool { distance(node1, node2) <= 20 }
pub fn init() {
let sdt = match find_sdt("SLIT").as_slice() {
[] => return,
[x] => *x,
xs => { println!("SLIT: {} tables found, expected 1", xs.len()); return; }
};
if &sdt.signature != b"SLIT" { return; }
let data_addr = sdt.data_address();
let data_len = sdt.data_len();
if data_len < 8 { return; }
let num_nodes = unsafe { *(data_addr as *const u64) } as usize;
if num_nodes == 0 || num_nodes > MAX_NODES { println!("SLIT: {num_nodes} nodes (max {MAX_NODES}), ignoring"); return; }
let matrix_start = 8;
let matrix_size = num_nodes * num_nodes;
if data_len < matrix_start + matrix_size { println!("SLIT: matrix truncated ({data_len} < {})", matrix_start + matrix_size); return; }
let matrix = unsafe { &mut SLIT_MATRIX };
for i in 0..num_nodes { for j in 0..num_nodes { matrix[i][j] = unsafe { *((data_addr + matrix_start + i * num_nodes + j) as *const u8) }; } }
unsafe { SLIT_NUM_NODES = num_nodes; SLIT_AVAILABLE = true; }
debug!("SLIT: {} nodes, distance matrix loaded", num_nodes);
}
-102
View File
@@ -1,102 +0,0 @@
//! SRAT (System Resource Affinity Table) parser.
//!
//! Parses CPU-to-NUMA-node and memory-to-NUMA-node affinity information.
//! Called before MADT init so that NUMA data is available during AP startup.
use super::sdt::Sdt;
use crate::acpi::find_sdt;
const MAX_CPU_ENTRIES: usize = 256;
const MAX_MEM_ENTRIES: usize = 64;
#[derive(Clone, Copy)]
struct SratCpuEntry { apic_id: u32, node: u8, enabled: bool }
#[derive(Clone, Copy)]
struct SratMemEntry { node: u8, base: u64, length: u64, enabled: bool }
const CPU_NONE: SratCpuEntry = SratCpuEntry { apic_id: u32::MAX, node: 0, enabled: false };
const MEM_NONE: SratMemEntry = SratMemEntry { node: 0, base: 0, length: 0, enabled: false };
static mut SRAT_CPU_ENTRIES: [SratCpuEntry; MAX_CPU_ENTRIES] = [CPU_NONE; MAX_CPU_ENTRIES];
static mut SRAT_MEM_ENTRIES: [SratMemEntry; MAX_MEM_ENTRIES] = [MEM_NONE; MAX_MEM_ENTRIES];
static mut SRAT_CPU_COUNT: usize = 0;
static mut SRAT_MEM_COUNT: usize = 0;
static mut SRAT_AVAILABLE: bool = false;
pub fn is_available() -> bool { unsafe { SRAT_AVAILABLE } }
pub fn numa_node_for_apic(apic_id: u32) -> Option<u8> {
if !unsafe { SRAT_AVAILABLE } { return None; }
let count = unsafe { SRAT_CPU_COUNT };
let entries = unsafe { &SRAT_CPU_ENTRIES };
for i in 0..count {
if entries[i].apic_id == apic_id && entries[i].enabled { return Some(entries[i].node); }
}
None
}
pub fn numa_node_count() -> usize {
if !unsafe { SRAT_AVAILABLE } { return 1; }
let mut max_node: u8 = 0;
let count = unsafe { SRAT_CPU_COUNT };
let entries = unsafe { &SRAT_CPU_ENTRIES };
for i in 0..count { if entries[i].enabled && entries[i].node > max_node { max_node = entries[i].node; } }
(max_node as usize) + 1
}
#[repr(C, packed)]
struct SratLocalApic { _proximity_lo: u8, apic_id: u8, flags: u32, _local_sapic_eid: u8, _proximity_hi: [u8; 3], _clock_domain: u32 }
#[repr(C, packed)]
struct SratMemoryAffinity { proximity_domain: u32, _reserved1: u16, base_address_lo: u32, base_address_hi: u32, length_lo: u32, length_hi: u32, _reserved2: u32, flags: u32, _reserved3: u64 }
#[repr(C, packed)]
struct SratLocalX2Apic { _reserved: u16, proximity_domain: u32, x2apic_id: u32, flags: u32, _clock_domain: u32, _reserved2: u32 }
pub fn init() {
let sdt = match find_sdt("SRAT").as_slice() {
[] => return,
[x] => *x,
xs => { println!("SRAT: {} tables found, expected 1", xs.len()); return; }
};
if &sdt.signature != b"SRAT" { return; }
let data_addr = sdt.data_address();
let data_len = sdt.data_len();
if data_len < 12 { println!("SRAT: table too short ({data_len} bytes)"); return; }
let mut offset: usize = 12;
let cpu_entries = unsafe { &mut SRAT_CPU_ENTRIES };
let mem_entries = unsafe { &mut SRAT_MEM_ENTRIES };
let mut cpu_count: usize = 0;
let mut mem_count: usize = 0;
while offset + 2 <= data_len {
let entry_type = unsafe { *((data_addr + offset) as *const u8) };
let entry_len = unsafe { *((data_addr + offset + 1) as *const u8) } as usize;
if entry_len < 2 || offset + entry_len > data_len { break; }
let entry_data = data_addr + offset + 2;
match entry_type {
0x0 if entry_len >= size_of::<SratLocalApic>() + 2 => {
let e = unsafe { &*(entry_data as *const SratLocalApic) };
let enabled = (e.flags & 1) == 1;
let node = (e._proximity_lo as u32) | ((e._proximity_hi[0] as u32) << 8) | ((e._proximity_hi[1] as u32) << 16) | ((e._proximity_hi[2] as u32) << 24);
if cpu_count < MAX_CPU_ENTRIES { cpu_entries[cpu_count] = SratCpuEntry { apic_id: e.apic_id as u32, node: node as u8, enabled }; cpu_count += 1; }
}
0x1 if entry_len >= size_of::<SratMemoryAffinity>() + 2 => {
let e = unsafe { &*(entry_data as *const SratMemoryAffinity) };
let enabled = (e.flags & 1) == 1;
let base = (e.base_address_hi as u64) << 32 | e.base_address_lo as u64;
let length = (e.length_hi as u64) << 32 | e.length_lo as u64;
if mem_count < MAX_MEM_ENTRIES { mem_entries[mem_count] = SratMemEntry { node: e.proximity_domain as u8, base, length, enabled }; mem_count += 1; }
}
0x2 if entry_len >= size_of::<SratLocalX2Apic>() + 2 => {
let e = unsafe { &*(entry_data as *const SratLocalX2Apic) };
let enabled = (e.flags & 1) == 1;
if cpu_count < MAX_CPU_ENTRIES { cpu_entries[cpu_count] = SratCpuEntry { apic_id: e.x2apic_id, node: e.proximity_domain as u8, enabled }; cpu_count += 1; }
}
_ => {}
}
offset += entry_len;
}
unsafe { SRAT_CPU_COUNT = cpu_count; SRAT_MEM_COUNT = mem_count; SRAT_AVAILABLE = true; }
debug!("SRAT: {} CPU entries, {} memory entries", cpu_count, mem_count);
}
+13 -27
View File
@@ -7,40 +7,26 @@ mod linked_list;
/// Size of kernel heap
const KERNEL_HEAP_SIZE: usize = ::rmm::MEGABYTE;
#[cold]
fn halt_kernel_heap_init(message: &str) -> ! {
print!("{message}");
println!("Kernel heap initialization cannot continue. Halting.");
loop {
core::hint::spin_loop();
}
}
unsafe fn map_heap(mapper: &mut KernelMapper<true>, offset: usize, size: usize) {
let mut flush_all = PageFlushAll::new();
let heap_start_page = Page::containing_address(VirtualAddress::new(offset));
let heap_end_page = Page::containing_address(VirtualAddress::new(offset + size - 1));
for page in Page::range_inclusive(heap_start_page, heap_end_page) {
let phys = match mapper.allocator_mut().allocate_one() {
Some(phys) => phys,
None => halt_kernel_heap_init(
"FATAL: failed to allocate physical frame for kernel heap\n",
),
};
let phys = mapper
.allocator_mut()
.allocate_one()
.expect("failed to allocate kernel heap");
let flush = unsafe {
match mapper.map_phys(
page.start_address(),
phys,
PageFlags::new()
.write(true)
.global(cfg!(not(feature = "pti"))),
) {
Some(flush) => flush,
None => halt_kernel_heap_init(
"FATAL: failed to map kernel heap virtual page\n",
),
}
mapper
.map_phys(
page.start_address(),
phys,
PageFlags::new()
.write(true)
.global(cfg!(not(feature = "pti"))),
)
.expect("failed to map kernel heap")
};
flush_all.consume(flush);
}
@@ -91,7 +91,7 @@ unsafe extern "C" fn start(args_ptr: *const KernelArgs) -> ! {
dtb::serial::init_early(dtb);
}
info!("RedBear OS starting...");
info!("Redox OS starting...");
args.print();
// Initialize RMM
@@ -97,7 +97,7 @@ unsafe extern "C" fn start(args_ptr: *const KernelArgs) -> ! {
init_early(dtb);
}
info!("RedBear OS starting...");
info!("Redox OS starting...");
args.print();
if let Some(dtb) = &dtb {
@@ -14,10 +14,6 @@ pub struct IoApicRegs {
pointer: *const u32,
}
impl IoApicRegs {
fn redirection_index_valid(&mut self, idx: u8) -> bool {
idx <= self.max_redirection_table_entries()
}
fn ioregsel(&self) -> *const u32 {
self.pointer
}
@@ -48,28 +44,21 @@ impl IoApicRegs {
pub fn read_ioapicver(&mut self) -> u32 {
self.read_reg(0x01)
}
pub fn read_ioredtbl(&mut self, idx: u8) -> Option<u64> {
if !self.redirection_index_valid(idx) {
warn!("IOAPIC read_ioredtbl index {} out of range", idx);
return None;
}
pub fn read_ioredtbl(&mut self, idx: u8) -> u64 {
assert!(idx < 24);
let lo = self.read_reg(0x10 + idx * 2);
let hi = self.read_reg(0x10 + idx * 2 + 1);
Some(u64::from(lo) | (u64::from(hi) << 32))
u64::from(lo) | (u64::from(hi) << 32)
}
pub fn write_ioredtbl(&mut self, idx: u8, value: u64) -> bool {
if !self.redirection_index_valid(idx) {
warn!("IOAPIC write_ioredtbl index {} out of range", idx);
return false;
}
pub fn write_ioredtbl(&mut self, idx: u8, value: u64) {
assert!(idx < 24);
let lo = value as u32;
let hi = (value >> 32) as u32;
self.write_reg(0x10 + idx * 2, lo);
self.write_reg(0x10 + idx * 2 + 1, hi);
true
}
pub fn max_redirection_table_entries(&mut self) -> u8 {
@@ -103,37 +92,17 @@ impl IoApic {
}
/// Map an interrupt vector to a physical local APIC ID of a processor (thus physical mode).
#[allow(dead_code)]
pub fn map(&self, idx: u8, info: MapInfo) -> bool {
let Some(raw) = info.as_raw() else {
return false;
};
self.regs.lock().write_ioredtbl(idx, raw)
pub fn map(&self, idx: u8, info: MapInfo) {
self.regs.lock().write_ioredtbl(idx, info.as_raw())
}
pub fn set_mask(&self, gsi: u32, mask: bool) {
let idx = (gsi - self.gsi_start) as u8;
let mut guard = self.regs.lock();
let Some(mut reg) = guard.read_ioredtbl(idx) else {
return;
};
let mut reg = guard.read_ioredtbl(idx);
reg &= !(1 << 16);
reg |= u64::from(mask) << 16;
let _ = guard.write_ioredtbl(idx, reg);
}
/// Change the destination APIC for a GSI by reprogramming the redirection table entry.
/// Preserves all other fields (vector, polarity, trigger mode, delivery mode, mask).
/// Returns true if the entry was successfully updated.
pub fn set_irq_affinity(&self, gsi: u32, dest: ApicId) -> bool {
let idx = (gsi - self.gsi_start) as u8;
let mut guard = self.regs.lock();
let Some(mut entry) = guard.read_ioredtbl(idx) else {
return false;
};
// Clear destination field (bits 63:56 for xAPIC physical mode)
// and set new destination APIC ID
entry &= !(0xFF_u64 << 56);
entry |= u64::from(dest.get()) << 56;
guard.write_ioredtbl(idx, entry)
guard.write_ioredtbl(idx, reg);
}
}
@@ -180,26 +149,19 @@ pub struct MapInfo {
}
impl MapInfo {
pub fn as_raw(&self) -> Option<u64> {
if !(0x20..=0xFE).contains(&self.vector) {
warn!(
"Refusing to map IOAPIC vector outside valid range: {:#x}",
self.vector
);
return None;
}
pub fn as_raw(&self) -> u64 {
assert!(self.vector >= 0x20);
assert!(self.vector <= 0xFE);
// TODO: Check for reserved fields.
Some(
(u64::from(self.dest.get()) << 56)
(u64::from(self.dest.get()) << 56)
| (u64::from(self.mask) << 16)
| ((self.trigger_mode as u64) << 15)
| ((self.polarity as u64) << 13)
| ((self.dest_mode as u64) << 11)
| ((self.delivery_mode as u64) << 8)
| u64::from(self.vector),
)
| u64::from(self.vector)
}
}
@@ -213,7 +175,7 @@ impl fmt::Debug for IoApic {
let count = guard.max_redirection_table_entries();
f.debug_list()
.entries((0..=count).filter_map(|i| guard.read_ioredtbl(i)))
.entries((0..count).map(|i| guard.read_ioredtbl(i)))
.finish()
}
}
@@ -275,14 +237,11 @@ pub unsafe fn handle_ioapic(madt_ioapic: &'static MadtIoApic) {
let ioapic_registers = virt.data() as *const u32;
let ioapic = IoApic::new(ioapic_registers, madt_ioapic.gsi_base);
let detected_id = ioapic.regs.lock().id();
if detected_id != madt_ioapic.id {
warn!(
"mismatched ACPI MADT I/O APIC ID: MADT={}, IOAPIC={}; continuing with detected hardware",
madt_ioapic.id,
detected_id
);
}
assert_eq!(
ioapic.regs.lock().id(),
madt_ioapic.id,
"mismatched ACPI MADT I/O APIC ID, and the ID reported by the I/O APIC"
);
(*IOAPICS.get()).get_or_insert_with(Vec::new).push(ioapic);
}
@@ -351,11 +310,11 @@ pub unsafe fn init() {
}
}
}
for ioapic in ioapics() {
for idx in 0..=ioapic.count {
ioapic.set_mask(ioapic.gsi_start + u32::from(idx), true);
}
}
println!(
"I/O APICs: {:?}, overrides: {:?}",
ioapics(),
src_overrides()
);
// map the legacy PC-compatible IRQs (0-15) to 32-47, just like we did with 8259 PIC (if it
// wouldn't have been disabled due to this I/O APIC)
@@ -370,6 +329,7 @@ pub unsafe fn init() {
.iter()
.any(|over| over.bus_irq == legacy_irq)
{
// there's an IRQ conflict, making this legacy IRQ inaccessible.
continue;
}
(
@@ -389,6 +349,7 @@ pub unsafe fn init() {
let redir_tbl_index = (gsi - apic.gsi_start) as u8;
let map_info = MapInfo {
// only send to the BSP
dest: bsp_apic_id,
dest_mode: DestinationMode::Physical,
delivery_mode: DeliveryMode::Fixed,
@@ -405,32 +366,7 @@ pub unsafe fn init() {
},
vector: 32 + legacy_irq,
};
if !apic.map(redir_tbl_index, map_info) {
warn!(
"Unable to map legacy IRQ {} (GSI {}) through IOAPIC index {}",
legacy_irq,
gsi,
redir_tbl_index
);
}
if legacy_irq == 0 && gsi != u32::from(legacy_irq) {
if let Some(apic0) = find_ioapic(u32::from(legacy_irq)) {
let idx0 = (u32::from(legacy_irq) - apic0.gsi_start) as u8;
let _ = apic0.map(
idx0,
MapInfo {
dest: bsp_apic_id,
dest_mode: DestinationMode::Physical,
delivery_mode: DeliveryMode::Fixed,
mask: false,
polarity: ApicPolarity::ActiveHigh,
trigger_mode: ApicTriggerMode::Edge,
vector: 32,
},
);
}
}
apic.map(redir_tbl_index, map_info);
}
println!(
"I/O APICs: {:?}, overrides: {:?}",
@@ -470,7 +406,7 @@ fn resolve(irq: u8) -> u32 {
fn find_ioapic(gsi: u32) -> Option<&'static IoApic> {
ioapics()
.iter()
.find(|apic| gsi >= apic.gsi_start && gsi <= apic.gsi_start + u32::from(apic.count))
.find(|apic| gsi >= apic.gsi_start && gsi < apic.gsi_start + u32::from(apic.count))
}
pub unsafe fn mask(irq: u8) {
@@ -489,14 +425,3 @@ pub unsafe fn unmask(irq: u8) {
};
apic.set_mask(gsi, false);
}
/// Change the destination CPU for an IRQ by reprogramming the IOAPIC redirection entry.
/// Resolves the legacy IRQ to its GSI, finds the owning IOAPIC, and updates the destination
/// APIC ID in the redirection table while preserving all other fields.
pub unsafe fn set_affinity(irq: u8, dest: ApicId) -> bool {
let gsi = resolve(irq);
match find_ioapic(gsi) {
Some(apic) => apic.set_irq_affinity(gsi, dest),
None => false,
}
}
@@ -0,0 +1,312 @@
use core::{
cell::SyncUnsafeCell,
ptr::{read_volatile, write_volatile},
};
use x86::msr::*;
use crate::{
arch::{cpuid::cpuid, ipi::IpiKind},
memory::{map_device_memory, PhysicalAddress},
percpu::PercpuBlock,
};
#[derive(Clone, Copy, Debug)]
pub struct ApicId(u32);
impl ApicId {
pub fn new(inner: u32) -> Self {
Self(inner)
}
pub fn get(&self) -> u32 {
self.0
}
}
static LOCAL_APIC: SyncUnsafeCell<LocalApic> = SyncUnsafeCell::new(LocalApic {
address: 0,
x2: false,
});
pub unsafe fn the_local_apic() -> &'static mut LocalApic {
unsafe { &mut *LOCAL_APIC.get() }
}
pub unsafe fn init() {
unsafe {
the_local_apic().init();
}
}
pub unsafe fn init_ap() {
unsafe {
the_local_apic().init_ap();
}
}
/// Local APIC
pub struct LocalApic {
pub address: usize,
pub x2: bool,
}
impl LocalApic {
unsafe fn init(&mut self) {
unsafe {
let physaddr = PhysicalAddress::new(rdmsr(IA32_APIC_BASE) as usize & 0xFFFF_0000);
self.x2 = cpuid()
.get_feature_info()
.is_some_and(|feature_info| feature_info.has_x2apic());
if !self.x2 {
info!("Detected xAPIC at {:#x}", physaddr.data());
self.address = map_device_memory(physaddr, 4096).data();
} else {
info!("Detected x2APIC");
}
self.init_ap();
}
}
unsafe fn init_ap(&mut self) {
unsafe {
if self.x2 {
wrmsr(IA32_APIC_BASE, rdmsr(IA32_APIC_BASE) | (1 << 10));
wrmsr(IA32_X2APIC_SIVR, 0x100);
} else {
self.write(0xF0, 0x100);
}
self.setup_error_int();
//self.setup_timer();
PercpuBlock::current()
.misc_arch_info
.apic_id_opt
.set(Some(self.id()));
}
}
unsafe fn read(&self, reg: u32) -> u32 {
debug_assert!(!self.x2);
unsafe { read_volatile((self.address + reg as usize) as *const u32) }
}
unsafe fn write(&mut self, reg: u32, value: u32) {
debug_assert!(!self.x2);
unsafe {
write_volatile((self.address + reg as usize) as *mut u32, value);
}
}
pub fn id(&self) -> ApicId {
ApicId::new(if self.x2 {
unsafe { rdmsr(IA32_X2APIC_APICID) as u32 }
} else {
unsafe { self.read(0x20) >> 24 }
})
}
pub fn version(&self) -> u32 {
if self.x2 {
unsafe { rdmsr(IA32_X2APIC_VERSION) as u32 }
} else {
unsafe { self.read(0x30) }
}
}
pub fn icr(&self) -> u64 {
if self.x2 {
unsafe { rdmsr(IA32_X2APIC_ICR) }
} else {
unsafe { ((self.read(0x310) as u64) << 32) | self.read(0x300) as u64 }
}
}
pub fn set_icr(&mut self, value: u64) {
if self.x2 {
unsafe {
const PENDING: u32 = 1 << 12;
while (rdmsr(IA32_X2APIC_ICR) as u32) & PENDING == PENDING {
core::hint::spin_loop();
}
wrmsr(IA32_X2APIC_ICR, value);
while (rdmsr(IA32_X2APIC_ICR) as u32) & PENDING == PENDING {
core::hint::spin_loop();
}
}
} else {
unsafe {
const PENDING: u32 = 1 << 12;
while self.read(0x300) & PENDING == PENDING {
core::hint::spin_loop();
}
self.write(0x310, (value >> 32) as u32);
self.write(0x300, value as u32);
while self.read(0x300) & PENDING == PENDING {
core::hint::spin_loop();
}
}
}
}
pub fn ipi(&mut self, apic_id: ApicId, kind: IpiKind) {
let shift = if self.x2 { 32 } else { 56 };
self.set_icr((u64::from(apic_id.get()) << shift) | 0x40 | kind as u64);
}
pub fn ipi_nmi(&mut self, apic_id: ApicId) {
let shift = if self.x2 { 32 } else { 56 };
self.set_icr((u64::from(apic_id.get()) << shift) | (1 << 14) | (0b100 << 8));
}
pub unsafe fn eoi(&mut self) {
unsafe {
if self.x2 {
wrmsr(IA32_X2APIC_EOI, 0);
} else {
self.write(0xB0, 0);
}
}
}
/// Reads the Error Status Register.
pub unsafe fn esr(&mut self) -> u32 {
unsafe {
if self.x2 {
// update the ESR to the current state of the local apic.
wrmsr(IA32_X2APIC_ESR, 0);
// read the updated value
rdmsr(IA32_X2APIC_ESR) as u32
} else {
self.write(0x280, 0);
self.read(0x280)
}
}
}
pub unsafe fn lvt_timer(&mut self) -> u32 {
unsafe {
if self.x2 {
rdmsr(IA32_X2APIC_LVT_TIMER) as u32
} else {
self.read(0x320)
}
}
}
pub unsafe fn set_lvt_timer(&mut self, value: u32) {
unsafe {
if self.x2 {
wrmsr(IA32_X2APIC_LVT_TIMER, u64::from(value));
} else {
self.write(0x320, value);
}
}
}
pub unsafe fn init_count(&mut self) -> u32 {
unsafe {
if self.x2 {
rdmsr(IA32_X2APIC_INIT_COUNT) as u32
} else {
self.read(0x380)
}
}
}
pub unsafe fn set_init_count(&mut self, initial_count: u32) {
unsafe {
if self.x2 {
wrmsr(IA32_X2APIC_INIT_COUNT, u64::from(initial_count));
} else {
self.write(0x380, initial_count);
}
}
}
pub unsafe fn cur_count(&mut self) -> u32 {
unsafe {
if self.x2 {
rdmsr(IA32_X2APIC_CUR_COUNT) as u32
} else {
self.read(0x390)
}
}
}
pub unsafe fn div_conf(&mut self) -> u32 {
unsafe {
if self.x2 {
rdmsr(IA32_X2APIC_DIV_CONF) as u32
} else {
self.read(0x3E0)
}
}
}
pub unsafe fn set_div_conf(&mut self, div_conf: u32) {
unsafe {
if self.x2 {
wrmsr(IA32_X2APIC_DIV_CONF, u64::from(div_conf));
} else {
self.write(0x3E0, div_conf);
}
}
}
pub unsafe fn lvt_error(&mut self) -> u32 {
unsafe {
if self.x2 {
rdmsr(IA32_X2APIC_LVT_ERROR) as u32
} else {
self.read(0x370)
}
}
}
pub unsafe fn set_lvt_error(&mut self, lvt_error: u32) {
unsafe {
if self.x2 {
wrmsr(IA32_X2APIC_LVT_ERROR, u64::from(lvt_error));
} else {
self.write(0x370, lvt_error);
}
}
}
pub unsafe fn set_lvt_nmi(&mut self, pin: u8, flags: u16) {
let polarity = match flags & 0b11 {
0b11 => 1 << 13,
_ => 0,
};
let trigger_mode = match (flags >> 2) & 0b11 {
0b11 => 1 << 15,
_ => 0,
};
let lvt_value = (0b100 << 8) | polarity | trigger_mode;
unsafe {
match pin {
0 => {
if self.x2 {
wrmsr(IA32_X2APIC_LVT_LINT0, u64::from(lvt_value));
} else {
self.write(0x350, lvt_value);
}
}
1 => {
if self.x2 {
wrmsr(IA32_X2APIC_LVT_LINT1, u64::from(lvt_value));
} else {
self.write(0x360, lvt_value);
}
}
_ => {}
}
}
}
unsafe fn setup_error_int(&mut self) {
unsafe {
let vector = 49u32;
self.set_lvt_error(vector);
}
}
}
#[repr(u8)]
pub enum LvtTimerMode {
OneShot = 0b00,
Periodic = 0b01,
TscDeadline = 0b10,
}
@@ -0,0 +1,14 @@
--- src/arch/x86_shared/device/local_apic.rs
+++ src/arch/x86_shared/device/local_apic.rs
@@ -61,9 +61,9 @@
if !self.x2 {
- info!("Detected xAPIC at {:#x}", physaddr.data());
+ debug!("Detected xAPIC at {:#x}", physaddr.data());
self.address = map_device_memory(physaddr, 4096).data();
} else {
- info!("Detected x2APIC");
+ debug!("Detected x2APIC");
}
@@ -4,11 +4,9 @@ pub mod cpu;
pub mod hpet;
pub mod ioapic;
pub mod local_apic;
pub mod msi;
pub mod pic;
pub mod pit;
pub mod serial;
pub mod vector;
#[cfg(feature = "system76_ec_debug")]
pub mod system76_ec;
@@ -25,7 +23,8 @@ pub unsafe fn init() {
}
}
pub unsafe fn init_after_acpi() {
unsafe { ioapic::init() };
// this will disable the IOAPIC if needed.
//ioapic::init(mapper);
}
unsafe fn init_hpet() -> bool {
@@ -1,183 +0,0 @@
// MSI/MSI-X support for x86 — kernel-level message composition and validation
// Cross-referenced from Linux 7.0: arch/x86/kernel/apic/msi.c (391 lines)
use crate::arch::device::local_apic::ApicId;
pub const MSI_ADDRESS_BASE: u64 = 0xFEE0_0000;
pub const MSI_ADDRESS_MASK: u64 = 0xFEEF_F000;
const MSI_DEST_MODE_LOGICAL: u64 = 1 << 2;
const MSI_REDIRECTION_HINT: u64 = 1 << 3;
#[derive(Debug, Clone, Copy)]
pub struct MsiAddress {
pub raw: u64,
}
#[derive(Debug, Clone, Copy)]
pub struct MsiData {
pub raw: u32,
}
#[derive(Debug, Clone)]
pub struct MsiMessage {
pub address: MsiAddress,
pub data: MsiData,
}
impl MsiAddress {
pub fn new(dest_apic_id: u8, redirection_hint: bool, dest_mode_logical: bool) -> Self {
let mut addr = MSI_ADDRESS_BASE;
addr |= u64::from(dest_apic_id) << 12;
if redirection_hint {
addr |= MSI_REDIRECTION_HINT;
}
if dest_mode_logical {
addr |= MSI_DEST_MODE_LOGICAL;
}
Self { raw: addr }
}
pub fn validate(addr: u64) -> bool {
(addr & MSI_ADDRESS_MASK) == MSI_ADDRESS_BASE
}
pub fn dest_apic_id(&self) -> u8 {
((self.raw >> 12) & 0xFF) as u8
}
}
impl MsiData {
pub fn new(vector: u8, delivery_mode: u8, trigger_mode: u8) -> Self {
let mut data = u32::from(vector);
data |= u32::from(delivery_mode) << 8;
data |= u32::from(trigger_mode) << 15;
Self { raw: data }
}
pub fn vector(&self) -> u8 {
(self.raw & 0xFF) as u8
}
pub fn delivery_mode(&self) -> u8 {
((self.raw >> 8) & 0x7) as u8
}
pub fn trigger_mode(&self) -> u8 {
((self.raw >> 15) & 0x1) as u8
}
}
impl MsiMessage {
pub fn compose(dest: ApicId, vector: u8, delivery_mode: u8, trigger_mode: u8) -> Self {
let address = MsiAddress::new(dest.get() as u8, false, false);
let data = MsiData::new(vector, delivery_mode, trigger_mode);
Self { address, data }
}
pub fn validate(&self) -> bool {
MsiAddress::validate(self.address.raw)
&& self.data.vector() >= 32
&& self.data.vector() < 255
}
}
pub fn is_valid_msi_address(addr: u64) -> bool {
MsiAddress::validate(addr)
}
pub fn is_valid_msi_vector(vector: u8) -> bool {
vector >= 32 && vector < 255
}
#[derive(Debug)]
pub struct MsiCapability {
pub msg_ctl: u16,
pub msg_addr_lo: u32,
pub msg_addr_hi: u32,
pub msg_data: u16,
pub mask_bits: u32,
pub pending_bits: u32,
pub is_64bit: bool,
pub is_maskable: bool,
pub multiple_message_capable: u8,
}
impl MsiCapability {
pub fn parse(raw: &[u32; 6], msg_ctl: u16) -> Self {
Self {
msg_ctl,
msg_addr_lo: raw[1],
msg_addr_hi: if msg_ctl & (1 << 7) != 0 { raw[2] } else { 0 },
msg_data: if msg_ctl & (1 << 7) != 0 {
(raw[3] & 0xFFFF) as u16
} else {
(raw[2] & 0xFFFF) as u16
},
mask_bits: if msg_ctl & (1 << 8) != 0 {
if msg_ctl & (1 << 7) != 0 {
raw[3] >> 16
} else {
raw[3]
}
} else {
0
},
pending_bits: if msg_ctl & (1 << 8) != 0 { raw[4] } else { 0 },
is_64bit: msg_ctl & (1 << 7) != 0,
is_maskable: msg_ctl & (1 << 8) != 0,
multiple_message_capable: ((msg_ctl >> 1) & 0x7) as u8,
}
}
}
#[derive(Debug)]
pub struct MsixCapability {
pub msg_ctl: u16,
pub table_offset: u32,
pub table_bar: u8,
pub pba_offset: u32,
pub pba_bar: u8,
pub table_size: u16,
}
impl MsixCapability {
pub fn parse(raw: &[u32; 3], msg_ctl: u16) -> Self {
Self {
msg_ctl,
table_offset: raw[1] & !0x7,
table_bar: (raw[1] & 0x7) as u8,
pba_offset: raw[2] & !0x7,
pba_bar: (raw[2] & 0x7) as u8,
table_size: ((msg_ctl >> 1) & 0x7FF) as u16 + 1,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_compose_message() {
let msg = MsiMessage::compose(ApicId::new(3), 48, 0b101, 1);
assert!(msg.validate());
assert_eq!(msg.address.dest_apic_id(), 3);
assert_eq!(msg.data.vector(), 48);
assert_eq!(msg.data.delivery_mode(), 0b101);
assert_eq!(msg.data.trigger_mode(), 1);
}
#[test]
fn test_invalid_address() {
assert!(!is_valid_msi_address(0xDEAD_BEEF));
assert!(is_valid_msi_address(0xFEE0_0000));
}
#[test]
fn test_msi_parse() {
let raw = [0u32; 6];
let cap = MsiCapability::parse(&raw, 0);
assert!(!cap.is_64bit);
assert!(!cap.is_maskable);
}
}
@@ -1,53 +0,0 @@
use crate::cpu_set::LogicalCpuId;
const VECTOR_COUNT: usize = 224;
static VECTORS: [core::sync::atomic::AtomicU32; 7] = [
core::sync::atomic::AtomicU32::new(0),
core::sync::atomic::AtomicU32::new(0),
core::sync::atomic::AtomicU32::new(0),
core::sync::atomic::AtomicU32::new(0),
core::sync::atomic::AtomicU32::new(0),
core::sync::atomic::AtomicU32::new(0),
core::sync::atomic::AtomicU32::new(0),
];
pub fn allocate_vector(_cpu: LogicalCpuId) -> Option<u8> {
for (bank, slot) in VECTORS.iter().enumerate() {
let mut bits = slot.load(core::sync::atomic::Ordering::Acquire);
loop {
let free = bits.trailing_ones() as usize;
if free >= 32 {
break;
}
let bit = 1u32 << free;
match slot.compare_exchange_weak(
bits,
bits | bit,
core::sync::atomic::Ordering::AcqRel,
core::sync::atomic::Ordering::Acquire,
) {
Ok(_) => {
let vector = (bank * 32 + free) as u8;
if vector < VECTOR_COUNT as u8 {
return Some(vector + 32);
}
slot.fetch_and(!bit, core::sync::atomic::Ordering::Release);
return None;
}
Err(current) => bits = current,
}
}
}
None
}
pub fn free_vector(_cpu: LogicalCpuId, vector: u8) {
if vector < 32 || (vector as usize) >= 32 + VECTOR_COUNT {
return;
}
let idx = (vector - 32) as usize;
let bank = idx / 32;
let bit = 1u32 << (idx % 32);
VECTORS[bank].fetch_and(!bit, core::sync::atomic::Ordering::Release);
}
@@ -192,15 +192,6 @@ impl ProcessorControlRegion {
}
}
#[cold]
fn halt_pcr_init() -> ! {
println!("FATAL: failed to allocate physical memory for Processor Control Region");
println!("Processor startup cannot continue. Halting.");
loop {
core::hint::spin_loop();
}
}
pub unsafe fn pcr() -> *mut ProcessorControlRegion {
unsafe {
// Primitive benchmarking of RDFSBASE and RDGSBASE in userspace, appears to indicate that
@@ -384,10 +375,7 @@ pub fn allocate_and_init_pcr(
.next_power_of_two()
.trailing_zeros();
let pcr_frame = match crate::memory::allocate_p2frame(alloc_order) {
Some(frame) => frame,
None => halt_pcr_init(),
};
let pcr_frame = crate::memory::allocate_p2frame(alloc_order).expect("failed to allocate PCR");
let pcr_ptr = RmmA::phys_to_virt(pcr_frame.base()).data() as *mut ProcessorControlRegion;
unsafe { core::ptr::write(pcr_ptr, ProcessorControlRegion::new_partial_init(cpu_id)) };
@@ -78,15 +78,6 @@ static INIT_BSP_IDT: SyncUnsafeCell<Idt> = SyncUnsafeCell::new(Idt::new());
pub(crate) static IDTS: RwLock<HashMap<LogicalCpuId, &'static mut Idt>> =
RwLock::new(HashMap::with_hasher(DefaultHashBuilder::new()));
#[cold]
fn halt_idt_init() -> ! {
println!("FATAL: failed to allocate physical pages for backup interrupt stack");
println!("Interrupt setup cannot continue. Halting.");
loop {
core::hint::spin_loop();
}
}
#[inline]
pub fn is_reserved(cpu_id: LogicalCpuId, index: u8) -> bool {
if cpu_id == LogicalCpuId::BSP {
@@ -110,8 +101,6 @@ pub fn set_reserved(cpu_id: LogicalCpuId, index: u8, reserved: bool) {
}
pub fn available_irqs_iter(cpu_id: LogicalCpuId) -> impl Iterator<Item = u8> + 'static {
let count = (32..=254).filter(|&index| !is_reserved(cpu_id, index)).count();
info!("available_irqs_iter: cpu_id={} count={}", cpu_id.get(), count);
(32..=254).filter(move |&index| !is_reserved(cpu_id, index))
}
@@ -172,10 +161,8 @@ pub fn allocate_and_init_idt(cpu_id: LogicalCpuId) -> *mut Idt {
.or_insert_with(|| Box::leak(Box::new(Idt::new())));
use crate::memory::{RmmA, RmmArch};
let frames = match crate::memory::allocate_p2frame(4) {
Some(frames) => frames,
None => halt_idt_init(),
};
let frames = crate::memory::allocate_p2frame(4)
.expect("failed to allocate pages for backup interrupt stack");
// Physical pages are mapped linearly. So is the linearly mapped virtual memory.
let base_address = RmmA::phys_to_virt(frames.base());
@@ -1,5 +1,3 @@
use core::sync::atomic::{AtomicBool, Ordering};
use syscall::Exception;
use x86::irq::PageFaultError;
@@ -12,22 +10,6 @@ use crate::{
syscall::flag::*,
};
static NMI_IN_PROGRESS: AtomicBool = AtomicBool::new(false);
unsafe fn nmi_raw_serial_write(bytes: &[u8]) {
use crate::syscall::io::{Io, Pio};
let mut com1 = Pio::<u8>::new(0x3F8);
let lsr = Pio::<u8>::new(0x3F8 + 5);
for &byte in bytes {
while lsr.read() & (1 << 5) == 0 {
core::hint::spin_loop();
}
com1.write(byte);
}
}
interrupt_stack!(divide_by_zero, |stack| {
println!("Divide by zero");
stack.trace();
@@ -73,35 +55,9 @@ interrupt_stack!(non_maskable, @paranoid, |stack| {
#[cfg(not(all(target_arch = "x86_64", feature = "profiling")))]
{
if NMI_IN_PROGRESS.swap(true, Ordering::SeqCst) {
return;
}
unsafe {
nmi_raw_serial_write(b"Non-maskable interrupt\n");
nmi_raw_serial_write(b" RIP: ");
#[cfg(target_arch = "x86")]
let instruction_pointer = u64::from(stack.iret.eip);
#[cfg(target_arch = "x86_64")]
let instruction_pointer = stack.iret.rip;
let mut buf = [0u8; 19];
buf[0] = b'0';
buf[1] = b'x';
for i in 0..16 {
let nibble = ((instruction_pointer >> (60 - i * 4)) & 0xF) as u8;
buf[2 + i] = if nibble < 10 {
b'0' + nibble
} else {
b'a' + nibble - 10
};
}
buf[18] = b'\n';
nmi_raw_serial_write(&buf);
}
NMI_IN_PROGRESS.store(false, Ordering::SeqCst);
// TODO: This will likely deadlock
println!("Non-maskable interrupt");
stack.dump();
}
});
@@ -28,8 +28,6 @@ pub mod pti;
/// Initialization and start function
pub mod start;
pub mod sleep;
/// Stop function
pub mod stop;
@@ -1,712 +0,0 @@
use alloc::{sync::Arc, vec::Vec};
use core::{
ptr::NonNull,
str::FromStr,
sync::atomic::{AtomicU32, Ordering},
};
use acpi_ext::{
aml::{namespace::AmlName, object::Object, Interpreter},
registers::FixedRegisters,
sdt::{facs::Facs, fadt::Fadt, SdtHeader},
AcpiTables, Handle, Handler, PhysicalMapping,
};
use spin::Mutex;
use syscall::error::{Error, EINVAL, EIO};
use x86::{segmentation::SegmentSelector, task, Ring};
use crate::{
acpi::ACPI_ROOT_INFO,
arch::interrupt,
memory::{
round_down_pages, round_up_pages, KernelMapper, Page, PageFlags, PhysicalAddress, RmmA,
RmmArch, VirtualAddress, PAGE_SIZE,
},
syscall::io::{Io, Pio},
};
const ACPI_SLP_TYP_SHIFT: u16 = 10;
const ACPI_SLP_TYP_MASK: u16 = 0x1C00;
const ACPI_SLP_EN: u16 = 1 << 13;
const WAKE_TRAMPOLINE_PHYS: usize = 0x8000;
const SLEEP_RETURN_OK: usize = 0;
#[cfg(target_arch = "x86_64")]
static WAKE_TRAMPOLINE_DATA: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/s3_wakeup"));
#[repr(C, packed)]
#[derive(Clone, Copy, Debug, Default)]
struct DescriptorTableRegister {
limit: u16,
base: u64,
}
#[repr(C, align(64))]
#[derive(Clone, Copy, Debug)]
struct FpuState {
bytes: [u8; 4096],
}
impl Default for FpuState {
fn default() -> Self {
Self { bytes: [0; 4096] }
}
}
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub enum SleepState {
S3,
S5,
}
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub enum SleepError {
UnsupportedArch,
MissingAcpi,
MissingFadt,
MissingFacs,
MissingSleepObject,
InvalidSleepObject,
UnsupportedPmControl,
UnsupportedAmlOperation,
SleepDidNotEnter,
}
impl SleepError {
fn code(self) -> usize {
match self {
Self::UnsupportedArch => EINVAL as usize,
Self::MissingAcpi
| Self::MissingFadt
| Self::MissingFacs
| Self::MissingSleepObject
| Self::UnsupportedAmlOperation => EIO as usize,
Self::InvalidSleepObject | Self::UnsupportedPmControl | Self::SleepDidNotEnter => {
EINVAL as usize
}
}
}
fn from_code(code: usize) -> Self {
match code as i32 {
x if x == EINVAL => Self::InvalidSleepObject,
_ => Self::MissingAcpi,
}
}
}
#[derive(Clone, Copy, Debug, Default)]
struct SavedCpuContext {
entry_rsp: usize,
runtime_rsp: usize,
facs_address: usize,
cr0: usize,
cr2: usize,
cr3: usize,
cr4: usize,
rflags: usize,
gdtr: DescriptorTableRegister,
idtr: DescriptorTableRegister,
efer: u64,
fs_base: u64,
gs_base: u64,
kernel_gs_base: u64,
fpu: FpuState,
}
static SAVED_CONTEXT: Mutex<Option<SavedCpuContext>> = Mutex::new(None);
static AML_MUTEX_IDS: AtomicU32 = AtomicU32::new(1);
#[derive(Clone, Copy, Debug)]
struct SleepTypeData {
a: u16,
b: u16,
}
#[derive(Clone, Copy)]
struct KernelAcpiHandler;
impl KernelAcpiHandler {
fn map_range(physical_address: usize, size: usize) -> (*mut u8, usize) {
let map_base = round_down_pages(physical_address);
let map_offset = physical_address - map_base;
let mapped_length = round_up_pages(size + map_offset);
// SAFETY: The ACPI interpreter only requests firmware-described physical regions.
unsafe {
let mut mapper = KernelMapper::lock_rw();
for page_index in 0..mapped_length / PAGE_SIZE {
let (_, flush) = mapper
.map_linearly(
PhysicalAddress::new(map_base + page_index * PAGE_SIZE),
PageFlags::new(),
)
.expect("failed to linearly map ACPI physical region");
flush.flush();
}
}
let virtual_base = RmmA::phys_to_virt(PhysicalAddress::new(map_base)).data();
((virtual_base + map_offset) as *mut u8, mapped_length)
}
}
impl Handler for KernelAcpiHandler {
unsafe fn map_physical_region<T>(&self, physical_address: usize, size: usize) -> PhysicalMapping<Self, T> {
let (virtual_start, mapped_length) = Self::map_range(physical_address, size);
PhysicalMapping {
physical_start: physical_address,
virtual_start: NonNull::new(virtual_start.cast::<T>())
.expect("expected mapped ACPI virtual address to be non-null"),
region_length: size,
mapped_length,
handler: *self,
}
}
fn unmap_physical_region<T>(_region: &PhysicalMapping<Self, T>) {}
fn read_u8(&self, address: usize) -> u8 {
// SAFETY: AML system-memory accesses are byte-addressable firmware regions.
unsafe { core::ptr::read_volatile(RmmA::phys_to_virt(PhysicalAddress::new(address)).data() as *const u8) }
}
fn read_u16(&self, address: usize) -> u16 {
// SAFETY: AML system-memory accesses are word-addressable firmware regions.
unsafe {
core::ptr::read_volatile(RmmA::phys_to_virt(PhysicalAddress::new(address)).data() as *const u16)
}
}
fn read_u32(&self, address: usize) -> u32 {
// SAFETY: AML system-memory accesses are dword-addressable firmware regions.
unsafe {
core::ptr::read_volatile(RmmA::phys_to_virt(PhysicalAddress::new(address)).data() as *const u32)
}
}
fn read_u64(&self, address: usize) -> u64 {
// SAFETY: AML system-memory accesses are qword-addressable firmware regions.
unsafe {
core::ptr::read_volatile(RmmA::phys_to_virt(PhysicalAddress::new(address)).data() as *const u64)
}
}
fn write_u8(&self, address: usize, value: u8) {
// SAFETY: AML system-memory accesses are byte-addressable firmware regions.
unsafe {
core::ptr::write_volatile(RmmA::phys_to_virt(PhysicalAddress::new(address)).data() as *mut u8, value)
}
}
fn write_u16(&self, address: usize, value: u16) {
// SAFETY: AML system-memory accesses are word-addressable firmware regions.
unsafe {
core::ptr::write_volatile(
RmmA::phys_to_virt(PhysicalAddress::new(address)).data() as *mut u16,
value,
)
}
}
fn write_u32(&self, address: usize, value: u32) {
// SAFETY: AML system-memory accesses are dword-addressable firmware regions.
unsafe {
core::ptr::write_volatile(
RmmA::phys_to_virt(PhysicalAddress::new(address)).data() as *mut u32,
value,
)
}
}
fn write_u64(&self, address: usize, value: u64) {
// SAFETY: AML system-memory accesses are qword-addressable firmware regions.
unsafe {
core::ptr::write_volatile(
RmmA::phys_to_virt(PhysicalAddress::new(address)).data() as *mut u64,
value,
)
}
}
fn read_io_u8(&self, port: u16) -> u8 {
Pio::<u8>::new(port).read()
}
fn read_io_u16(&self, port: u16) -> u16 {
Pio::<u16>::new(port).read()
}
fn read_io_u32(&self, port: u16) -> u32 {
Pio::<u32>::new(port).read()
}
fn write_io_u8(&self, port: u16, value: u8) {
Pio::<u8>::new(port).write(value)
}
fn write_io_u16(&self, port: u16, value: u16) {
Pio::<u16>::new(port).write(value)
}
fn write_io_u32(&self, port: u16, value: u32) {
Pio::<u32>::new(port).write(value)
}
fn read_pci_u8(&self, _address: acpi_ext::PciAddress, _offset: u16) -> u8 {
0
}
fn read_pci_u16(&self, _address: acpi_ext::PciAddress, _offset: u16) -> u16 {
0
}
fn read_pci_u32(&self, _address: acpi_ext::PciAddress, _offset: u16) -> u32 {
0
}
fn write_pci_u8(&self, _address: acpi_ext::PciAddress, _offset: u16, _value: u8) {}
fn write_pci_u16(&self, _address: acpi_ext::PciAddress, _offset: u16, _value: u16) {}
fn write_pci_u32(&self, _address: acpi_ext::PciAddress, _offset: u16, _value: u32) {}
fn nanos_since_boot(&self) -> u64 {
0
}
fn stall(&self, microseconds: u64) {
for _ in 0..(microseconds.saturating_mul(64)) {
core::hint::spin_loop();
}
}
fn sleep(&self, milliseconds: u64) {
for _ in 0..(milliseconds.saturating_mul(64_000)) {
core::hint::spin_loop();
}
}
fn create_mutex(&self) -> Handle {
Handle(AML_MUTEX_IDS.fetch_add(1, Ordering::Relaxed))
}
fn acquire(&self, _mutex: Handle, _timeout: u16) -> Result<(), acpi_ext::aml::AmlError> {
Ok(())
}
fn release(&self, _mutex: Handle) {}
}
fn sleep_state_name(state: SleepState) -> &'static str {
match state {
SleepState::S3 => "\\_S3",
SleepState::S5 => "\\_S5",
}
}
fn encode_sleep_type(value: u16) -> u16 {
if value <= 0x7 {
value << ACPI_SLP_TYP_SHIFT
} else {
value & ACPI_SLP_TYP_MASK
}
}
fn load_interpreter() -> Result<(
Arc<FixedRegisters<KernelAcpiHandler>>,
PhysicalMapping<KernelAcpiHandler, Facs>,
Interpreter<KernelAcpiHandler>,
), SleepError> {
let root = *ACPI_ROOT_INFO.get().ok_or(SleepError::MissingAcpi)?;
let handler = KernelAcpiHandler;
// SAFETY: ACPI root info is captured from the firmware-provided, already validated root table.
let tables = unsafe {
AcpiTables::from_rsdt(handler, root.revision, root.root_sdt_address.data())
.map_err(|_| SleepError::MissingAcpi)?
};
let fadt = tables.find_table::<Fadt>().ok_or(SleepError::MissingFadt)?;
let registers = Arc::new(
FixedRegisters::new(&fadt, handler).map_err(|_| SleepError::UnsupportedPmControl)?,
);
let facs_address = fadt.facs_address().map_err(|_| SleepError::MissingFacs)?;
// SAFETY: The FADT-supplied FACS address is used exactly as described by the ACPI spec.
let facs = unsafe { handler.map_physical_region::<Facs>(facs_address, core::mem::size_of::<Facs>()) };
// SAFETY: The AML interpreter only needs an owned mapping of the same firmware FACS table.
let interpreter_facs = unsafe {
handler.map_physical_region::<Facs>(facs_address, core::mem::size_of::<Facs>())
};
let dsdt = tables.dsdt().map_err(|_| SleepError::MissingFadt)?;
let interpreter = Interpreter::new(handler, dsdt.revision, Arc::clone(&registers), Some(interpreter_facs));
// SAFETY: Each AML table mapping is owned by the interpreter during table loading.
unsafe {
let mapping = handler.map_physical_region::<SdtHeader>(dsdt.phys_address, dsdt.length as usize);
let stream = core::slice::from_raw_parts(
mapping.virtual_start.as_ptr().byte_add(core::mem::size_of::<SdtHeader>()) as *const u8,
dsdt.length as usize - core::mem::size_of::<SdtHeader>(),
);
interpreter
.load_table(stream)
.map_err(|_| SleepError::UnsupportedAmlOperation)?;
for ssdt in tables.ssdts() {
let mapping = handler.map_physical_region::<SdtHeader>(ssdt.phys_address, ssdt.length as usize);
let stream = core::slice::from_raw_parts(
mapping.virtual_start.as_ptr().byte_add(core::mem::size_of::<SdtHeader>()) as *const u8,
ssdt.length as usize - core::mem::size_of::<SdtHeader>(),
);
interpreter
.load_table(stream)
.map_err(|_| SleepError::UnsupportedAmlOperation)?;
}
}
Ok((registers, facs, interpreter))
}
fn sleep_type_data_from_interpreter(
interpreter: &Interpreter<KernelAcpiHandler>,
state: SleepState,
) -> Result<SleepTypeData, SleepError> {
let name = AmlName::from_str(sleep_state_name(state)).map_err(|_| SleepError::MissingSleepObject)?;
let object = interpreter
.evaluate(name, Vec::new())
.map_err(|_| SleepError::MissingSleepObject)?;
let Object::Package(package) = &*object else {
return Err(SleepError::InvalidSleepObject);
};
let Some(typa_object) = package.first() else {
return Err(SleepError::InvalidSleepObject);
};
let Some(typb_object) = package.get(1) else {
return Err(SleepError::InvalidSleepObject);
};
let Object::Integer(typa) = &**typa_object else {
return Err(SleepError::InvalidSleepObject);
};
let Object::Integer(typb) = &**typb_object else {
return Err(SleepError::InvalidSleepObject);
};
Ok(SleepTypeData {
a: encode_sleep_type(*typa as u16),
b: encode_sleep_type(*typb as u16),
})
}
fn sleep_type_data(state: SleepState) -> Result<SleepTypeData, SleepError> {
let (_registers, _facs, interpreter) = load_interpreter()?;
sleep_type_data_from_interpreter(&interpreter, state)
}
fn install_wake_trampoline(stack_rsp: usize, cr3: usize) {
let trampoline_page = Page::containing_address(VirtualAddress::new(WAKE_TRAMPOLINE_PHYS));
let trampoline_frame = PhysicalAddress::new(WAKE_TRAMPOLINE_PHYS);
// SAFETY: The 0x8000 low-memory trampoline page is reserved by the kernel for bootstrap stubs.
let (result, _) = unsafe {
let mut mapper = KernelMapper::lock_rw();
let result = mapper
.map_phys(
trampoline_page.start_address(),
trampoline_frame,
PageFlags::new().execute(true).write(true),
)
.expect("failed to map S3 wake trampoline page");
(result, mapper.table().phys().data())
};
result.flush();
for (index, value) in WAKE_TRAMPOLINE_DATA.iter().enumerate() {
// SAFETY: The trampoline page is mapped writable at the same virtual address as the physical page.
unsafe {
core::ptr::write_volatile((WAKE_TRAMPOLINE_PHYS as *mut u8).add(index), *value);
}
}
// SAFETY: The wake trampoline layout reserves three qword fields immediately after the jump.
unsafe {
let stack_slot = (WAKE_TRAMPOLINE_PHYS + 8) as *mut u64;
let page_table_slot = stack_slot.add(1);
let code_slot = stack_slot.add(2);
stack_slot.write(stack_rsp as u64);
page_table_slot.write(cr3 as u64);
#[expect(clippy::fn_to_numeric_cast)]
code_slot.write(resume_from_s3_trampoline as usize as u64);
}
// SAFETY: The trampoline mapping is no longer needed once the physical page has been populated.
let (_frame, _, flush) = unsafe {
KernelMapper::lock_rw()
.unmap_phys(trampoline_page.start_address())
.expect("failed to unmap S3 wake trampoline page")
};
flush.flush();
}
fn save_descriptor_tables(context: &mut SavedCpuContext) {
// SAFETY: SGDT/SIDT only read the current CPU descriptor-table registers into the provided storage.
unsafe {
core::arch::asm!("sgdt [{}]", in(reg) &mut context.gdtr, options(nostack, preserves_flags));
core::arch::asm!("sidt [{}]", in(reg) &mut context.idtr, options(nostack, preserves_flags));
}
}
fn save_fpu_state(context: &mut SavedCpuContext) {
// SAFETY: The kernel owns the current CPU at suspend entry and the FXSAVE buffer is 64-byte aligned.
unsafe {
core::arch::asm!(
"fxsave64 [{}]",
in(reg) context.fpu.bytes.as_mut_ptr(),
);
}
}
fn restore_fpu_state(context: &SavedCpuContext) {
// SAFETY: The saved FXSAVE image belongs to the same CPU context and matches the restore instruction.
unsafe {
core::arch::asm!(
"fxrstor64 [{}]",
in(reg) context.fpu.bytes.as_ptr(),
);
}
}
fn save_cpu_context(entry_rsp: usize) -> SavedCpuContext {
let mut context = SavedCpuContext {
entry_rsp,
..SavedCpuContext::default()
};
// SAFETY: Reading control registers and MSRs is required to reconstruct the CPU execution state on wake.
unsafe {
core::arch::asm!(
"mov {}, cr0",
out(reg) context.cr0,
options(nostack, preserves_flags)
);
core::arch::asm!(
"mov {}, cr2",
out(reg) context.cr2,
options(nostack, preserves_flags)
);
core::arch::asm!(
"mov {}, cr3",
out(reg) context.cr3,
options(nostack, preserves_flags)
);
core::arch::asm!(
"mov {}, cr4",
out(reg) context.cr4,
options(nostack, preserves_flags)
);
core::arch::asm!(
"pushfq",
"pop {}",
out(reg) context.rflags,
options(preserves_flags)
);
core::arch::asm!("mov {}, rsp", out(reg) context.runtime_rsp, options(nostack, preserves_flags));
context.efer = x86::msr::rdmsr(x86::msr::IA32_EFER);
context.fs_base = x86::msr::rdmsr(x86::msr::IA32_FS_BASE);
context.gs_base = x86::msr::rdmsr(x86::msr::IA32_GS_BASE);
context.kernel_gs_base = x86::msr::rdmsr(x86::msr::IA32_KERNEL_GSBASE);
}
save_descriptor_tables(&mut context);
save_fpu_state(&mut context);
context
}
fn set_firmware_waking_vector(facs: &mut PhysicalMapping<KernelAcpiHandler, Facs>, vector: usize) {
facs.firmware_waking_vector = vector as u32;
facs.x_firmware_waking_vector = vector as u64;
}
fn write_pm1_control_block(
registers: &FixedRegisters<KernelAcpiHandler>,
sleep_type: SleepTypeData,
) -> Result<(), SleepError> {
let current_a = registers
.pm1_control_registers
.pm1a
.read()
.map_err(|_| SleepError::UnsupportedPmControl)? as u16;
let armed_a = (current_a & !(ACPI_SLP_TYP_MASK | ACPI_SLP_EN)) | sleep_type.a;
registers
.pm1_control_registers
.pm1a
.write(u64::from(armed_a))
.map_err(|_| SleepError::UnsupportedPmControl)?;
if let Some(pm1b) = &registers.pm1_control_registers.pm1b {
let current_b = pm1b.read().map_err(|_| SleepError::UnsupportedPmControl)? as u16;
let armed_b = (current_b & !(ACPI_SLP_TYP_MASK | ACPI_SLP_EN)) | sleep_type.b;
pm1b.write(u64::from(armed_b))
.map_err(|_| SleepError::UnsupportedPmControl)?;
pm1b.write(u64::from(armed_b | ACPI_SLP_EN))
.map_err(|_| SleepError::UnsupportedPmControl)?;
}
// SAFETY: WBINVD is required here to flush dirty cache lines before firmware powers down the CPU package.
unsafe {
core::arch::asm!("wbinvd", options(nostack, preserves_flags));
}
registers
.pm1_control_registers
.pm1a
.write(u64::from(armed_a | ACPI_SLP_EN))
.map_err(|_| SleepError::UnsupportedPmControl)?;
Ok(())
}
#[unsafe(naked)]
unsafe extern "sysv64" fn enter_sleep_raw(state: usize) -> usize {
core::arch::naked_asm!(
"mov rsi, rsp",
"jmp {inner}",
inner = sym enter_sleep_raw_inner,
);
}
extern "C" fn enter_sleep_raw_inner(state: usize, entry_rsp: usize) -> usize {
let state = match state {
3 => SleepState::S3,
5 => SleepState::S5,
_ => return SleepError::InvalidSleepObject.code(),
};
let (registers, mut facs, interpreter) = match load_interpreter() {
Ok(tuple) => tuple,
Err(error) => return error.code(),
};
let sleep_type = match sleep_type_data_from_interpreter(&interpreter, state) {
Ok(data) => data,
Err(error) => return error.code(),
};
let mut context = save_cpu_context(entry_rsp);
context.facs_address = facs.physical_start;
install_wake_trampoline(context.runtime_rsp, context.cr3);
set_firmware_waking_vector(&mut facs, WAKE_TRAMPOLINE_PHYS);
{
let mut saved = SAVED_CONTEXT.lock();
*saved = Some(context);
}
// SAFETY: Suspend entry must not be interrupted while the wake vector and PM1 control block are being armed.
unsafe {
interrupt::disable();
}
if let Err(error) = write_pm1_control_block(registers.as_ref(), sleep_type) {
return error.code();
}
// SAFETY: The final CLI+HLT sequence is the architectural handoff point after asserting SLP_EN.
unsafe {
core::arch::asm!("cli; hlt", options(nostack));
}
SleepError::SleepDidNotEnter.code()
}
extern "C" fn resume_from_s3_trampoline() -> ! {
let mut saved = SAVED_CONTEXT.lock();
let context = saved.take().expect("S3 wake trampoline resumed without saved CPU context");
drop(saved);
// SAFETY: The saved FACS physical address was captured from the validated FADT during suspend entry.
if context.facs_address != 0 {
let mut facs = unsafe {
KernelAcpiHandler.map_physical_region::<Facs>(
context.facs_address,
core::mem::size_of::<Facs>(),
)
};
set_firmware_waking_vector(&mut facs, 0);
}
// SAFETY: The wake trampoline already switched to the saved kernel CR3 and long mode, so the remaining restores are architectural register state only.
unsafe {
x86::msr::wrmsr(x86::msr::IA32_EFER, context.efer);
core::arch::asm!("mov cr3, {}", in(reg) context.cr3, options(nostack));
core::arch::asm!("mov cr4, {}", in(reg) context.cr4, options(nostack));
core::arch::asm!("mov cr2, {}", in(reg) context.cr2, options(nostack));
core::arch::asm!("mov cr0, {}", in(reg) context.cr0, options(nostack));
core::arch::asm!("lgdt [{}]", in(reg) &context.gdtr, options(nostack));
core::arch::asm!("lidt [{}]", in(reg) &context.idtr, options(nostack));
task::load_tr(SegmentSelector::new(crate::arch::gdt::GDT_TSS as u16, Ring::Ring0));
x86::msr::wrmsr(x86::msr::IA32_FS_BASE, context.fs_base);
x86::msr::wrmsr(x86::msr::IA32_GS_BASE, context.gs_base);
x86::msr::wrmsr(x86::msr::IA32_KERNEL_GSBASE, context.kernel_gs_base);
}
restore_fpu_state(&context);
// SAFETY: Returning with the original entry stack and RFLAGS completes the suspend call as a successful function return.
unsafe {
core::arch::asm!(
"mov rsp, {entry_rsp}",
"push {rflags}",
"popfq",
"xor eax, eax",
"ret",
entry_rsp = in(reg) context.entry_rsp,
rflags = in(reg) context.rflags,
options(noreturn)
);
}
}
pub fn enter_sleep_state(state: SleepState) -> core::result::Result<(), SleepError> {
#[cfg(not(target_arch = "x86_64"))]
{
let _ = state;
return Err(SleepError::UnsupportedArch);
}
#[cfg(target_arch = "x86_64")]
{
let raw = unsafe {
enter_sleep_raw(match state {
SleepState::S3 => 3,
SleepState::S5 => 5,
})
};
if raw == SLEEP_RETURN_OK {
Ok(())
} else {
Err(SleepError::from_code(raw))
}
}
}
pub fn available_sleep_states() -> &'static [u8] {
if sleep_type_data(SleepState::S3).is_ok() {
b"S3\nS5\n"
} else {
b"S5\n"
}
}
pub fn trigger_sleep_request(request: &str) -> Result<(), Error> {
match request.trim() {
"S3" => enter_sleep_state(SleepState::S3).map_err(|_| Error::new(EIO)),
"S5" => enter_sleep_state(SleepState::S5).map_err(|_| Error::new(EIO)),
_ => Err(Error::new(EINVAL)),
}
}
@@ -82,15 +82,6 @@ extern "C" fn kstart() {
/// The entry to Rust, all things must be initialized
unsafe extern "C" fn start(args_ptr: *const KernelArgs, stack_end: usize) -> ! {
unsafe {
// EARLY CANARY: write 'R' to COM1 before any kernel init.
// This proves the serial hardware works and the kernel reached Rust entry.
// If this character appears but "RedBear OS starting..." does not,
// the hang is in args_ptr.read(), serial::init(), or graphical_debug::init().
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
{
core::arch::asm!("out dx, al", in("dx") 0x3F8u16, in("al") b'R', options(nostack, preserves_flags));
}
let bootstrap = {
let args = args_ptr.read();
@@ -100,49 +91,27 @@ unsafe extern "C" fn start(args_ptr: *const KernelArgs, stack_end: usize) -> ! {
// Set up graphical debug
graphical_debug::init(args.env());
// SECOND CANARY: write 'S' to COM1 after serial init.
// If 'R' appears but 'S' does not, the hang is in serial::init() or graphical_debug::init().
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
{
core::arch::asm!("out dx, al", in("dx") 0x3F8u16, in("al") b'S', options(nostack, preserves_flags));
}
info!("RedBear OS starting...");
info!("Redox OS starting...");
args.print();
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
{ core::arch::asm!("out dx, al", in("dx") 0x3F8u16, in("al") b'1', options(nostack, preserves_flags)); }
// Set up GDT
gdt::init_bsp(stack_end);
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
{ core::arch::asm!("out dx, al", in("dx") 0x3F8u16, in("al") b'2', options(nostack, preserves_flags)); }
// Set up IDT
idt::init_bsp();
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
{ core::arch::asm!("out dx, al", in("dx") 0x3F8u16, in("al") b'3', options(nostack, preserves_flags)); }
// Initialize RMM
#[cfg(target_arch = "x86")]
crate::startup::memory::init(&args, Some(0x100000), Some(0x40000000));
#[cfg(target_arch = "x86_64")]
crate::startup::memory::init(&args, Some(0x100000), None);
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
{ core::arch::asm!("out dx, al", in("dx") 0x3F8u16, in("al") b'4', options(nostack, preserves_flags)); }
// Initialize paging
paging::init();
#[cfg(target_arch = "x86_64")]
crate::arch::alternative::early_init(true);
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
{ core::arch::asm!("out dx, al", in("dx") 0x3F8u16, in("al") b'5', options(nostack, preserves_flags)); }
// Set up syscall instruction
interrupt::syscall::init();
@@ -152,9 +121,6 @@ unsafe extern "C" fn start(args_ptr: *const KernelArgs, stack_end: usize) -> ! {
// Activate memory logging
crate::log::init();
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
{ core::arch::asm!("out dx, al", in("dx") 0x3F8u16, in("al") b'6', options(nostack, preserves_flags)); }
// Initialize miscellaneous processor features
#[cfg(target_arch = "x86_64")]
crate::arch::misc::init(LogicalCpuId::BSP);
@@ -162,9 +128,6 @@ unsafe extern "C" fn start(args_ptr: *const KernelArgs, stack_end: usize) -> ! {
// Initialize devices
device::init();
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
{ core::arch::asm!("out dx, al", in("dx") 0x3F8u16, in("al") b'7', options(nostack, preserves_flags)); }
// Read ACPI tables, starts APs
if cfg!(feature = "acpi") {
crate::acpi::init(args.acpi_rsdp());
@@ -1,110 +0,0 @@
; ACPI S3 wake trampoline
; compiled with nasm by build.rs, copied to physical 0x8000 before S3 entry
ORG 0x8000
SECTION .text
USE16
trampoline:
jmp short startup_wake
times 8 - ($ - trampoline) nop
.stack: dq 0
.page_table: dq 0
.code: dq 0
startup_wake:
cli
xor ax, ax
mov ds, ax
mov es, ax
mov ss, ax
mov sp, 0
mov edi, [trampoline.page_table]
mov cr3, edi
mov eax, cr0
and al, 11110011b
or al, 00100010b
mov cr0, eax
mov eax, cr4
or eax, 1 << 9 | 1 << 7 | 1 << 5 | 1 << 4
mov cr4, eax
fninit
lgdt [gdtr]
mov ecx, 0xC0000080
rdmsr
or eax, 1 << 11 | 1 << 8
wrmsr
mov ebx, cr0
or ebx, 1 << 31 | 1 << 16 | 1
mov cr0, ebx
jmp gdt.kernel_code:long_mode_wake
USE64
long_mode_wake:
mov rax, gdt.kernel_data
mov ds, rax
mov es, rax
mov fs, rax
mov gs, rax
mov ss, rax
mov rsp, [trampoline.stack]
mov rax, [trampoline.code]
jmp rax
struc GDTEntry
.limitl resw 1
.basel resw 1
.basem resb 1
.attribute resb 1
.flags__limith resb 1
.baseh resb 1
endstruc
attrib:
.present equ 1 << 7
.user equ 1 << 4
.code equ 1 << 3
.writable equ 1 << 1
flags:
.long_mode equ 1 << 5
gdtr:
dw gdt.end + 1
dq gdt
gdt:
.null equ $ - gdt
dq 0
.kernel_code equ $ - gdt
istruc GDTEntry
at GDTEntry.limitl, dw 0
at GDTEntry.basel, dw 0
at GDTEntry.basem, db 0
at GDTEntry.attribute, db attrib.present | attrib.user | attrib.code
at GDTEntry.flags__limith, db flags.long_mode
at GDTEntry.baseh, db 0
iend
.kernel_data equ $ - gdt
istruc GDTEntry
at GDTEntry.limitl, dw 0
at GDTEntry.basel, dw 0
at GDTEntry.basem, db 0
at GDTEntry.attribute, db attrib.present | attrib.user | attrib.writable
at GDTEntry.flags__limith, db 0
at GDTEntry.baseh, db 0
iend
.end equ $ - gdt
@@ -4,10 +4,16 @@ use crate::{
percpu::PercpuBlock,
syscall::FloatRegisters,
};
use core::{mem::offset_of, ptr};
use core::{mem::offset_of, ptr, sync::atomic::AtomicBool};
use spin::Once;
use syscall::{EnvRegisters, Result};
/// This must be used by the kernel to ensure that context switches are done atomically
/// Compare and exchange this to true when beginning a context switch on any CPU
/// The `Context::switch_to` function will set it back to false, allowing other CPU's to switch
/// This must be done, as no locks can be held on the stack during switch
pub static CONTEXT_SWITCH_LOCK: AtomicBool = AtomicBool::new(false);
// 512 bytes for registers, extra bytes for fpcr and fpsr
pub const KFX_ALIGN: usize = 16;
@@ -2,11 +2,13 @@ use crate::{
arch::interrupt::InterruptStack, context::context::Kstack, memory::RmmA, percpu::PercpuBlock,
syscall::FloatRegisters,
};
use core::mem::offset_of;
use core::{mem::offset_of, sync::atomic::AtomicBool};
use rmm::{Arch, VirtualAddress};
use spin::Once;
use syscall::{error::*, EnvRegisters};
pub static CONTEXT_SWITCH_LOCK: AtomicBool = AtomicBool::new(false);
pub const KFX_ALIGN: usize = 16;
#[derive(Clone, Debug, Default)]
@@ -1,4 +1,4 @@
use core::mem::offset_of;
use core::{mem::offset_of, sync::atomic::AtomicBool};
use rmm::{Arch, VirtualAddress};
use spin::Once;
use syscall::{error::*, EnvRegisters};
@@ -14,6 +14,12 @@ use crate::{
syscall::FloatRegisters,
};
/// This must be used by the kernel to ensure that context switches are done atomically
/// Compare and exchange this to true when beginning a context switch on any CPU
/// The `Context::switch_to` function will set it back to false, allowing other CPU's to switch
/// This must be done, as no locks can be held on the stack during switch
pub static CONTEXT_SWITCH_LOCK: AtomicBool = AtomicBool::new(false);
const ST_RESERVED: u128 = 0xFFFF_FFFF_FFFF_0000_0000_0000_0000_0000;
pub const KFX_ALIGN: usize = 16;
@@ -1,5 +1,6 @@
use core::{
ptr::{addr_of, addr_of_mut},
sync::atomic::AtomicBool,
};
use crate::syscall::FloatRegisters;
@@ -11,6 +12,12 @@ use spin::Once;
use syscall::{error::*, EnvRegisters};
use x86::msr;
/// This must be used by the kernel to ensure that context switches are done atomically
/// Compare and exchange this to true when beginning a context switch on any CPU
/// The `Context::switch_to` function will set it back to false, allowing other CPU's to switch
/// This must be done, as no locks can be held on the stack during switch
pub static CONTEXT_SWITCH_LOCK: AtomicBool = AtomicBool::new(false);
const ST_RESERVED: u128 = 0xFFFF_FFFF_FFFF_0000_0000_0000_0000_0000;
#[cfg(cpu_feature_never = "xsave")]
@@ -148,8 +148,6 @@ pub struct Context {
pub euid: u32,
pub egid: u32,
pub pid: usize,
/// Supplementary group IDs for access control decisions.
pub groups: Vec<u32>,
// See [`PreemptGuard`]
//
@@ -206,7 +204,6 @@ impl Context {
euid: 0,
egid: 0,
pid: 0,
groups: Vec::new(),
#[cfg(feature = "syscall_debug")]
syscall_debug_info: crate::syscall::debug::SyscallDebugInfo::default(),
@@ -482,7 +479,6 @@ impl Context {
uid: self.euid,
gid: self.egid,
pid: self.pid,
groups: self.groups.clone(),
}
}
}
+5 -52
View File
@@ -4,7 +4,7 @@ use crate::{
event,
scheme::{self, SchemeId},
sync::{CleanLockToken, RwLock, L6},
syscall::error::{Error, Result, ESTALE},
syscall::error::Result,
};
use alloc::sync::Arc;
use syscall::{schemev2::NewFdFlags, RwFlags, O_APPEND, O_NONBLOCK};
@@ -18,7 +18,6 @@ pub struct FileDescription {
pub offset: u64,
/// The scheme that this file refers to
pub scheme: SchemeId,
pub scheme_generation: Option<u64>,
/// The number the scheme uses to refer to this file
pub number: usize,
/// The flags passed to open or fcntl(SETFL)
@@ -33,52 +32,6 @@ bitflags! {
}
}
impl FileDescription {
pub fn with_generation(
scheme: SchemeId,
scheme_generation: Option<u64>,
number: usize,
offset: u64,
flags: u32,
internal_flags: InternalFlags,
) -> Self {
Self {
offset,
scheme,
scheme_generation,
number,
flags,
internal_flags,
}
}
pub fn new(
scheme: SchemeId,
number: usize,
offset: u64,
flags: u32,
internal_flags: InternalFlags,
token: &mut CleanLockToken,
) -> Self {
Self::with_generation(
scheme,
Some(scheme::current_scheme_generation(token.token(), scheme)),
number,
offset,
flags,
internal_flags,
)
}
pub fn get_scheme(&self, token: &mut CleanLockToken) -> Result<scheme::KernelSchemes> {
if let Some(expected_generation) = self.scheme_generation
&& expected_generation != scheme::current_scheme_generation(token.token(), self.scheme)
{
return Err(Error::new(ESTALE));
}
scheme::get_scheme(token.token(), self.scheme)
}
pub fn rw_flags(&self, rw: RwFlags) -> u32 {
let mut ret = self.flags & !(O_NONBLOCK | O_APPEND) as u32;
if rw.contains(RwFlags::APPEND) {
@@ -123,7 +76,7 @@ impl FileDescription {
pub fn try_close(self, token: &mut CleanLockToken) -> Result<()> {
event::unregister_file(self.scheme, self.number, token);
let scheme = self.get_scheme(token)?;
let scheme = scheme::get_scheme(token.token(), self.scheme)?;
scheme.close(self.number, token)
}
@@ -132,12 +85,12 @@ impl FileDescription {
impl FileDescriptor {
pub fn close(self, token: &mut CleanLockToken) -> Result<()> {
{
let (desc, number, internal_flags) = {
let (scheme_id, number, internal_flags) = {
let desc = self.description.read(token.token());
(*desc, desc.number, desc.internal_flags)
(desc.scheme, desc.number, desc.internal_flags)
};
if internal_flags.contains(InternalFlags::NOTIFY_ON_NEXT_DETACH) {
let scheme = desc.get_scheme(token)?;
let scheme = scheme::get_scheme(token.token(), scheme_id)?;
scheme.detach(number, token)?;
}
}
@@ -64,13 +64,14 @@ impl UnmapResult {
return Ok(());
};
let (scheme, number) = {
let desc = *description.read(token.token());
(desc.get_scheme(token)?, desc.number)
let (scheme_id, number) = {
let desc = description.write(token.token());
(desc.scheme, desc.number)
};
let funmap_result = scheme
.kfunmap(number, base_offset, self.size, self.flags, token);
let scheme_opt = scheme::get_scheme(token.token(), scheme_id);
let funmap_result = scheme_opt
.and_then(|scheme| scheme.kfunmap(number, base_offset, self.size, self.flags, token));
if let Ok(fd) = Arc::try_unwrap(description) {
fd.into_inner().try_close(token)?;
@@ -2686,13 +2687,20 @@ fn correct_inner<'l>(
// XXX: This is cheating, but guaranteed we won't deadlock because we've dropped addr_space_guard
let mut token = unsafe { CleanLockToken::new() };
let desc = *file_ref.description.read(token.token());
let scheme = desc.get_scheme(&mut token).map_err(|_| PfError::Segv)?;
let scheme_number = desc.number;
let user_inner = match scheme {
KernelSchemes::User(user) => user.inner,
_ => return Err(PfError::Segv),
let (scheme_id, scheme_number) = {
let desc = &file_ref.description.read(token.token());
(desc.scheme, desc.number)
};
let user_inner = scheme::get_scheme(token.token(), scheme_id)
.ok()
.and_then(|s| {
if let KernelSchemes::User(user) = s {
Some(user.inner)
} else {
None
}
})
.ok_or(PfError::Segv)?;
let offset = file_ref.base_offset as u64 + (pages_from_grant_start * PAGE_SIZE) as u64;
user_inner
@@ -14,8 +14,8 @@ use crate::{
memory::{RmmA, RmmArch, TableKind},
percpu::PercpuBlock,
sync::{
ArcRwLockWriteGuard, CleanLockToken, LockToken, McsMutex, McsMutexGuard, Mutex,
MutexGuard, RwLock, RwLockReadGuard, RwLockWriteGuard, L0, L1, L2, L4,
ArcRwLockWriteGuard, CleanLockToken, LockToken, Mutex, MutexGuard, RwLock, RwLockReadGuard,
RwLockWriteGuard, L0, L1, L2, L4,
},
syscall::error::Result,
};
@@ -74,12 +74,10 @@ pub use self::arch::empty_cr3;
// the context file descriptors.
static CONTEXTS: RwLock<L2, BTreeSet<ContextRef>> = RwLock::new(BTreeSet::new());
// Actual context store for the scheduler — uses MCS fair spinlock to
// eliminate cache-line bouncing under multi-CPU contention.
static RUN_CONTEXTS: McsMutex<L1, RunContextData> = McsMutex::new(RunContextData::new());
// Actual context store for the scheduler
static RUN_CONTEXTS: Mutex<L1, RunContextData> = Mutex::new(RunContextData::new());
// Context that has been pushed out from RUN_CONTEXTS after being idle.
// Uses regular Mutex (lower contention; wakeup_contexts uses try_lock).
// Context that has been pushed out from RUN_CONTEXTS after being idle
static IDLE_CONTEXTS: Mutex<L2, VecDeque<WeakContextRef>> = Mutex::new(VecDeque::new());
pub struct RunContextData {
@@ -115,7 +113,7 @@ pub fn idle_contexts_try(
IDLE_CONTEXTS.try_lock(token)
}
pub fn run_contexts(token: LockToken<'_, L0>) -> McsMutexGuard<'_, L1, RunContextData> {
pub fn run_contexts(token: LockToken<'_, L0>) -> MutexGuard<'_, L1, RunContextData> {
RUN_CONTEXTS.lock(token)
}
@@ -15,7 +15,7 @@ use crate::{
use alloc::{sync::Arc, vec::Vec};
use core::{
cell::{Cell, RefCell},
mem,
hint, mem,
sync::atomic::Ordering,
};
use syscall::PtraceFlags;
@@ -26,11 +26,6 @@ enum UpdateResult {
Blocked,
}
/// Default number of PIT ticks before triggering a context switch.
/// At ~2.25 ms per tick, 3 ticks ≈ 6.75 ms timeslice.
/// Configurable per-CPU via `ContextSwitchPercpu::preempt_interval`.
const DEFAULT_PREEMPT_INTERVAL: usize = 3;
// A simple geometric series where value[i] ~= value[i - 1] * 1.25
const SCHED_PRIO_TO_WEIGHT: [usize; 40] = [
88761, 71755, 56483, 46273, 36291, 29154, 23254, 18705, 14949, 11916, 9548, 7620, 6100, 4904,
@@ -95,15 +90,13 @@ struct SwitchResultInner {
///
/// The function also calls the signal handler after switching contexts.
pub fn tick(token: &mut CleanLockToken) {
let percpu = PercpuBlock::current();
let ticks_cell = &percpu.switch_internals.pit_ticks;
let ticks_cell = &PercpuBlock::current().switch_internals.pit_ticks;
let new_ticks = ticks_cell.get() + 1;
ticks_cell.set(new_ticks);
// Trigger a context switch when the per-CPU preempt interval is reached.
let interval = percpu.switch_internals.preempt_interval.get();
if new_ticks >= interval {
// Trigger a context switch after every 3 ticks (approx. 6.75 ms).
if new_ticks >= 3 {
switch(token);
crate::context::signal::signal_handler(token);
}
@@ -127,10 +120,7 @@ pub unsafe extern "C" fn switch_finish_hook() {
crate::arch::stop::emergency_reset();
}
}
PercpuBlock::current()
.switch_internals
.in_context_switch
.set(false);
arch::CONTEXT_SWITCH_LOCK.store(false, Ordering::SeqCst);
crate::percpu::switch_arch_hook();
}
}
@@ -160,15 +150,16 @@ pub fn switch(token: &mut CleanLockToken) -> SwitchResult {
//set PIT Interrupt counter to 0, giving each process same amount of PIT ticks
percpu.switch_internals.pit_ticks.set(0);
// Acquire the per-CPU context switch flag. Each CPU can only be in one context
// switch at a time. The per-context write locks provide cross-CPU safety; this
// flag catches re-entrant switches on the same CPU (a kernel bug).
debug_assert!(
!percpu.switch_internals.in_context_switch.get(),
"context switch re-entry on CPU {}",
percpu.cpu_id
);
percpu.switch_internals.in_context_switch.set(true);
// Acquire the global lock to ensure exclusive access during context switch and avoid
// issues that would be caused by the unsafe operations below
// TODO: Better memory orderings?
while arch::CONTEXT_SWITCH_LOCK
.compare_exchange_weak(false, true, Ordering::SeqCst, Ordering::Relaxed)
.is_err()
{
hint::spin_loop();
percpu.maybe_handle_tlb_shootdown();
}
// Lock the previous context.
let prev_context_lock = crate::context::current();
@@ -176,8 +167,8 @@ pub fn switch(token: &mut CleanLockToken) -> SwitchResult {
let mut prev_context_guard = unsafe { prev_context_lock.write_arc() };
if !prev_context_guard.is_preemptable() {
// Unset per-CPU context switch flag
percpu.switch_internals.in_context_switch.set(false);
// Unset global lock
arch::CONTEXT_SWITCH_LOCK.store(false, Ordering::SeqCst);
// Pretend to have finished switching, so CPU is not idled
return SwitchResult::Switched;
@@ -301,8 +292,8 @@ pub fn switch(token: &mut CleanLockToken) -> SwitchResult {
SwitchResult::Switched
}
_ => {
// No target was found, unset per-CPU context switch flag and return
percpu.switch_internals.in_context_switch.set(false);
// No target was found, unset global lock and return
arch::CONTEXT_SWITCH_LOCK.store(false, Ordering::SeqCst);
percpu.stats.set_state(cpu_stats::CpuState::Idle);
@@ -361,7 +352,6 @@ fn wakeup_contexts(token: &mut CleanLockToken, switch_time: u128) -> Vec<(usize,
}
/// This is the scheduler function which currently utilises Deficit Weighted Round Robin Scheduler
/// with NUMA-aware context selection preference.
fn select_next_context(
token: &mut CleanLockToken,
percpu: &PercpuBlock,
@@ -387,10 +377,6 @@ fn select_next_context(
let total_contexts: usize = contexts_list.iter().map(|q| q.len()).sum();
let mut skipped_contexts = 0;
// NUMA-aware selection: remember cross-node fallback candidate.
let my_numa_node = percpu.numa_node.get();
let mut cross_node_fallback: Option<(usize, ArcContextLockWriteGuard)> = None;
'priority: loop {
i = (i + 1) % 40;
total_iters += 1;
@@ -455,44 +441,9 @@ fn select_next_context(
// Is this context runnable on this CPU?
let sw = unsafe { update_runnable(&mut next_context_guard, cpu_id, switch_time) };
if let UpdateResult::CanSwitch = sw {
// NUMA-aware selection: check if this context's last CPU was on the same node.
let same_node = if my_numa_node != u8::MAX {
next_context_guard.cpu_id
.map(|cid| {
crate::percpu::get_for_cpu(cid)
.map(|p| p.numa_node.get() == my_numa_node)
.unwrap_or(false)
})
.unwrap_or(true) // New context (no last CPU) — treat as same node
} else {
true // No NUMA info — treat all as same node
};
if same_node {
// Cache-warm: select immediately
percpu.current_prio.set(next_context_guard.prio);
next_context_guard_opt = Some(next_context_guard);
balance[i] -= SCHED_PRIO_TO_WEIGHT[20];
break 'priority;
} else {
// Cross-node candidate: save as fallback, keep scanning for same-node
if cross_node_fallback.is_none() {
// Cache the priority and balance for later
cross_node_fallback =
Some((next_context_guard.prio, next_context_guard));
balance[i] -= SCHED_PRIO_TO_WEIGHT[20];
// Don't break — keep looking for a same-node context
continue;
} else {
// Already have a cross-node fallback; push this one back
contexts.push_back(next_context_ref);
skipped_contexts += 1;
if skipped_contexts >= total_contexts {
break 'priority;
}
continue;
}
}
next_context_guard_opt = Some(next_context_guard);
balance[i] -= SCHED_PRIO_TO_WEIGHT[20];
break 'priority;
} else {
if matches!(sw, UpdateResult::Blocked) {
idle_contexts(token.token()).push_back(next_context_ref);
@@ -507,15 +458,6 @@ fn select_next_context(
}
}
}
// If we found a cross-node fallback but no same-node context, use it
if next_context_guard_opt.is_none() {
if let Some((prio, guard)) = cross_node_fallback {
percpu.current_prio.set(prio);
next_context_guard_opt = Some(guard);
}
}
percpu.balance.set(balance);
percpu.last_queue.set(i);
@@ -523,10 +465,7 @@ fn select_next_context(
// Send the old process to the back of the line (if it is still runnable)
let prev_ctx = WeakContextRef(Arc::downgrade(&prev_context_lock));
if prev_context_guard.status.is_runnable() {
let raw_prio = prev_context_guard.prio;
let prio = percpu.effective_prio(raw_prio);
// Clear PI donation — previous context is being re-queued
percpu.pi_donated_prio.store(u32::MAX, Ordering::Relaxed);
let prio = prev_context_guard.prio;
contexts_list[prio].push_back(prev_ctx);
} else {
idle_contexts(token.token()).push_back(prev_ctx);
@@ -538,8 +477,7 @@ fn select_next_context(
return Ok(Some(next_context_guard));
} else {
if !was_idle && !Arc::ptr_eq(&prev_context_lock, &idle_context) {
// Switching to idle context — cache lowest priority
percpu.current_prio.set(39);
// We switch into the idle context
Ok(Some(unsafe { idle_context.write_arc() }))
} else {
// We found no other process to run.
@@ -556,13 +494,6 @@ pub struct ContextSwitchPercpu {
switch_result: Cell<Option<SwitchResultInner>>,
switch_time: Cell<u128>,
pit_ticks: Cell<usize>,
/// Per-CPU context switch flag. Set to true during a context switch on this CPU.
/// Replaced the global CONTEXT_SWITCH_LOCK to eliminate cross-CPU serialization.
in_context_switch: Cell<bool>,
/// Number of PIT ticks before triggering a context switch.
/// Default: 3 (≈6.75 ms). Lower values improve interactive responsiveness;
/// higher values improve throughput for batch/compute workloads.
preempt_interval: Cell<usize>,
current_ctxt: RefCell<Option<Arc<ContextLock>>>,
@@ -577,8 +508,6 @@ impl ContextSwitchPercpu {
switch_result: Cell::new(None),
switch_time: Cell::new(0),
pit_ticks: Cell::new(0),
in_context_switch: Cell::new(false),
preempt_interval: Cell::new(DEFAULT_PREEMPT_INTERVAL),
current_ctxt: RefCell::new(None),
idle_ctxt: RefCell::new(None),
being_sigkilled: Cell::new(false),
+3 -4
View File
@@ -42,18 +42,17 @@ impl core::fmt::Display for LogicalCpuId {
}
#[cfg(target_pointer_width = "64")]
pub const MAX_CPU_COUNT: u32 = 256;
pub const MAX_CPU_COUNT: u32 = 128;
#[cfg(target_pointer_width = "32")]
pub const MAX_CPU_COUNT: u32 = 32;
const SET_WORDS: usize = (MAX_CPU_COUNT / usize::BITS) as usize;
// TODO: Support more than 256 CPUs.
// TODO: Support more than 128 CPUs.
// The maximum number of CPUs on Linux is configurable, and the type for LogicalCpuSet and
// LogicalCpuId may be optimized accordingly. In that case, box the mask if it's larger than some
// base size (probably 256 bytes). AMD EPYC has 128C/256T, Threadripper PRO 96C/192T —
// 256 covers current hardware.
// base size (probably 256 bytes).
#[derive(Debug)]
pub struct LogicalCpuSet([AtomicUsize; SET_WORDS]);
+1 -11
View File
@@ -1,5 +1,5 @@
use alloc::sync::Arc;
use core::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
use core::sync::atomic::{AtomicUsize, Ordering};
use hashbrown::{hash_map::DefaultHashBuilder, HashMap};
use smallvec::SmallVec;
use syscall::data::GlobalSchemes;
@@ -23,7 +23,6 @@ int_like!(EventQueueId, AtomicEventQueueId, usize, AtomicUsize);
pub struct EventQueue {
id: EventQueueId,
queue: WaitQueue<Event>,
pub eventfd: Option<(AtomicU64, bool)>, // (counter, semaphore_mode)
}
impl EventQueue {
@@ -31,15 +30,6 @@ impl EventQueue {
EventQueue {
id,
queue: WaitQueue::new(),
eventfd: None,
}
}
pub fn new_eventfd(id: EventQueueId, initval: u64, semaphore: bool) -> EventQueue {
EventQueue {
id,
queue: WaitQueue::new(),
eventfd: Some((AtomicU64::new(initval), semaphore)),
}
}
-3
View File
@@ -70,9 +70,6 @@ mod log;
/// Memory management
mod memory;
/// NUMA topology
mod numa;
/// Panic
mod panic;
-81
View File
@@ -1,81 +0,0 @@
/// NUMA topology hints for the kernel scheduler.
///
/// NUMA discovery (SRAT/SLIT parsing) is performed during kernel ACPI init
/// (`acpi::init()`). The kernel stores a lightweight copy for O(1) scheduling
/// lookups. If no SRAT is found, `init_default()` creates a single-node topology.
use crate::acpi::srat;
use crate::cpu_set::{LogicalCpuId, LogicalCpuSet};
use core::sync::atomic::{AtomicBool, Ordering};
const MAX_NUMA_NODES: usize = 8;
#[derive(Debug)]
pub struct NumaHint {
pub node_id: u8,
pub cpus: LogicalCpuSet,
}
pub struct NumaTopology {
pub nodes: [Option<NumaHint>; MAX_NUMA_NODES],
pub initialized: AtomicBool,
}
impl NumaTopology {
pub const fn new() -> Self {
const NONE: Option<NumaHint> = None;
Self { nodes: [NONE; MAX_NUMA_NODES], initialized: AtomicBool::new(false) }
}
pub fn node_for_cpu(&self, cpu: LogicalCpuId) -> Option<u8> {
for node in self.nodes.iter().flatten() {
if node.cpus.contains(cpu) { return Some(node.node_id); }
}
None
}
pub fn same_node(&self, cpu1: LogicalCpuId, cpu2: LogicalCpuId) -> bool {
self.node_for_cpu(cpu1) == self.node_for_cpu(cpu2)
}
}
static mut NUMA_TOPOLOGY: NumaTopology = NumaTopology::new();
pub fn topology() -> &'static NumaTopology { unsafe { &NUMA_TOPOLOGY } }
/// Initialize NUMA topology from SRAT data parsed during ACPI init.
pub fn init_from_srat(apic_ids: &[(u32, LogicalCpuId)]) {
let topo = topology();
if topo.initialized.swap(true, Ordering::AcqRel) { return; }
if !srat::is_available() { init_default_inner(); return; }
unsafe {
let topo_mut = &mut *core::ptr::addr_of_mut!(NUMA_TOPOLOGY);
for &(apic_id, cpu_id) in apic_ids {
if let Some(node) = srat::numa_node_for_apic(apic_id) {
let idx = node as usize;
if idx < MAX_NUMA_NODES {
topo_mut.nodes[idx].get_or_insert_with(|| NumaHint { node_id: node, cpus: LogicalCpuSet::empty() }).cpus.atomic_set(cpu_id);
}
}
}
if topo_mut.nodes.iter().all(|n| n.is_none()) {
topo_mut.nodes[0] = Some(NumaHint { node_id: 0, cpus: LogicalCpuSet::all() });
}
}
let node_count = topology().nodes.iter().filter(|n| n.is_some()).count();
debug!("NUMA: {node_count} node(s) from SRAT");
}
/// Fallback: single-node topology.
pub fn init_default() {
let topo = topology();
if topo.initialized.swap(true, Ordering::AcqRel) { return; }
init_default_inner();
}
fn init_default_inner() {
unsafe {
let topo_mut = &mut *core::ptr::addr_of_mut!(NUMA_TOPOLOGY);
topo_mut.nodes[0] = Some(NumaHint { node_id: 0, cpus: LogicalCpuSet::all() });
}
debug!("NUMA: single-node topology (no SRAT)");
}
+7 -184
View File
@@ -4,14 +4,9 @@ use alloc::{
};
use core::{
cell::{Cell, RefCell},
hint,
sync::atomic::{AtomicBool, AtomicPtr, AtomicU32, AtomicU64, Ordering},
sync::atomic::{AtomicBool, AtomicPtr, Ordering},
};
/// Maximum number of pages to flush individually using INVLPG before falling
/// back to a full TLB flush (CR3 reload).
const TLB_RANGE_THRESHOLD: u32 = 32;
use rmm::Arch;
use syscall::PtraceFlags;
@@ -21,7 +16,7 @@ use crate::{
cpu_set::{LogicalCpuId, MAX_CPU_COUNT},
cpu_stats::{CpuStats, CpuStatsData},
ptrace::Session,
sync::{mcs::McsNode, mcs::McsRawLock, CleanLockToken},
sync::CleanLockToken,
syscall::debug::SyscallDebugInfo,
};
@@ -39,38 +34,6 @@ pub struct PercpuBlock {
pub balance: Cell<[usize; 40]>,
pub last_queue: Cell<usize>,
/// Per-CPU MCS node for the scheduler run-queue lock (RUN_CONTEXTS).
pub mcs_sched_node: McsNode,
/// Counts how many times the scheduler MCS lock acquisition was contended.
pub mcs_contention_count: Cell<u64>,
/// TLB shootdown range: start virtual address (page-aligned).
/// Set to 0 for a full flush. Only valid when `wants_tlb_shootdown` is true.
pub tlb_flush_start: AtomicU64,
/// TLB shootdown range: number of pages to invalidate.
pub tlb_flush_count: AtomicU32,
/// Priority inheritance donation. When another CPU is blocked waiting on a
/// lock this CPU holds, the blocked CPU may donate its priority here.
/// `u32::MAX` means no donation; otherwise it's a priority level (0-39).
pub pi_donated_prio: AtomicU32,
/// Cached priority of the currently-running context on this CPU.
/// Set by the scheduler when selecting a new context. Read by the MCS
/// lock during priority donation — avoids acquiring the context RwLock
/// from the spin loop. Default 39 (lowest priority).
pub current_prio: Cell<usize>,
/// NUMA proximity domain for this CPU. Set during ACPI init from SRAT.
/// `u8::MAX` means unknown (no SRAT or APIC ID not listed).
pub numa_node: Cell<u8>,
/// Pointer to the MCS lock this CPU is currently spinning on (for transitive PI).
/// `null` when not waiting on any lock. Set in McsRawLock::acquire() before
/// entering the spin loop, cleared upon acquisition.
pub waiting_on_lock: AtomicPtr<McsRawLock>,
// TODO: Put mailbox queues here, e.g. for TLB shootdown? Just be sure to 128-byte align it
// first to avoid cache invalidation.
pub profiling: Option<&'static crate::profiling::RingBuffer>,
@@ -94,15 +57,6 @@ pub unsafe fn init_tlb_shootdown(id: LogicalCpuId, block: *mut PercpuBlock) {
ALL_PERCPU_BLOCKS[id.get() as usize].store(block, Ordering::Release)
}
/// Get a reference to another CPU's PercpuBlock by logical CPU ID.
pub fn get_for_cpu(id: LogicalCpuId) -> Option<&'static PercpuBlock> {
unsafe {
ALL_PERCPU_BLOCKS[id.get() as usize]
.load(Ordering::Acquire)
.as_ref()
}
}
pub fn get_all_stats() -> Vec<(LogicalCpuId, CpuStatsData)> {
let mut res = ALL_PERCPU_BLOCKS
.iter()
@@ -147,148 +101,25 @@ pub fn shootdown_tlb_ipi(target: Option<LogicalCpuId>) {
core::hint::spin_loop();
}
}
// Full flush — clear range info (Release ordering ensures the flag
// swap and these stores are visible to the handler before the IPI).
percpublock.tlb_flush_start.store(0, Ordering::Release);
percpublock.tlb_flush_count.store(0, Ordering::Release);
crate::ipi::ipi_single(crate::ipi::IpiKind::Tlb, percpublock);
} else {
// Broadcast TLB shootdown: set flag on all other CPUs, then send a single
// IPI with "all except self" destination shorthand instead of N individual IPIs.
let my_percpublock = PercpuBlock::current();
for id in 0..crate::cpu_count() {
let target_id = LogicalCpuId::new(id);
if target_id == my_percpublock.cpu_id {
continue;
}
let Some(percpublock) = (unsafe {
ALL_PERCPU_BLOCKS[id as usize]
.load(Ordering::Acquire)
.as_ref()
}) else {
continue;
};
// Wait if this CPU still has a pending shootdown from a previous request
#[expect(clippy::bool_comparison)]
while percpublock
.wants_tlb_shootdown
.swap(true, Ordering::Release)
== true
{
while percpublock.wants_tlb_shootdown.load(Ordering::Relaxed) == true {
my_percpublock.maybe_handle_tlb_shootdown();
hint::spin_loop();
}
}
// Full flush — clear range info (Release ordering)
percpublock.tlb_flush_start.store(0, Ordering::Release);
percpublock.tlb_flush_count.store(0, Ordering::Release);
// TODO: Optimize: use global counter and percpu ack counters, send IPI using
// destination shorthand "all CPUs".
shootdown_tlb_ipi(Some(LogicalCpuId::new(id)));
}
// Single broadcast IPI to all other CPUs using destination shorthand
crate::ipi::ipi(crate::ipi::IpiKind::Tlb, crate::ipi::IpiTarget::Other);
}
}
/// Range-based TLB shootdown IPI. Only invalidates the specified virtual address
/// range using INVLPG per page for ranges up to TLB_RANGE_THRESHOLD pages.
/// Falls back to full flush for larger ranges.
pub fn shootdown_tlb_ipi_range(target: Option<LogicalCpuId>, start: usize, count: usize) {
if cfg!(not(feature = "multi_core")) {
return;
}
let start_aligned = start as u64 & !0xFFF;
let count_u32 = count as u32;
let use_range = count_u32 > 0 && count_u32 <= TLB_RANGE_THRESHOLD;
let set_range = |percpublock: &PercpuBlock| {
if use_range {
percpublock.tlb_flush_start.store(start_aligned, Ordering::Release);
percpublock.tlb_flush_count.store(count_u32, Ordering::Release);
} else {
percpublock.tlb_flush_start.store(0, Ordering::Release);
percpublock.tlb_flush_count.store(0, Ordering::Release);
}
};
if let Some(target) = target {
let my_percpublock = PercpuBlock::current();
assert_ne!(target, my_percpublock.cpu_id);
let Some(percpublock) = (unsafe {
ALL_PERCPU_BLOCKS[target.get() as usize]
.load(Ordering::Acquire)
.as_ref()
}) else {
return;
};
#[expect(clippy::bool_comparison)]
while percpublock.wants_tlb_shootdown.swap(true, Ordering::Release) == true {
while percpublock.wants_tlb_shootdown.load(Ordering::Relaxed) == true {
my_percpublock.maybe_handle_tlb_shootdown();
hint::spin_loop();
}
}
set_range(percpublock);
crate::ipi::ipi_single(crate::ipi::IpiKind::Tlb, percpublock);
} else {
let my_percpublock = PercpuBlock::current();
for id in 0..crate::cpu_count() {
let target_id = LogicalCpuId::new(id);
if target_id == my_percpublock.cpu_id {
continue;
}
let Some(percpublock) = (unsafe {
ALL_PERCPU_BLOCKS[id as usize]
.load(Ordering::Acquire)
.as_ref()
}) else {
continue;
};
#[expect(clippy::bool_comparison)]
while percpublock.wants_tlb_shootdown.swap(true, Ordering::Release) == true {
while percpublock.wants_tlb_shootdown.load(Ordering::Relaxed) == true {
my_percpublock.maybe_handle_tlb_shootdown();
hint::spin_loop();
}
}
set_range(percpublock);
}
crate::ipi::ipi(crate::ipi::IpiKind::Tlb, crate::ipi::IpiTarget::Other);
}
}
impl PercpuBlock {
/// Return the effective scheduling priority, accounting for priority inheritance.
/// Lower number = higher priority (0-39 range).
pub fn effective_prio(&self, context_prio: usize) -> usize {
let donated = self.pi_donated_prio.load(Ordering::Relaxed);
if donated < context_prio as u32 {
donated as usize
} else {
context_prio
}
}
pub fn maybe_handle_tlb_shootdown(&self) {
#[expect(clippy::bool_comparison)]
if self.wants_tlb_shootdown.swap(false, Ordering::Relaxed) == false {
return;
}
let start = self.tlb_flush_start.load(Ordering::Acquire);
let count = self.tlb_flush_count.load(Ordering::Acquire);
if start != 0 && count > 0 && count <= TLB_RANGE_THRESHOLD {
// Range-based flush using INVLPG per page — cheaper than full CR3 reload.
for i in 0..count {
let addr = start + (i as u64) * 4096;
crate::memory::RmmA::invalidate(rmm::VirtualAddress::new(addr as usize));
}
} else {
// Full TLB flush (CR3 reload) for large ranges or global shootdowns.
crate::memory::RmmA::invalidate_all();
}
// TODO: Finer-grained flush
crate::memory::RmmA::invalidate_all();
if let Some(addrsp) = &*self.current_addrsp.borrow() {
addrsp.tlb_ack.fetch_add(1, Ordering::Release);
@@ -358,14 +189,6 @@ impl PercpuBlock {
wants_tlb_shootdown: AtomicBool::new(false),
balance: Cell::new([0; 40]),
last_queue: Cell::new(39),
mcs_sched_node: McsNode::new(),
mcs_contention_count: Cell::new(0),
tlb_flush_start: AtomicU64::new(0),
tlb_flush_count: AtomicU32::new(0),
pi_donated_prio: AtomicU32::new(u32::MAX),
current_prio: Cell::new(39),
numa_node: Cell::new(u8::MAX),
waiting_on_lock: AtomicPtr::new(core::ptr::null_mut()),
ptrace_flags: Cell::new(PtraceFlags::empty()),
ptrace_session: RefCell::new(None),
inside_syscall: Cell::new(false),
+3 -65
View File
@@ -10,7 +10,6 @@ use syscall::{
use crate::{
acpi::{RxsdtEnum, RXSDT_ENUM},
arch::sleep,
context::file::InternalFlags,
event,
sync::{CleanLockToken, RwLock, WaitCondition, L1},
@@ -41,7 +40,6 @@ enum HandleKind {
TopLevel,
Rxsdt,
ShutdownPipe,
SleepControl,
SchemeRoot,
}
@@ -148,11 +146,11 @@ impl KernelScheme for AcpiScheme {
if flags & O_EXCL == O_EXCL || flags & O_SYMLINK == O_SYMLINK {
return Err(Error::new(EINVAL));
}
if flags & O_ACCMODE != O_RDONLY && flags & O_STAT != O_STAT {
return Err(Error::new(EROFS));
}
let (handle_kind, int_flags) = match path {
"" => {
if flags & O_ACCMODE != O_RDONLY && flags & O_STAT != O_STAT {
return Err(Error::new(EROFS));
}
if flags & O_DIRECTORY != O_DIRECTORY && flags & O_STAT != O_STAT {
return Err(Error::new(EISDIR));
}
@@ -160,36 +158,17 @@ impl KernelScheme for AcpiScheme {
(HandleKind::TopLevel, InternalFlags::POSITIONED)
}
"rxsdt" => {
if flags & O_ACCMODE != O_RDONLY && flags & O_STAT != O_STAT {
return Err(Error::new(EROFS));
}
if flags & O_DIRECTORY == O_DIRECTORY && flags & O_STAT != O_STAT {
return Err(Error::new(ENOTDIR));
}
(HandleKind::Rxsdt, InternalFlags::POSITIONED)
}
"kstop" => {
if flags & O_ACCMODE != O_RDONLY && flags & O_STAT != O_STAT {
return Err(Error::new(EROFS));
}
if flags & O_DIRECTORY == O_DIRECTORY && flags & O_STAT != O_STAT {
return Err(Error::new(ENOTDIR));
}
(HandleKind::ShutdownPipe, InternalFlags::empty())
}
"sleep" => {
if flags & O_ACCMODE == O_RDONLY || flags & O_STAT == O_STAT {
// allowed
} else if flags & O_ACCMODE != syscall::flag::O_WRONLY
&& flags & O_ACCMODE != syscall::flag::O_RDWR
{
return Err(Error::new(EINVAL));
}
if flags & O_DIRECTORY == O_DIRECTORY && flags & O_STAT != O_STAT {
return Err(Error::new(ENOTDIR));
}
(HandleKind::SleepControl, InternalFlags::POSITIONED)
}
_ => return Err(Error::new(ENOENT)),
};
@@ -212,7 +191,6 @@ impl KernelScheme for AcpiScheme {
Ok(match handle.kind {
HandleKind::Rxsdt => DATA.get().ok_or(Error::new(EBADFD))?.len() as u64,
HandleKind::ShutdownPipe => 1,
HandleKind::SleepControl => sleep::available_sleep_states().len() as u64,
HandleKind::TopLevel => 0,
HandleKind::SchemeRoot => return Err(Error::new(EBADF))?,
})
@@ -275,7 +253,6 @@ impl KernelScheme for AcpiScheme {
return dst_buf.copy_exactly(&[0x42]).map(|()| 1);
}
HandleKind::SleepControl => sleep::available_sleep_states(),
HandleKind::Rxsdt => DATA.get().ok_or(Error::new(EBADFD))?,
HandleKind::TopLevel => return Err(Error::new(EISDIR)),
HandleKind::SchemeRoot => return Err(Error::new(EBADF)),
@@ -318,45 +295,11 @@ impl KernelScheme for AcpiScheme {
kind: DirentKind::Socket,
name: "kstop",
inode: 0,
next_opaque_id: 2,
})?;
}
if opaque <= 2 {
buf.entry(DirEntry {
kind: DirentKind::Regular,
name: "sleep",
inode: 0,
next_opaque_id: u64::MAX,
})?;
}
Ok(buf.finalize())
}
fn kwrite(
&self,
id: usize,
buf: crate::syscall::usercopy::UserSliceRo,
_flags: u32,
_stored_flags: u32,
token: &mut CleanLockToken,
) -> Result<usize> {
let handle = *HANDLES.read(token.token()).get(id)?;
if handle.stat {
return Err(Error::new(EBADF));
}
match handle.kind {
HandleKind::SleepControl => {
let mut tmp = [0_u8; 16];
let len = buf.copy_common_bytes_to_slice(&mut tmp)?;
let request = core::str::from_utf8(&tmp[..len]).map_err(|_| Error::new(EINVAL))?;
sleep::trigger_sleep_request(request)?;
Ok(len)
}
HandleKind::SchemeRoot => Err(Error::new(EBADF)),
_ => Err(Error::new(EBADF)),
}
}
fn kfpath(&self, _id: usize, buf: UserSliceWo, _token: &mut CleanLockToken) -> Result<usize> {
//TODO: construct useful path?
buf.copy_common_bytes_from_slice("/scheme/kernel.acpi/".as_bytes())
@@ -385,11 +328,6 @@ impl KernelScheme for AcpiScheme {
st_size: 1,
..Default::default()
},
HandleKind::SleepControl => Stat {
st_mode: MODE_FILE,
st_size: sleep::available_sleep_states().len().try_into().unwrap_or(u64::MAX),
..Default::default()
},
HandleKind::SchemeRoot => return Err(Error::new(EBADF)),
})?;
@@ -22,10 +22,9 @@ struct Handle {
static HANDLES: RwLock<L1, HandleMap<Handle>> = RwLock::new(HandleMap::new());
/// Add to the input queue, translating CR to NL (ICRNL) for serial console compatibility.
/// Add to the input queue
pub fn debug_input(data: u8, token: &mut CleanLockToken) {
let translated = if data == b'\r' { b'\n' } else { data };
INPUT.send(translated, token);
INPUT.send(data, token);
}
// Notify readers of input updates
@@ -107,16 +106,12 @@ impl KernelScheme for DebugScheme {
fn fevent(
&self,
id: usize,
flags: EventFlags,
_flags: EventFlags,
token: &mut CleanLockToken,
) -> Result<EventFlags> {
let _handle = *HANDLES.read(token.token()).get(id)?;
let mut ready = EventFlags::empty();
if flags.contains(EventFlags::EVENT_READ) {
ready |= EventFlags::EVENT_READ;
}
Ok(ready)
Ok(EventFlags::empty())
}
fn fsync(&self, id: usize, token: &mut CleanLockToken) -> Result<()> {
+1 -53
View File
@@ -1,5 +1,4 @@
use alloc::sync::Arc;
use core::sync::atomic::Ordering;
use syscall::{EventFlags, O_NONBLOCK};
use crate::{
@@ -26,25 +25,12 @@ impl KernelScheme for EventScheme {
fn kopenat(
&self,
id: usize,
user_buf: StrOrBytes,
_user_buf: StrOrBytes,
_flags: usize,
_fcntl_flags: u32,
_ctx: CallerCtx,
token: &mut CleanLockToken,
) -> Result<OpenResult> {
let path = match &user_buf {
StrOrBytes::Str(s) => s,
StrOrBytes::Bytes(b) => core::str::from_utf8(b).unwrap_or(""),
};
if path.starts_with("eventfd/") {
let rest = &path[8..]; // after "eventfd/"
let mut parts = rest.split('/');
let initval: u64 = parts.next().and_then(|s| s.parse().ok()).unwrap_or(0);
let sem: bool = parts.next().and_then(|s| s.parse().ok()).unwrap_or(false);
let id = next_queue_id();
queues_mut(token.token()).insert(id, Arc::new(EventQueue::new_eventfd(id, initval, sem)));
return Ok(OpenResult::SchemeLocal(id.get(), InternalFlags::empty()));
}
if id != SCHEME_ROOT_ID {
return Err(Error::new(EACCES));
}
@@ -81,31 +67,6 @@ impl KernelScheme for EventScheme {
handle.clone()
};
if let Some((ref counter, semaphore)) = queue.eventfd {
let is_nonblock = flags & O_NONBLOCK as u32 != 0;
if semaphore {
let val = counter.load(Ordering::Acquire);
if val == 0 {
if is_nonblock { return Err(Error::new(EAGAIN)); }
// Blocking wait not implemented for eventfd in kernel
return Err(Error::new(EAGAIN));
}
if counter.compare_exchange(val, val - 1, Ordering::AcqRel, Ordering::Relaxed).is_ok() {
let one: u64 = 1;
buf.copy_from_slice(unsafe { core::slice::from_raw_parts(&one as *const u64 as *const u8, 8) })?;
return Ok(8);
}
return Err(Error::new(EAGAIN));
} else {
let val = counter.swap(0, Ordering::AcqRel);
if val == 0 && is_nonblock {
return Err(Error::new(EAGAIN));
}
buf.copy_from_slice(unsafe { core::slice::from_raw_parts(&val as *const u64 as *const u8, 8) })?;
return Ok(8);
}
}
queue.read(buf, flags & O_NONBLOCK as u32 == 0, token)
}
@@ -124,19 +85,6 @@ impl KernelScheme for EventScheme {
let handle = handles.get(&id).ok_or(Error::new(EBADF))?;
handle.clone()
};
if let Some((ref counter, _semaphore)) = queue.eventfd {
if buf.len() >= 8 {
let mut bytes = [0u8; 8];
buf.copy_to_slice(&mut bytes)?;
let val = u64::from_ne_bytes(bytes);
if val == u64::MAX { return Err(Error::new(EINVAL)); }
counter.fetch_add(val, Ordering::AcqRel);
return Ok(8);
}
return Err(Error::new(EINVAL));
}
let mut events_written = 0;
for chunk in buf.in_exact_chunks(size_of::<Event>()) {
+8 -88
View File
@@ -18,9 +18,6 @@ use syscall::{
use crate::context::file::InternalFlags;
use super::{CallerCtx, HandleMap, OpenResult, SchemeExt, StrOrBytes};
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
use crate::arch::device::{ioapic, local_apic::ApicId};
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
use crate::arch::interrupt::{available_irqs_iter, irq::acknowledge, is_reserved, set_reserved};
#[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))]
@@ -59,11 +56,8 @@ const INO_AVAIL: u64 = 0x8000_0000_0000_0000;
const INO_BSP: u64 = 0x8001_0000_0000_0000;
const INO_PHANDLE: u64 = 0x8003_0000_0000_0000;
/// Add to the input queue, with iommu validation gate for MSI vectors
/// Add to the input queue
pub fn irq_trigger(irq: u8, token: &mut CleanLockToken) {
if irq >= 16 && !iommu_validate_msi_irq(irq) {
return;
}
COUNTS.lock()[irq as usize] += 1;
let fds: SmallVec<[usize; 8]> = {
HANDLES
@@ -83,17 +77,16 @@ pub fn irq_trigger(irq: u8, token: &mut CleanLockToken) {
#[allow(dead_code)]
enum Handle {
SchemeRoot,
Irq { ack: AtomicUsize, irq: u8, cpu_id: LogicalCpuId },
Irq { ack: AtomicUsize, irq: u8 },
Avail(LogicalCpuId),
TopLevel,
Phandle(u8, Vec<u8>),
Bsp,
IrqAffinity { irq: u8, mask: AtomicUsize },
}
impl Handle {
fn as_irq_handle(&self) -> Option<(&AtomicUsize, u8)> {
match self {
&Self::Irq { ref ack, irq, cpu_id: _ } => Some((ack, irq)),
&Self::Irq { ref ack, irq } => Some((ack, irq)),
_ => None,
}
}
@@ -147,7 +140,6 @@ impl IrqScheme {
Handle::Irq {
ack: AtomicUsize::new(0),
irq: irq_number,
cpu_id: LogicalCpuId::BSP,
},
InternalFlags::empty(),
)
@@ -166,7 +158,6 @@ impl IrqScheme {
Handle::Irq {
ack: AtomicUsize::new(0),
irq: irq_number,
cpu_id,
},
InternalFlags::empty(),
)
@@ -208,7 +199,6 @@ impl IrqScheme {
Handle::Irq {
ack: AtomicUsize::new(0),
irq: irq_number as u8,
cpu_id: LogicalCpuId::new(0),
},
InternalFlags::empty(),
)
@@ -224,14 +214,6 @@ const fn vector_to_irq(vector: u8) -> u8 {
vector - 32
}
const fn msi_vector_is_valid(vector: u8) -> bool {
vector >= 32 && vector < 0xEF
}
fn iommu_validate_msi_irq(_irq: u8) -> bool {
true
}
impl crate::scheme::KernelScheme for IrqScheme {
fn scheme_root(&self, token: &mut CleanLockToken) -> Result<usize> {
let id = HANDLES.write(token.token()).insert(Handle::SchemeRoot);
@@ -298,21 +280,7 @@ impl crate::scheme::KernelScheme for IrqScheme {
InternalFlags::POSITIONED,
)
} else if let Some(path_str) = path_str.strip_prefix('/') {
let (irq_str, affinity) = path_str
.trim_end_matches('/')
.rsplit_once('/')
.map(|(a, b)| (a, Some(b)))
.unwrap_or((path_str.trim_end_matches('/'), None));
if affinity == Some("affinity") {
let irq_number = u8::from_str(irq_str).or(Err(Error::new(ENOENT)))?;
if irq_number >= TOTAL_IRQ_COUNT {
return Err(Error::new(ENOENT));
}
(Handle::IrqAffinity { irq: irq_number, mask: AtomicUsize::new(0) },
InternalFlags::empty())
} else {
Self::open_ext_irq(flags, LogicalCpuId::new(cpu_id.into()), path_str)?
}
Self::open_ext_irq(flags, LogicalCpuId::new(cpu_id.into()), path_str)?
} else {
return Err(Error::new(ENOENT));
}
@@ -339,20 +307,12 @@ impl crate::scheme::KernelScheme for IrqScheme {
}
#[cfg(not(dtb))]
panic!("")
} else if let Some(rest) = path_str.strip_suffix("/affinity") {
let irq_number = u8::from_str(rest).or(Err(Error::new(ENOENT)))?;
if irq_number >= TOTAL_IRQ_COUNT {
return Err(Error::new(ENOENT));
}
(Handle::IrqAffinity { irq: irq_number, mask: AtomicUsize::new(0) },
InternalFlags::empty())
} else if let Ok(plain_irq_number) = u8::from_str(path_str) {
if plain_irq_number < BASE_IRQ_COUNT {
(
Handle::Irq {
ack: AtomicUsize::new(0),
irq: plain_irq_number,
cpu_id: LogicalCpuId::BSP,
},
InternalFlags::empty(),
)
@@ -408,7 +368,6 @@ impl crate::scheme::KernelScheme for IrqScheme {
}
}
Handle::Avail(cpu_id) => {
let mut listed = 0;
for vector in available_irqs_iter(cpu_id).skip(opaque) {
let irq = vector_to_irq(vector);
if cpu_id == LogicalCpuId::BSP && irq < BASE_IRQ_COUNT {
@@ -422,9 +381,7 @@ impl crate::scheme::KernelScheme for IrqScheme {
name: &intermediate,
next_opaque_id: u64::from(vector) + 1,
})?;
listed += 1;
}
info!("irq getdents Avail: cpu_id={} opaque={} listed={}", cpu_id.get(), opaque, listed);
}
_ => return Err(Error::new(ENOTDIR)),
}
@@ -459,14 +416,11 @@ impl crate::scheme::KernelScheme for IrqScheme {
let handle = handles_guard.get(id)?;
if let &Handle::Irq {
irq: handle_irq,
cpu_id: handle_cpu_id,
..
irq: handle_irq, ..
} = handle
&& handle_irq > BASE_IRQ_COUNT
{
info!("irq close: unreserving vector {} on cpu_id={}", irq_to_vector(handle_irq), handle_cpu_id.get());
set_reserved(handle_cpu_id, irq_to_vector(handle_irq), false);
set_reserved(LogicalCpuId::BSP, irq_to_vector(handle_irq), false);
}
Ok(())
}
@@ -482,32 +436,9 @@ impl crate::scheme::KernelScheme for IrqScheme {
let handle = handles_guard.get(file)?;
match handle {
&Handle::IrqAffinity { irq: _handle_irq, ref mask } => {
if buffer.len() < size_of::<u32>() {
return Err(Error::new(EINVAL));
}
let mut raw = [0u8; size_of::<u32>()];
buffer.copy_to_slice(&mut raw)?;
let cpu_id = u32::from_ne_bytes(raw);
let cpus = CPUS.get().ok_or(Error::new(EIO))?;
if !cpus.contains(&(cpu_id as u8)) {
return Err(Error::new(EINVAL));
}
// Reprogram the IOAPIC redirection entry for x86 targets.
// Non-IOAPIC IRQs (e.g. MSI) will return false -> EIO.
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
{
if !unsafe { ioapic::set_affinity(_handle_irq, ApicId::new(cpu_id)) } {
return Err(Error::new(EIO));
}
}
mask.store(cpu_id as usize, Ordering::Release);
Ok(size_of::<u32>())
}
&Handle::Irq {
irq: handle_irq,
ack: ref handle_ack,
cpu_id: _,
} => {
if buffer.len() < size_of::<usize>() {
return Err(Error::new(EINVAL));
@@ -544,15 +475,6 @@ impl crate::scheme::KernelScheme for IrqScheme {
st_nlink: 1,
..Default::default()
},
Handle::IrqAffinity { irq, .. } => Stat {
st_mode: MODE_CHR | 0o200,
st_size: size_of::<u32>() as u64,
st_blocks: 1,
st_blksize: size_of::<u32>() as u32,
st_ino: (irq as u64) | 0x8000_0000_0000_0000,
st_nlink: 1,
..Default::default()
},
Handle::Bsp => Stat {
st_mode: MODE_CHR | 0o400,
st_size: size_of::<usize>() as u64,
@@ -594,9 +516,8 @@ impl crate::scheme::KernelScheme for IrqScheme {
let scheme_path = match handle {
Handle::Irq { irq, .. } => format!("irq:{}", irq),
Handle::IrqAffinity { irq, .. } => format!("irq:{}/affinity", irq),
Handle::Bsp => "irq:bsp".to_owned(),
Handle::Avail(cpu_id) => format!("irq:cpu-{:02x}", cpu_id.get()),
Handle::Avail(cpu_id) => format!("irq:cpu-{:2x}", cpu_id.get()),
Handle::Phandle(phandle, _) => format!("irq:phandle-{}", phandle),
Handle::TopLevel => "irq:".to_owned(),
_ => return Err(Error::new(EBADF)),
@@ -622,7 +543,6 @@ impl crate::scheme::KernelScheme for IrqScheme {
Handle::Irq {
irq: handle_irq,
ack: ref handle_ack,
cpu_id: _,
} => {
if buffer.len() < size_of::<usize>() {
return Err(Error::new(EINVAL));
@@ -642,7 +562,7 @@ impl crate::scheme::KernelScheme for IrqScheme {
buffer.write_u32(LogicalCpuId::BSP.get())?;
Ok(size_of::<usize>())
}
Handle::Avail(_) | Handle::TopLevel | Handle::Phandle(_, _) | Handle::SchemeRoot | Handle::IrqAffinity { .. } => {
Handle::Avail(_) | Handle::TopLevel | Handle::Phandle(_, _) | Handle::SchemeRoot => {
Err(Error::new(EISDIR))
}
}
+25 -60
View File
@@ -14,7 +14,7 @@ use alloc::{
};
use core::{
str,
sync::atomic::{AtomicU64, AtomicUsize, Ordering},
sync::atomic::{AtomicUsize, Ordering},
};
use hashbrown::hash_map::{self, DefaultHashBuilder, HashMap};
use spin::Once;
@@ -169,7 +169,6 @@ enum Handle {
/// Schemes list
static HANDLES: Once<RwLock<L1, HashMap<SchemeId, Handle>>> = Once::new();
static SCHEME_GENERATIONS: Once<RwLock<L1, HashMap<SchemeId, AtomicU64>>> = Once::new();
static SCHEME_LIST_NEXT_ID: AtomicUsize = AtomicUsize::new(MAX_GLOBAL_SCHEMES);
static SCHEME_LIST_ID: AtomicUsize = AtomicUsize::new(0);
@@ -205,10 +204,6 @@ fn init_schemes() -> RwLock<L1, HashMap<SchemeId, Handle>> {
RwLock::new(handles)
}
fn init_scheme_generations() -> RwLock<L1, HashMap<SchemeId, AtomicU64>> {
RwLock::new(HashMap::new())
}
/// Get a handle to a scheme.
pub fn get_scheme(token: LockToken<'_, L0>, scheme_id: SchemeId) -> Result<KernelSchemes> {
match handles().read(token).get(&scheme_id) {
@@ -217,33 +212,10 @@ pub fn get_scheme(token: LockToken<'_, L0>, scheme_id: SchemeId) -> Result<Kerne
}
}
pub fn current_scheme_generation(token: LockToken<'_, L0>, scheme_id: SchemeId) -> u64 {
scheme_generations()
.read(token)
.get(&scheme_id)
.map(|generation| generation.load(Ordering::Acquire))
.unwrap_or(0)
}
fn handles<'a>() -> &'a RwLock<L1, HashMap<SchemeId, Handle>> {
HANDLES.call_once(init_schemes)
}
fn scheme_generations<'a>() -> &'a RwLock<L1, HashMap<SchemeId, AtomicU64>> {
SCHEME_GENERATIONS.call_once(init_scheme_generations)
}
fn increment_scheme_generation(scheme_id: SchemeId, token: &mut CleanLockToken) {
match scheme_generations().write(token.token()).entry(scheme_id) {
hash_map::Entry::Occupied(entry) => {
entry.get().fetch_add(1, Ordering::AcqRel);
}
hash_map::Entry::Vacant(entry) => {
entry.insert(AtomicU64::new(1));
}
}
}
/// Scheme list type
pub struct SchemeList;
@@ -288,14 +260,9 @@ impl SchemeList {
/// Remove a scheme
fn remove(&self, id: usize, token: &mut CleanLockToken) {
let scheme_id = SchemeId(id);
let scheme = handles().write(token.token()).remove(&scheme_id);
let scheme = handles().write(token.token()).remove(&SchemeId(id));
assert!(scheme.is_some());
if let Some(Handle::Scheme(KernelSchemes::User(user))) = scheme.as_ref() {
user.inner.fail_pending_calls(token);
}
increment_scheme_generation(scheme_id, token);
if let Some(Handle::Scheme(KernelSchemes::User(user))) = scheme
&& let Some(user) = Arc::into_inner(user.inner)
{
@@ -320,32 +287,32 @@ impl KernelScheme for SchemeList {
token: &mut CleanLockToken,
) -> Result<OpenResult> {
let scheme_id = SchemeId(scheme_id);
let maybe_inner = {
let handles = handles().read(token.token());
match handles.get(&scheme_id).ok_or(Error::new(EBADF))? {
Handle::Scheme(KernelSchemes::User(UserScheme { inner })) => Some(inner.clone()),
Handle::SchemeCreationCapability => None,
_ => return Err(Error::new(EBADF)),
match handles()
.read(token.token())
.get(&scheme_id)
.ok_or(Error::new(EBADF))?
{
Handle::Scheme(KernelSchemes::User(UserScheme { inner })) => {
let inner = inner.clone();
assert!(scheme_id == inner.scheme_id);
let scheme = scheme_id;
let params = unsafe { user_buf.read_exact::<NewFdParams>()? };
return Ok(OpenResult::External(Arc::new(RwLock::new(
FileDescription {
scheme,
number: params.number,
offset: params.offset,
flags: params.flags as u32,
internal_flags: InternalFlags::from_extra0(params.internal_flags)
.ok_or(Error::new(EINVAL))?,
},
))));
}
Handle::SchemeCreationCapability => (),
_ => return Err(Error::new(EBADF)),
};
if let Some(inner) = maybe_inner {
assert!(scheme_id == inner.scheme_id);
let params = unsafe { user_buf.read_exact::<NewFdParams>()? };
return Ok(OpenResult::External(Arc::new(RwLock::new(
FileDescription::new(
scheme_id,
params.number,
params.offset,
params.flags as u32,
InternalFlags::from_extra0(params.internal_flags)
.ok_or(Error::new(EINVAL))?,
token,
),
))));
}
const EXPECTED: &[u8] = b"create-scheme";
let mut buf = [0u8; EXPECTED.len()];
@@ -810,7 +777,6 @@ pub struct CallerCtx {
pub pid: usize,
pub uid: u32,
pub gid: u32,
pub groups: alloc::vec::Vec<u32>,
}
impl CallerCtx {
pub fn filter_uid_gid(self, euid: u32, egid: u32) -> Self {
@@ -819,7 +785,6 @@ impl CallerCtx {
pid: self.pid,
uid: euid,
gid: egid,
groups: self.groups,
}
} else {
self
+194 -385
View File
@@ -1,10 +1,5 @@
use alloc::{
collections::VecDeque,
string::{String, ToString},
sync::Arc,
vec::Vec,
};
use core::sync::atomic::{AtomicUsize, Ordering};
use alloc::{collections::VecDeque, sync::Arc, vec::Vec};
use core::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
use syscall::{data::GlobalSchemes, CallFlags};
@@ -19,228 +14,67 @@ use crate::{
sync::{CleanLockToken, Mutex, RwLock, WaitCondition, L1},
syscall::{
data::Stat,
error::{
Error, Result, EAGAIN, EBADF, EEXIST, EINVAL, EINTR, ENOENT, ENOTDIR, EPIPE,
},
flag::{
EventFlags, EVENT_READ, EVENT_WRITE, MODE_FIFO, O_ACCMODE, O_DIRECTORY,
O_NONBLOCK, O_RDONLY, O_RDWR, O_STAT, O_WRONLY,
},
error::{Error, Result, EAGAIN, EBADF, EINTR, EINVAL, ENOENT, EPIPE},
flag::{EventFlags, EVENT_READ, EVENT_WRITE, MODE_FIFO, O_NONBLOCK},
usercopy::{UserSliceRo, UserSliceRw, UserSliceWo},
},
};
use super::{CallerCtx, KernelScheme, OpenResult, SchemeExt, StrOrBytes};
static PIPE_NEXT_ID: AtomicUsize = AtomicUsize::new(1);
// TODO: Preallocate a number of scheme IDs, since there can only be *one* root namespace, and
// therefore only *one* pipe scheme.
static PIPE_NEXT_ID: AtomicUsize = AtomicUsize::new(0);
#[derive(Clone)]
enum Handle {
Endpoint(EndpointHandle),
Pipe(Arc<Pipe>),
SchemeRoot,
}
#[derive(Clone, Copy, Eq, PartialEq)]
enum EndpointKind {
Read,
Write,
ReadWrite,
}
impl EndpointKind {
fn can_read(self) -> bool {
matches!(self, Self::Read | Self::ReadWrite)
}
fn can_write(self) -> bool {
matches!(self, Self::Write | Self::ReadWrite)
}
}
#[derive(Clone)]
struct EndpointHandle {
pipe: Arc<Pipe>,
kind: EndpointKind,
named: Option<Arc<NamedPipe>>,
}
struct NamedPipe {
path: String,
mode: u16,
active: Mutex<L1, Option<Arc<Pipe>>>,
}
static HANDLES: RwLock<L1, HashMap<usize, Handle>> =
RwLock::new(HashMap::with_hasher(DefaultHashBuilder::new()));
static NAMED_PIPES: RwLock<L1, HashMap<String, Arc<NamedPipe>>> =
// TODO: SLOB?
static PIPES: RwLock<L1, HashMap<usize, Handle>> =
RwLock::new(HashMap::with_hasher(DefaultHashBuilder::new()));
const MAX_QUEUE_SIZE: usize = 65536;
fn next_id() -> usize {
PIPE_NEXT_ID.fetch_add(1, Ordering::Relaxed)
}
// In almost all places where Rust (and LLVM) uses pointers, they are limited to nonnegative isize,
// so this is fine.
const WRITE_NOT_READ_BIT: usize = 1;
fn endpoint_kind_from_flags(flags: usize) -> Result<EndpointKind> {
match flags & O_ACCMODE {
O_RDONLY => Ok(EndpointKind::Read),
O_WRONLY => Ok(EndpointKind::Write),
O_RDWR => Ok(EndpointKind::ReadWrite),
_ => Err(Error::new(EINVAL)),
}
}
fn validate_named_fifo_open(flags: usize) -> Result<()> {
if flags & O_DIRECTORY == O_DIRECTORY && flags & O_STAT != O_STAT {
return Err(Error::new(ENOTDIR));
}
let _ = endpoint_kind_from_flags(flags)?;
Ok(())
}
fn trigger_matching(
pipe: &Arc<Pipe>,
require_read: bool,
require_write: bool,
flags: EventFlags,
token: &mut CleanLockToken,
) {
let ids = {
let handles = HANDLES.read(token.token());
handles
.iter()
.filter_map(|(id, handle)| match handle {
Handle::Endpoint(endpoint)
if Arc::ptr_eq(&endpoint.pipe, pipe)
&& (!require_read || endpoint.kind.can_read())
&& (!require_write || endpoint.kind.can_write()) =>
{
Some(*id)
}
_ => None,
})
.collect::<Vec<_>>()
};
for id in ids {
event::trigger(GlobalSchemes::Pipe.scheme_id(), id, flags, token);
}
}
fn open_endpoint(
pipe: Arc<Pipe>,
kind: EndpointKind,
named: Option<Arc<NamedPipe>>,
token: &mut CleanLockToken,
) -> usize {
if kind.can_read() {
pipe.reader_count.fetch_add(1, Ordering::SeqCst);
}
if kind.can_write() {
pipe.writer_count.fetch_add(1, Ordering::SeqCst);
}
let id = next_id();
HANDLES.write(token.token()).insert(
id,
Handle::Endpoint(EndpointHandle { pipe, kind, named }),
);
id
}
fn drop_wait_conditions_if_possible(pipe: Arc<Pipe>, token: &mut CleanLockToken) {
if let Some(pipe) = Arc::into_inner(pipe) {
{
pipe.read_condition.into_drop(token);
}
{
pipe.write_condition.into_drop(token);
}
}
fn from_raw_id(id: usize) -> (bool, usize) {
(id & WRITE_NOT_READ_BIT != 0, id & !WRITE_NOT_READ_BIT)
}
pub fn pipe(token: &mut CleanLockToken) -> Result<(usize, usize)> {
let pipe = Arc::new(Pipe::new());
let read_id = open_endpoint(Arc::clone(&pipe), EndpointKind::Read, None, token);
let write_id = open_endpoint(pipe, EndpointKind::Write, None, token);
// Bit 0 is used for WRITE_NOT_READ_BIT
let id = PIPE_NEXT_ID.fetch_add(2, Ordering::Relaxed);
Ok((read_id, write_id))
}
PIPES.write(token.token()).insert(
id,
Handle::Pipe(Arc::new(Pipe {
queue: Mutex::new(VecDeque::new()),
read_condition: WaitCondition::new(),
write_condition: WaitCondition::new(),
writer_is_alive: AtomicBool::new(true),
reader_is_alive: AtomicBool::new(true),
has_run_dup: AtomicBool::new(false),
fd_queue: Mutex::new(VecDeque::new()),
})),
);
pub fn named_pipe_exists(path: &str, token: &mut CleanLockToken) -> bool {
NAMED_PIPES.read(token.token()).contains_key(path)
}
pub fn create_named_pipe(
path: &str,
display_path: &str,
mode: u16,
flags: usize,
token: &mut CleanLockToken,
) -> Result<usize> {
validate_named_fifo_open(flags)?;
let named = {
let mut named_pipes = NAMED_PIPES.write(token.token());
if named_pipes.contains_key(path) {
return Err(Error::new(EEXIST));
}
let named = Arc::new(NamedPipe {
path: display_path.to_string(),
mode,
active: Mutex::new(None),
});
named_pipes.insert(path.to_string(), Arc::clone(&named));
named
};
let kind = endpoint_kind_from_flags(flags)?;
let pipe = Arc::new(Pipe::new());
*named.active.lock(token.token()) = Some(Arc::clone(&pipe));
Ok(open_endpoint(pipe, kind, Some(named), token))
}
pub fn open_named_pipe(path: &str, flags: usize, token: &mut CleanLockToken) -> Result<Option<usize>> {
validate_named_fifo_open(flags)?;
let named = match NAMED_PIPES.read(token.token()).get(path) {
Some(named) => Arc::clone(named),
None => return Ok(None),
};
let kind = endpoint_kind_from_flags(flags)?;
let pipe = {
let mut active = named.active.lock(token.token());
match active.as_ref() {
Some(pipe) => Arc::clone(pipe),
None => {
let pipe = Arc::new(Pipe::new());
*active = Some(Arc::clone(&pipe));
pipe
}
}
};
Ok(Some(open_endpoint(pipe, kind, Some(named), token)))
}
pub fn unlink_named_pipe(path: &str, token: &mut CleanLockToken) -> bool {
NAMED_PIPES.write(token.token()).remove(path).is_some()
Ok((id, id | WRITE_NOT_READ_BIT))
}
pub struct PipeScheme;
impl PipeScheme {
fn get_endpoint(id: usize, token: &mut CleanLockToken) -> Result<EndpointHandle> {
HANDLES
fn get_pipe(key: usize, token: &mut CleanLockToken) -> Result<Arc<Pipe>> {
PIPES
.read(token.token())
.get(&id)
.get(&key)
.and_then(|handle| match handle {
Handle::Endpoint(endpoint) => Some(endpoint.clone()),
Handle::SchemeRoot => None,
Handle::Pipe(pipe) => Some(Arc::clone(pipe)),
_ => None,
})
.ok_or(Error::new(EBADF))
}
@@ -248,33 +82,32 @@ impl PipeScheme {
impl KernelScheme for PipeScheme {
fn scheme_root(&self, token: &mut CleanLockToken) -> Result<usize> {
let id = next_id();
HANDLES.write(token.token()).insert(id, Handle::SchemeRoot);
let id = PIPE_NEXT_ID.fetch_add(2, Ordering::Relaxed);
PIPES.write(token.token()).insert(id, Handle::SchemeRoot);
Ok(id)
}
fn fevent(
&self,
id: usize,
flags: EventFlags,
token: &mut CleanLockToken,
) -> Result<EventFlags> {
let endpoint = Self::get_endpoint(id, token)?;
let (is_writer_not_reader, key) = from_raw_id(id);
let pipe = Self::get_pipe(key, token)?;
let mut ready = EventFlags::empty();
if endpoint.kind.can_write()
if is_writer_not_reader
&& flags.contains(EVENT_WRITE)
&& (endpoint.pipe.queue.lock(token.token()).len() <= MAX_QUEUE_SIZE
|| endpoint.pipe.reader_count.load(Ordering::Acquire) == 0)
&& (pipe.queue.lock(token.token()).len() <= MAX_QUEUE_SIZE
|| !pipe.reader_is_alive.load(Ordering::Acquire))
{
ready |= EventFlags::EVENT_WRITE;
}
if endpoint.kind.can_read()
if !is_writer_not_reader
&& flags.contains(EVENT_READ)
&& (!endpoint.pipe.queue.lock(token.token()).is_empty()
|| endpoint.pipe.writer_count.load(Ordering::Acquire) == 0)
&& (!pipe.queue.lock(token.token()).is_empty()
|| !pipe.writer_is_alive.load(Ordering::Acquire))
{
ready |= EventFlags::EVENT_READ;
}
@@ -283,48 +116,46 @@ impl KernelScheme for PipeScheme {
}
fn close(&self, id: usize, token: &mut CleanLockToken) -> Result<()> {
let handle = HANDLES
.write(token.token())
.remove(&id)
.ok_or(Error::new(EBADF))?;
let (is_write_not_read, key) = from_raw_id(id);
let Handle::Endpoint(endpoint) = handle else {
return Ok(());
let pipe = Self::get_pipe(key, token)?;
let scheme_id = GlobalSchemes::Pipe.scheme_id();
let can_remove = if is_write_not_read {
pipe.writer_is_alive.store(false, Ordering::SeqCst);
event::trigger(scheme_id, key, EVENT_READ, token);
pipe.read_condition.notify(token);
!pipe.reader_is_alive.load(Ordering::SeqCst)
} else {
pipe.reader_is_alive.store(false, Ordering::SeqCst);
event::trigger(scheme_id, key | WRITE_NOT_READ_BIT, EVENT_WRITE, token);
pipe.write_condition.notify(token);
!pipe.writer_is_alive.load(Ordering::SeqCst)
};
let mut last_reader = false;
let mut last_writer = false;
if endpoint.kind.can_read() {
last_reader = endpoint.pipe.reader_count.fetch_sub(1, Ordering::SeqCst) == 1;
}
if endpoint.kind.can_write() {
last_writer = endpoint.pipe.writer_count.fetch_sub(1, Ordering::SeqCst) == 1;
}
if last_writer {
trigger_matching(&endpoint.pipe, true, false, EVENT_READ, token);
endpoint.pipe.read_condition.notify(token);
}
if last_reader {
trigger_matching(&endpoint.pipe, false, true, EVENT_WRITE, token);
endpoint.pipe.write_condition.notify(token);
}
let no_readers = endpoint.pipe.reader_count.load(Ordering::SeqCst) == 0;
let no_writers = endpoint.pipe.writer_count.load(Ordering::SeqCst) == 0;
if no_readers && no_writers {
if let Some(named) = endpoint.named {
let mut active = named.active.lock(token.token());
if active
.as_ref()
.is_some_and(|active_pipe| Arc::ptr_eq(active_pipe, &endpoint.pipe))
if can_remove {
let handle = PIPES.write(token.token()).remove(&key);
if let Some(Handle::Pipe(pipe)) = handle
&& let Some(pipe) = Arc::into_inner(pipe)
{
{
*active = None;
pipe.read_condition.into_drop(token);
}
{
pipe.write_condition.into_drop(token);
}
}
}
drop_wait_conditions_if_possible(endpoint.pipe, token);
if let Some(pipe) = Arc::into_inner(pipe) {
{
pipe.read_condition.into_drop(token);
}
{
pipe.write_condition.into_drop(token);
}
}
Ok(())
@@ -337,9 +168,9 @@ impl KernelScheme for PipeScheme {
_ctx: CallerCtx,
token: &mut CleanLockToken,
) -> Result<OpenResult> {
let endpoint = Self::get_endpoint(old_id, token)?;
let (is_writer_not_reader, key) = from_raw_id(old_id);
if !endpoint.kind.can_read() {
if is_writer_not_reader {
return Err(Error::new(EBADF));
}
@@ -349,17 +180,17 @@ impl KernelScheme for PipeScheme {
return Err(Error::new(EINVAL));
}
let pipe = Self::get_pipe(key, token)?;
if pipe.has_run_dup.swap(true, Ordering::SeqCst) {
return Err(Error::new(EBADF));
}
Ok(OpenResult::SchemeLocal(
open_endpoint(
Arc::clone(&endpoint.pipe),
EndpointKind::Write,
endpoint.named,
token,
),
key | WRITE_NOT_READ_BIT,
InternalFlags::empty(),
))
}
fn kopenat(
&self,
id: usize,
@@ -369,47 +200,40 @@ impl KernelScheme for PipeScheme {
_ctx: CallerCtx,
token: &mut CleanLockToken,
) -> Result<OpenResult> {
let is_scheme_root = {
let handles = HANDLES.read(token.token());
match handles.get(&id) {
Some(Handle::SchemeRoot) => true,
Some(Handle::Endpoint(_)) => false,
None => return Err(Error::new(EBADF)),
}
};
let (_, key) = from_raw_id(id);
if is_scheme_root {
let path = user_buf.as_str().or(Err(Error::new(EINVAL)))?;
if !path.trim_start_matches('/').is_empty() {
return Err(Error::new(ENOENT));
{
let guard = PIPES.read(token.token());
if let Some(Handle::SchemeRoot) = guard.get(&key) {
} else if let Some(Handle::Pipe(pipe_arc)) = guard.get(&key) {
let pipe = Arc::clone(pipe_arc);
drop(guard);
if user_buf.as_bytes() == b"write" {
return Err(Error::new(EINVAL));
}
if pipe.has_run_dup.swap(true, Ordering::SeqCst) {
return Err(Error::new(EBADF));
}
let pipe = Arc::new(Pipe::new());
return Ok(OpenResult::SchemeLocal(
open_endpoint(pipe, EndpointKind::Read, None, token),
key | WRITE_NOT_READ_BIT,
InternalFlags::empty(),
));
} else {
return Err(Error::new(EBADF));
}
}
let endpoint = Self::get_endpoint(id, token)?;
if !endpoint.kind.can_read() {
return Err(Error::new(EBADF));
let path = user_buf.as_str().or(Err(Error::new(EINVAL)))?;
if !path.trim_start_matches('/').is_empty() {
return Err(Error::new(ENOENT));
}
let path = user_buf.as_bytes();
if !path.is_empty() && path != b"write" {
return Err(Error::new(EINVAL));
}
let (read_id, _) = pipe(token)?;
Ok(OpenResult::SchemeLocal(
open_endpoint(
Arc::clone(&endpoint.pipe),
EndpointKind::Write,
endpoint.named,
token,
),
InternalFlags::empty(),
))
Ok(OpenResult::SchemeLocal(read_id, InternalFlags::empty()))
}
fn kread(
@@ -420,15 +244,16 @@ impl KernelScheme for PipeScheme {
_stored_flags: u32,
token: &mut CleanLockToken,
) -> Result<usize> {
let endpoint = Self::get_endpoint(id, token)?;
let (is_write_not_read, key) = from_raw_id(id);
if !endpoint.kind.can_read() {
if is_write_not_read {
return Err(Error::new(EBADF));
}
let pipe = Self::get_pipe(key, token)?;
loop {
let vec = endpoint.pipe.queue.lock(token.token());
let (mut vec, mut lock_token) = vec.into_split();
let vec = pipe.queue.lock(token.token());
let (mut vec, mut token) = vec.into_split();
let (s1, s2) = vec.as_slices();
let s1_count = core::cmp::min(user_buf.len(), s1.len());
@@ -448,34 +273,28 @@ impl KernelScheme for PipeScheme {
let _ = vec.drain(..bytes_read);
if bytes_read > 0 {
drop(vec);
drop(lock_token);
trigger_matching(&endpoint.pipe, false, true, EVENT_WRITE, token);
endpoint.pipe.write_condition.notify(token);
event::trigger_locked(
GlobalSchemes::Pipe.scheme_id(),
key | WRITE_NOT_READ_BIT,
EVENT_WRITE,
token.token(),
);
pipe.write_condition.notify_locked(token.token());
return Ok(bytes_read);
}
if user_buf.is_empty() {
} else if user_buf.is_empty() {
return Ok(0);
}
if endpoint.pipe.writer_count.load(Ordering::SeqCst) == 0 {
if !pipe.writer_is_alive.load(Ordering::SeqCst) {
return Ok(0);
}
if fcntl_flags & O_NONBLOCK as u32 != 0 {
} else if fcntl_flags & O_NONBLOCK as u32 != 0 {
return Err(Error::new(EAGAIN));
}
if !endpoint
.pipe
.read_condition
.wait(vec, "PipeRead::read", &mut lock_token)
{
} else if !pipe.read_condition.wait(vec, "PipeRead::read", &mut token) {
return Err(Error::new(EINTR));
}
}
}
fn kwrite(
&self,
id: usize,
@@ -484,17 +303,18 @@ impl KernelScheme for PipeScheme {
_stored_flags: u32,
token: &mut CleanLockToken,
) -> Result<usize> {
let endpoint = Self::get_endpoint(id, token)?;
let (is_write_not_read, key) = from_raw_id(id);
if !endpoint.kind.can_write() {
if !is_write_not_read {
return Err(Error::new(EBADF));
}
let pipe = Self::get_pipe(key, token)?;
loop {
let vec = endpoint.pipe.queue.lock(token.token());
let (mut vec, mut lock_token) = vec.into_split();
let vec = pipe.queue.lock(token.token());
let (mut vec, mut token) = vec.into_split();
if endpoint.pipe.reader_count.load(Ordering::Relaxed) == 0 {
if !pipe.reader_is_alive.load(Ordering::Relaxed) {
return Err(Error::new(EPIPE));
}
@@ -509,6 +329,7 @@ impl KernelScheme for PipeScheme {
let mut bytes_written = 0;
// TODO: Modify VecDeque so that the unwritten portions can be accessed directly?
for (idx, chunk) in src_buf.in_variable_chunks(TMPBUF_SIZE).enumerate() {
let chunk_byte_count = match chunk.copy_common_bytes_to_slice(&mut tmp_buf) {
Ok(c) => c,
@@ -520,52 +341,41 @@ impl KernelScheme for PipeScheme {
}
if bytes_written > 0 {
drop(vec);
drop(lock_token);
trigger_matching(&endpoint.pipe, true, false, EVENT_READ, token);
endpoint.pipe.read_condition.notify(token);
event::trigger_locked(
GlobalSchemes::Pipe.scheme_id(),
key,
EVENT_READ,
token.token(),
);
pipe.read_condition.notify_locked(token.token());
return Ok(bytes_written);
}
if user_buf.is_empty() {
} else if user_buf.is_empty() {
return Ok(0);
}
if fcntl_flags & O_NONBLOCK as u32 != 0 {
return Err(Error::new(EAGAIN));
}
if !endpoint
.pipe
} else if !pipe
.write_condition
.wait(vec, "PipeWrite::write", &mut lock_token)
.wait(vec, "PipeWrite::write", &mut token)
{
return Err(Error::new(EINTR));
}
}
}
fn kfpath(&self, id: usize, buf: UserSliceWo, token: &mut CleanLockToken) -> Result<usize> {
let endpoint = Self::get_endpoint(id, token)?;
if let Some(named) = endpoint.named {
buf.copy_common_bytes_from_slice(named.path.as_bytes())
} else {
buf.copy_common_bytes_from_slice("/scheme/pipe/".as_bytes())
}
fn kfpath(&self, _id: usize, buf: UserSliceWo, _token: &mut CleanLockToken) -> Result<usize> {
//TODO: construct useful path?
buf.copy_common_bytes_from_slice("/scheme/pipe/".as_bytes())
}
fn kfstat(&self, id: usize, buf: UserSliceWo, token: &mut CleanLockToken) -> Result<()> {
let endpoint = Self::get_endpoint(id, token)?;
let mode = endpoint.named.map_or(0o666, |named| named.mode);
fn kfstat(&self, _id: usize, buf: UserSliceWo, _token: &mut CleanLockToken) -> Result<()> {
buf.copy_exactly(&Stat {
st_mode: MODE_FIFO | mode,
st_mode: MODE_FIFO | 0o666,
..Default::default()
})?;
Ok(())
}
fn kfdwrite(
&self,
id: usize,
@@ -575,17 +385,23 @@ impl KernelScheme for PipeScheme {
_metadata: &[u64],
token: &mut CleanLockToken,
) -> Result<usize> {
let endpoint = Self::get_endpoint(id, token)?;
let (is_write_not_read, key) = from_raw_id(id);
if !endpoint.kind.can_write() {
if !is_write_not_read {
return Err(Error::new(EBADF));
}
let pipe = match Self::get_pipe(key, token) {
Ok(p) => p,
Err(e) => {
return Err(e);
}
};
loop {
let vec = endpoint.pipe.fd_queue.lock(token.token());
let (mut vec, mut lock_token) = vec.into_split();
let vec = pipe.fd_queue.lock(token.token());
let (mut vec, mut token) = vec.into_split();
if endpoint.pipe.reader_count.load(Ordering::Relaxed) == 0 {
if !pipe.reader_is_alive.load(Ordering::Relaxed) {
return Err(Error::new(EPIPE));
}
if descs.is_empty() {
@@ -605,24 +421,25 @@ impl KernelScheme for PipeScheme {
let fds_written = vec.len() - before_len;
if fds_written > 0 {
drop(vec);
drop(lock_token);
trigger_matching(&endpoint.pipe, true, false, EVENT_READ, token);
endpoint.pipe.read_condition.notify(token);
event::trigger_locked(
GlobalSchemes::Pipe.scheme_id(),
key,
EVENT_READ,
token.token(),
);
pipe.read_condition.notify_locked(token.token());
return Ok(fds_written);
}
if !endpoint
.pipe
if !pipe
.write_condition
.wait(vec, "PipeWrite::write", &mut lock_token)
.wait(vec, "PipeWrite::write", &mut token)
{
return Err(Error::new(EINTR));
}
}
}
fn kfdread(
&self,
id: usize,
@@ -631,19 +448,25 @@ impl KernelScheme for PipeScheme {
_metadata: &[u64],
token: &mut CleanLockToken,
) -> Result<usize> {
let endpoint = Self::get_endpoint(id, token)?;
let (is_write_not_read, key) = from_raw_id(id);
if !endpoint.kind.can_read() {
if is_write_not_read {
return Err(Error::new(EBADF));
}
let pipe = match Self::get_pipe(key, token) {
Ok(p) => p,
Err(e) => {
return Err(e);
}
};
if payload.is_empty() {
return Ok(0);
}
loop {
let vec = endpoint.pipe.fd_queue.lock(token.token());
let (mut vec, mut lock_token) = vec.into_split();
let vec = pipe.fd_queue.lock(token.token());
let (mut vec, mut token) = vec.into_split();
let fds_available = vec.len();
let max_fds_read = payload.len() / size_of::<usize>();
@@ -656,33 +479,31 @@ impl KernelScheme for PipeScheme {
fds_to_transfer,
payload,
flags.contains(CallFlags::FD_CLOEXEC),
&mut lock_token,
&mut token,
)?;
} else {
bulk_add_fds(
fds_to_transfer,
payload,
flags.contains(CallFlags::FD_CLOEXEC),
&mut lock_token,
&mut token,
)?;
}
drop(vec);
drop(lock_token);
trigger_matching(&endpoint.pipe, false, true, EVENT_WRITE, token);
endpoint.pipe.write_condition.notify(token);
event::trigger_locked(
GlobalSchemes::Pipe.scheme_id(),
key | WRITE_NOT_READ_BIT,
EVENT_WRITE,
token.token(),
);
pipe.write_condition.notify_locked(token.token());
return Ok(fds_to_read);
}
if endpoint.pipe.writer_count.load(Ordering::SeqCst) == 0 {
if !pipe.writer_is_alive.load(Ordering::SeqCst) {
return Ok(0);
}
if !endpoint
.pipe
.read_condition
.wait(vec, "PipeRead::read", &mut lock_token)
{
} else if !pipe.read_condition.wait(vec, "PipeRead::read", &mut token) {
return Err(Error::new(EINTR));
}
}
@@ -690,23 +511,11 @@ impl KernelScheme for PipeScheme {
}
pub struct Pipe {
read_condition: WaitCondition,
write_condition: WaitCondition,
read_condition: WaitCondition, // signals whether there are available bytes to read
write_condition: WaitCondition, // signals whether there is room for additional bytes
queue: Mutex<L1, VecDeque<u8>>,
reader_count: AtomicUsize,
writer_count: AtomicUsize,
reader_is_alive: AtomicBool, // starts set, unset when reader closes
writer_is_alive: AtomicBool, // starts set, unset when writer closes
has_run_dup: AtomicBool,
fd_queue: Mutex<L1, VecDeque<Arc<LockedFileDescription>>>,
}
impl Pipe {
fn new() -> Self {
Self {
read_condition: WaitCondition::new(),
write_condition: WaitCondition::new(),
queue: Mutex::new(VecDeque::new()),
reader_count: AtomicUsize::new(0),
writer_count: AtomicUsize::new(0),
fd_queue: Mutex::new(VecDeque::new()),
}
}
}
+12 -59
View File
@@ -105,7 +105,6 @@ enum ContextHandle {
// Attr handles, to set ens/euid/egid/pid.
Authority,
Attr,
Groups,
Status {
privileged: bool,
@@ -262,7 +261,6 @@ impl ProcScheme {
let handle = match actual_name {
"attrs" => ContextHandle::Attr,
"status" => ContextHandle::Status { privileged: true },
"groups" => ContextHandle::Groups,
_ => return Err(Error::new(ENOENT)),
};
@@ -308,11 +306,6 @@ impl ProcScheme {
let id = NonZeroUsize::new(NEXT_ID.fetch_add(1, Ordering::Relaxed))
.ok_or(Error::new(EMFILE))?;
let context = context::spawn(true, Some(id), ret, token)?;
{
let parent_groups =
context::current().read(token.token()).groups.clone();
context.write(token.token()).groups = parent_groups;
}
HANDLES.write(token.token()).insert(
id.get(),
Handle {
@@ -432,7 +425,6 @@ impl KernelScheme for ProcScheme {
}
fn close(&self, id: usize, token: &mut CleanLockToken) -> Result<()> {
let mut inner_token = unsafe { CleanLockToken::new() };
let handle = HANDLES
.write(token.token())
.remove(&id)
@@ -460,7 +452,9 @@ impl KernelScheme for ProcScheme {
))]
regs.set_arg1(arg1);
Ok(context.set_addr_space(Some(new), inner_token.downgrade()))
// TODO: Lock ordering violation
let mut token = unsafe { CleanLockToken::new() };
Ok(context.set_addr_space(Some(new), token.downgrade()))
})?;
if let Some(old_ctx) = old_ctx
&& let Some(addrspace) = Arc::into_inner(old_ctx)
@@ -499,7 +493,6 @@ impl KernelScheme for ProcScheme {
consume: bool,
token: &mut CleanLockToken,
) -> Result<usize> {
let mut inner_token = unsafe { CleanLockToken::new() };
let handle = HANDLES
.read(token.token())
.get(&id)
@@ -590,7 +583,9 @@ impl KernelScheme for ProcScheme {
};
// TODO: Allocated or AllocatedShared?
let addrsp = AddrSpace::current()?;
let page = addrsp.acquire_write(inner_token.downgrade()).mmap_anywhere(
// TODO: Lock ordering violation
let mut token = unsafe { CleanLockToken::new() };
let page = addrsp.acquire_write(token.downgrade()).mmap_anywhere(
&addrsp,
NonZeroUsize::new(1).unwrap(),
MapFlags::PROT_READ | MapFlags::PROT_WRITE,
@@ -854,17 +849,17 @@ impl KernelScheme for ProcScheme {
}
}
fn extract_scheme_number(fd: usize, token: &mut CleanLockToken) -> Result<(KernelSchemes, usize)> {
let desc = {
let (scheme_id, number) = {
let current_lock = context::current();
let mut current = current_lock.read(token.token());
let (context, mut context_token) = current.token_split();
let (context, mut token) = current.token_split();
let file_descriptor = context
.get_file(FileHandle::from(fd), &mut context_token)
.get_file(FileHandle::from(fd), &mut token)
.ok_or(Error::new(EBADF))?;
*file_descriptor.description.read(context_token.token())
let desc = file_descriptor.description.read(token.token());
(desc.scheme, desc.number)
};
let scheme = desc.get_scheme(token)?;
let number = desc.number;
let scheme = scheme::get_scheme(token.token(), scheme_id)?;
Ok((scheme, number))
}
@@ -1276,39 +1271,6 @@ impl ContextHandle {
guard.prio = (info.prio as usize).min(39);
Ok(size_of::<ProcSchemeAttrs>())
}
Self::Groups => {
const NGROUPS_MAX: usize = 65536;
if buf.len() % size_of::<u32>() != 0 {
return Err(Error::new(EINVAL));
}
let count = buf.len() / size_of::<u32>();
if count > NGROUPS_MAX {
return Err(Error::new(EINVAL));
}
let mut groups = Vec::with_capacity(count);
for chunk in buf.in_exact_chunks(size_of::<u32>()).take(count) {
groups.push(chunk.read_u32()?);
}
let proc_id = {
let guard = context.read(token.token());
guard.owner_proc_id
};
{
let mut guard = context.write(token.token());
guard.groups = groups.clone();
}
if let Some(pid) = proc_id {
let mut contexts = context::contexts(token.downgrade());
let (contexts, mut t) = contexts.token_split();
for context_ref in contexts.iter() {
let mut ctx = context_ref.write(t.token());
if ctx.owner_proc_id == Some(pid) {
ctx.groups = groups.clone();
}
}
}
Ok(count * size_of::<u32>())
}
ContextHandle::OpenViaDup => {
let mut args = buf.usizes();
@@ -1513,15 +1475,6 @@ impl ContextHandle {
debug_name,
})
}
Self::Groups => {
let c = &context.read(token.token());
let max = buf.len() / size_of::<u32>();
let count = c.groups.len().min(max);
for (chunk, gid) in buf.in_exact_chunks(size_of::<u32>()).zip(&c.groups).take(count) {
chunk.copy_from_slice(&gid.to_ne_bytes())?;
}
Ok(count * size_of::<u32>())
}
ContextHandle::Sighandler => {
let data = match context.read(token.token()).sig {
Some(ref sig) => SetSighandlerData {
+63 -157
View File
@@ -80,7 +80,6 @@ const ONE: NonZeroUsize = match NonZeroUsize::new(1) {
Some(one) => one,
None => unreachable!(),
};
const MAX_SPURIOUS_WAKEUPS: usize = 100;
enum ParsedCqe {
TriggerFevent {
@@ -210,8 +209,6 @@ impl UserInner {
caller_responsible: &mut PageSpan,
token: &mut CleanLockToken,
) -> Result<Response> {
let mut remaining_spurious_wakeups = MAX_SPURIOUS_WAKEUPS;
{
// Disable preemption to avoid context switches between setting the
// process state and sending the scheme request. The process is made
@@ -264,10 +261,7 @@ impl UserInner {
};
let states = self.states.lock(token.token());
let (mut states, mut state_token) = states.into_split();
let mut timed_out_descriptions = None;
let mut remove_state = false;
let mut timed_out = false;
let (mut states, mut token) = states.into_split();
match states.get_mut(sqe.tag as usize) {
// invalid state
None => return Err(Error::new(EBADFD)),
@@ -280,35 +274,24 @@ impl UserInner {
fds,
} => {
let maybe_eintr =
eintr_if_sigkill(&mut callee_responsible, &mut state_token.token());
if maybe_eintr.is_ok() {
remaining_spurious_wakeups =
remaining_spurious_wakeups.saturating_sub(1);
}
if maybe_eintr.is_ok() && remaining_spurious_wakeups == 0 {
timed_out_descriptions = Some(Self::collect_descriptions_to_close(fds));
remove_state = true;
} else {
*o = State::Waiting {
canceling: true,
callee_responsible,
context,
fds,
};
}
eintr_if_sigkill(&mut callee_responsible, &mut token.token());
*o = State::Waiting {
canceling: true,
callee_responsible,
context,
fds,
};
maybe_eintr?;
if remove_state {
states.remove(sqe.tag as usize);
timed_out = true;
} else {
context::current()
.write(state_token.token())
.block("UserInner::call (woken up after cancelation request)");
}
context::current()
.write(token.token())
.block("UserInner::call (woken up after cancelation request)");
// We do not want to drop the lock before blocking
// as if we get preempted in between we might miss a
// wakeup.
drop(states);
}
// spurious wakeup
State::Waiting {
@@ -317,76 +300,60 @@ impl UserInner {
context,
mut callee_responsible,
} => {
let maybe_eintr = eintr_if_sigkill(&mut callee_responsible, &mut token);
let current_context = context::current();
let maybe_eintr =
eintr_if_sigkill(&mut callee_responsible, &mut state_token);
if maybe_eintr.is_ok() {
remaining_spurious_wakeups =
remaining_spurious_wakeups.saturating_sub(1);
}
if maybe_eintr.is_ok() && remaining_spurious_wakeups == 0 {
timed_out_descriptions = Some(Self::collect_descriptions_to_close(fds));
remove_state = true;
} else {
*o = State::Waiting {
// Currently we treat all spurious wakeups to have the same behavior
// as signals (i.e., we send a cancellation request). It is not something
// that should happen, but it certainly can happen, for example if a context
// is awoken through its thread handle without setting any sig bits, or if the
// caller clears its own sig bits. If it actually is a signal, then it is the
// intended behavior.
canceling: true,
fds,
context,
callee_responsible,
};
}
*o = State::Waiting {
// Currently we treat all spurious wakeups to have the same behavior
// as signals (i.e., we send a cancellation request). It is not something
// that should happen, but it certainly can happen, for example if a context
// is awoken through its thread handle without setting any sig bits, or if the
// caller clears its own sig bits. If it actually is a signal, then it is the
// intended behavior.
canceling: true,
fds,
context,
callee_responsible,
};
maybe_eintr?;
if remove_state {
states.remove(sqe.tag as usize);
timed_out = true;
} else {
// We do not want to preempt between sending the
// cancellation and blocking again where we might
// miss a wakeup.
let mut preempt =
PreemptGuardL1::new(&current_context, &mut state_token);
let token = preempt.token();
// We do not want to preempt between sending the
// cancellation and blocking again where we might
// miss a wakeup.
let mut preempt = PreemptGuardL1::new(&current_context, &mut token);
let token = preempt.token();
self.todo.send_locked(
Sqe {
opcode: Opcode::Cancel as u8,
sqe_flags: SqeFlags::ONEWAY,
tag: sqe.tag,
..Default::default()
},
token.token(),
);
event::trigger_locked(
self.root_id,
self.scheme_id.get(),
EVENT_READ,
token.token(),
);
self.todo.send_locked(
Sqe {
opcode: Opcode::Cancel as u8,
sqe_flags: SqeFlags::ONEWAY,
tag: sqe.tag,
..Default::default()
},
token.token(),
);
event::trigger_locked(
self.root_id,
self.scheme_id.get(),
EVENT_READ,
token.token(),
);
// 1. If cancellation was requested and arrived
// before the scheme processed the request, an
// acknowledgement will be sent back after the
// cancellation is processed and we will be woken up
// again. State will be State::Responded then.
//
// 2. If cancellation was requested but the scheme
// already processed the request, we will receive
// the actual response next and woken up again.
// State will be State::Responded then.
context::current()
.write(token.token())
.block("UserInner::call (spurious wakeup)");
}
// 1. If cancellation was requested and arrived
// before the scheme processed the request, an
// acknowledgement will be sent back after the
// cancellation is processed and we will be woken up
// again. State will be State::Responded then.
//
// 2. If cancellation was requested but the scheme
// already processed the request, we will receive
// the actual response next and woken up again.
// State will be State::Responded then.
context::current()
.write(token.token())
.block("UserInner::call (spurious wakeup)");
drop(states);
}
// invalid state
@@ -401,70 +368,10 @@ impl UserInner {
}
},
}
if let Some(descriptions) = timed_out_descriptions {
drop(states);
for desc in descriptions {
let _ = desc.try_close(token);
}
}
if timed_out {
return Err(Error::new(ETIMEDOUT));
}
}
}
}
fn collect_descriptions_to_close(
fds: Vec<Arc<LockedFileDescription>>,
) -> Vec<FileDescription> {
fds.into_iter()
.filter_map(|fd| Arc::try_unwrap(fd).ok())
.map(RwLock::into_inner)
.collect()
}
pub fn fail_pending_calls(&self, token: &mut CleanLockToken) {
let descriptions_to_close = {
let mut states_lock = self.states.lock(token.token());
let (states, mut lock_token) = states_lock.token_split();
let mut descriptions_to_close = Vec::new();
let mut states_to_remove = Vec::new();
for (id, state) in states.iter_mut() {
match mem::replace(state, State::Placeholder) {
State::Waiting { context, fds, .. } => {
descriptions_to_close.extend(Self::collect_descriptions_to_close(fds));
match context.upgrade() {
Some(context) => {
*state = State::Responded(Response::Regular(
Err(Error::new(ENODEV)),
0,
false,
));
context.write(lock_token.token()).unblock();
}
None => states_to_remove.push(id),
}
}
old_state => *state = old_state,
}
}
for id in states_to_remove {
states.remove(id);
}
descriptions_to_close
};
for desc in descriptions_to_close {
let _ = desc.try_close(token);
}
}
/// Map a readable structure to the scheme's userspace and return the
/// pointer
#[must_use = "copying back to head/tail buffers can fail"]
@@ -1376,7 +1283,6 @@ impl UserInner {
}
pub fn into_drop(self, token: &mut CleanLockToken) {
self.fail_pending_calls(token);
self.todo.condition.into_drop(token);
}
}
@@ -74,16 +74,14 @@ impl MemoryEntry {
}
struct MemoryMap {
entries: [MemoryEntry; 1024],
entries: [MemoryEntry; 512],
size: usize,
}
impl MemoryMap {
fn register(&mut self, base: usize, size: usize, kind: BootloaderMemoryKind) {
if self.size >= self.entries.len() {
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
unsafe { core::arch::asm!("out dx, al", in("dx") 0x3F8u16, in("al") b'!', options(nostack, preserves_flags)); }
panic!("Early memory map overflow at entry {} (max {})", self.size, self.entries.len());
panic!("Early memory map overflow!");
}
let start = if kind == BootloaderMemoryKind::Free {
align_up(base)
@@ -136,7 +134,7 @@ static MEMORY_MAP: SyncUnsafeCell<MemoryMap> = SyncUnsafeCell::new(MemoryMap {
start: 0,
end: 0,
kind: BootloaderMemoryKind::Null,
}; 1024],
}; 512],
size: 0,
});
@@ -325,16 +323,7 @@ unsafe fn map_memory<A: Arch>(areas: &[MemoryArea], mut bump_allocator: &mut Bum
}
}
let kernel_area = match (*MEMORY_MAP.get()).kernel() {
Some(area) => area,
None => {
println!("FATAL: kernel memory area not found in boot memory map");
println!("Cannot determine kernel base address. Halting.");
loop {
core::hint::spin_loop();
}
}
};
let kernel_area = (*MEMORY_MAP.get()).kernel().unwrap();
let kernel_base = kernel_area.start;
let kernel_size = kernel_area.end.saturating_sub(kernel_area.start);
// Map kernel at KERNEL_OFFSET
+3 -10
View File
@@ -149,15 +149,6 @@ static BOOTSTRAP: spin::Once<Bootstrap> = spin::Once::new();
pub(crate) static AP_READY: AtomicBool = AtomicBool::new(false);
static BSP_READY: AtomicBool = AtomicBool::new(false);
#[cold]
fn halt_boot(message: &str) -> ! {
print!("{message}");
println!("Kernel boot cannot continue. Halting.");
loop {
hint::spin_loop();
}
}
/// This is the kernel entry point for the primary CPU. The arch crate is responsible for calling this
pub(crate) fn kmain(bootstrap: Bootstrap) -> ! {
let mut token = unsafe { CleanLockToken::new() };
@@ -189,7 +180,9 @@ pub(crate) fn kmain(bootstrap: Bootstrap) -> ! {
context.euid = 0;
context.egid = 0;
}
Err(_err) => halt_boot("FATAL: failed to spawn first userspace process userspace_init\n"),
Err(err) => {
panic!("failed to spawn userspace_init: {:?}", err);
}
}
run_userspace(&mut token)
-188
View File
@@ -1,188 +0,0 @@
//! MCS (Mellor-Crummey Scott) fair spinlock.
//!
//! Each waiter spins on its own local `locked` flag instead of a shared lock
//! word, eliminating cache-line bouncing under contention. FIFO ordering
//! guarantees fairness. O(1) cache-line transfers on unlock.
//!
//! Supports transitive priority inheritance: when CPU A waits on a lock held
//! by CPU B, and CPU B waits on a lock held by CPU C, A's priority is
//! propagated through the chain to C (up to MAX_PI_CHAIN_DEPTH hops).
use core::sync::atomic::{AtomicBool, AtomicPtr, AtomicU32, Ordering};
use core::{hint, ptr};
use crate::percpu::PercpuBlock;
/// Maximum depth for transitive priority inheritance chain following.
/// Prevents infinite loops from theoretical lock cycles and bounds latency.
/// Linux uses 20; 8 is conservative for a microkernel with fewer nesting levels.
const MAX_PI_CHAIN_DEPTH: u32 = 8;
/// A node in the MCS lock queue.
pub struct McsNode {
pub next: AtomicPtr<McsNode>,
pub locked: AtomicBool,
}
impl McsNode {
pub const fn new() -> Self {
Self {
next: AtomicPtr::new(ptr::null_mut()),
locked: AtomicBool::new(false),
}
}
}
/// Raw MCS spinlock primitive.
pub struct McsRawLock {
tail: AtomicPtr<McsNode>,
/// CPU ID of the current lock holder (for priority inheritance).
/// `u32::MAX` means no holder.
holder_cpu: AtomicU32,
}
impl McsRawLock {
pub const fn new() -> Self {
Self {
tail: AtomicPtr::new(ptr::null_mut()),
holder_cpu: AtomicU32::new(u32::MAX),
}
}
#[inline]
pub fn acquire(&self, node: &McsNode) -> bool {
node.next.store(ptr::null_mut(), Ordering::Relaxed);
node.locked.store(true, Ordering::Relaxed);
let prev = self.tail.swap((node as *const McsNode).cast_mut(), Ordering::AcqRel);
if prev.is_null() {
// Uncontended — record ourselves as holder
let cpu_id = PercpuBlock::current().cpu_id.get();
self.holder_cpu.store(cpu_id, Ordering::Release);
return false;
}
unsafe {
(*prev).next.store((node as *const McsNode).cast_mut(), Ordering::Release);
}
let percpu = PercpuBlock::current();
// Record which lock we're spinning on (for transitive PI chain following)
percpu.waiting_on_lock.store(
(self as *const McsRawLock).cast_mut(),
Ordering::Release,
);
let mut donated = false;
while node.locked.load(Ordering::Acquire) {
percpu.maybe_handle_tlb_shootdown();
// Donate priority to the lock holder (transitively) once per acquisition
if !donated {
self.maybe_donate_priority(percpu);
donated = true;
}
hint::spin_loop();
}
// Clear waiting_on_lock before proceeding — we now hold the lock
percpu.waiting_on_lock.store(ptr::null_mut(), Ordering::Release);
self.holder_cpu.store(percpu.cpu_id.get(), Ordering::Release);
true
}
#[inline]
pub fn release(&self, node: &McsNode) {
// Clear priority inheritance donation — we no longer hold the lock
PercpuBlock::current().pi_donated_prio.store(u32::MAX, Ordering::Release);
// Clear holder CPU
self.holder_cpu.store(u32::MAX, Ordering::Release);
let next = node.next.load(Ordering::Acquire);
if next.is_null() {
if self
.tail
.compare_exchange(
(node as *const McsNode).cast_mut(),
ptr::null_mut(),
Ordering::AcqRel,
Ordering::Acquire,
)
.is_ok()
{
return;
}
while node.next.load(Ordering::Acquire).is_null() {
hint::spin_loop();
}
}
unsafe {
(*node.next.load(Ordering::Acquire)).locked.store(false, Ordering::Release);
}
}
#[inline]
pub fn try_acquire(&self, node: &McsNode) -> bool {
node.next.store(ptr::null_mut(), Ordering::Relaxed);
node.locked.store(true, Ordering::Relaxed);
let ok = self
.tail
.compare_exchange(
ptr::null_mut(),
(node as *const McsNode).cast_mut(),
Ordering::AcqRel,
Ordering::Acquire,
)
.is_ok();
if ok {
let cpu_id = PercpuBlock::current().cpu_id.get();
self.holder_cpu.store(cpu_id, Ordering::Release);
}
ok
}
/// Donate current CPU's context priority to the lock holder's CPU,
/// following the PI chain transitively (A→B→C).
///
/// Reads priority from PercpuBlock::current_prio (cached by the scheduler)
/// to avoid acquiring any lock in the MCS spin loop.
///
/// Chain following: if the holder is itself waiting on another lock,
/// we propagate our priority to that lock's holder too, up to
/// MAX_PI_CHAIN_DEPTH hops.
fn maybe_donate_priority(&self, my_percpu: &PercpuBlock) {
let my_prio = my_percpu.current_prio.get() as u32;
let mut current_holder_cpu = self.holder_cpu.load(Ordering::Relaxed);
for _ in 0..MAX_PI_CHAIN_DEPTH {
if current_holder_cpu == u32::MAX {
return;
}
let holder_percpu = crate::percpu::get_for_cpu(
crate::cpu_set::LogicalCpuId::new(current_holder_cpu),
);
let Some(holder) = holder_percpu else {
return;
};
// Donate if our priority is higher (lower number) than current donation
let current_donated = holder.pi_donated_prio.load(Ordering::Relaxed);
if my_prio < current_donated {
holder.pi_donated_prio.store(my_prio, Ordering::Release);
}
// Follow the chain: is this holder also waiting on another lock?
let next_lock_ptr = holder.waiting_on_lock.load(Ordering::Relaxed);
if next_lock_ptr.is_null() {
return;
}
// SAFETY: The pointed-to McsRawLock is a long-lived struct field
// (e.g., part of the run queue). The holder is currently spinning
// in acquire(), so the pointer is valid. We only read holder_cpu
// (an atomic u32) — no mutable access needed.
let next_holder_cpu =
unsafe { (*next_lock_ptr).holder_cpu.load(Ordering::Relaxed) };
// Cycle detection: if the next holder is the same CPU we just visited, stop
if next_holder_cpu == current_holder_cpu {
return;
}
current_holder_cpu = next_holder_cpu;
}
// Chain depth exhausted — stop to bound latency
}
}
@@ -1,6 +1,5 @@
pub use self::{ordered::*, wait_condition::WaitCondition, wait_queue::WaitQueue};
pub mod mcs;
pub mod ordered;
pub mod wait_condition;
pub mod wait_queue;
@@ -52,9 +52,7 @@
//! *g1 = 12;
//! ```
use alloc::sync::Arc;
use core::cell::UnsafeCell;
use core::marker::PhantomData;
use core::ptr;
use crate::percpu::PercpuBlock;
@@ -734,143 +732,3 @@ impl<L: Level, T> Drop for ArcRwLockWriteGuard<L, T> {
/// This function can only be called if no lock is held by the calling thread/task
#[inline]
pub fn check_no_locks(_: LockToken<'_, L0>) {}
// ---------------------------------------------------------------------------
// MCS-based fair mutex (McsMutex)
// ---------------------------------------------------------------------------
/// A mutual exclusion lock using the MCS fair spinlock algorithm.
///
/// Unlike `Mutex<L, T>` which uses a simple spinlock (no fairness under
/// contention), `McsMutex` uses Mellor-Crummey Scott queue-based spinning:
///
/// - Each waiter spins on its **own** local flag — no shared cache-line bouncing.
/// - FIFO ordering prevents starvation.
/// - O(1) cache-line transfers on unlock.
///
/// The MCS node is stored in [`crate::percpu::PercpuBlock::mcs_sched_node`], so
/// this type is suitable for scheduler-internal locks where the holder is always
/// the current CPU.
pub struct McsMutex<L: Level, T> {
raw: crate::sync::mcs::McsRawLock,
data: UnsafeCell<T>,
_phantom: PhantomData<L>,
}
unsafe impl<L: Level, T: Send> Sync for McsMutex<L, T> {}
unsafe impl<L: Level, T: Send> Send for McsMutex<L, T> {}
impl<L: Level, T> McsMutex<L, T> {
pub const fn new(val: T) -> Self {
Self {
raw: crate::sync::mcs::McsRawLock::new(),
data: UnsafeCell::new(val),
_phantom: PhantomData,
}
}
}
impl<L: Level, T> McsMutex<L, T> {
pub fn lock<'a, LP: Lower<L> + 'a>(
&'a self,
lock_token: LockToken<'a, LP>,
) -> McsMutexGuard<'a, L, T> {
let percpu = PercpuBlock::current();
let contended = self.raw.acquire(&percpu.mcs_sched_node);
if contended {
percpu
.mcs_contention_count
.set(percpu.mcs_contention_count.get() + 1);
}
McsMutexGuard {
lock: self,
lock_token: LockToken::downgraded(lock_token),
}
}
pub fn try_lock<'a, LP: Lower<L> + 'a>(
&'a self,
lock_token: LockToken<'a, LP>,
) -> Option<McsMutexGuard<'a, L, T>> {
let percpu = PercpuBlock::current();
if self.raw.try_acquire(&percpu.mcs_sched_node) {
Some(McsMutexGuard {
lock: self,
lock_token: LockToken::downgraded(lock_token),
})
} else {
None
}
}
}
pub struct McsMutexGuard<'a, L: Level, T: 'a> {
lock: &'a McsMutex<L, T>,
lock_token: LockToken<'a, L>,
}
impl<'a, L: Level, T: 'a> McsMutexGuard<'a, L, T> {
pub fn token_split(&mut self) -> (&mut T, LockToken<'_, L>) {
unsafe { (&mut *self.lock.data.get(), self.lock_token.token()) }
}
pub fn into_split(self) -> (McsRawGuard<'a, L, T>, LockToken<'a, L>) {
let lock_ref = self.lock;
let token = unsafe { core::ptr::read(&self.lock_token) };
core::mem::forget(self);
(McsRawGuard { lock: lock_ref }, token)
}
pub fn from_split(raw: McsRawGuard<'a, L, T>, token: LockToken<'a, L>) -> Self {
let lock_ref = raw.lock;
core::mem::forget(raw);
Self {
lock: lock_ref,
lock_token: token,
}
}
}
impl<L: Level, T> core::ops::Deref for McsMutexGuard<'_, L, T> {
type Target = T;
fn deref(&self) -> &Self::Target {
unsafe { &*self.lock.data.get() }
}
}
impl<L: Level, T> core::ops::DerefMut for McsMutexGuard<'_, L, T> {
fn deref_mut(&mut self) -> &mut Self::Target {
unsafe { &mut *self.lock.data.get() }
}
}
impl<L: Level, T> Drop for McsMutexGuard<'_, L, T> {
fn drop(&mut self) {
let percpu = PercpuBlock::current();
self.lock.raw.release(&percpu.mcs_sched_node);
}
}
pub struct McsRawGuard<'a, L: Level, T: 'a> {
lock: &'a McsMutex<L, T>,
}
impl<L: Level, T> core::ops::Deref for McsRawGuard<'_, L, T> {
type Target = T;
fn deref(&self) -> &Self::Target {
unsafe { &*self.lock.data.get() }
}
}
impl<L: Level, T> core::ops::DerefMut for McsRawGuard<'_, L, T> {
fn deref_mut(&mut self) -> &mut Self::Target {
unsafe { &mut *self.lock.data.get() }
}
}
impl<L: Level, T> Drop for McsRawGuard<'_, L, T> {
fn drop(&mut self) {
let percpu = PercpuBlock::current();
self.lock.raw.release(&percpu.mcs_sched_node);
}
}
+102 -180
View File
@@ -2,7 +2,7 @@
use core::num::NonZeroUsize;
use alloc::{format, string::{String, ToString}, sync::Arc, vec::Vec};
use alloc::{string::String, sync::Arc, vec::Vec};
use redox_path::RedoxPath;
use crate::{
@@ -12,9 +12,9 @@ use crate::{
memory::{AddrSpace, GenericFlusher, Grant, PageSpan, TlbShootdownActions},
},
memory::{Page, VirtualAddress, PAGE_SIZE},
scheme::{self, pipe, FileHandle, KernelScheme, OpenResult, SchemeExt, StrOrBytes},
scheme::{self, FileHandle, KernelScheme, OpenResult, StrOrBytes},
sync::{CleanLockToken, RwLock},
syscall::{data::{GlobalSchemes, Stat}, error::*, flag::*},
syscall::{data::Stat, error::*, flag::*},
};
use super::usercopy::{UserSlice, UserSliceRo, UserSliceRw, UserSliceWo};
@@ -45,7 +45,7 @@ pub fn file_op_generic_ext<T>(
(file, desc)
};
let scheme = desc.get_scheme(token)?;
let scheme = scheme::get_scheme(token.token(), desc.scheme)?;
op(&*scheme, file.description, desc, token)
}
@@ -62,32 +62,55 @@ pub fn copy_path_to_buf(raw_path: UserSliceRo, max_len: usize) -> Result<String>
// TODO: Define elsewhere
const PATH_MAX: usize = PAGE_SIZE;
fn fifo_path_key(scheme_id: scheme::SchemeId, number: usize, path: &str) -> String {
if path.starts_with('/') {
path.to_string()
} else {
format!("@fifo:{}:{}:{}", scheme_id.get(), number, path)
}
}
fn install_open_result(
scheme_id: scheme::SchemeId,
pub fn openat(
fh: FileHandle,
raw_path: UserSliceRo,
flags: usize,
open_result: OpenResult,
fcntl_flags: u32,
euid: u32,
egid: u32,
token: &mut CleanLockToken,
) -> Result<FileHandle> {
let new_description = match open_result {
OpenResult::SchemeLocal(number, internal_flags) => Arc::new(RwLock::new(
FileDescription::new(
scheme_id,
number,
0,
(flags & !O_CLOEXEC) as u32,
internal_flags,
token,
),
)),
OpenResult::External(desc) => desc,
let path_buf = copy_path_to_buf(raw_path, PATH_MAX)?;
let (scheme_id, number) = {
let current_lock = context::current();
let mut current = current_lock.read(token.token());
let (context, mut token) = current.token_split();
let pipe = context.get_file(fh, &mut token).ok_or(Error::new(EBADF))?;
let desc = pipe.description.read(token.token());
(desc.scheme, desc.number)
};
let caller_ctx = context::current()
.read(token.token())
.caller_ctx()
.filter_uid_gid(euid, egid);
let new_description = {
let scheme = scheme::get_scheme(token.token(), scheme_id)?;
let res = scheme.kopenat(
number,
StrOrBytes::from_str(&path_buf),
flags,
fcntl_flags,
caller_ctx,
token,
);
match res? {
OpenResult::SchemeLocal(number, internal_flags) => {
Arc::new(RwLock::new(FileDescription {
offset: 0,
internal_flags,
scheme: scheme_id,
number,
flags: (flags & !O_CLOEXEC) as u32,
}))
}
OpenResult::External(desc) => desc,
}
};
let current_lock = context::current();
@@ -103,102 +126,6 @@ fn install_open_result(
)
.ok_or(Error::new(EMFILE))
}
fn path_exists_in_scheme(
scheme: &dyn KernelScheme,
number: usize,
path: &str,
caller_ctx: scheme::CallerCtx,
token: &mut CleanLockToken,
) -> Result<bool> {
match scheme.kopenat(number, StrOrBytes::from_str(path), O_STAT, 0, caller_ctx, token) {
Ok(OpenResult::SchemeLocal(number, _)) => {
let _ = scheme.close(number, token);
Ok(true)
}
Ok(OpenResult::External(_)) => Ok(true),
Err(err) if err.errno == ENOENT => Ok(false),
Err(err) => Err(err),
}
}
pub fn openat(
fh: FileHandle,
raw_path: UserSliceRo,
flags: usize,
fcntl_flags: u32,
euid: u32,
egid: u32,
token: &mut CleanLockToken,
) -> Result<FileHandle> {
let path_buf = copy_path_to_buf(raw_path, PATH_MAX)?;
let desc = {
let current_lock = context::current();
let mut current = current_lock.read(token.token());
let (context, mut context_token) = current.token_split();
let pipe = context
.get_file(fh, &mut context_token)
.ok_or(Error::new(EBADF))?;
*pipe.description.read(context_token.token())
};
let scheme = desc.get_scheme(token)?;
let number = desc.number;
let scheme_id = desc.scheme;
let caller_ctx = context::current()
.read(token.token())
.caller_ctx()
.filter_uid_gid(euid, egid);
let fifo_mode_requested = flags & MODE_FIFO as usize == MODE_FIFO as usize;
let fifo_key = fifo_path_key(scheme_id, number, &path_buf);
if pipe::named_pipe_exists(&fifo_key, token) {
if flags & O_EXCL == O_EXCL && flags & O_CREAT == O_CREAT {
return Err(Error::new(EEXIST));
}
if fifo_mode_requested && flags & O_CREAT == O_CREAT {
return Err(Error::new(EEXIST));
}
let pipe_number = pipe::open_named_pipe(&fifo_key, flags, token)?
.ok_or(Error::new(ENOENT))?;
return install_open_result(
GlobalSchemes::Pipe.scheme_id(),
flags,
OpenResult::SchemeLocal(pipe_number, InternalFlags::empty()),
token,
);
}
if fifo_mode_requested && flags & O_CREAT == O_CREAT {
if path_exists_in_scheme(&*scheme, number, &path_buf, caller_ctx, token)? {
return Err(Error::new(EEXIST));
}
let mode = u16::try_from(flags & 0o7777).map_err(|_| Error::new(EINVAL))?;
let pipe_number = pipe::create_named_pipe(&fifo_key, &path_buf, mode, flags, token)?;
return install_open_result(
GlobalSchemes::Pipe.scheme_id(),
flags,
OpenResult::SchemeLocal(pipe_number, InternalFlags::empty()),
token,
);
}
let open_result = scheme.kopenat(
number,
StrOrBytes::from_str(&path_buf),
flags,
fcntl_flags,
caller_ctx,
token,
)?;
install_open_result(scheme_id, flags, open_result, token)
}
/// Unlinkat syscall
pub fn unlinkat(
fh: FileHandle,
@@ -210,27 +137,22 @@ pub fn unlinkat(
) -> Result<()> {
let path_buf = copy_path_to_buf(raw_path, PATH_MAX)?;
let desc = {
let (number, scheme_id) = {
let current_lock = context::current();
let mut current = current_lock.read(token.token());
let (context, mut context_token) = current.token_split();
let pipe = context
.get_file(fh, &mut context_token)
.ok_or(Error::new(EBADF))?;
*pipe.description.read(context_token.token())
let (context, mut token) = current.token_split();
let pipe = context.get_file(fh, &mut token).ok_or(Error::new(EBADF))?;
let desc = pipe.description.read(token.token());
(desc.number, desc.scheme)
};
let number = desc.number;
let scheme = desc.get_scheme(token)?;
let scheme = scheme::get_scheme(token.token(), scheme_id)?;
let caller_ctx = context::current()
.read(token.token())
.caller_ctx()
.filter_uid_gid(euid, egid);
if pipe::unlink_named_pipe(&fifo_path_key(desc.scheme, number, &path_buf), token) {
return Ok(());
}
/*
let mut path_buf = BorrowedHtBuf::head()?;
let path = path_buf.use_for_string(raw_path)?;
@@ -277,18 +199,17 @@ fn duplicate_file(
let description = { *file.description.read(token.token()) };
let new_description = {
let scheme = description.get_scheme(token)?;
let scheme = scheme::get_scheme(token.token(), description.scheme)?;
match scheme.kdup(description.number, user_buf, caller_ctx, token)? {
OpenResult::SchemeLocal(number, internal_flags) => {
Arc::new(RwLock::new(FileDescription::new(
description.scheme,
number,
0,
description.flags,
Arc::new(RwLock::new(FileDescription {
offset: 0,
internal_flags,
token,
)))
scheme: description.scheme,
number,
flags: description.flags,
}))
}
OpenResult::External(desc) => desc,
}
@@ -375,10 +296,11 @@ fn call_normal(
}
.ok_or(Error::new(EBADF))?;
let (scheme, number) = {
let desc = *file.description.read(token.token());
(desc.get_scheme(token)?, desc.number)
let (scheme_id, number) = {
let desc = file.description.read(token.token());
(desc.scheme, desc.number)
};
let scheme = scheme::get_scheme(token.token(), scheme_id)?;
if flags.contains(CallFlags::STD_FS) {
scheme.translate_std_fs_call(number, file.description, payload, flags, metadata, token)
@@ -419,28 +341,28 @@ fn fdwrite_inner(
) -> Result<usize> {
// TODO: Ensure deadlocks can't happen
let (scheme, number, descs_to_send) = {
let desc = {
let (scheme, number) = {
let current_lock = context::current();
let mut current = current_lock.read(token.token());
let (context, mut context_token) = current.token_split();
let (context, mut token) = current.token_split();
let file_descriptor = context
.get_file(socket, &mut context_token)
.get_file(socket, &mut token)
.ok_or(Error::new(EBADF))?;
*file_descriptor.description.read(context_token.token())
let desc = &file_descriptor.description.read(token.token());
(desc.scheme, desc.number)
};
let scheme = desc.get_scheme(token)?;
let number = desc.number;
let scheme = scheme::get_scheme(token.token(), scheme)?;
let current_lock = context::current();
let mut current = current_lock.read(token.token());
let (context, mut context_token) = current.token_split();
let (context, mut token) = current.token_split();
(
scheme,
number,
if flags.contains(CallFlags::FD_CLONE) {
context.bulk_get_files(&target_fds, &mut context_token)
context.bulk_get_files(&target_fds, &mut token)
} else {
context.bulk_remove_files(&target_fds, &mut context_token)
context.bulk_remove_files(&target_fds, &mut token)
}?
.into_iter()
.map(|f| f.description)
@@ -473,22 +395,18 @@ fn call_fdread(
metadata: &[u64],
token: &mut CleanLockToken,
) -> Result<usize> {
let desc = {
let current_lock = context::current();
let mut current = current_lock.read(token.token());
let (context, mut context_token) = current.token_split();
let file_descriptor = context
.get_file(fd, &mut context_token)
.ok_or(Error::new(EBADF))?;
*file_descriptor.description.read(context_token.token())
};
let (scheme, number) = {
let scheme = desc.get_scheme(token)?;
let number = desc.number;
(
scheme,
number,
)
let (scheme, number) = {
let current_lock = context::current();
let mut current = current_lock.read(token.token());
let (context, mut token) = current.token_split();
let file_descriptor = context.get_file(fd, &mut token).ok_or(Error::new(EBADF))?;
let desc = file_descriptor.description.read(token.token());
(desc.scheme, desc.number)
};
let scheme = scheme::get_scheme(token.token(), scheme)?;
(scheme, number)
};
scheme.kfdread(number, payload, flags, metadata, token)
@@ -522,9 +440,9 @@ pub fn fcntl(fd: FileHandle, cmd: usize, arg: usize, token: &mut CleanLockToken)
}
.ok_or(Error::new(EBADF))?;
let (number, flags, desc) = {
let desc = *file.description.read(token.token());
(desc.number, desc.flags, desc)
let (scheme_id, number, flags) = {
let desc = file.description.write(token.token());
(desc.scheme, desc.number, desc.flags)
};
if cmd == F_DUPFD || cmd == F_DUPFD_CLOEXEC {
@@ -542,7 +460,7 @@ pub fn fcntl(fd: FileHandle, cmd: usize, arg: usize, token: &mut CleanLockToken)
// Communicate fcntl with scheme
if cmd != F_GETFD && cmd != F_SETFD {
let scheme = desc.get_scheme(token)?;
let scheme = scheme::get_scheme(token.token(), scheme_id)?;
scheme.fcntl(number, cmd, arg, token)?;
};
@@ -600,11 +518,13 @@ pub fn flink(fd: FileHandle, raw_path: UserSliceRo, token: &mut CleanLockToken)
let path = RedoxPath::from_absolute(&path_buf).ok_or(Error::new(EINVAL))?;
let (_, reference) = path.as_parts().ok_or(Error::new(EINVAL))?;
let (number, scheme) = {
let desc = *file.description.read(token.token());
(desc.number, desc.get_scheme(token)?)
let (number, scheme_id) = {
let desc = file.description.read(token.token());
(desc.number, desc.scheme)
};
let scheme = scheme::get_scheme(token.token(), scheme_id)?;
// TODO: Check EXDEV.
/*
if scheme_id != description.scheme {
@@ -634,11 +554,13 @@ pub fn frename(fd: FileHandle, raw_path: UserSliceRo, token: &mut CleanLockToken
let path = RedoxPath::from_absolute(&path_buf).ok_or(Error::new(EINVAL))?;
let (_, reference) = path.as_parts().ok_or(Error::new(EINVAL))?;
let (number, scheme) = {
let desc = *file.description.read(token.token());
(desc.number, desc.get_scheme(token)?)
let (number, scheme_id) = {
let desc = file.description.read(token.token());
(desc.number, desc.scheme)
};
let scheme = scheme::get_scheme(token.token(), scheme_id)?;
// TODO: Check EXDEV.
/*
if scheme_id != description.scheme {
@@ -28,11 +28,6 @@ use crate::{
sync::CleanLockToken,
};
/// Local syscall numbers not yet in the redox_syscall crate.
/// These are allocated from the 987+ range to avoid collisions with crate numbers.
pub const SYS_SCHED_SETAFFINITY: usize = 987;
pub const SYS_SCHED_GETAFFINITY: usize = 988;
/// Debug
pub mod debug;
@@ -225,10 +220,6 @@ pub fn syscall(
unlinkat(fd, UserSlice::ro(c, d)?, e, f as _, g as _, token).map(|()| 0)
}
SYS_YIELD => sched_yield(token).map(|()| 0),
// P17-3: CPU affinity syscalls. Numbers allocated locally (not yet in redox_syscall crate).
SYS_SCHED_SETAFFINITY => sched_setaffinity(b, UserSlice::ro(c, d)?, token),
SYS_SCHED_GETAFFINITY => sched_getaffinity(b, UserSlice::wo(c, d)?, token),
SYS_NANOSLEEP => nanosleep(
UserSlice::ro(b, size_of::<TimeSpec>())?,
UserSlice::wo(c, size_of::<TimeSpec>())?.none_if_null(),
@@ -11,7 +11,6 @@ use crate::{
memory::{AddrSpace, Grant, PageSpan},
ContextRef,
},
cpu_set::RawMask,
event,
sync::{CleanLockToken, RwLock},
syscall::flag::{EventFlags, O_CREAT, O_RDWR},
@@ -272,95 +271,24 @@ unsafe fn bootstrap_mem(bootstrap: &crate::startup::Bootstrap) -> &'static [u8]
}
fn insert_fd(scheme: SchemeId, number: usize, cloexec: bool, token: &mut CleanLockToken) -> usize {
let description = Arc::new(RwLock::new(FileDescription::new(
scheme,
number,
0,
(O_CREAT | O_RDWR) as u32,
InternalFlags::empty(),
token,
)));
let current_lock = context::current();
let mut current = current_lock.read(token.token());
let (context, mut context_token) = current.token_split();
let (context, mut token) = current.token_split();
context
.add_file_min(
FileDescriptor {
description,
description: Arc::new(RwLock::new(FileDescription {
scheme,
number,
offset: 0,
flags: (O_CREAT | O_RDWR) as u32,
internal_flags: InternalFlags::empty(),
})),
cloexec,
},
syscall::flag::UPPER_FDTBL_TAG + scheme.get(),
&mut context_token,
&mut token,
)
.expect("failed to insert fd to current context")
.get()
}
/// Set CPU affinity mask for a process.
///
/// # Arguments (syscall ABI)
/// - `pid`: Process ID (0 = current process; other PIDs not yet supported)
/// - `mask_ptr`: Pointer to a `RawMask` (32 bytes on 64-bit, 256-bit bitmap)
/// - `mask_len`: Length of mask in bytes (must equal `size_of::<RawMask>()`)
pub fn sched_setaffinity(
pid: usize,
mask_ptr: super::usercopy::UserSliceRo,
token: &mut CleanLockToken,
) -> Result<usize> {
// Validate mask size
if mask_ptr.len() != core::mem::size_of::<RawMask>() {
return Err(Error::new(super::error::EINVAL));
}
// pid == 0 means current process
let target = if pid == 0 {
context::current()
} else {
// TODO: Support PID-based lookup (requires context list iteration
// with lock token downgrades). For now, only pid=0 is supported.
return Err(Error::new(super::error::ESRCH));
};
// Read mask from userspace
let raw_mask: RawMask = unsafe { mask_ptr.read_exact() }?;
// Apply to context's affinity mask
let mut ctx = target.write(token.token());
ctx.sched_affinity.override_from(&raw_mask);
Ok(0)
}
/// Get CPU affinity mask for a process.
///
/// # Arguments (syscall ABI)
/// - `pid`: Process ID (0 = current process; other PIDs not yet supported)
/// - `mask_ptr`: Pointer to a `RawMask` buffer (32 bytes on 64-bit)
/// - `mask_len`: Length of buffer in bytes (must equal `size_of::<RawMask>()`)
///
/// # Returns
/// Number of bytes written to mask_ptr on success.
pub fn sched_getaffinity(
pid: usize,
mask_ptr: super::usercopy::UserSliceWo,
token: &mut CleanLockToken,
) -> Result<usize> {
// Validate mask size
if mask_ptr.len() != core::mem::size_of::<RawMask>() {
return Err(Error::new(super::error::EINVAL));
}
// pid == 0 means current process
let target = if pid == 0 {
context::current()
} else {
return Err(Error::new(super::error::ESRCH));
};
let ctx = target.read(token.token());
let raw_mask = ctx.sched_affinity.to_raw();
mask_ptr.copy_common_bytes_from_slice(crate::cpu_set::mask_as_bytes(&raw_mask))?;
Ok(core::mem::size_of::<RawMask>())
}