0cbad35638
Kernel source (ephemeral — changes durable in local/patches/kernel/): - P20 x2apic ICR mode fix, P21 x2apic SMP fix applied - ACPI MADT, RSDP, SDT improvements - Context switch, percpu, event, IRQ scheme updates - MSI/vector allocation, NUMA/SLIT/SRAT support Local recipe source updates: - redox-driver-acpi: bus/prt hardening - redox-drm: Intel display, KMS connector improvements - driver-manager: config/scheme hardening - thermald: main.rs fix - uutils-tar, ninja-build: source updates Other: - bootloader, installer, redoxfs, relibc, userutils source updates - recipe.toml.backup, libxcvt source directory
383 lines
14 KiB
Rust
383 lines
14 KiB
Rust
use alloc::{
|
|
sync::{Arc, Weak},
|
|
vec::Vec,
|
|
};
|
|
use core::{
|
|
cell::{Cell, RefCell},
|
|
hint,
|
|
sync::atomic::{AtomicBool, AtomicPtr, AtomicU32, AtomicU64, Ordering},
|
|
};
|
|
|
|
/// Maximum number of pages to flush individually using INVLPG before falling
|
|
/// back to a full TLB flush (CR3 reload).
|
|
const TLB_RANGE_THRESHOLD: u32 = 32;
|
|
|
|
use rmm::Arch;
|
|
use syscall::PtraceFlags;
|
|
|
|
use crate::{
|
|
arch::device::ArchPercpuMisc,
|
|
context::{empty_cr3, memory::AddrSpaceWrapper, switch::ContextSwitchPercpu},
|
|
cpu_set::{LogicalCpuId, MAX_CPU_COUNT},
|
|
cpu_stats::{CpuStats, CpuStatsData},
|
|
ptrace::Session,
|
|
sync::{mcs::McsNode, mcs::McsRawLock, CleanLockToken},
|
|
syscall::debug::SyscallDebugInfo,
|
|
};
|
|
|
|
/// The percpu block, that stored all percpu variables.
|
|
pub struct PercpuBlock {
|
|
/// A unique immutable number that identifies the current CPU - used for scheduling
|
|
pub cpu_id: LogicalCpuId,
|
|
|
|
/// Context management
|
|
pub switch_internals: ContextSwitchPercpu,
|
|
|
|
pub current_addrsp: RefCell<Option<Arc<AddrSpaceWrapper>>>,
|
|
pub new_addrsp_tmp: Cell<Option<Arc<AddrSpaceWrapper>>>,
|
|
pub wants_tlb_shootdown: AtomicBool,
|
|
pub balance: Cell<[usize; 40]>,
|
|
pub last_queue: Cell<usize>,
|
|
|
|
/// Per-CPU MCS node for the scheduler run-queue lock (RUN_CONTEXTS).
|
|
pub mcs_sched_node: McsNode,
|
|
|
|
/// Counts how many times the scheduler MCS lock acquisition was contended.
|
|
pub mcs_contention_count: Cell<u64>,
|
|
|
|
/// TLB shootdown range: start virtual address (page-aligned).
|
|
/// Set to 0 for a full flush. Only valid when `wants_tlb_shootdown` is true.
|
|
pub tlb_flush_start: AtomicU64,
|
|
/// TLB shootdown range: number of pages to invalidate.
|
|
pub tlb_flush_count: AtomicU32,
|
|
|
|
/// Priority inheritance donation. When another CPU is blocked waiting on a
|
|
/// lock this CPU holds, the blocked CPU may donate its priority here.
|
|
/// `u32::MAX` means no donation; otherwise it's a priority level (0-39).
|
|
pub pi_donated_prio: AtomicU32,
|
|
|
|
/// Cached priority of the currently-running context on this CPU.
|
|
/// Set by the scheduler when selecting a new context. Read by the MCS
|
|
/// lock during priority donation — avoids acquiring the context RwLock
|
|
/// from the spin loop. Default 39 (lowest priority).
|
|
pub current_prio: Cell<usize>,
|
|
|
|
/// NUMA proximity domain for this CPU. Set during ACPI init from SRAT.
|
|
/// `u8::MAX` means unknown (no SRAT or APIC ID not listed).
|
|
pub numa_node: Cell<u8>,
|
|
|
|
/// Pointer to the MCS lock this CPU is currently spinning on (for transitive PI).
|
|
/// `null` when not waiting on any lock. Set in McsRawLock::acquire() before
|
|
/// entering the spin loop, cleared upon acquisition.
|
|
pub waiting_on_lock: AtomicPtr<McsRawLock>,
|
|
|
|
// TODO: Put mailbox queues here, e.g. for TLB shootdown? Just be sure to 128-byte align it
|
|
// first to avoid cache invalidation.
|
|
pub profiling: Option<&'static crate::profiling::RingBuffer>,
|
|
|
|
pub ptrace_flags: Cell<PtraceFlags>,
|
|
pub ptrace_session: RefCell<Option<Weak<Session>>>,
|
|
pub inside_syscall: Cell<bool>,
|
|
|
|
pub syscall_debug_info: Cell<SyscallDebugInfo>,
|
|
|
|
pub misc_arch_info: crate::arch::device::ArchPercpuMisc,
|
|
|
|
pub stats: CpuStats,
|
|
}
|
|
|
|
static ALL_PERCPU_BLOCKS: [AtomicPtr<PercpuBlock>; MAX_CPU_COUNT as usize] =
|
|
[const { AtomicPtr::new(core::ptr::null_mut()) }; MAX_CPU_COUNT as usize];
|
|
|
|
#[allow(unused)]
|
|
pub unsafe fn init_tlb_shootdown(id: LogicalCpuId, block: *mut PercpuBlock) {
|
|
ALL_PERCPU_BLOCKS[id.get() as usize].store(block, Ordering::Release)
|
|
}
|
|
|
|
/// Get a reference to another CPU's PercpuBlock by logical CPU ID.
|
|
pub fn get_for_cpu(id: LogicalCpuId) -> Option<&'static PercpuBlock> {
|
|
unsafe {
|
|
ALL_PERCPU_BLOCKS[id.get() as usize]
|
|
.load(Ordering::Acquire)
|
|
.as_ref()
|
|
}
|
|
}
|
|
|
|
pub fn get_all_stats() -> Vec<(LogicalCpuId, CpuStatsData)> {
|
|
let mut res = ALL_PERCPU_BLOCKS
|
|
.iter()
|
|
.filter_map(|block| unsafe { block.load(Ordering::Relaxed).as_ref() })
|
|
.map(|block| {
|
|
let stats = &block.stats;
|
|
(block.cpu_id, stats.into())
|
|
})
|
|
.collect::<Vec<_>>();
|
|
res.sort_unstable_by_key(|(id, _stats)| id.get());
|
|
res
|
|
}
|
|
|
|
// PercpuBlock::current() is implemented somewhere in the arch-specific modules
|
|
|
|
pub fn shootdown_tlb_ipi(target: Option<LogicalCpuId>) {
|
|
if cfg!(not(feature = "multi_core")) {
|
|
return;
|
|
}
|
|
|
|
if let Some(target) = target {
|
|
let my_percpublock = PercpuBlock::current();
|
|
assert_ne!(target, my_percpublock.cpu_id);
|
|
|
|
let Some(percpublock) = (unsafe {
|
|
ALL_PERCPU_BLOCKS[target.get() as usize]
|
|
.load(Ordering::Acquire)
|
|
.as_ref()
|
|
}) else {
|
|
warn!("Trying to TLB shootdown a CPU that doesn't exist or isn't initialized.");
|
|
return;
|
|
};
|
|
#[expect(clippy::bool_comparison)]
|
|
while percpublock
|
|
.wants_tlb_shootdown
|
|
.swap(true, Ordering::Release)
|
|
== true
|
|
{
|
|
// Load is faster than CAS or on x86, LOCK BTS
|
|
while percpublock.wants_tlb_shootdown.load(Ordering::Relaxed) == true {
|
|
my_percpublock.maybe_handle_tlb_shootdown();
|
|
core::hint::spin_loop();
|
|
}
|
|
}
|
|
// Full flush — clear range info (Release ordering ensures the flag
|
|
// swap and these stores are visible to the handler before the IPI).
|
|
percpublock.tlb_flush_start.store(0, Ordering::Release);
|
|
percpublock.tlb_flush_count.store(0, Ordering::Release);
|
|
|
|
crate::ipi::ipi_single(crate::ipi::IpiKind::Tlb, percpublock);
|
|
} else {
|
|
// Broadcast TLB shootdown: set flag on all other CPUs, then send a single
|
|
// IPI with "all except self" destination shorthand instead of N individual IPIs.
|
|
let my_percpublock = PercpuBlock::current();
|
|
for id in 0..crate::cpu_count() {
|
|
let target_id = LogicalCpuId::new(id);
|
|
if target_id == my_percpublock.cpu_id {
|
|
continue;
|
|
}
|
|
let Some(percpublock) = (unsafe {
|
|
ALL_PERCPU_BLOCKS[id as usize]
|
|
.load(Ordering::Acquire)
|
|
.as_ref()
|
|
}) else {
|
|
continue;
|
|
};
|
|
// Wait if this CPU still has a pending shootdown from a previous request
|
|
#[expect(clippy::bool_comparison)]
|
|
while percpublock
|
|
.wants_tlb_shootdown
|
|
.swap(true, Ordering::Release)
|
|
== true
|
|
{
|
|
while percpublock.wants_tlb_shootdown.load(Ordering::Relaxed) == true {
|
|
my_percpublock.maybe_handle_tlb_shootdown();
|
|
hint::spin_loop();
|
|
}
|
|
}
|
|
// Full flush — clear range info (Release ordering)
|
|
percpublock.tlb_flush_start.store(0, Ordering::Release);
|
|
percpublock.tlb_flush_count.store(0, Ordering::Release);
|
|
}
|
|
// Single broadcast IPI to all other CPUs using destination shorthand
|
|
crate::ipi::ipi(crate::ipi::IpiKind::Tlb, crate::ipi::IpiTarget::Other);
|
|
}
|
|
}
|
|
|
|
/// Range-based TLB shootdown IPI. Only invalidates the specified virtual address
|
|
/// range using INVLPG per page for ranges up to TLB_RANGE_THRESHOLD pages.
|
|
/// Falls back to full flush for larger ranges.
|
|
pub fn shootdown_tlb_ipi_range(target: Option<LogicalCpuId>, start: usize, count: usize) {
|
|
if cfg!(not(feature = "multi_core")) {
|
|
return;
|
|
}
|
|
|
|
let start_aligned = start as u64 & !0xFFF;
|
|
let count_u32 = count as u32;
|
|
let use_range = count_u32 > 0 && count_u32 <= TLB_RANGE_THRESHOLD;
|
|
|
|
let set_range = |percpublock: &PercpuBlock| {
|
|
if use_range {
|
|
percpublock.tlb_flush_start.store(start_aligned, Ordering::Release);
|
|
percpublock.tlb_flush_count.store(count_u32, Ordering::Release);
|
|
} else {
|
|
percpublock.tlb_flush_start.store(0, Ordering::Release);
|
|
percpublock.tlb_flush_count.store(0, Ordering::Release);
|
|
}
|
|
};
|
|
|
|
if let Some(target) = target {
|
|
let my_percpublock = PercpuBlock::current();
|
|
assert_ne!(target, my_percpublock.cpu_id);
|
|
|
|
let Some(percpublock) = (unsafe {
|
|
ALL_PERCPU_BLOCKS[target.get() as usize]
|
|
.load(Ordering::Acquire)
|
|
.as_ref()
|
|
}) else {
|
|
return;
|
|
};
|
|
#[expect(clippy::bool_comparison)]
|
|
while percpublock.wants_tlb_shootdown.swap(true, Ordering::Release) == true {
|
|
while percpublock.wants_tlb_shootdown.load(Ordering::Relaxed) == true {
|
|
my_percpublock.maybe_handle_tlb_shootdown();
|
|
hint::spin_loop();
|
|
}
|
|
}
|
|
set_range(percpublock);
|
|
crate::ipi::ipi_single(crate::ipi::IpiKind::Tlb, percpublock);
|
|
} else {
|
|
let my_percpublock = PercpuBlock::current();
|
|
for id in 0..crate::cpu_count() {
|
|
let target_id = LogicalCpuId::new(id);
|
|
if target_id == my_percpublock.cpu_id {
|
|
continue;
|
|
}
|
|
let Some(percpublock) = (unsafe {
|
|
ALL_PERCPU_BLOCKS[id as usize]
|
|
.load(Ordering::Acquire)
|
|
.as_ref()
|
|
}) else {
|
|
continue;
|
|
};
|
|
#[expect(clippy::bool_comparison)]
|
|
while percpublock.wants_tlb_shootdown.swap(true, Ordering::Release) == true {
|
|
while percpublock.wants_tlb_shootdown.load(Ordering::Relaxed) == true {
|
|
my_percpublock.maybe_handle_tlb_shootdown();
|
|
hint::spin_loop();
|
|
}
|
|
}
|
|
set_range(percpublock);
|
|
}
|
|
crate::ipi::ipi(crate::ipi::IpiKind::Tlb, crate::ipi::IpiTarget::Other);
|
|
}
|
|
}
|
|
impl PercpuBlock {
|
|
/// Return the effective scheduling priority, accounting for priority inheritance.
|
|
/// Lower number = higher priority (0-39 range).
|
|
pub fn effective_prio(&self, context_prio: usize) -> usize {
|
|
let donated = self.pi_donated_prio.load(Ordering::Relaxed);
|
|
if donated < context_prio as u32 {
|
|
donated as usize
|
|
} else {
|
|
context_prio
|
|
}
|
|
}
|
|
|
|
pub fn maybe_handle_tlb_shootdown(&self) {
|
|
#[expect(clippy::bool_comparison)]
|
|
if self.wants_tlb_shootdown.swap(false, Ordering::Relaxed) == false {
|
|
return;
|
|
}
|
|
|
|
let start = self.tlb_flush_start.load(Ordering::Acquire);
|
|
let count = self.tlb_flush_count.load(Ordering::Acquire);
|
|
|
|
if start != 0 && count > 0 && count <= TLB_RANGE_THRESHOLD {
|
|
// Range-based flush using INVLPG per page — cheaper than full CR3 reload.
|
|
for i in 0..count {
|
|
let addr = start + (i as u64) * 4096;
|
|
crate::memory::RmmA::invalidate(rmm::VirtualAddress::new(addr as usize));
|
|
}
|
|
} else {
|
|
// Full TLB flush (CR3 reload) for large ranges or global shootdowns.
|
|
crate::memory::RmmA::invalidate_all();
|
|
}
|
|
|
|
if let Some(addrsp) = &*self.current_addrsp.borrow() {
|
|
addrsp.tlb_ack.fetch_add(1, Ordering::Release);
|
|
}
|
|
}
|
|
}
|
|
pub unsafe fn switch_arch_hook() {
|
|
unsafe {
|
|
let percpu = PercpuBlock::current();
|
|
|
|
let cur_addrsp = percpu.current_addrsp.borrow();
|
|
let next_addrsp = percpu.new_addrsp_tmp.take();
|
|
|
|
let retain_pgtbl = match (&*cur_addrsp, &next_addrsp) {
|
|
(Some(p), Some(n)) => Arc::ptr_eq(p, n),
|
|
(Some(_), None) | (None, Some(_)) => false,
|
|
(None, None) => true,
|
|
};
|
|
if retain_pgtbl {
|
|
// If we are not switching to a different address space, we can simply return early.
|
|
return;
|
|
}
|
|
if let Some(prev_addrsp) = &*cur_addrsp {
|
|
prev_addrsp.used_by.atomic_clear(percpu.cpu_id);
|
|
|
|
// See [`Flusher::flush`].
|
|
//
|
|
// Without the fence, `wants_tlb_shootdown` check *may* happen
|
|
// before the CPU is removed from the `used_by` set. Hence, if a
|
|
// shootdown request arises *after* the check and *before* removing
|
|
// the CPU from the set, it would be missed and the CPU who
|
|
// requested the shootdown would spin forever since the request was
|
|
// never ACKed.
|
|
core::sync::atomic::fence(Ordering::SeqCst);
|
|
|
|
percpu.maybe_handle_tlb_shootdown();
|
|
}
|
|
|
|
drop(cur_addrsp);
|
|
|
|
// Tell future TLB shootdown handlers that old_addrsp_tmp is no longer the current address
|
|
// space.
|
|
*percpu.current_addrsp.borrow_mut() = next_addrsp;
|
|
|
|
match &*percpu.current_addrsp.borrow() {
|
|
Some(next_addrsp) => {
|
|
next_addrsp.used_by.atomic_set(percpu.cpu_id);
|
|
let mut token = CleanLockToken::new();
|
|
let mut token = token.token();
|
|
let next = next_addrsp.acquire_read(token.downgrade());
|
|
|
|
next.table.utable.make_current();
|
|
}
|
|
_ => {
|
|
crate::memory::RmmA::set_table(rmm::TableKind::User, empty_cr3());
|
|
}
|
|
}
|
|
}
|
|
}
|
|
impl PercpuBlock {
|
|
pub const fn init(cpu_id: LogicalCpuId) -> Self {
|
|
Self {
|
|
cpu_id,
|
|
switch_internals: ContextSwitchPercpu::default(),
|
|
current_addrsp: RefCell::new(None),
|
|
new_addrsp_tmp: Cell::new(None),
|
|
wants_tlb_shootdown: AtomicBool::new(false),
|
|
balance: Cell::new([0; 40]),
|
|
last_queue: Cell::new(39),
|
|
mcs_sched_node: McsNode::new(),
|
|
mcs_contention_count: Cell::new(0),
|
|
tlb_flush_start: AtomicU64::new(0),
|
|
tlb_flush_count: AtomicU32::new(0),
|
|
pi_donated_prio: AtomicU32::new(u32::MAX),
|
|
current_prio: Cell::new(39),
|
|
numa_node: Cell::new(u8::MAX),
|
|
waiting_on_lock: AtomicPtr::new(core::ptr::null_mut()),
|
|
ptrace_flags: Cell::new(PtraceFlags::empty()),
|
|
ptrace_session: RefCell::new(None),
|
|
inside_syscall: Cell::new(false),
|
|
|
|
syscall_debug_info: Cell::new(SyscallDebugInfo::default()),
|
|
|
|
profiling: None,
|
|
|
|
misc_arch_info: ArchPercpuMisc::default(),
|
|
|
|
stats: CpuStats::default(),
|
|
}
|
|
}
|
|
}
|