Files
RedBear-OS/recipes/core/kernel/source/src/percpu.rs
T
vasilito 0cbad35638 chore: kernel source patches, local recipe updates, and build artifacts
Kernel source (ephemeral — changes durable in local/patches/kernel/):
- P20 x2apic ICR mode fix, P21 x2apic SMP fix applied
- ACPI MADT, RSDP, SDT improvements
- Context switch, percpu, event, IRQ scheme updates
- MSI/vector allocation, NUMA/SLIT/SRAT support

Local recipe source updates:
- redox-driver-acpi: bus/prt hardening
- redox-drm: Intel display, KMS connector improvements
- driver-manager: config/scheme hardening
- thermald: main.rs fix
- uutils-tar, ninja-build: source updates

Other:
- bootloader, installer, redoxfs, relibc, userutils source updates
- recipe.toml.backup, libxcvt source directory
2026-05-18 14:20:54 +03:00

383 lines
14 KiB
Rust

use alloc::{
sync::{Arc, Weak},
vec::Vec,
};
use core::{
cell::{Cell, RefCell},
hint,
sync::atomic::{AtomicBool, AtomicPtr, AtomicU32, AtomicU64, Ordering},
};
/// Maximum number of pages to flush individually using INVLPG before falling
/// back to a full TLB flush (CR3 reload).
const TLB_RANGE_THRESHOLD: u32 = 32;
use rmm::Arch;
use syscall::PtraceFlags;
use crate::{
arch::device::ArchPercpuMisc,
context::{empty_cr3, memory::AddrSpaceWrapper, switch::ContextSwitchPercpu},
cpu_set::{LogicalCpuId, MAX_CPU_COUNT},
cpu_stats::{CpuStats, CpuStatsData},
ptrace::Session,
sync::{mcs::McsNode, mcs::McsRawLock, CleanLockToken},
syscall::debug::SyscallDebugInfo,
};
/// The percpu block, that stored all percpu variables.
pub struct PercpuBlock {
/// A unique immutable number that identifies the current CPU - used for scheduling
pub cpu_id: LogicalCpuId,
/// Context management
pub switch_internals: ContextSwitchPercpu,
pub current_addrsp: RefCell<Option<Arc<AddrSpaceWrapper>>>,
pub new_addrsp_tmp: Cell<Option<Arc<AddrSpaceWrapper>>>,
pub wants_tlb_shootdown: AtomicBool,
pub balance: Cell<[usize; 40]>,
pub last_queue: Cell<usize>,
/// Per-CPU MCS node for the scheduler run-queue lock (RUN_CONTEXTS).
pub mcs_sched_node: McsNode,
/// Counts how many times the scheduler MCS lock acquisition was contended.
pub mcs_contention_count: Cell<u64>,
/// TLB shootdown range: start virtual address (page-aligned).
/// Set to 0 for a full flush. Only valid when `wants_tlb_shootdown` is true.
pub tlb_flush_start: AtomicU64,
/// TLB shootdown range: number of pages to invalidate.
pub tlb_flush_count: AtomicU32,
/// Priority inheritance donation. When another CPU is blocked waiting on a
/// lock this CPU holds, the blocked CPU may donate its priority here.
/// `u32::MAX` means no donation; otherwise it's a priority level (0-39).
pub pi_donated_prio: AtomicU32,
/// Cached priority of the currently-running context on this CPU.
/// Set by the scheduler when selecting a new context. Read by the MCS
/// lock during priority donation — avoids acquiring the context RwLock
/// from the spin loop. Default 39 (lowest priority).
pub current_prio: Cell<usize>,
/// NUMA proximity domain for this CPU. Set during ACPI init from SRAT.
/// `u8::MAX` means unknown (no SRAT or APIC ID not listed).
pub numa_node: Cell<u8>,
/// Pointer to the MCS lock this CPU is currently spinning on (for transitive PI).
/// `null` when not waiting on any lock. Set in McsRawLock::acquire() before
/// entering the spin loop, cleared upon acquisition.
pub waiting_on_lock: AtomicPtr<McsRawLock>,
// TODO: Put mailbox queues here, e.g. for TLB shootdown? Just be sure to 128-byte align it
// first to avoid cache invalidation.
pub profiling: Option<&'static crate::profiling::RingBuffer>,
pub ptrace_flags: Cell<PtraceFlags>,
pub ptrace_session: RefCell<Option<Weak<Session>>>,
pub inside_syscall: Cell<bool>,
pub syscall_debug_info: Cell<SyscallDebugInfo>,
pub misc_arch_info: crate::arch::device::ArchPercpuMisc,
pub stats: CpuStats,
}
static ALL_PERCPU_BLOCKS: [AtomicPtr<PercpuBlock>; MAX_CPU_COUNT as usize] =
[const { AtomicPtr::new(core::ptr::null_mut()) }; MAX_CPU_COUNT as usize];
#[allow(unused)]
pub unsafe fn init_tlb_shootdown(id: LogicalCpuId, block: *mut PercpuBlock) {
ALL_PERCPU_BLOCKS[id.get() as usize].store(block, Ordering::Release)
}
/// Get a reference to another CPU's PercpuBlock by logical CPU ID.
pub fn get_for_cpu(id: LogicalCpuId) -> Option<&'static PercpuBlock> {
unsafe {
ALL_PERCPU_BLOCKS[id.get() as usize]
.load(Ordering::Acquire)
.as_ref()
}
}
pub fn get_all_stats() -> Vec<(LogicalCpuId, CpuStatsData)> {
let mut res = ALL_PERCPU_BLOCKS
.iter()
.filter_map(|block| unsafe { block.load(Ordering::Relaxed).as_ref() })
.map(|block| {
let stats = &block.stats;
(block.cpu_id, stats.into())
})
.collect::<Vec<_>>();
res.sort_unstable_by_key(|(id, _stats)| id.get());
res
}
// PercpuBlock::current() is implemented somewhere in the arch-specific modules
pub fn shootdown_tlb_ipi(target: Option<LogicalCpuId>) {
if cfg!(not(feature = "multi_core")) {
return;
}
if let Some(target) = target {
let my_percpublock = PercpuBlock::current();
assert_ne!(target, my_percpublock.cpu_id);
let Some(percpublock) = (unsafe {
ALL_PERCPU_BLOCKS[target.get() as usize]
.load(Ordering::Acquire)
.as_ref()
}) else {
warn!("Trying to TLB shootdown a CPU that doesn't exist or isn't initialized.");
return;
};
#[expect(clippy::bool_comparison)]
while percpublock
.wants_tlb_shootdown
.swap(true, Ordering::Release)
== true
{
// Load is faster than CAS or on x86, LOCK BTS
while percpublock.wants_tlb_shootdown.load(Ordering::Relaxed) == true {
my_percpublock.maybe_handle_tlb_shootdown();
core::hint::spin_loop();
}
}
// Full flush — clear range info (Release ordering ensures the flag
// swap and these stores are visible to the handler before the IPI).
percpublock.tlb_flush_start.store(0, Ordering::Release);
percpublock.tlb_flush_count.store(0, Ordering::Release);
crate::ipi::ipi_single(crate::ipi::IpiKind::Tlb, percpublock);
} else {
// Broadcast TLB shootdown: set flag on all other CPUs, then send a single
// IPI with "all except self" destination shorthand instead of N individual IPIs.
let my_percpublock = PercpuBlock::current();
for id in 0..crate::cpu_count() {
let target_id = LogicalCpuId::new(id);
if target_id == my_percpublock.cpu_id {
continue;
}
let Some(percpublock) = (unsafe {
ALL_PERCPU_BLOCKS[id as usize]
.load(Ordering::Acquire)
.as_ref()
}) else {
continue;
};
// Wait if this CPU still has a pending shootdown from a previous request
#[expect(clippy::bool_comparison)]
while percpublock
.wants_tlb_shootdown
.swap(true, Ordering::Release)
== true
{
while percpublock.wants_tlb_shootdown.load(Ordering::Relaxed) == true {
my_percpublock.maybe_handle_tlb_shootdown();
hint::spin_loop();
}
}
// Full flush — clear range info (Release ordering)
percpublock.tlb_flush_start.store(0, Ordering::Release);
percpublock.tlb_flush_count.store(0, Ordering::Release);
}
// Single broadcast IPI to all other CPUs using destination shorthand
crate::ipi::ipi(crate::ipi::IpiKind::Tlb, crate::ipi::IpiTarget::Other);
}
}
/// Range-based TLB shootdown IPI. Only invalidates the specified virtual address
/// range using INVLPG per page for ranges up to TLB_RANGE_THRESHOLD pages.
/// Falls back to full flush for larger ranges.
pub fn shootdown_tlb_ipi_range(target: Option<LogicalCpuId>, start: usize, count: usize) {
if cfg!(not(feature = "multi_core")) {
return;
}
let start_aligned = start as u64 & !0xFFF;
let count_u32 = count as u32;
let use_range = count_u32 > 0 && count_u32 <= TLB_RANGE_THRESHOLD;
let set_range = |percpublock: &PercpuBlock| {
if use_range {
percpublock.tlb_flush_start.store(start_aligned, Ordering::Release);
percpublock.tlb_flush_count.store(count_u32, Ordering::Release);
} else {
percpublock.tlb_flush_start.store(0, Ordering::Release);
percpublock.tlb_flush_count.store(0, Ordering::Release);
}
};
if let Some(target) = target {
let my_percpublock = PercpuBlock::current();
assert_ne!(target, my_percpublock.cpu_id);
let Some(percpublock) = (unsafe {
ALL_PERCPU_BLOCKS[target.get() as usize]
.load(Ordering::Acquire)
.as_ref()
}) else {
return;
};
#[expect(clippy::bool_comparison)]
while percpublock.wants_tlb_shootdown.swap(true, Ordering::Release) == true {
while percpublock.wants_tlb_shootdown.load(Ordering::Relaxed) == true {
my_percpublock.maybe_handle_tlb_shootdown();
hint::spin_loop();
}
}
set_range(percpublock);
crate::ipi::ipi_single(crate::ipi::IpiKind::Tlb, percpublock);
} else {
let my_percpublock = PercpuBlock::current();
for id in 0..crate::cpu_count() {
let target_id = LogicalCpuId::new(id);
if target_id == my_percpublock.cpu_id {
continue;
}
let Some(percpublock) = (unsafe {
ALL_PERCPU_BLOCKS[id as usize]
.load(Ordering::Acquire)
.as_ref()
}) else {
continue;
};
#[expect(clippy::bool_comparison)]
while percpublock.wants_tlb_shootdown.swap(true, Ordering::Release) == true {
while percpublock.wants_tlb_shootdown.load(Ordering::Relaxed) == true {
my_percpublock.maybe_handle_tlb_shootdown();
hint::spin_loop();
}
}
set_range(percpublock);
}
crate::ipi::ipi(crate::ipi::IpiKind::Tlb, crate::ipi::IpiTarget::Other);
}
}
impl PercpuBlock {
/// Return the effective scheduling priority, accounting for priority inheritance.
/// Lower number = higher priority (0-39 range).
pub fn effective_prio(&self, context_prio: usize) -> usize {
let donated = self.pi_donated_prio.load(Ordering::Relaxed);
if donated < context_prio as u32 {
donated as usize
} else {
context_prio
}
}
pub fn maybe_handle_tlb_shootdown(&self) {
#[expect(clippy::bool_comparison)]
if self.wants_tlb_shootdown.swap(false, Ordering::Relaxed) == false {
return;
}
let start = self.tlb_flush_start.load(Ordering::Acquire);
let count = self.tlb_flush_count.load(Ordering::Acquire);
if start != 0 && count > 0 && count <= TLB_RANGE_THRESHOLD {
// Range-based flush using INVLPG per page — cheaper than full CR3 reload.
for i in 0..count {
let addr = start + (i as u64) * 4096;
crate::memory::RmmA::invalidate(rmm::VirtualAddress::new(addr as usize));
}
} else {
// Full TLB flush (CR3 reload) for large ranges or global shootdowns.
crate::memory::RmmA::invalidate_all();
}
if let Some(addrsp) = &*self.current_addrsp.borrow() {
addrsp.tlb_ack.fetch_add(1, Ordering::Release);
}
}
}
pub unsafe fn switch_arch_hook() {
unsafe {
let percpu = PercpuBlock::current();
let cur_addrsp = percpu.current_addrsp.borrow();
let next_addrsp = percpu.new_addrsp_tmp.take();
let retain_pgtbl = match (&*cur_addrsp, &next_addrsp) {
(Some(p), Some(n)) => Arc::ptr_eq(p, n),
(Some(_), None) | (None, Some(_)) => false,
(None, None) => true,
};
if retain_pgtbl {
// If we are not switching to a different address space, we can simply return early.
return;
}
if let Some(prev_addrsp) = &*cur_addrsp {
prev_addrsp.used_by.atomic_clear(percpu.cpu_id);
// See [`Flusher::flush`].
//
// Without the fence, `wants_tlb_shootdown` check *may* happen
// before the CPU is removed from the `used_by` set. Hence, if a
// shootdown request arises *after* the check and *before* removing
// the CPU from the set, it would be missed and the CPU who
// requested the shootdown would spin forever since the request was
// never ACKed.
core::sync::atomic::fence(Ordering::SeqCst);
percpu.maybe_handle_tlb_shootdown();
}
drop(cur_addrsp);
// Tell future TLB shootdown handlers that old_addrsp_tmp is no longer the current address
// space.
*percpu.current_addrsp.borrow_mut() = next_addrsp;
match &*percpu.current_addrsp.borrow() {
Some(next_addrsp) => {
next_addrsp.used_by.atomic_set(percpu.cpu_id);
let mut token = CleanLockToken::new();
let mut token = token.token();
let next = next_addrsp.acquire_read(token.downgrade());
next.table.utable.make_current();
}
_ => {
crate::memory::RmmA::set_table(rmm::TableKind::User, empty_cr3());
}
}
}
}
impl PercpuBlock {
pub const fn init(cpu_id: LogicalCpuId) -> Self {
Self {
cpu_id,
switch_internals: ContextSwitchPercpu::default(),
current_addrsp: RefCell::new(None),
new_addrsp_tmp: Cell::new(None),
wants_tlb_shootdown: AtomicBool::new(false),
balance: Cell::new([0; 40]),
last_queue: Cell::new(39),
mcs_sched_node: McsNode::new(),
mcs_contention_count: Cell::new(0),
tlb_flush_start: AtomicU64::new(0),
tlb_flush_count: AtomicU32::new(0),
pi_donated_prio: AtomicU32::new(u32::MAX),
current_prio: Cell::new(39),
numa_node: Cell::new(u8::MAX),
waiting_on_lock: AtomicPtr::new(core::ptr::null_mut()),
ptrace_flags: Cell::new(PtraceFlags::empty()),
ptrace_session: RefCell::new(None),
inside_syscall: Cell::new(false),
syscall_debug_info: Cell::new(SyscallDebugInfo::default()),
profiling: None,
misc_arch_info: ArchPercpuMisc::default(),
stats: CpuStats::default(),
}
}
}