use alloc::{ sync::{Arc, Weak}, vec::Vec, }; use core::{ cell::{Cell, RefCell}, hint, sync::atomic::{AtomicBool, AtomicPtr, AtomicU32, AtomicU64, Ordering}, }; /// Maximum number of pages to flush individually using INVLPG before falling /// back to a full TLB flush (CR3 reload). const TLB_RANGE_THRESHOLD: u32 = 32; use rmm::Arch; use syscall::PtraceFlags; use crate::{ arch::device::ArchPercpuMisc, context::{empty_cr3, memory::AddrSpaceWrapper, switch::ContextSwitchPercpu}, cpu_set::{LogicalCpuId, MAX_CPU_COUNT}, cpu_stats::{CpuStats, CpuStatsData}, ptrace::Session, sync::{mcs::McsNode, mcs::McsRawLock, CleanLockToken}, syscall::debug::SyscallDebugInfo, }; /// The percpu block, that stored all percpu variables. pub struct PercpuBlock { /// A unique immutable number that identifies the current CPU - used for scheduling pub cpu_id: LogicalCpuId, /// Context management pub switch_internals: ContextSwitchPercpu, pub current_addrsp: RefCell>>, pub new_addrsp_tmp: Cell>>, pub wants_tlb_shootdown: AtomicBool, pub balance: Cell<[usize; 40]>, pub last_queue: Cell, /// Per-CPU MCS node for the scheduler run-queue lock (RUN_CONTEXTS). pub mcs_sched_node: McsNode, /// Counts how many times the scheduler MCS lock acquisition was contended. pub mcs_contention_count: Cell, /// TLB shootdown range: start virtual address (page-aligned). /// Set to 0 for a full flush. Only valid when `wants_tlb_shootdown` is true. pub tlb_flush_start: AtomicU64, /// TLB shootdown range: number of pages to invalidate. pub tlb_flush_count: AtomicU32, /// Priority inheritance donation. When another CPU is blocked waiting on a /// lock this CPU holds, the blocked CPU may donate its priority here. /// `u32::MAX` means no donation; otherwise it's a priority level (0-39). pub pi_donated_prio: AtomicU32, /// Cached priority of the currently-running context on this CPU. /// Set by the scheduler when selecting a new context. Read by the MCS /// lock during priority donation — avoids acquiring the context RwLock /// from the spin loop. Default 39 (lowest priority). pub current_prio: Cell, /// NUMA proximity domain for this CPU. Set during ACPI init from SRAT. /// `u8::MAX` means unknown (no SRAT or APIC ID not listed). pub numa_node: Cell, /// Pointer to the MCS lock this CPU is currently spinning on (for transitive PI). /// `null` when not waiting on any lock. Set in McsRawLock::acquire() before /// entering the spin loop, cleared upon acquisition. pub waiting_on_lock: AtomicPtr, // TODO: Put mailbox queues here, e.g. for TLB shootdown? Just be sure to 128-byte align it // first to avoid cache invalidation. pub profiling: Option<&'static crate::profiling::RingBuffer>, pub ptrace_flags: Cell, pub ptrace_session: RefCell>>, pub inside_syscall: Cell, pub syscall_debug_info: Cell, pub misc_arch_info: crate::arch::device::ArchPercpuMisc, pub stats: CpuStats, } static ALL_PERCPU_BLOCKS: [AtomicPtr; MAX_CPU_COUNT as usize] = [const { AtomicPtr::new(core::ptr::null_mut()) }; MAX_CPU_COUNT as usize]; #[allow(unused)] pub unsafe fn init_tlb_shootdown(id: LogicalCpuId, block: *mut PercpuBlock) { ALL_PERCPU_BLOCKS[id.get() as usize].store(block, Ordering::Release) } /// Get a reference to another CPU's PercpuBlock by logical CPU ID. pub fn get_for_cpu(id: LogicalCpuId) -> Option<&'static PercpuBlock> { unsafe { ALL_PERCPU_BLOCKS[id.get() as usize] .load(Ordering::Acquire) .as_ref() } } pub fn get_all_stats() -> Vec<(LogicalCpuId, CpuStatsData)> { let mut res = ALL_PERCPU_BLOCKS .iter() .filter_map(|block| unsafe { block.load(Ordering::Relaxed).as_ref() }) .map(|block| { let stats = &block.stats; (block.cpu_id, stats.into()) }) .collect::>(); res.sort_unstable_by_key(|(id, _stats)| id.get()); res } // PercpuBlock::current() is implemented somewhere in the arch-specific modules pub fn shootdown_tlb_ipi(target: Option) { if cfg!(not(feature = "multi_core")) { return; } if let Some(target) = target { let my_percpublock = PercpuBlock::current(); assert_ne!(target, my_percpublock.cpu_id); let Some(percpublock) = (unsafe { ALL_PERCPU_BLOCKS[target.get() as usize] .load(Ordering::Acquire) .as_ref() }) else { warn!("Trying to TLB shootdown a CPU that doesn't exist or isn't initialized."); return; }; #[expect(clippy::bool_comparison)] while percpublock .wants_tlb_shootdown .swap(true, Ordering::Release) == true { // Load is faster than CAS or on x86, LOCK BTS while percpublock.wants_tlb_shootdown.load(Ordering::Relaxed) == true { my_percpublock.maybe_handle_tlb_shootdown(); core::hint::spin_loop(); } } // Full flush — clear range info (Release ordering ensures the flag // swap and these stores are visible to the handler before the IPI). percpublock.tlb_flush_start.store(0, Ordering::Release); percpublock.tlb_flush_count.store(0, Ordering::Release); crate::ipi::ipi_single(crate::ipi::IpiKind::Tlb, percpublock); } else { // Broadcast TLB shootdown: set flag on all other CPUs, then send a single // IPI with "all except self" destination shorthand instead of N individual IPIs. let my_percpublock = PercpuBlock::current(); for id in 0..crate::cpu_count() { let target_id = LogicalCpuId::new(id); if target_id == my_percpublock.cpu_id { continue; } let Some(percpublock) = (unsafe { ALL_PERCPU_BLOCKS[id as usize] .load(Ordering::Acquire) .as_ref() }) else { continue; }; // Wait if this CPU still has a pending shootdown from a previous request #[expect(clippy::bool_comparison)] while percpublock .wants_tlb_shootdown .swap(true, Ordering::Release) == true { while percpublock.wants_tlb_shootdown.load(Ordering::Relaxed) == true { my_percpublock.maybe_handle_tlb_shootdown(); hint::spin_loop(); } } // Full flush — clear range info (Release ordering) percpublock.tlb_flush_start.store(0, Ordering::Release); percpublock.tlb_flush_count.store(0, Ordering::Release); } // Single broadcast IPI to all other CPUs using destination shorthand crate::ipi::ipi(crate::ipi::IpiKind::Tlb, crate::ipi::IpiTarget::Other); } } /// Range-based TLB shootdown IPI. Only invalidates the specified virtual address /// range using INVLPG per page for ranges up to TLB_RANGE_THRESHOLD pages. /// Falls back to full flush for larger ranges. pub fn shootdown_tlb_ipi_range(target: Option, start: usize, count: usize) { if cfg!(not(feature = "multi_core")) { return; } let start_aligned = start as u64 & !0xFFF; let count_u32 = count as u32; let use_range = count_u32 > 0 && count_u32 <= TLB_RANGE_THRESHOLD; let set_range = |percpublock: &PercpuBlock| { if use_range { percpublock.tlb_flush_start.store(start_aligned, Ordering::Release); percpublock.tlb_flush_count.store(count_u32, Ordering::Release); } else { percpublock.tlb_flush_start.store(0, Ordering::Release); percpublock.tlb_flush_count.store(0, Ordering::Release); } }; if let Some(target) = target { let my_percpublock = PercpuBlock::current(); assert_ne!(target, my_percpublock.cpu_id); let Some(percpublock) = (unsafe { ALL_PERCPU_BLOCKS[target.get() as usize] .load(Ordering::Acquire) .as_ref() }) else { return; }; #[expect(clippy::bool_comparison)] while percpublock.wants_tlb_shootdown.swap(true, Ordering::Release) == true { while percpublock.wants_tlb_shootdown.load(Ordering::Relaxed) == true { my_percpublock.maybe_handle_tlb_shootdown(); hint::spin_loop(); } } set_range(percpublock); crate::ipi::ipi_single(crate::ipi::IpiKind::Tlb, percpublock); } else { let my_percpublock = PercpuBlock::current(); for id in 0..crate::cpu_count() { let target_id = LogicalCpuId::new(id); if target_id == my_percpublock.cpu_id { continue; } let Some(percpublock) = (unsafe { ALL_PERCPU_BLOCKS[id as usize] .load(Ordering::Acquire) .as_ref() }) else { continue; }; #[expect(clippy::bool_comparison)] while percpublock.wants_tlb_shootdown.swap(true, Ordering::Release) == true { while percpublock.wants_tlb_shootdown.load(Ordering::Relaxed) == true { my_percpublock.maybe_handle_tlb_shootdown(); hint::spin_loop(); } } set_range(percpublock); } crate::ipi::ipi(crate::ipi::IpiKind::Tlb, crate::ipi::IpiTarget::Other); } } impl PercpuBlock { /// Return the effective scheduling priority, accounting for priority inheritance. /// Lower number = higher priority (0-39 range). pub fn effective_prio(&self, context_prio: usize) -> usize { let donated = self.pi_donated_prio.load(Ordering::Relaxed); if donated < context_prio as u32 { donated as usize } else { context_prio } } pub fn maybe_handle_tlb_shootdown(&self) { #[expect(clippy::bool_comparison)] if self.wants_tlb_shootdown.swap(false, Ordering::Relaxed) == false { return; } let start = self.tlb_flush_start.load(Ordering::Acquire); let count = self.tlb_flush_count.load(Ordering::Acquire); if start != 0 && count > 0 && count <= TLB_RANGE_THRESHOLD { // Range-based flush using INVLPG per page — cheaper than full CR3 reload. for i in 0..count { let addr = start + (i as u64) * 4096; crate::memory::RmmA::invalidate(rmm::VirtualAddress::new(addr as usize)); } } else { // Full TLB flush (CR3 reload) for large ranges or global shootdowns. crate::memory::RmmA::invalidate_all(); } if let Some(addrsp) = &*self.current_addrsp.borrow() { addrsp.tlb_ack.fetch_add(1, Ordering::Release); } } } pub unsafe fn switch_arch_hook() { unsafe { let percpu = PercpuBlock::current(); let cur_addrsp = percpu.current_addrsp.borrow(); let next_addrsp = percpu.new_addrsp_tmp.take(); let retain_pgtbl = match (&*cur_addrsp, &next_addrsp) { (Some(p), Some(n)) => Arc::ptr_eq(p, n), (Some(_), None) | (None, Some(_)) => false, (None, None) => true, }; if retain_pgtbl { // If we are not switching to a different address space, we can simply return early. return; } if let Some(prev_addrsp) = &*cur_addrsp { prev_addrsp.used_by.atomic_clear(percpu.cpu_id); // See [`Flusher::flush`]. // // Without the fence, `wants_tlb_shootdown` check *may* happen // before the CPU is removed from the `used_by` set. Hence, if a // shootdown request arises *after* the check and *before* removing // the CPU from the set, it would be missed and the CPU who // requested the shootdown would spin forever since the request was // never ACKed. core::sync::atomic::fence(Ordering::SeqCst); percpu.maybe_handle_tlb_shootdown(); } drop(cur_addrsp); // Tell future TLB shootdown handlers that old_addrsp_tmp is no longer the current address // space. *percpu.current_addrsp.borrow_mut() = next_addrsp; match &*percpu.current_addrsp.borrow() { Some(next_addrsp) => { next_addrsp.used_by.atomic_set(percpu.cpu_id); let mut token = CleanLockToken::new(); let mut token = token.token(); let next = next_addrsp.acquire_read(token.downgrade()); next.table.utable.make_current(); } _ => { crate::memory::RmmA::set_table(rmm::TableKind::User, empty_cr3()); } } } } impl PercpuBlock { pub const fn init(cpu_id: LogicalCpuId) -> Self { Self { cpu_id, switch_internals: ContextSwitchPercpu::default(), current_addrsp: RefCell::new(None), new_addrsp_tmp: Cell::new(None), wants_tlb_shootdown: AtomicBool::new(false), balance: Cell::new([0; 40]), last_queue: Cell::new(39), mcs_sched_node: McsNode::new(), mcs_contention_count: Cell::new(0), tlb_flush_start: AtomicU64::new(0), tlb_flush_count: AtomicU32::new(0), pi_donated_prio: AtomicU32::new(u32::MAX), current_prio: Cell::new(39), numa_node: Cell::new(u8::MAX), waiting_on_lock: AtomicPtr::new(core::ptr::null_mut()), ptrace_flags: Cell::new(PtraceFlags::empty()), ptrace_session: RefCell::new(None), inside_syscall: Cell::new(false), syscall_debug_info: Cell::new(SyscallDebugInfo::default()), profiling: None, misc_arch_info: ArchPercpuMisc::default(), stats: CpuStats::default(), } } }