RedBear-OS/recipes/core/kernel/source/src/percpu.rs

use alloc::{
    sync::{Arc, Weak},
    vec::Vec,
};
use core::{
    cell::{Cell, RefCell},
    hint,
    sync::atomic::{AtomicBool, AtomicPtr, AtomicU32, AtomicU64, Ordering},
};

/// Maximum number of pages to flush individually using INVLPG before falling
/// back to a full TLB flush (CR3 reload).
const TLB_RANGE_THRESHOLD: u32 = 32;

use rmm::Arch;
use syscall::PtraceFlags;

use crate::{
    arch::device::ArchPercpuMisc,
    context::{empty_cr3, memory::AddrSpaceWrapper, switch::ContextSwitchPercpu},
    cpu_set::{LogicalCpuId, MAX_CPU_COUNT},
    cpu_stats::{CpuStats, CpuStatsData},
    ptrace::Session,
    sync::{mcs::McsNode, mcs::McsRawLock, CleanLockToken},
    syscall::debug::SyscallDebugInfo,
};

/// The percpu block, that stored all percpu variables.
pub struct PercpuBlock {
    /// A unique immutable number that identifies the current CPU - used for scheduling
    pub cpu_id: LogicalCpuId,

    /// Context management
    pub switch_internals: ContextSwitchPercpu,

    pub current_addrsp: RefCell<Option<Arc<AddrSpaceWrapper>>>,
    pub new_addrsp_tmp: Cell<Option<Arc<AddrSpaceWrapper>>>,
    pub wants_tlb_shootdown: AtomicBool,
    pub balance: Cell<[usize; 40]>,
    pub last_queue: Cell<usize>,

    /// Per-CPU MCS node for the scheduler run-queue lock (RUN_CONTEXTS).
    pub mcs_sched_node: McsNode,

    /// Counts how many times the scheduler MCS lock acquisition was contended.
    pub mcs_contention_count: Cell<u64>,

    /// TLB shootdown range: start virtual address (page-aligned).
    /// Set to 0 for a full flush. Only valid when `wants_tlb_shootdown` is true.
    pub tlb_flush_start: AtomicU64,
    /// TLB shootdown range: number of pages to invalidate.
    pub tlb_flush_count: AtomicU32,

    /// Priority inheritance donation. When another CPU is blocked waiting on a
    /// lock this CPU holds, the blocked CPU may donate its priority here.
    /// `u32::MAX` means no donation; otherwise it's a priority level (0-39).
    pub pi_donated_prio: AtomicU32,

    /// Cached priority of the currently-running context on this CPU.
    /// Set by the scheduler when selecting a new context. Read by the MCS
    /// lock during priority donation — avoids acquiring the context RwLock
    /// from the spin loop. Default 39 (lowest priority).
    pub current_prio: Cell<usize>,

    /// NUMA proximity domain for this CPU. Set during ACPI init from SRAT.
    /// `u8::MAX` means unknown (no SRAT or APIC ID not listed).
    pub numa_node: Cell<u8>,

    /// Pointer to the MCS lock this CPU is currently spinning on (for transitive PI).
    /// `null` when not waiting on any lock. Set in McsRawLock::acquire() before
    /// entering the spin loop, cleared upon acquisition.
    pub waiting_on_lock: AtomicPtr<McsRawLock>,

    // TODO: Put mailbox queues here, e.g. for TLB shootdown? Just be sure to 128-byte align it
    // first to avoid cache invalidation.
    pub profiling: Option<&'static crate::profiling::RingBuffer>,

    pub ptrace_flags: Cell<PtraceFlags>,
    pub ptrace_session: RefCell<Option<Weak<Session>>>,
    pub inside_syscall: Cell<bool>,

    pub syscall_debug_info: Cell<SyscallDebugInfo>,

    pub misc_arch_info: crate::arch::device::ArchPercpuMisc,

    pub stats: CpuStats,
}

static ALL_PERCPU_BLOCKS: [AtomicPtr<PercpuBlock>; MAX_CPU_COUNT as usize] =
    [const { AtomicPtr::new(core::ptr::null_mut()) }; MAX_CPU_COUNT as usize];

#[allow(unused)]
pub unsafe fn init_tlb_shootdown(id: LogicalCpuId, block: *mut PercpuBlock) {
    ALL_PERCPU_BLOCKS[id.get() as usize].store(block, Ordering::Release)
}

/// Get a reference to another CPU's PercpuBlock by logical CPU ID.
pub fn get_for_cpu(id: LogicalCpuId) -> Option<&'static PercpuBlock> {
    unsafe {
        ALL_PERCPU_BLOCKS[id.get() as usize]
            .load(Ordering::Acquire)
            .as_ref()
    }
}

pub fn get_all_stats() -> Vec<(LogicalCpuId, CpuStatsData)> {
    let mut res = ALL_PERCPU_BLOCKS
        .iter()
        .filter_map(|block| unsafe { block.load(Ordering::Relaxed).as_ref() })
        .map(|block| {
            let stats = &block.stats;
            (block.cpu_id, stats.into())
        })
        .collect::<Vec<_>>();
    res.sort_unstable_by_key(|(id, _stats)| id.get());
    res
}

// PercpuBlock::current() is implemented somewhere in the arch-specific modules

pub fn shootdown_tlb_ipi(target: Option<LogicalCpuId>) {
    if cfg!(not(feature = "multi_core")) {
        return;
    }

    if let Some(target) = target {
        let my_percpublock = PercpuBlock::current();
        assert_ne!(target, my_percpublock.cpu_id);

        let Some(percpublock) = (unsafe {
            ALL_PERCPU_BLOCKS[target.get() as usize]
                .load(Ordering::Acquire)
                .as_ref()
        }) else {
            warn!("Trying to TLB shootdown a CPU that doesn't exist or isn't initialized.");
            return;
        };
        #[expect(clippy::bool_comparison)]
        while percpublock
            .wants_tlb_shootdown
            .swap(true, Ordering::Release)
            == true
        {
            // Load is faster than CAS or on x86, LOCK BTS
            while percpublock.wants_tlb_shootdown.load(Ordering::Relaxed) == true {
                my_percpublock.maybe_handle_tlb_shootdown();
                core::hint::spin_loop();
            }
        }
        // Full flush — clear range info (Release ordering ensures the flag
        // swap and these stores are visible to the handler before the IPI).
        percpublock.tlb_flush_start.store(0, Ordering::Release);
        percpublock.tlb_flush_count.store(0, Ordering::Release);

        crate::ipi::ipi_single(crate::ipi::IpiKind::Tlb, percpublock);
    } else {
        // Broadcast TLB shootdown: set flag on all other CPUs, then send a single
        // IPI with "all except self" destination shorthand instead of N individual IPIs.
        let my_percpublock = PercpuBlock::current();
        for id in 0..crate::cpu_count() {
            let target_id = LogicalCpuId::new(id);
            if target_id == my_percpublock.cpu_id {
                continue;
            }
            let Some(percpublock) = (unsafe {
                ALL_PERCPU_BLOCKS[id as usize]
                    .load(Ordering::Acquire)
                    .as_ref()
            }) else {
                continue;
            };
            // Wait if this CPU still has a pending shootdown from a previous request
            #[expect(clippy::bool_comparison)]
            while percpublock
                .wants_tlb_shootdown
                .swap(true, Ordering::Release)
                == true
            {
                while percpublock.wants_tlb_shootdown.load(Ordering::Relaxed) == true {
                    my_percpublock.maybe_handle_tlb_shootdown();
                    hint::spin_loop();
                }
            }
            // Full flush — clear range info (Release ordering)
            percpublock.tlb_flush_start.store(0, Ordering::Release);
            percpublock.tlb_flush_count.store(0, Ordering::Release);
        }
        // Single broadcast IPI to all other CPUs using destination shorthand
        crate::ipi::ipi(crate::ipi::IpiKind::Tlb, crate::ipi::IpiTarget::Other);
    }
}

/// Range-based TLB shootdown IPI. Only invalidates the specified virtual address
/// range using INVLPG per page for ranges up to TLB_RANGE_THRESHOLD pages.
/// Falls back to full flush for larger ranges.
pub fn shootdown_tlb_ipi_range(target: Option<LogicalCpuId>, start: usize, count: usize) {
    if cfg!(not(feature = "multi_core")) {
        return;
    }

    let start_aligned = start as u64 & !0xFFF;
    let count_u32 = count as u32;
    let use_range = count_u32 > 0 && count_u32 <= TLB_RANGE_THRESHOLD;

    let set_range = |percpublock: &PercpuBlock| {
        if use_range {
            percpublock.tlb_flush_start.store(start_aligned, Ordering::Release);
            percpublock.tlb_flush_count.store(count_u32, Ordering::Release);
        } else {
            percpublock.tlb_flush_start.store(0, Ordering::Release);
            percpublock.tlb_flush_count.store(0, Ordering::Release);
        }
    };

    if let Some(target) = target {
        let my_percpublock = PercpuBlock::current();
        assert_ne!(target, my_percpublock.cpu_id);

        let Some(percpublock) = (unsafe {
            ALL_PERCPU_BLOCKS[target.get() as usize]
                .load(Ordering::Acquire)
                .as_ref()
        }) else {
            return;
        };
        #[expect(clippy::bool_comparison)]
        while percpublock.wants_tlb_shootdown.swap(true, Ordering::Release) == true {
            while percpublock.wants_tlb_shootdown.load(Ordering::Relaxed) == true {
                my_percpublock.maybe_handle_tlb_shootdown();
                hint::spin_loop();
            }
        }
        set_range(percpublock);
        crate::ipi::ipi_single(crate::ipi::IpiKind::Tlb, percpublock);
    } else {
        let my_percpublock = PercpuBlock::current();
        for id in 0..crate::cpu_count() {
            let target_id = LogicalCpuId::new(id);
            if target_id == my_percpublock.cpu_id {
                continue;
            }
            let Some(percpublock) = (unsafe {
                ALL_PERCPU_BLOCKS[id as usize]
                    .load(Ordering::Acquire)
                    .as_ref()
            }) else {
                continue;
            };
            #[expect(clippy::bool_comparison)]
            while percpublock.wants_tlb_shootdown.swap(true, Ordering::Release) == true {
                while percpublock.wants_tlb_shootdown.load(Ordering::Relaxed) == true {
                    my_percpublock.maybe_handle_tlb_shootdown();
                    hint::spin_loop();
                }
            }
            set_range(percpublock);
        }
        crate::ipi::ipi(crate::ipi::IpiKind::Tlb, crate::ipi::IpiTarget::Other);
    }
}
impl PercpuBlock {
    /// Return the effective scheduling priority, accounting for priority inheritance.
    /// Lower number = higher priority (0-39 range).
    pub fn effective_prio(&self, context_prio: usize) -> usize {
        let donated = self.pi_donated_prio.load(Ordering::Relaxed);
        if donated < context_prio as u32 {
            donated as usize
        } else {
            context_prio
        }
    }

    pub fn maybe_handle_tlb_shootdown(&self) {
        #[expect(clippy::bool_comparison)]
        if self.wants_tlb_shootdown.swap(false, Ordering::Relaxed) == false {
            return;
        }

        let start = self.tlb_flush_start.load(Ordering::Acquire);
        let count = self.tlb_flush_count.load(Ordering::Acquire);

        if start != 0 && count > 0 && count <= TLB_RANGE_THRESHOLD {
            // Range-based flush using INVLPG per page — cheaper than full CR3 reload.
            for i in 0..count {
                let addr = start + (i as u64) * 4096;
                crate::memory::RmmA::invalidate(rmm::VirtualAddress::new(addr as usize));
            }
        } else {
            // Full TLB flush (CR3 reload) for large ranges or global shootdowns.
            crate::memory::RmmA::invalidate_all();
        }

        if let Some(addrsp) = &*self.current_addrsp.borrow() {
            addrsp.tlb_ack.fetch_add(1, Ordering::Release);
        }
    }
}
pub unsafe fn switch_arch_hook() {
    unsafe {
        let percpu = PercpuBlock::current();

        let cur_addrsp = percpu.current_addrsp.borrow();
        let next_addrsp = percpu.new_addrsp_tmp.take();

        let retain_pgtbl = match (&*cur_addrsp, &next_addrsp) {
            (Some(p), Some(n)) => Arc::ptr_eq(p, n),
            (Some(_), None) | (None, Some(_)) => false,
            (None, None) => true,
        };
        if retain_pgtbl {
            // If we are not switching to a different address space, we can simply return early.
            return;
        }
        if let Some(prev_addrsp) = &*cur_addrsp {
            prev_addrsp.used_by.atomic_clear(percpu.cpu_id);

            // See [`Flusher::flush`].
            //
            // Without the fence, `wants_tlb_shootdown` check *may* happen
            // before the CPU is removed from the `used_by` set. Hence, if a
            // shootdown request arises *after* the check and *before* removing
            // the CPU from the set, it would be missed and the CPU who
            // requested the shootdown would spin forever since the request was
            // never ACKed.
            core::sync::atomic::fence(Ordering::SeqCst);

            percpu.maybe_handle_tlb_shootdown();
        }

        drop(cur_addrsp);

        // Tell future TLB shootdown handlers that old_addrsp_tmp is no longer the current address
        // space.
        *percpu.current_addrsp.borrow_mut() = next_addrsp;

        match &*percpu.current_addrsp.borrow() {
            Some(next_addrsp) => {
                next_addrsp.used_by.atomic_set(percpu.cpu_id);
                let mut token = CleanLockToken::new();
                let mut token = token.token();
                let next = next_addrsp.acquire_read(token.downgrade());

                next.table.utable.make_current();
            }
            _ => {
                crate::memory::RmmA::set_table(rmm::TableKind::User, empty_cr3());
            }
        }
    }
}
impl PercpuBlock {
    pub const fn init(cpu_id: LogicalCpuId) -> Self {
        Self {
            cpu_id,
            switch_internals: ContextSwitchPercpu::default(),
            current_addrsp: RefCell::new(None),
            new_addrsp_tmp: Cell::new(None),
            wants_tlb_shootdown: AtomicBool::new(false),
            balance: Cell::new([0; 40]),
            last_queue: Cell::new(39),
            mcs_sched_node: McsNode::new(),
            mcs_contention_count: Cell::new(0),
            tlb_flush_start: AtomicU64::new(0),
            tlb_flush_count: AtomicU32::new(0),
            pi_donated_prio: AtomicU32::new(u32::MAX),
            current_prio: Cell::new(39),
            numa_node: Cell::new(u8::MAX),
            waiting_on_lock: AtomicPtr::new(core::ptr::null_mut()),
            ptrace_flags: Cell::new(PtraceFlags::empty()),
            ptrace_session: RefCell::new(None),
            inside_syscall: Cell::new(false),

            syscall_debug_info: Cell::new(SyscallDebugInfo::default()),

            profiling: None,

            misc_arch_info: ArchPercpuMisc::default(),

            stats: CpuStats::default(),
        }
    }
}