RedBear-OS/local/patches/kernel/P12-range-tlb-flush.patch

--- a/src/percpu.rs
+++ b/src/percpu.rs
@@ -5,9 +5,13 @@
 use core::{
     cell::{Cell, RefCell},
     hint,
-    sync::atomic::{AtomicBool, AtomicPtr, Ordering},
+    sync::atomic::{AtomicBool, AtomicPtr, AtomicU32, AtomicU64, Ordering},
 };

+/// Maximum number of pages to flush individually using INVLPG before falling
+/// back to a full TLB flush (CR3 reload).
+const TLB_RANGE_THRESHOLD: u32 = 32;
+
 use rmm::Arch;
 use syscall::PtraceFlags;

@@ -41,6 +45,23 @@
     /// Counts how many times the scheduler MCS lock acquisition was contended.
     pub mcs_contention_count: Cell<u64>,

+    /// TLB shootdown range: start virtual address (page-aligned).
+    /// Set to 0 for a full flush. Only valid when `wants_tlb_shootdown` is true.
+    pub tlb_flush_start: AtomicU64,
+    /// TLB shootdown range: number of pages to invalidate.
+    pub tlb_flush_count: AtomicU32,
+
+    /// Priority inheritance donation. When another CPU is blocked waiting on a
+    /// lock this CPU holds, the blocked CPU may donate its priority here.
+    /// `u32::MAX` means no donation; otherwise it's a priority level (0-39).
+    pub pi_donated_prio: AtomicU32,
+
+    /// Cached priority of the currently-running context on this CPU.
+    /// Set by the scheduler when selecting a new context. Read by the MCS
+    /// lock during priority donation — avoids acquiring the context RwLock
+    /// from the spin loop. Default 39 (lowest priority).
+    pub current_prio: Cell<usize>,
+
     // TODO: Put mailbox queues here, e.g. for TLB shootdown? Just be sure to 128-byte align it
     // first to avoid cache invalidation.
     pub profiling: Option<&'static crate::profiling::RingBuffer>,
@@ -64,6 +85,15 @@
     ALL_PERCPU_BLOCKS[id.get() as usize].store(block, Ordering::Release)
 }

+/// Get a reference to another CPU's PercpuBlock by logical CPU ID.
+pub fn get_for_cpu(id: LogicalCpuId) -> Option<&'static PercpuBlock> {
+    unsafe {
+        ALL_PERCPU_BLOCKS[id.get() as usize]
+            .load(Ordering::Acquire)
+            .as_ref()
+    }
+}
+
 pub fn get_all_stats() -> Vec<(LogicalCpuId, CpuStatsData)> {
     let mut res = ALL_PERCPU_BLOCKS
         .iter()
@@ -108,6 +138,9 @@
                 core::hint::spin_loop();
             }
         }
+        // Full flush — clear range info
+        percpublock.tlb_flush_start.store(0, Ordering::Relaxed);
+        percpublock.tlb_flush_count.store(0, Ordering::Relaxed);

         crate::ipi::ipi_single(crate::ipi::IpiKind::Tlb, percpublock);
     } else {
@@ -138,20 +171,114 @@
                     hint::spin_loop();
                 }
             }
+            // Full flush — clear range info
+            percpublock.tlb_flush_start.store(0, Ordering::Relaxed);
+            percpublock.tlb_flush_count.store(0, Ordering::Relaxed);
         }
         // Single broadcast IPI to all other CPUs using destination shorthand
         crate::ipi::ipi(crate::ipi::IpiKind::Tlb, crate::ipi::IpiTarget::Other);
     }
 }
+
+/// Range-based TLB shootdown IPI. Only invalidates the specified virtual address
+/// range using INVLPG per page for ranges up to TLB_RANGE_THRESHOLD pages.
+/// Falls back to full flush for larger ranges.
+pub fn shootdown_tlb_ipi_range(target: Option<LogicalCpuId>, start: usize, count: usize) {
+    if cfg!(not(feature = "multi_core")) {
+        return;
+    }
+
+    let start_aligned = start as u64 & !0xFFF;
+    let count_u32 = count as u32;
+    let use_range = count_u32 > 0 && count_u32 <= TLB_RANGE_THRESHOLD;
+
+    let set_range = |percpublock: &PercpuBlock| {
+        if use_range {
+            percpublock.tlb_flush_start.store(start_aligned, Ordering::Release);
+            percpublock.tlb_flush_count.store(count_u32, Ordering::Release);
+        } else {
+            percpublock.tlb_flush_start.store(0, Ordering::Release);
+            percpublock.tlb_flush_count.store(0, Ordering::Release);
+        }
+    };
+
+    if let Some(target) = target {
+        let my_percpublock = PercpuBlock::current();
+        assert_ne!(target, my_percpublock.cpu_id);
+
+        let Some(percpublock) = (unsafe {
+            ALL_PERCPU_BLOCKS[target.get() as usize]
+                .load(Ordering::Acquire)
+                .as_ref()
+        }) else {
+            return;
+        };
+        #[expect(clippy::bool_comparison)]
+        while percpublock.wants_tlb_shootdown.swap(true, Ordering::Release) == true {
+            while percpublock.wants_tlb_shootdown.load(Ordering::Relaxed) == true {
+                my_percpublock.maybe_handle_tlb_shootdown();
+                hint::spin_loop();
+            }
+        }
+        set_range(percpublock);
+        crate::ipi::ipi_single(crate::ipi::IpiKind::Tlb, percpublock);
+    } else {
+        let my_percpublock = PercpuBlock::current();
+        for id in 0..crate::cpu_count() {
+            let target_id = LogicalCpuId::new(id);
+            if target_id == my_percpublock.cpu_id {
+                continue;
+            }
+            let Some(percpublock) = (unsafe {
+                ALL_PERCPU_BLOCKS[id as usize]
+                    .load(Ordering::Acquire)
+                    .as_ref()
+            }) else {
+                continue;
+            };
+            #[expect(clippy::bool_comparison)]
+            while percpublock.wants_tlb_shootdown.swap(true, Ordering::Release) == true {
+                while percpublock.wants_tlb_shootdown.load(Ordering::Relaxed) == true {
+                    my_percpublock.maybe_handle_tlb_shootdown();
+                    hint::spin_loop();
+                }
+            }
+            set_range(percpublock);
+        }
+        crate::ipi::ipi(crate::ipi::IpiKind::Tlb, crate::ipi::IpiTarget::Other);
+    }
+}
 impl PercpuBlock {
+    /// Return the effective scheduling priority, accounting for priority inheritance.
+    /// Lower number = higher priority (0-39 range).
+    pub fn effective_prio(&self, context_prio: usize) -> usize {
+        let donated = self.pi_donated_prio.load(Ordering::Relaxed);
+        if donated < context_prio as u32 {
+            donated as usize
+        } else {
+            context_prio
+        }
+    }
+
     pub fn maybe_handle_tlb_shootdown(&self) {
         #[expect(clippy::bool_comparison)]
         if self.wants_tlb_shootdown.swap(false, Ordering::Relaxed) == false {
             return;
         }

-        // TODO: Finer-grained flush
-        crate::memory::RmmA::invalidate_all();
+        let start = self.tlb_flush_start.load(Ordering::Acquire);
+        let count = self.tlb_flush_count.load(Ordering::Acquire);
+
+        if start != 0 && count > 0 && count <= TLB_RANGE_THRESHOLD {
+            // Range-based flush using INVLPG per page — cheaper than full CR3 reload.
+            for i in 0..count {
+                let addr = start + (i as u64) * 4096;
+                crate::memory::RmmA::invalidate(rmm::VirtualAddress::new(addr as usize));
+            }
+        } else {
+            // Full TLB flush (CR3 reload) for large ranges or global shootdowns.
+            crate::memory::RmmA::invalidate_all();
+        }

         if let Some(addrsp) = &*self.current_addrsp.borrow() {
             addrsp.tlb_ack.fetch_add(1, Ordering::Release);
@@ -223,6 +350,10 @@
             last_queue: Cell::new(39),
             mcs_sched_node: McsNode::new(),
             mcs_contention_count: Cell::new(0),
+            tlb_flush_start: AtomicU64::new(0),
+            tlb_flush_count: AtomicU32::new(0),
+            pi_donated_prio: AtomicU32::new(u32::MAX),
+            current_prio: Cell::new(39),
             ptrace_flags: Cell::new(PtraceFlags::empty()),
             ptrace_session: RefCell::new(None),
             inside_syscall: Cell::new(false),