RedBear-OS/local/patches/kernel/P17-2b-transitive-pi.patch

--- a/src/sync/mcs.rs
+++ b/src/sync/mcs.rs
@@ -3,12 +3,21 @@
 //! Each waiter spins on its own local `locked` flag instead of a shared lock
 //! word, eliminating cache-line bouncing under contention. FIFO ordering
 //! guarantees fairness. O(1) cache-line transfers on unlock.
+//!
+//! Supports transitive priority inheritance: when CPU A waits on a lock held
+//! by CPU B, and CPU B waits on a lock held by CPU C, A's priority is
+//! propagated through the chain to C (up to MAX_PI_CHAIN_DEPTH hops).

 use core::sync::atomic::{AtomicBool, AtomicPtr, AtomicU32, Ordering};
 use core::{hint, ptr};

 use crate::percpu::PercpuBlock;

+/// Maximum depth for transitive priority inheritance chain following.
+/// Prevents infinite loops from theoretical lock cycles and bounds latency.
+/// Linux uses 20; 8 is conservative for a microkernel with fewer nesting levels.
+const MAX_PI_CHAIN_DEPTH: u32 = 8;
+
 /// A node in the MCS lock queue.
 pub struct McsNode {
     pub next: AtomicPtr<McsNode>,
@@ -55,17 +64,23 @@
             (*prev).next.store((node as *const McsNode).cast_mut(), Ordering::Release);
         }
         let percpu = PercpuBlock::current();
+        // Record which lock we're spinning on (for transitive PI chain following)
+        percpu.waiting_on_lock.store(
+            (self as *const McsRawLock).cast_mut(),
+            Ordering::Release,
+        );
         let mut donated = false;
         while node.locked.load(Ordering::Acquire) {
             percpu.maybe_handle_tlb_shootdown();
-            // Donate priority to the lock holder once per acquisition
+            // Donate priority to the lock holder (transitively) once per acquisition
             if !donated {
                 self.maybe_donate_priority(percpu);
                 donated = true;
             }
             hint::spin_loop();
         }
-        // We now hold the lock
+        // Clear waiting_on_lock before proceeding — we now hold the lock
+        percpu.waiting_on_lock.store(ptr::null_mut(), Ordering::Release);
         self.holder_cpu.store(percpu.cpu_id.get(), Ordering::Release);
         true
     }
@@ -120,27 +135,54 @@
         ok
     }

-    /// Donate current CPU's context priority to the lock holder's CPU.
+    /// Donate current CPU's context priority to the lock holder's CPU,
+    /// following the PI chain transitively (A→B→C).
+    ///
     /// Reads priority from PercpuBlock::current_prio (cached by the scheduler)
     /// to avoid acquiring any lock in the MCS spin loop.
+    ///
+    /// Chain following: if the holder is itself waiting on another lock,
+    /// we propagate our priority to that lock's holder too, up to
+    /// MAX_PI_CHAIN_DEPTH hops.
     fn maybe_donate_priority(&self, my_percpu: &PercpuBlock) {
-        let holder_cpu_id = self.holder_cpu.load(Ordering::Relaxed);
-        if holder_cpu_id == u32::MAX {
-            return;
-        }
-        // Read our own priority from the per-CPU cache (set by scheduler,
-        // no lock required).
-        let my_prio = my_percpu.current_prio.get();
-        // Look up holder's PercpuBlock
-        let holder_percpu = crate::percpu::get_for_cpu(
-            crate::cpu_set::LogicalCpuId::new(holder_cpu_id),
-        );
-        if let Some(holder) = holder_percpu {
+        let my_prio = my_percpu.current_prio.get() as u32;
+        let mut current_holder_cpu = self.holder_cpu.load(Ordering::Relaxed);
+
+        for _ in 0..MAX_PI_CHAIN_DEPTH {
+            if current_holder_cpu == u32::MAX {
+                return;
+            }
+            let holder_percpu = crate::percpu::get_for_cpu(
+                crate::cpu_set::LogicalCpuId::new(current_holder_cpu),
+            );
+            let Some(holder) = holder_percpu else {
+                return;
+            };
+
+            // Donate if our priority is higher (lower number) than current donation
             let current_donated = holder.pi_donated_prio.load(Ordering::Relaxed);
-            // Donate if our priority is higher (lower number)
-            if (my_prio as u32) < current_donated {
-                holder.pi_donated_prio.store(my_prio as u32, Ordering::Release);
+            if my_prio < current_donated {
+                holder.pi_donated_prio.store(my_prio, Ordering::Release);
+            }
+
+            // Follow the chain: is this holder also waiting on another lock?
+            let next_lock_ptr = holder.waiting_on_lock.load(Ordering::Relaxed);
+            if next_lock_ptr.is_null() {
+                return;
+            }
+            // SAFETY: The pointed-to McsRawLock is a long-lived struct field
+            // (e.g., part of the run queue). The holder is currently spinning
+            // in acquire(), so the pointer is valid. We only read holder_cpu
+            // (an atomic u32) — no mutable access needed.
+            let next_holder_cpu =
+                unsafe { (*next_lock_ptr).holder_cpu.load(Ordering::Relaxed) };
+
+            // Cycle detection: if the next holder is the same CPU we just visited, stop
+            if next_holder_cpu == current_holder_cpu {
+                return;
             }
+            current_holder_cpu = next_holder_cpu;
         }
+        // Chain depth exhausted — stop to bound latency
     }
 }