RedBear-OS/local/patches/kernel/P11-mcs-lock.patch

--- /dev/null
+++ b/src/sync/mcs.rs
@@ -0,0 +1,96 @@
+//! MCS (Mellor-Crummey Scott) fair spinlock.
+//!
+//! Each waiter spins on its own local `locked` flag instead of a shared lock
+//! word, eliminating cache-line bouncing under contention. FIFO ordering
+//! guarantees fairness. O(1) cache-line transfers on unlock.
+
+use core::sync::atomic::{AtomicBool, AtomicPtr, Ordering};
+use core::{hint, ptr};
+
+use crate::percpu::PercpuBlock;
+
+/// A node in the MCS lock queue.
+pub struct McsNode {
+    pub next: AtomicPtr<McsNode>,
+    pub locked: AtomicBool,
+}
+
+impl McsNode {
+    pub const fn new() -> Self {
+        Self {
+            next: AtomicPtr::new(ptr::null_mut()),
+            locked: AtomicBool::new(false),
+        }
+    }
+}
+
+/// Raw MCS spinlock primitive.
+pub struct McsRawLock {
+    tail: AtomicPtr<McsNode>,
+}
+
+impl McsRawLock {
+    pub const fn new() -> Self {
+        Self {
+            tail: AtomicPtr::new(ptr::null_mut()),
+        }
+    }
+
+    #[inline]
+    pub fn acquire(&self, node: &McsNode) -> bool {
+        node.next.store(ptr::null_mut(), Ordering::Relaxed);
+        node.locked.store(true, Ordering::Relaxed);
+        let prev = self.tail.swap((node as *const McsNode).cast_mut(), Ordering::AcqRel);
+        if prev.is_null() {
+            return false;
+        }
+        unsafe {
+            (*prev).next.store((node as *const McsNode).cast_mut(), Ordering::Release);
+        }
+        let percpu = PercpuBlock::current();
+        while node.locked.load(Ordering::Acquire) {
+            percpu.maybe_handle_tlb_shootdown();
+            hint::spin_loop();
+        }
+        true
+    }
+
+    #[inline]
+    pub fn release(&self, node: &McsNode) {
+        let next = node.next.load(Ordering::Acquire);
+        if next.is_null() {
+            if self
+                .tail
+                .compare_exchange(
+                    (node as *const McsNode).cast_mut(),
+                    ptr::null_mut(),
+                    Ordering::AcqRel,
+                    Ordering::Acquire,
+                )
+                .is_ok()
+            {
+                return;
+            }
+            while node.next.load(Ordering::Acquire).is_null() {
+                hint::spin_loop();
+            }
+        }
+        unsafe {
+            (*node.next.load(Ordering::Acquire)).locked.store(false, Ordering::Release);
+        }
+    }
+
+    #[inline]
+    pub fn try_acquire(&self, node: &McsNode) -> bool {
+        node.next.store(ptr::null_mut(), Ordering::Relaxed);
+        node.locked.store(true, Ordering::Relaxed);
+        self.tail
+            .compare_exchange(
+                ptr::null_mut(),
+                (node as *const McsNode).cast_mut(),
+                Ordering::AcqRel,
+                Ordering::Acquire,
+            )
+            .is_ok()
+    }
+}
--- a/src/sync/mod.rs
+++ b/src/sync/mod.rs
@@ -1,5 +1,6 @@
 pub use self::{ordered::*, wait_condition::WaitCondition, wait_queue::WaitQueue};

+pub mod mcs;
 pub mod ordered;
 pub mod wait_condition;
 pub mod wait_queue;
--- a/src/sync/ordered.rs
+++ b/src/sync/ordered.rs
@@ -52,7 +52,9 @@
 //! *g1 = 12;
 //! ```
 use alloc::sync::Arc;
+use core::cell::UnsafeCell;
 use core::marker::PhantomData;
+use core::ptr;

 use crate::percpu::PercpuBlock;

@@ -732,3 +734,143 @@
 /// This function can only be called if no lock is held by the calling thread/task
 #[inline]
 pub fn check_no_locks(_: LockToken<'_, L0>) {}
+
+// ---------------------------------------------------------------------------
+// MCS-based fair mutex (McsMutex)
+// ---------------------------------------------------------------------------
+
+/// A mutual exclusion lock using the MCS fair spinlock algorithm.
+///
+/// Unlike `Mutex<L, T>` which uses a simple spinlock (no fairness under
+/// contention), `McsMutex` uses Mellor-Crummey Scott queue-based spinning:
+///
+/// - Each waiter spins on its **own** local flag — no shared cache-line bouncing.
+/// - FIFO ordering prevents starvation.
+/// - O(1) cache-line transfers on unlock.
+///
+/// The MCS node is stored in [`crate::percpu::PercpuBlock::mcs_sched_node`], so
+/// this type is suitable for scheduler-internal locks where the holder is always
+/// the current CPU.
+pub struct McsMutex<L: Level, T> {
+    raw: crate::sync::mcs::McsRawLock,
+    data: UnsafeCell<T>,
+    _phantom: PhantomData<L>,
+}
+
+unsafe impl<L: Level, T: Send> Sync for McsMutex<L, T> {}
+unsafe impl<L: Level, T: Send> Send for McsMutex<L, T> {}
+
+impl<L: Level, T> McsMutex<L, T> {
+    pub const fn new(val: T) -> Self {
+        Self {
+            raw: crate::sync::mcs::McsRawLock::new(),
+            data: UnsafeCell::new(val),
+            _phantom: PhantomData,
+        }
+    }
+}
+
+impl<L: Level, T> McsMutex<L, T> {
+    pub fn lock<'a, LP: Lower<L> + 'a>(
+        &'a self,
+        lock_token: LockToken<'a, LP>,
+    ) -> McsMutexGuard<'a, L, T> {
+        let percpu = PercpuBlock::current();
+        let contended = self.raw.acquire(&percpu.mcs_sched_node);
+        if contended {
+            percpu
+                .mcs_contention_count
+                .set(percpu.mcs_contention_count.get() + 1);
+        }
+        McsMutexGuard {
+            lock: self,
+            lock_token: LockToken::downgraded(lock_token),
+        }
+    }
+
+    pub fn try_lock<'a, LP: Lower<L> + 'a>(
+        &'a self,
+        lock_token: LockToken<'a, LP>,
+    ) -> Option<McsMutexGuard<'a, L, T>> {
+        let percpu = PercpuBlock::current();
+        if self.raw.try_acquire(&percpu.mcs_sched_node) {
+            Some(McsMutexGuard {
+                lock: self,
+                lock_token: LockToken::downgraded(lock_token),
+            })
+        } else {
+            None
+        }
+    }
+}
+
+pub struct McsMutexGuard<'a, L: Level, T: 'a> {
+    lock: &'a McsMutex<L, T>,
+    lock_token: LockToken<'a, L>,
+}
+
+impl<'a, L: Level, T: 'a> McsMutexGuard<'a, L, T> {
+    pub fn token_split(&mut self) -> (&mut T, LockToken<'_, L>) {
+        unsafe { (&mut *self.lock.data.get(), self.lock_token.token()) }
+    }
+
+    pub fn into_split(self) -> (McsRawGuard<'a, L, T>, LockToken<'a, L>) {
+        let lock_ref = self.lock;
+        let token = unsafe { core::ptr::read(&self.lock_token) };
+        core::mem::forget(self);
+        (McsRawGuard { lock: lock_ref }, token)
+    }
+
+    pub fn from_split(raw: McsRawGuard<'a, L, T>, token: LockToken<'a, L>) -> Self {
+        let lock_ref = raw.lock;
+        core::mem::forget(raw);
+        Self {
+            lock: lock_ref,
+            lock_token: token,
+        }
+    }
+}
+
+impl<L: Level, T> core::ops::Deref for McsMutexGuard<'_, L, T> {
+    type Target = T;
+    fn deref(&self) -> &Self::Target {
+        unsafe { &*self.lock.data.get() }
+    }
+}
+
+impl<L: Level, T> core::ops::DerefMut for McsMutexGuard<'_, L, T> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        unsafe { &mut *self.lock.data.get() }
+    }
+}
+
+impl<L: Level, T> Drop for McsMutexGuard<'_, L, T> {
+    fn drop(&mut self) {
+        let percpu = PercpuBlock::current();
+        self.lock.raw.release(&percpu.mcs_sched_node);
+    }
+}
+
+pub struct McsRawGuard<'a, L: Level, T: 'a> {
+    lock: &'a McsMutex<L, T>,
+}
+
+impl<L: Level, T> core::ops::Deref for McsRawGuard<'_, L, T> {
+    type Target = T;
+    fn deref(&self) -> &Self::Target {
+        unsafe { &*self.lock.data.get() }
+    }
+}
+
+impl<L: Level, T> core::ops::DerefMut for McsRawGuard<'_, L, T> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        unsafe { &mut *self.lock.data.get() }
+    }
+}
+
+impl<L: Level, T> Drop for McsRawGuard<'_, L, T> {
+    fn drop(&mut self) {
+        let percpu = PercpuBlock::current();
+        self.lock.raw.release(&percpu.mcs_sched_node);
+    }
+}
--- a/src/percpu.rs
+++ b/src/percpu.rs
@@ -17,7 +17,7 @@
     cpu_set::{LogicalCpuId, MAX_CPU_COUNT},
     cpu_stats::{CpuStats, CpuStatsData},
     ptrace::Session,
-    sync::CleanLockToken,
+    sync::{mcs::McsNode, CleanLockToken},
     syscall::debug::SyscallDebugInfo,
 };

@@ -35,6 +35,12 @@
     pub balance: Cell<[usize; 40]>,
     pub last_queue: Cell<usize>,

+    /// Per-CPU MCS node for the scheduler run-queue lock (RUN_CONTEXTS).
+    pub mcs_sched_node: McsNode,
+
+    /// Counts how many times the scheduler MCS lock acquisition was contended.
+    pub mcs_contention_count: Cell<u64>,
+
     // TODO: Put mailbox queues here, e.g. for TLB shootdown? Just be sure to 128-byte align it
     // first to avoid cache invalidation.
     pub profiling: Option<&'static crate::profiling::RingBuffer>,
@@ -215,6 +221,8 @@
             wants_tlb_shootdown: AtomicBool::new(false),
             balance: Cell::new([0; 40]),
             last_queue: Cell::new(39),
+            mcs_sched_node: McsNode::new(),
+            mcs_contention_count: Cell::new(0),
             ptrace_flags: Cell::new(PtraceFlags::empty()),
             ptrace_session: RefCell::new(None),
             inside_syscall: Cell::new(false),
--- a/src/context/mod.rs
+++ b/src/context/mod.rs
@@ -14,8 +14,8 @@
     memory::{RmmA, RmmArch, TableKind},
     percpu::PercpuBlock,
     sync::{
-        ArcRwLockWriteGuard, CleanLockToken, LockToken, Mutex, MutexGuard, RwLock, RwLockReadGuard,
-        RwLockWriteGuard, L0, L1, L2, L4,
+        ArcRwLockWriteGuard, CleanLockToken, LockToken, McsMutex, McsMutexGuard, Mutex,
+        MutexGuard, RwLock, RwLockReadGuard, RwLockWriteGuard, L0, L1, L2, L4,
     },
     syscall::error::Result,
 };
@@ -74,10 +74,12 @@
 // the context file descriptors.
 static CONTEXTS: RwLock<L2, BTreeSet<ContextRef>> = RwLock::new(BTreeSet::new());

-// Actual context store for the scheduler
-static RUN_CONTEXTS: Mutex<L1, RunContextData> = Mutex::new(RunContextData::new());
+// Actual context store for the scheduler — uses MCS fair spinlock to
+// eliminate cache-line bouncing under multi-CPU contention.
+static RUN_CONTEXTS: McsMutex<L1, RunContextData> = McsMutex::new(RunContextData::new());

-// Context that has been pushed out from RUN_CONTEXTS after being idle
+// Context that has been pushed out from RUN_CONTEXTS after being idle.
+// Uses regular Mutex (lower contention; wakeup_contexts uses try_lock).
 static IDLE_CONTEXTS: Mutex<L2, VecDeque<WeakContextRef>> = Mutex::new(VecDeque::new());

 pub struct RunContextData {
@@ -113,7 +115,7 @@
     IDLE_CONTEXTS.try_lock(token)
 }

-pub fn run_contexts(token: LockToken<'_, L0>) -> MutexGuard<'_, L1, RunContextData> {
+pub fn run_contexts(token: LockToken<'_, L0>) -> McsMutexGuard<'_, L1, RunContextData> {
     RUN_CONTEXTS.lock(token)
 }