Files
RedBear-OS/local/patches/kernel/P11-mcs-lock.patch
T
vasilito cee25393d8 fix: boot process improvements — dependency cycle, INIT_NOTIFY, probing loop, and log spam fixes
- Fix P15-8-init-cycle-detection.patch: replace visiting+error with seen+silent-skip
  to eliminate 11 false-positive 'dependency cycle detected' errors on shared deps
- Fix P0-daemon-fix-init-notify-unwrap.patch: remove eprintln! for missing
  INIT_NOTIFY (expected for oneshot_async services, ~7 daemons affected)
- Fix driver-manager hotplug loop: add PERMANENTLY_SKIPPED static set shared
  between hotplug handler and DriverConfig::probe() to stop infinite re-probing
  of Fatal/NotSupported/deferred-exhausted device+driver pairs (e.g. ided)
- Fix driver-manager log_timeline: suppress repeated EPIPE/ENOENT errors with
  AtomicI32 dedup and AtomicBool one-shot guards for boot timeline JSON
- Add driver-manager SIGTERM handler, ACPI bus registration, --status mode,
  driver reap loop, graceful shutdown, and reduced deferred retries (30→3)
2026-05-17 12:34:02 +03:00

336 lines
10 KiB
Diff

--- /dev/null
+++ b/src/sync/mcs.rs
@@ -0,0 +1,96 @@
+//! MCS (Mellor-Crummey Scott) fair spinlock.
+//!
+//! Each waiter spins on its own local `locked` flag instead of a shared lock
+//! word, eliminating cache-line bouncing under contention. FIFO ordering
+//! guarantees fairness. O(1) cache-line transfers on unlock.
+
+use core::sync::atomic::{AtomicBool, AtomicPtr, Ordering};
+use core::{hint, ptr};
+
+use crate::percpu::PercpuBlock;
+
+/// A node in the MCS lock queue.
+pub struct McsNode {
+ pub next: AtomicPtr<McsNode>,
+ pub locked: AtomicBool,
+}
+
+impl McsNode {
+ pub const fn new() -> Self {
+ Self {
+ next: AtomicPtr::new(ptr::null_mut()),
+ locked: AtomicBool::new(false),
+ }
+ }
+}
+
+/// Raw MCS spinlock primitive.
+pub struct McsRawLock {
+ tail: AtomicPtr<McsNode>,
+}
+
+impl McsRawLock {
+ pub const fn new() -> Self {
+ Self {
+ tail: AtomicPtr::new(ptr::null_mut()),
+ }
+ }
+
+ #[inline]
+ pub fn acquire(&self, node: &McsNode) -> bool {
+ node.next.store(ptr::null_mut(), Ordering::Relaxed);
+ node.locked.store(true, Ordering::Relaxed);
+ let prev = self.tail.swap((node as *const McsNode).cast_mut(), Ordering::AcqRel);
+ if prev.is_null() {
+ return false;
+ }
+ unsafe {
+ (*prev).next.store((node as *const McsNode).cast_mut(), Ordering::Release);
+ }
+ let percpu = PercpuBlock::current();
+ while node.locked.load(Ordering::Acquire) {
+ percpu.maybe_handle_tlb_shootdown();
+ hint::spin_loop();
+ }
+ true
+ }
+
+ #[inline]
+ pub fn release(&self, node: &McsNode) {
+ let next = node.next.load(Ordering::Acquire);
+ if next.is_null() {
+ if self
+ .tail
+ .compare_exchange(
+ (node as *const McsNode).cast_mut(),
+ ptr::null_mut(),
+ Ordering::AcqRel,
+ Ordering::Acquire,
+ )
+ .is_ok()
+ {
+ return;
+ }
+ while node.next.load(Ordering::Acquire).is_null() {
+ hint::spin_loop();
+ }
+ }
+ unsafe {
+ (*node.next.load(Ordering::Acquire)).locked.store(false, Ordering::Release);
+ }
+ }
+
+ #[inline]
+ pub fn try_acquire(&self, node: &McsNode) -> bool {
+ node.next.store(ptr::null_mut(), Ordering::Relaxed);
+ node.locked.store(true, Ordering::Relaxed);
+ self.tail
+ .compare_exchange(
+ ptr::null_mut(),
+ (node as *const McsNode).cast_mut(),
+ Ordering::AcqRel,
+ Ordering::Acquire,
+ )
+ .is_ok()
+ }
+}
--- a/src/sync/mod.rs
+++ b/src/sync/mod.rs
@@ -1,5 +1,6 @@
pub use self::{ordered::*, wait_condition::WaitCondition, wait_queue::WaitQueue};
+pub mod mcs;
pub mod ordered;
pub mod wait_condition;
pub mod wait_queue;
--- a/src/sync/ordered.rs
+++ b/src/sync/ordered.rs
@@ -52,7 +52,9 @@
//! *g1 = 12;
//! ```
use alloc::sync::Arc;
+use core::cell::UnsafeCell;
use core::marker::PhantomData;
+use core::ptr;
use crate::percpu::PercpuBlock;
@@ -732,3 +734,143 @@
/// This function can only be called if no lock is held by the calling thread/task
#[inline]
pub fn check_no_locks(_: LockToken<'_, L0>) {}
+
+// ---------------------------------------------------------------------------
+// MCS-based fair mutex (McsMutex)
+// ---------------------------------------------------------------------------
+
+/// A mutual exclusion lock using the MCS fair spinlock algorithm.
+///
+/// Unlike `Mutex<L, T>` which uses a simple spinlock (no fairness under
+/// contention), `McsMutex` uses Mellor-Crummey Scott queue-based spinning:
+///
+/// - Each waiter spins on its **own** local flag — no shared cache-line bouncing.
+/// - FIFO ordering prevents starvation.
+/// - O(1) cache-line transfers on unlock.
+///
+/// The MCS node is stored in [`crate::percpu::PercpuBlock::mcs_sched_node`], so
+/// this type is suitable for scheduler-internal locks where the holder is always
+/// the current CPU.
+pub struct McsMutex<L: Level, T> {
+ raw: crate::sync::mcs::McsRawLock,
+ data: UnsafeCell<T>,
+ _phantom: PhantomData<L>,
+}
+
+unsafe impl<L: Level, T: Send> Sync for McsMutex<L, T> {}
+unsafe impl<L: Level, T: Send> Send for McsMutex<L, T> {}
+
+impl<L: Level, T> McsMutex<L, T> {
+ pub const fn new(val: T) -> Self {
+ Self {
+ raw: crate::sync::mcs::McsRawLock::new(),
+ data: UnsafeCell::new(val),
+ _phantom: PhantomData,
+ }
+ }
+}
+
+impl<L: Level, T> McsMutex<L, T> {
+ pub fn lock<'a, LP: Lower<L> + 'a>(
+ &'a self,
+ lock_token: LockToken<'a, LP>,
+ ) -> McsMutexGuard<'a, L, T> {
+ let percpu = PercpuBlock::current();
+ let contended = self.raw.acquire(&percpu.mcs_sched_node);
+ if contended {
+ percpu
+ .mcs_contention_count
+ .set(percpu.mcs_contention_count.get() + 1);
+ }
+ McsMutexGuard {
+ lock: self,
+ lock_token: LockToken::downgraded(lock_token),
+ }
+ }
+
+ pub fn try_lock<'a, LP: Lower<L> + 'a>(
+ &'a self,
+ lock_token: LockToken<'a, LP>,
+ ) -> Option<McsMutexGuard<'a, L, T>> {
+ let percpu = PercpuBlock::current();
+ if self.raw.try_acquire(&percpu.mcs_sched_node) {
+ Some(McsMutexGuard {
+ lock: self,
+ lock_token: LockToken::downgraded(lock_token),
+ })
+ } else {
+ None
+ }
+ }
+}
+
+pub struct McsMutexGuard<'a, L: Level, T: 'a> {
+ lock: &'a McsMutex<L, T>,
+ lock_token: LockToken<'a, L>,
+}
+
+impl<'a, L: Level, T: 'a> McsMutexGuard<'a, L, T> {
+ pub fn token_split(&mut self) -> (&mut T, LockToken<'_, L>) {
+ unsafe { (&mut *self.lock.data.get(), self.lock_token.token()) }
+ }
+
+ pub fn into_split(self) -> (McsRawGuard<'a, L, T>, LockToken<'a, L>) {
+ let lock_ref = self.lock;
+ let token = unsafe { core::ptr::read(&self.lock_token) };
+ core::mem::forget(self);
+ (McsRawGuard { lock: lock_ref }, token)
+ }
+
+ pub fn from_split(raw: McsRawGuard<'a, L, T>, token: LockToken<'a, L>) -> Self {
+ let lock_ref = raw.lock;
+ core::mem::forget(raw);
+ Self {
+ lock: lock_ref,
+ lock_token: token,
+ }
+ }
+}
+
+impl<L: Level, T> core::ops::Deref for McsMutexGuard<'_, L, T> {
+ type Target = T;
+ fn deref(&self) -> &Self::Target {
+ unsafe { &*self.lock.data.get() }
+ }
+}
+
+impl<L: Level, T> core::ops::DerefMut for McsMutexGuard<'_, L, T> {
+ fn deref_mut(&mut self) -> &mut Self::Target {
+ unsafe { &mut *self.lock.data.get() }
+ }
+}
+
+impl<L: Level, T> Drop for McsMutexGuard<'_, L, T> {
+ fn drop(&mut self) {
+ let percpu = PercpuBlock::current();
+ self.lock.raw.release(&percpu.mcs_sched_node);
+ }
+}
+
+pub struct McsRawGuard<'a, L: Level, T: 'a> {
+ lock: &'a McsMutex<L, T>,
+}
+
+impl<L: Level, T> core::ops::Deref for McsRawGuard<'_, L, T> {
+ type Target = T;
+ fn deref(&self) -> &Self::Target {
+ unsafe { &*self.lock.data.get() }
+ }
+}
+
+impl<L: Level, T> core::ops::DerefMut for McsRawGuard<'_, L, T> {
+ fn deref_mut(&mut self) -> &mut Self::Target {
+ unsafe { &mut *self.lock.data.get() }
+ }
+}
+
+impl<L: Level, T> Drop for McsRawGuard<'_, L, T> {
+ fn drop(&mut self) {
+ let percpu = PercpuBlock::current();
+ self.lock.raw.release(&percpu.mcs_sched_node);
+ }
+}
--- a/src/percpu.rs
+++ b/src/percpu.rs
@@ -17,7 +17,7 @@
cpu_set::{LogicalCpuId, MAX_CPU_COUNT},
cpu_stats::{CpuStats, CpuStatsData},
ptrace::Session,
- sync::CleanLockToken,
+ sync::{mcs::McsNode, CleanLockToken},
syscall::debug::SyscallDebugInfo,
};
@@ -35,6 +35,12 @@
pub balance: Cell<[usize; 40]>,
pub last_queue: Cell<usize>,
+ /// Per-CPU MCS node for the scheduler run-queue lock (RUN_CONTEXTS).
+ pub mcs_sched_node: McsNode,
+
+ /// Counts how many times the scheduler MCS lock acquisition was contended.
+ pub mcs_contention_count: Cell<u64>,
+
// TODO: Put mailbox queues here, e.g. for TLB shootdown? Just be sure to 128-byte align it
// first to avoid cache invalidation.
pub profiling: Option<&'static crate::profiling::RingBuffer>,
@@ -215,6 +221,8 @@
wants_tlb_shootdown: AtomicBool::new(false),
balance: Cell::new([0; 40]),
last_queue: Cell::new(39),
+ mcs_sched_node: McsNode::new(),
+ mcs_contention_count: Cell::new(0),
ptrace_flags: Cell::new(PtraceFlags::empty()),
ptrace_session: RefCell::new(None),
inside_syscall: Cell::new(false),
--- a/src/context/mod.rs
+++ b/src/context/mod.rs
@@ -14,8 +14,8 @@
memory::{RmmA, RmmArch, TableKind},
percpu::PercpuBlock,
sync::{
- ArcRwLockWriteGuard, CleanLockToken, LockToken, Mutex, MutexGuard, RwLock, RwLockReadGuard,
- RwLockWriteGuard, L0, L1, L2, L4,
+ ArcRwLockWriteGuard, CleanLockToken, LockToken, McsMutex, McsMutexGuard, Mutex,
+ MutexGuard, RwLock, RwLockReadGuard, RwLockWriteGuard, L0, L1, L2, L4,
},
syscall::error::Result,
};
@@ -74,10 +74,12 @@
// the context file descriptors.
static CONTEXTS: RwLock<L2, BTreeSet<ContextRef>> = RwLock::new(BTreeSet::new());
-// Actual context store for the scheduler
-static RUN_CONTEXTS: Mutex<L1, RunContextData> = Mutex::new(RunContextData::new());
+// Actual context store for the scheduler — uses MCS fair spinlock to
+// eliminate cache-line bouncing under multi-CPU contention.
+static RUN_CONTEXTS: McsMutex<L1, RunContextData> = McsMutex::new(RunContextData::new());
-// Context that has been pushed out from RUN_CONTEXTS after being idle
+// Context that has been pushed out from RUN_CONTEXTS after being idle.
+// Uses regular Mutex (lower contention; wakeup_contexts uses try_lock).
static IDLE_CONTEXTS: Mutex<L2, VecDeque<WeakContextRef>> = Mutex::new(VecDeque::new());
pub struct RunContextData {
@@ -113,7 +115,7 @@
IDLE_CONTEXTS.try_lock(token)
}
-pub fn run_contexts(token: LockToken<'_, L0>) -> MutexGuard<'_, L1, RunContextData> {
+pub fn run_contexts(token: LockToken<'_, L0>) -> McsMutexGuard<'_, L1, RunContextData> {
RUN_CONTEXTS.lock(token)
}