cee25393d8
- Fix P15-8-init-cycle-detection.patch: replace visiting+error with seen+silent-skip to eliminate 11 false-positive 'dependency cycle detected' errors on shared deps - Fix P0-daemon-fix-init-notify-unwrap.patch: remove eprintln! for missing INIT_NOTIFY (expected for oneshot_async services, ~7 daemons affected) - Fix driver-manager hotplug loop: add PERMANENTLY_SKIPPED static set shared between hotplug handler and DriverConfig::probe() to stop infinite re-probing of Fatal/NotSupported/deferred-exhausted device+driver pairs (e.g. ided) - Fix driver-manager log_timeline: suppress repeated EPIPE/ENOENT errors with AtomicI32 dedup and AtomicBool one-shot guards for boot timeline JSON - Add driver-manager SIGTERM handler, ACPI bus registration, --status mode, driver reap loop, graceful shutdown, and reduced deferred retries (30→3)
336 lines
10 KiB
Diff
336 lines
10 KiB
Diff
--- /dev/null
|
|
+++ b/src/sync/mcs.rs
|
|
@@ -0,0 +1,96 @@
|
|
+//! MCS (Mellor-Crummey Scott) fair spinlock.
|
|
+//!
|
|
+//! Each waiter spins on its own local `locked` flag instead of a shared lock
|
|
+//! word, eliminating cache-line bouncing under contention. FIFO ordering
|
|
+//! guarantees fairness. O(1) cache-line transfers on unlock.
|
|
+
|
|
+use core::sync::atomic::{AtomicBool, AtomicPtr, Ordering};
|
|
+use core::{hint, ptr};
|
|
+
|
|
+use crate::percpu::PercpuBlock;
|
|
+
|
|
+/// A node in the MCS lock queue.
|
|
+pub struct McsNode {
|
|
+ pub next: AtomicPtr<McsNode>,
|
|
+ pub locked: AtomicBool,
|
|
+}
|
|
+
|
|
+impl McsNode {
|
|
+ pub const fn new() -> Self {
|
|
+ Self {
|
|
+ next: AtomicPtr::new(ptr::null_mut()),
|
|
+ locked: AtomicBool::new(false),
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+/// Raw MCS spinlock primitive.
|
|
+pub struct McsRawLock {
|
|
+ tail: AtomicPtr<McsNode>,
|
|
+}
|
|
+
|
|
+impl McsRawLock {
|
|
+ pub const fn new() -> Self {
|
|
+ Self {
|
|
+ tail: AtomicPtr::new(ptr::null_mut()),
|
|
+ }
|
|
+ }
|
|
+
|
|
+ #[inline]
|
|
+ pub fn acquire(&self, node: &McsNode) -> bool {
|
|
+ node.next.store(ptr::null_mut(), Ordering::Relaxed);
|
|
+ node.locked.store(true, Ordering::Relaxed);
|
|
+ let prev = self.tail.swap((node as *const McsNode).cast_mut(), Ordering::AcqRel);
|
|
+ if prev.is_null() {
|
|
+ return false;
|
|
+ }
|
|
+ unsafe {
|
|
+ (*prev).next.store((node as *const McsNode).cast_mut(), Ordering::Release);
|
|
+ }
|
|
+ let percpu = PercpuBlock::current();
|
|
+ while node.locked.load(Ordering::Acquire) {
|
|
+ percpu.maybe_handle_tlb_shootdown();
|
|
+ hint::spin_loop();
|
|
+ }
|
|
+ true
|
|
+ }
|
|
+
|
|
+ #[inline]
|
|
+ pub fn release(&self, node: &McsNode) {
|
|
+ let next = node.next.load(Ordering::Acquire);
|
|
+ if next.is_null() {
|
|
+ if self
|
|
+ .tail
|
|
+ .compare_exchange(
|
|
+ (node as *const McsNode).cast_mut(),
|
|
+ ptr::null_mut(),
|
|
+ Ordering::AcqRel,
|
|
+ Ordering::Acquire,
|
|
+ )
|
|
+ .is_ok()
|
|
+ {
|
|
+ return;
|
|
+ }
|
|
+ while node.next.load(Ordering::Acquire).is_null() {
|
|
+ hint::spin_loop();
|
|
+ }
|
|
+ }
|
|
+ unsafe {
|
|
+ (*node.next.load(Ordering::Acquire)).locked.store(false, Ordering::Release);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ #[inline]
|
|
+ pub fn try_acquire(&self, node: &McsNode) -> bool {
|
|
+ node.next.store(ptr::null_mut(), Ordering::Relaxed);
|
|
+ node.locked.store(true, Ordering::Relaxed);
|
|
+ self.tail
|
|
+ .compare_exchange(
|
|
+ ptr::null_mut(),
|
|
+ (node as *const McsNode).cast_mut(),
|
|
+ Ordering::AcqRel,
|
|
+ Ordering::Acquire,
|
|
+ )
|
|
+ .is_ok()
|
|
+ }
|
|
+}
|
|
--- a/src/sync/mod.rs
|
|
+++ b/src/sync/mod.rs
|
|
@@ -1,5 +1,6 @@
|
|
pub use self::{ordered::*, wait_condition::WaitCondition, wait_queue::WaitQueue};
|
|
|
|
+pub mod mcs;
|
|
pub mod ordered;
|
|
pub mod wait_condition;
|
|
pub mod wait_queue;
|
|
--- a/src/sync/ordered.rs
|
|
+++ b/src/sync/ordered.rs
|
|
@@ -52,7 +52,9 @@
|
|
//! *g1 = 12;
|
|
//! ```
|
|
use alloc::sync::Arc;
|
|
+use core::cell::UnsafeCell;
|
|
use core::marker::PhantomData;
|
|
+use core::ptr;
|
|
|
|
use crate::percpu::PercpuBlock;
|
|
|
|
@@ -732,3 +734,143 @@
|
|
/// This function can only be called if no lock is held by the calling thread/task
|
|
#[inline]
|
|
pub fn check_no_locks(_: LockToken<'_, L0>) {}
|
|
+
|
|
+// ---------------------------------------------------------------------------
|
|
+// MCS-based fair mutex (McsMutex)
|
|
+// ---------------------------------------------------------------------------
|
|
+
|
|
+/// A mutual exclusion lock using the MCS fair spinlock algorithm.
|
|
+///
|
|
+/// Unlike `Mutex<L, T>` which uses a simple spinlock (no fairness under
|
|
+/// contention), `McsMutex` uses Mellor-Crummey Scott queue-based spinning:
|
|
+///
|
|
+/// - Each waiter spins on its **own** local flag — no shared cache-line bouncing.
|
|
+/// - FIFO ordering prevents starvation.
|
|
+/// - O(1) cache-line transfers on unlock.
|
|
+///
|
|
+/// The MCS node is stored in [`crate::percpu::PercpuBlock::mcs_sched_node`], so
|
|
+/// this type is suitable for scheduler-internal locks where the holder is always
|
|
+/// the current CPU.
|
|
+pub struct McsMutex<L: Level, T> {
|
|
+ raw: crate::sync::mcs::McsRawLock,
|
|
+ data: UnsafeCell<T>,
|
|
+ _phantom: PhantomData<L>,
|
|
+}
|
|
+
|
|
+unsafe impl<L: Level, T: Send> Sync for McsMutex<L, T> {}
|
|
+unsafe impl<L: Level, T: Send> Send for McsMutex<L, T> {}
|
|
+
|
|
+impl<L: Level, T> McsMutex<L, T> {
|
|
+ pub const fn new(val: T) -> Self {
|
|
+ Self {
|
|
+ raw: crate::sync::mcs::McsRawLock::new(),
|
|
+ data: UnsafeCell::new(val),
|
|
+ _phantom: PhantomData,
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+impl<L: Level, T> McsMutex<L, T> {
|
|
+ pub fn lock<'a, LP: Lower<L> + 'a>(
|
|
+ &'a self,
|
|
+ lock_token: LockToken<'a, LP>,
|
|
+ ) -> McsMutexGuard<'a, L, T> {
|
|
+ let percpu = PercpuBlock::current();
|
|
+ let contended = self.raw.acquire(&percpu.mcs_sched_node);
|
|
+ if contended {
|
|
+ percpu
|
|
+ .mcs_contention_count
|
|
+ .set(percpu.mcs_contention_count.get() + 1);
|
|
+ }
|
|
+ McsMutexGuard {
|
|
+ lock: self,
|
|
+ lock_token: LockToken::downgraded(lock_token),
|
|
+ }
|
|
+ }
|
|
+
|
|
+ pub fn try_lock<'a, LP: Lower<L> + 'a>(
|
|
+ &'a self,
|
|
+ lock_token: LockToken<'a, LP>,
|
|
+ ) -> Option<McsMutexGuard<'a, L, T>> {
|
|
+ let percpu = PercpuBlock::current();
|
|
+ if self.raw.try_acquire(&percpu.mcs_sched_node) {
|
|
+ Some(McsMutexGuard {
|
|
+ lock: self,
|
|
+ lock_token: LockToken::downgraded(lock_token),
|
|
+ })
|
|
+ } else {
|
|
+ None
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+pub struct McsMutexGuard<'a, L: Level, T: 'a> {
|
|
+ lock: &'a McsMutex<L, T>,
|
|
+ lock_token: LockToken<'a, L>,
|
|
+}
|
|
+
|
|
+impl<'a, L: Level, T: 'a> McsMutexGuard<'a, L, T> {
|
|
+ pub fn token_split(&mut self) -> (&mut T, LockToken<'_, L>) {
|
|
+ unsafe { (&mut *self.lock.data.get(), self.lock_token.token()) }
|
|
+ }
|
|
+
|
|
+ pub fn into_split(self) -> (McsRawGuard<'a, L, T>, LockToken<'a, L>) {
|
|
+ let lock_ref = self.lock;
|
|
+ let token = unsafe { core::ptr::read(&self.lock_token) };
|
|
+ core::mem::forget(self);
|
|
+ (McsRawGuard { lock: lock_ref }, token)
|
|
+ }
|
|
+
|
|
+ pub fn from_split(raw: McsRawGuard<'a, L, T>, token: LockToken<'a, L>) -> Self {
|
|
+ let lock_ref = raw.lock;
|
|
+ core::mem::forget(raw);
|
|
+ Self {
|
|
+ lock: lock_ref,
|
|
+ lock_token: token,
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+impl<L: Level, T> core::ops::Deref for McsMutexGuard<'_, L, T> {
|
|
+ type Target = T;
|
|
+ fn deref(&self) -> &Self::Target {
|
|
+ unsafe { &*self.lock.data.get() }
|
|
+ }
|
|
+}
|
|
+
|
|
+impl<L: Level, T> core::ops::DerefMut for McsMutexGuard<'_, L, T> {
|
|
+ fn deref_mut(&mut self) -> &mut Self::Target {
|
|
+ unsafe { &mut *self.lock.data.get() }
|
|
+ }
|
|
+}
|
|
+
|
|
+impl<L: Level, T> Drop for McsMutexGuard<'_, L, T> {
|
|
+ fn drop(&mut self) {
|
|
+ let percpu = PercpuBlock::current();
|
|
+ self.lock.raw.release(&percpu.mcs_sched_node);
|
|
+ }
|
|
+}
|
|
+
|
|
+pub struct McsRawGuard<'a, L: Level, T: 'a> {
|
|
+ lock: &'a McsMutex<L, T>,
|
|
+}
|
|
+
|
|
+impl<L: Level, T> core::ops::Deref for McsRawGuard<'_, L, T> {
|
|
+ type Target = T;
|
|
+ fn deref(&self) -> &Self::Target {
|
|
+ unsafe { &*self.lock.data.get() }
|
|
+ }
|
|
+}
|
|
+
|
|
+impl<L: Level, T> core::ops::DerefMut for McsRawGuard<'_, L, T> {
|
|
+ fn deref_mut(&mut self) -> &mut Self::Target {
|
|
+ unsafe { &mut *self.lock.data.get() }
|
|
+ }
|
|
+}
|
|
+
|
|
+impl<L: Level, T> Drop for McsRawGuard<'_, L, T> {
|
|
+ fn drop(&mut self) {
|
|
+ let percpu = PercpuBlock::current();
|
|
+ self.lock.raw.release(&percpu.mcs_sched_node);
|
|
+ }
|
|
+}
|
|
--- a/src/percpu.rs
|
|
+++ b/src/percpu.rs
|
|
@@ -17,7 +17,7 @@
|
|
cpu_set::{LogicalCpuId, MAX_CPU_COUNT},
|
|
cpu_stats::{CpuStats, CpuStatsData},
|
|
ptrace::Session,
|
|
- sync::CleanLockToken,
|
|
+ sync::{mcs::McsNode, CleanLockToken},
|
|
syscall::debug::SyscallDebugInfo,
|
|
};
|
|
|
|
@@ -35,6 +35,12 @@
|
|
pub balance: Cell<[usize; 40]>,
|
|
pub last_queue: Cell<usize>,
|
|
|
|
+ /// Per-CPU MCS node for the scheduler run-queue lock (RUN_CONTEXTS).
|
|
+ pub mcs_sched_node: McsNode,
|
|
+
|
|
+ /// Counts how many times the scheduler MCS lock acquisition was contended.
|
|
+ pub mcs_contention_count: Cell<u64>,
|
|
+
|
|
// TODO: Put mailbox queues here, e.g. for TLB shootdown? Just be sure to 128-byte align it
|
|
// first to avoid cache invalidation.
|
|
pub profiling: Option<&'static crate::profiling::RingBuffer>,
|
|
@@ -215,6 +221,8 @@
|
|
wants_tlb_shootdown: AtomicBool::new(false),
|
|
balance: Cell::new([0; 40]),
|
|
last_queue: Cell::new(39),
|
|
+ mcs_sched_node: McsNode::new(),
|
|
+ mcs_contention_count: Cell::new(0),
|
|
ptrace_flags: Cell::new(PtraceFlags::empty()),
|
|
ptrace_session: RefCell::new(None),
|
|
inside_syscall: Cell::new(false),
|
|
--- a/src/context/mod.rs
|
|
+++ b/src/context/mod.rs
|
|
@@ -14,8 +14,8 @@
|
|
memory::{RmmA, RmmArch, TableKind},
|
|
percpu::PercpuBlock,
|
|
sync::{
|
|
- ArcRwLockWriteGuard, CleanLockToken, LockToken, Mutex, MutexGuard, RwLock, RwLockReadGuard,
|
|
- RwLockWriteGuard, L0, L1, L2, L4,
|
|
+ ArcRwLockWriteGuard, CleanLockToken, LockToken, McsMutex, McsMutexGuard, Mutex,
|
|
+ MutexGuard, RwLock, RwLockReadGuard, RwLockWriteGuard, L0, L1, L2, L4,
|
|
},
|
|
syscall::error::Result,
|
|
};
|
|
@@ -74,10 +74,12 @@
|
|
// the context file descriptors.
|
|
static CONTEXTS: RwLock<L2, BTreeSet<ContextRef>> = RwLock::new(BTreeSet::new());
|
|
|
|
-// Actual context store for the scheduler
|
|
-static RUN_CONTEXTS: Mutex<L1, RunContextData> = Mutex::new(RunContextData::new());
|
|
+// Actual context store for the scheduler — uses MCS fair spinlock to
|
|
+// eliminate cache-line bouncing under multi-CPU contention.
|
|
+static RUN_CONTEXTS: McsMutex<L1, RunContextData> = McsMutex::new(RunContextData::new());
|
|
|
|
-// Context that has been pushed out from RUN_CONTEXTS after being idle
|
|
+// Context that has been pushed out from RUN_CONTEXTS after being idle.
|
|
+// Uses regular Mutex (lower contention; wakeup_contexts uses try_lock).
|
|
static IDLE_CONTEXTS: Mutex<L2, VecDeque<WeakContextRef>> = Mutex::new(VecDeque::new());
|
|
|
|
pub struct RunContextData {
|
|
@@ -113,7 +115,7 @@
|
|
IDLE_CONTEXTS.try_lock(token)
|
|
}
|
|
|
|
-pub fn run_contexts(token: LockToken<'_, L0>) -> MutexGuard<'_, L1, RunContextData> {
|
|
+pub fn run_contexts(token: LockToken<'_, L0>) -> McsMutexGuard<'_, L1, RunContextData> {
|
|
RUN_CONTEXTS.lock(token)
|
|
}
|
|
|