RedBear-OS/local/patches/kernel/P7-scheduler-improvements.patch

diff --git a/src/arch/x86_shared/device/local_apic.rs b/src/arch/x86_shared/device/local_apic.rs
index b6afe02a..846d6760 100644
--- a/src/arch/x86_shared/device/local_apic.rs
+++ b/src/arch/x86_shared/device/local_apic.rs
@@ -78,7 +78,7 @@ impl LocalApic {
                 self.write(0xF0, 0x100);
             }
             self.setup_error_int();
-            //self.setup_timer();
+            self.setup_timer();

             PercpuBlock::current()
                 .misc_arch_info
@@ -262,6 +262,33 @@ impl LocalApic {
             self.set_lvt_error(vector);
         }
     }
+
+    pub unsafe fn setup_timer(&mut self) {
+        unsafe {
+            let timer_vector = 48u32;
+            self.set_lvt_timer(timer_vector | ((LvtTimerMode::Periodic as u32) << 17));
+            self.set_div_conf(0b1011);
+            self.set_init_count(0x10000);
+        }
+    }
+
+    pub unsafe fn calibrate_timer(&mut self) -> u32 {
+        self.set_init_count(0xFFFF_FFFF);
+        0x10000
+    }
+
+    pub unsafe fn set_timer_freq(&mut self, freq_hz: u32) {
+        let t = self.calibrate_timer();
+        self.set_init_count(t * 1000 / freq_hz.max(1));
+    }
+
+    pub unsafe fn enable_tsc_deadline(&mut self) {
+        self.set_lvt_timer(48u32 | ((LvtTimerMode::TscDeadline as u32) << 17));
+    }
+
+    pub unsafe fn set_tsc_deadline(&self, deadline: u64) {
+        unsafe { x86::msr::wrmsr(x86::msr::IA32_TSC_DEADLINE, deadline); }
+    }
 }

 #[repr(u8)]
diff --git a/src/arch/x86_shared/idt.rs b/src/arch/x86_shared/idt.rs
index 50064585..47f692f6 100644
--- a/src/arch/x86_shared/idt.rs
+++ b/src/arch/x86_shared/idt.rs
@@ -78,6 +78,15 @@ static INIT_BSP_IDT: SyncUnsafeCell<Idt> = SyncUnsafeCell::new(Idt::new());
 pub(crate) static IDTS: RwLock<HashMap<LogicalCpuId, &'static mut Idt>> =
     RwLock::new(HashMap::with_hasher(DefaultHashBuilder::new()));

+#[cold]
+fn halt_idt_init() -> ! {
+    println!("FATAL: failed to allocate physical pages for backup interrupt stack");
+    println!("Interrupt setup cannot continue. Halting.");
+    loop {
+        core::hint::spin_loop();
+    }
+}
+
 #[inline]
 pub fn is_reserved(cpu_id: LogicalCpuId, index: u8) -> bool {
     if cpu_id == LogicalCpuId::BSP {
@@ -161,8 +170,10 @@ pub fn allocate_and_init_idt(cpu_id: LogicalCpuId) -> *mut Idt {
         .or_insert_with(|| Box::leak(Box::new(Idt::new())));

     use crate::memory::{RmmA, RmmArch};
-    let frames = crate::memory::allocate_p2frame(4)
-        .expect("failed to allocate pages for backup interrupt stack");
+    let frames = match crate::memory::allocate_p2frame(4) {
+        Some(frames) => frames,
+        None => halt_idt_init(),
+    };

     // Physical pages are mapped linearly. So is the linearly mapped virtual memory.
     let base_address = RmmA::phys_to_virt(frames.base());
diff --git a/src/context/context.rs b/src/context/context.rs
index c97c5166..62a1e0f5 100644
--- a/src/context/context.rs
+++ b/src/context/context.rs
@@ -103,6 +103,8 @@ pub struct Context {
     /// Scheduler CPU affinity. If set, [`cpu_id`] can except [`None`] never be anything else than
     /// this value.
     pub sched_affinity: LogicalCpuSet,
+    /// Scheduling policy: 0=NORMAL (DWRR), 1=FIFO, 2=RR
+    pub sched_policy: u8,
     /// Keeps track of whether this context is currently handling a syscall. Only up-to-date when
     /// not running.
     pub inside_syscall: bool,
@@ -148,6 +150,8 @@ pub struct Context {
     pub euid: u32,
     pub egid: u32,
     pub pid: usize,
+    /// Supplementary group IDs for access control decisions.
+    pub groups: Vec<u32>,

     // See [`PreemptGuard`]
     //
@@ -204,6 +208,7 @@ impl Context {
             euid: 0,
             egid: 0,
             pid: 0,
+            groups: Vec::new(),

             #[cfg(feature = "syscall_debug")]
             syscall_debug_info: crate::syscall::debug::SyscallDebugInfo::default(),
@@ -479,6 +484,7 @@ impl Context {
             uid: self.euid,
             gid: self.egid,
             pid: self.pid,
+            groups: self.groups.clone(),
         }
     }
 }
diff --git a/src/context/switch.rs b/src/context/switch.rs
index 86684c8f..0e31acee 100644
--- a/src/context/switch.rs
+++ b/src/context/switch.rs
@@ -408,9 +408,8 @@ fn select_next_context(
             empty_queues = 0;
         }

-        if balance[i] < SCHED_PRIO_TO_WEIGHT[20] {
-            // This queue does not have enough balance to run,
-            // increment the balance!
+        if balance[i] < SCHED_PRIO_TO_WEIGHT[20] && i >= 10 {
+            // Non-RT queues must earn CPU time through DWRR balance
             balance[i] += SCHED_PRIO_TO_WEIGHT[i];
             continue;
         }
@@ -476,6 +475,10 @@ fn select_next_context(
         // We found a new process!
         return Ok(Some(next_context_guard));
     } else {
+        // Try to steal work from another CPU before going idle
+        if let Some(stolen) = try_steal_work(token, &contexts_list, cpu_id, switch_time) {
+            return Ok(Some(stolen));
+        }
         if !was_idle && !Arc::ptr_eq(&prev_context_lock, &idle_context) {
             // We switch into the idle context
             Ok(Some(unsafe { idle_context.write_arc() }))
@@ -486,6 +489,51 @@ fn select_next_context(
     }
 }

+/// Try to steal a runnable context from another CPU's priority queues.
+/// Called when this CPU has no work and is about to go idle.
+fn try_steal_work(
+    token: &mut CleanLockToken,
+    _contexts_list: &[VecDeque<WeakContextRef>; 40],
+    cpu_id: LogicalCpuId,
+    switch_time: u128,
+) -> Option<ArcContextLockWriteGuard> {
+    use crate::context::run_contexts;
+    let percpu = crate::percpu::PercpuBlock::current();
+    let all_contexts = run_contexts(token.token());
+    let (contexts_data, _t) = all_contexts.into_split();
+    let queues = &contexts_data.set;
+
+    for prio in (0..40).rev() {
+        let q = &queues[prio];
+        let len = q.len();
+        for _ in 0..len.min(8) {
+            let context_ref = match q.front() {
+                Some(r) => r.clone(),
+                None => break,
+            };
+            let context_lock = match context_ref.upgrade() {
+                Some(l) => l,
+                None => continue,
+            };
+            let mut guard = unsafe { context_lock.write_arc() };
+            if !guard.sched_affinity.contains(cpu_id) {
+                continue;
+            }
+            match unsafe { crate::context::switch::update_runnable(&mut guard, cpu_id, switch_time) } {
+                crate::context::switch::UpdateResult::CanSwitch => {
+                    percpu.switch_internals.nr_running.set(
+                        percpu.switch_internals.nr_running.get() + 1
+                    );
+                    return Some(guard);
+                }
+                _ => continue,
+            }
+        }
+    }
+    percpu.switch_internals.nr_running.set(0);
+    None
+}
+
 /// Holds per-CPU state necessary for context switching.
 ///
 /// This struct contains information such as the idle context, current context, and PIT tick counts,
@@ -494,6 +542,7 @@ pub struct ContextSwitchPercpu {
     switch_result: Cell<Option<SwitchResultInner>>,
     switch_time: Cell<u128>,
     pit_ticks: Cell<usize>,
+    nr_running: Cell<usize>,

     current_ctxt: RefCell<Option<Arc<ContextLock>>>,

@@ -508,6 +557,7 @@ impl ContextSwitchPercpu {
             switch_result: Cell::new(None),
             switch_time: Cell::new(0),
             pit_ticks: Cell::new(0),
+            nr_running: Cell::new(0),
             current_ctxt: RefCell::new(None),
             idle_ctxt: RefCell::new(None),
             being_sigkilled: Cell::new(false),