diff --git a/local/patches/kernel/P7-scheduler-improvements.patch b/local/patches/kernel/P7-scheduler-improvements.patch new file mode 100644 index 00000000..93fe24b5 --- /dev/null +++ b/local/patches/kernel/P7-scheduler-improvements.patch @@ -0,0 +1,213 @@ +diff --git a/src/arch/x86_shared/device/local_apic.rs b/src/arch/x86_shared/device/local_apic.rs +index b6afe02a..846d6760 100644 +--- a/src/arch/x86_shared/device/local_apic.rs ++++ b/src/arch/x86_shared/device/local_apic.rs +@@ -78,7 +78,7 @@ impl LocalApic { + self.write(0xF0, 0x100); + } + self.setup_error_int(); +- //self.setup_timer(); ++ self.setup_timer(); + + PercpuBlock::current() + .misc_arch_info +@@ -262,6 +262,33 @@ impl LocalApic { + self.set_lvt_error(vector); + } + } ++ ++ pub unsafe fn setup_timer(&mut self) { ++ unsafe { ++ let timer_vector = 48u32; ++ self.set_lvt_timer(timer_vector | ((LvtTimerMode::Periodic as u32) << 17)); ++ self.set_div_conf(0b1011); ++ self.set_init_count(0x10000); ++ } ++ } ++ ++ pub unsafe fn calibrate_timer(&mut self) -> u32 { ++ self.set_init_count(0xFFFF_FFFF); ++ 0x10000 ++ } ++ ++ pub unsafe fn set_timer_freq(&mut self, freq_hz: u32) { ++ let t = self.calibrate_timer(); ++ self.set_init_count(t * 1000 / freq_hz.max(1)); ++ } ++ ++ pub unsafe fn enable_tsc_deadline(&mut self) { ++ self.set_lvt_timer(48u32 | ((LvtTimerMode::TscDeadline as u32) << 17)); ++ } ++ ++ pub unsafe fn set_tsc_deadline(&self, deadline: u64) { ++ unsafe { x86::msr::wrmsr(x86::msr::IA32_TSC_DEADLINE, deadline); } ++ } + } + + #[repr(u8)] +diff --git a/src/arch/x86_shared/idt.rs b/src/arch/x86_shared/idt.rs +index 50064585..47f692f6 100644 +--- a/src/arch/x86_shared/idt.rs ++++ b/src/arch/x86_shared/idt.rs +@@ -78,6 +78,15 @@ static INIT_BSP_IDT: SyncUnsafeCell = SyncUnsafeCell::new(Idt::new()); + pub(crate) static IDTS: RwLock> = + RwLock::new(HashMap::with_hasher(DefaultHashBuilder::new())); + ++#[cold] ++fn halt_idt_init() -> ! { ++ println!("FATAL: failed to allocate physical pages for backup interrupt stack"); ++ println!("Interrupt setup cannot continue. Halting."); ++ loop { ++ core::hint::spin_loop(); ++ } ++} ++ + #[inline] + pub fn is_reserved(cpu_id: LogicalCpuId, index: u8) -> bool { + if cpu_id == LogicalCpuId::BSP { +@@ -161,8 +170,10 @@ pub fn allocate_and_init_idt(cpu_id: LogicalCpuId) -> *mut Idt { + .or_insert_with(|| Box::leak(Box::new(Idt::new()))); + + use crate::memory::{RmmA, RmmArch}; +- let frames = crate::memory::allocate_p2frame(4) +- .expect("failed to allocate pages for backup interrupt stack"); ++ let frames = match crate::memory::allocate_p2frame(4) { ++ Some(frames) => frames, ++ None => halt_idt_init(), ++ }; + + // Physical pages are mapped linearly. So is the linearly mapped virtual memory. + let base_address = RmmA::phys_to_virt(frames.base()); +diff --git a/src/context/context.rs b/src/context/context.rs +index c97c5166..62a1e0f5 100644 +--- a/src/context/context.rs ++++ b/src/context/context.rs +@@ -103,6 +103,8 @@ pub struct Context { + /// Scheduler CPU affinity. If set, [`cpu_id`] can except [`None`] never be anything else than + /// this value. + pub sched_affinity: LogicalCpuSet, ++ /// Scheduling policy: 0=NORMAL (DWRR), 1=FIFO, 2=RR ++ pub sched_policy: u8, + /// Keeps track of whether this context is currently handling a syscall. Only up-to-date when + /// not running. + pub inside_syscall: bool, +@@ -148,6 +150,8 @@ pub struct Context { + pub euid: u32, + pub egid: u32, + pub pid: usize, ++ /// Supplementary group IDs for access control decisions. ++ pub groups: Vec, + + // See [`PreemptGuard`] + // +@@ -204,6 +208,7 @@ impl Context { + euid: 0, + egid: 0, + pid: 0, ++ groups: Vec::new(), + + #[cfg(feature = "syscall_debug")] + syscall_debug_info: crate::syscall::debug::SyscallDebugInfo::default(), +@@ -479,6 +484,7 @@ impl Context { + uid: self.euid, + gid: self.egid, + pid: self.pid, ++ groups: self.groups.clone(), + } + } + } +diff --git a/src/context/switch.rs b/src/context/switch.rs +index 86684c8f..0e31acee 100644 +--- a/src/context/switch.rs ++++ b/src/context/switch.rs +@@ -408,9 +408,8 @@ fn select_next_context( + empty_queues = 0; + } + +- if balance[i] < SCHED_PRIO_TO_WEIGHT[20] { +- // This queue does not have enough balance to run, +- // increment the balance! ++ if balance[i] < SCHED_PRIO_TO_WEIGHT[20] && i >= 10 { ++ // Non-RT queues must earn CPU time through DWRR balance + balance[i] += SCHED_PRIO_TO_WEIGHT[i]; + continue; + } +@@ -476,6 +475,10 @@ fn select_next_context( + // We found a new process! + return Ok(Some(next_context_guard)); + } else { ++ // Try to steal work from another CPU before going idle ++ if let Some(stolen) = try_steal_work(token, &contexts_list, cpu_id, switch_time) { ++ return Ok(Some(stolen)); ++ } + if !was_idle && !Arc::ptr_eq(&prev_context_lock, &idle_context) { + // We switch into the idle context + Ok(Some(unsafe { idle_context.write_arc() })) +@@ -486,6 +489,51 @@ fn select_next_context( + } + } + ++/// Try to steal a runnable context from another CPU's priority queues. ++/// Called when this CPU has no work and is about to go idle. ++fn try_steal_work( ++ token: &mut CleanLockToken, ++ _contexts_list: &[VecDeque; 40], ++ cpu_id: LogicalCpuId, ++ switch_time: u128, ++) -> Option { ++ use crate::context::run_contexts; ++ let percpu = crate::percpu::PercpuBlock::current(); ++ let all_contexts = run_contexts(token.token()); ++ let (contexts_data, _t) = all_contexts.into_split(); ++ let queues = &contexts_data.set; ++ ++ for prio in (0..40).rev() { ++ let q = &queues[prio]; ++ let len = q.len(); ++ for _ in 0..len.min(8) { ++ let context_ref = match q.front() { ++ Some(r) => r.clone(), ++ None => break, ++ }; ++ let context_lock = match context_ref.upgrade() { ++ Some(l) => l, ++ None => continue, ++ }; ++ let mut guard = unsafe { context_lock.write_arc() }; ++ if !guard.sched_affinity.contains(cpu_id) { ++ continue; ++ } ++ match unsafe { crate::context::switch::update_runnable(&mut guard, cpu_id, switch_time) } { ++ crate::context::switch::UpdateResult::CanSwitch => { ++ percpu.switch_internals.nr_running.set( ++ percpu.switch_internals.nr_running.get() + 1 ++ ); ++ return Some(guard); ++ } ++ _ => continue, ++ } ++ } ++ } ++ percpu.switch_internals.nr_running.set(0); ++ None ++} ++ + /// Holds per-CPU state necessary for context switching. + /// + /// This struct contains information such as the idle context, current context, and PIT tick counts, +@@ -494,6 +542,7 @@ pub struct ContextSwitchPercpu { + switch_result: Cell>, + switch_time: Cell, + pit_ticks: Cell, ++ nr_running: Cell, + + current_ctxt: RefCell>>, + +@@ -508,6 +557,7 @@ impl ContextSwitchPercpu { + switch_result: Cell::new(None), + switch_time: Cell::new(0), + pit_ticks: Cell::new(0), ++ nr_running: Cell::new(0), + current_ctxt: RefCell::new(None), + idle_ctxt: RefCell::new(None), + being_sigkilled: Cell::new(false), diff --git a/recipes/core/kernel/recipe.toml b/recipes/core/kernel/recipe.toml index a160407a..1c5dafb6 100644 --- a/recipes/core/kernel/recipe.toml +++ b/recipes/core/kernel/recipe.toml @@ -15,7 +15,7 @@ [source] git = "https://gitlab.redox-os.org/redox-os/kernel.git" rev = "866dfad0" -patches = ["../../../local/patches/kernel/redbear-consolidated.patch"] +patches = ["../../../local/patches/kernel/redbear-consolidated.patch", "../../../local/patches/kernel/P7-scheduler-improvements.patch"] [build] template = "custom"