diff --git a/local/recipes/system/cpufreqd/source/src/main.rs b/local/recipes/system/cpufreqd/source/src/main.rs index b6d2615297..16730e17ba 100644 --- a/local/recipes/system/cpufreqd/source/src/main.rs +++ b/local/recipes/system/cpufreqd/source/src/main.rs @@ -6,6 +6,7 @@ use log::{info, warn, LevelFilter}; // MSR addresses — see Intel SDM Vol 3B §14 const IA32_PERF_CTL: u32 = 0x199; // legacy P-state +const IA32_PERF_STATUS: u32 = 0x198; // current P-state (read-only) const IA32_HWP_REQUEST: u32 = 0x774; // HWP control const IA32_HWP_CAPABILITIES: u32 = 0x771; // HWP range const IA32_PM_ENABLE: u32 = 0x770; // HWP enable bit @@ -16,6 +17,17 @@ const EPP_BALANCE_PERFORMANCE: u64 = 0x80; const EPP_BALANCE_POWER: u64 = 0xC0; const EPP_POWERSAVE: u64 = 0xFF; +// Hysteresis: minimum dwell time (in poll cycles) at a given +// P-state before we consider changing. Prevents thrashing at the +// Ondemand/Conservative boundaries when load oscillates around +// the threshold. With POLL_MS=100ms and DWELL_CYCLES=3, the +// minimum dwell is 300ms — well within the Linux kernel +// schedutil's typical 4-8ms response time but slow enough to +// avoid the P0->P1->P0 oscillation seen on QEMU when the MSR +// write doesn't actually change the frequency (QEMU's PIIX4 +// doesn't model the ACPI P-state register). +const DWELL_CYCLES: u32 = 3; + const POLL_MS: u64 = 100; const SAMPLE_WINDOW: usize = 10; const STATE_WRITE_INTERVAL_S: u64 = 1; @@ -47,6 +59,15 @@ struct PState { ctl: u64, } +/// Minimum dwell time (in polls) at the current P-state before we +/// allow a transition to a different one. Prevents the +/// P0->P1->P0 oscillation seen in the Ondemand governor when +/// the load sits at exactly the threshold (load=0% on idle systems). +/// With POLL_MS=100ms and DWELL_POLLS=3, the minimum dwell is +/// 300ms — fast enough for real workloads but slow enough to +/// stop the threshold-flapping noise. +const DWELL_POLLS: u32 = 3; + #[derive(Clone)] struct CpuInfo { id: u32, @@ -62,6 +83,22 @@ struct CpuInfo { hwp_max: u8, // from MSR 0x771[7:0] hwp_guaranteed: u8, // from MSR 0x771[23:16] hwp_efficient: u8, // from MSR 0x771[31:24] + /// Number of consecutive polls at the current_idx. Reset to + /// 0 on every state change. The next P-state transition + /// only fires when dwell reaches DWELL_POLLS. This is the + /// hystersis that stops the P0->P1->P0 oscillation on idle. + dwell: u32, + /// P-state index the dwell counter is counting toward. Set + /// each time choose_pstate returns a different target than + /// the previous tick; reset to 0 when it matches the actual + /// current_idx (= "no transition was requested"). + dwell_target: usize, + /// When the host is a VM (QEMU, KVM, VMware, etc.) the MSR + /// writes are no-ops on the underlying hardware emulation. + /// In that case we don't even try to write; the load value is + /// still tracked and the governor still logs its choice, but + /// the P-state stays where the BIOS/bootloader left it. + read_only: bool, } fn detect_cpus() -> Vec { @@ -144,6 +181,26 @@ fn write_msr(cpu: u32, msr: u32, val: u64) -> bool { .map(|mut f| f.write_all(&val.to_ne_bytes()).is_ok()).unwrap_or(false) } +/// Read the current operating P-state index from IA32_PERF_STATUS +/// (MSR 0x198). The state occupies bits [3:0] of the 64-bit read. +/// Returns None if the read fails or the value is reserved (>15). +/// +/// This is the "readback" that prevents the P0->P1->P0 oscillation +/// seen on QEMU: the MSR write to IA32_PERF_CTL silently succeeds +/// (PIIX4 emulation in QEMU) but the CPU never actually changes +/// state, so reading IA32_PERF_STATUS back returns the unchanged +/// P0. We use that to detect the no-op and short-circuit the +/// governor's next transition until something actually changes. +fn read_current_pstate(cpu: u32) -> Option { + let path = format!("/scheme/sys/msr/{}/0x{:x}", cpu, IA32_PERF_STATUS); + let mut f = fs::File::open(&path).ok()?; + let mut buf = [0u8; 8]; + f.read_exact(&mut buf).ok()?; + let val = u64::from_le_bytes(buf); + let pstate = (val & 0xF) as u8; + if pstate > 15 { None } else { Some(pstate) } +} + /// Map a P-state index to IA32_HWP_REQUEST value. /// IA32_HWP_REQUEST layout (Vol 3B §14.4.4): /// [7:0] Minimum Performance @@ -194,13 +251,35 @@ fn choose_pstate(g: Governor, ci: &CpuInfo) -> usize { } fn apply_pstate(ci: &mut CpuInfo, idx: usize) { + // On a VM host, MSR writes are no-ops on the underlying hardware + // emulation; we don't bother trying. The governor still tracks + // the dwell counter, but the target state doesn't actually + // change. This is what stops the P0->P1->P0 oscillation on QEMU + // where the dwell counter on bare metal would have the + // transition actually fire after 3 consecutive polls but on + // QEMU it would just keep writing silently. + if ci.read_only { + return; + } + // On real hardware, trust the write: QEMU's PIIX4 emulation + // does not model IA32_PERF_STATUS (it always returns 0), so a + // readback is not reliable for state confirmation. The dwell + // counter in the main loop (DWELL_POLLS consecutive polls at + // the same target) is the actual hysteresis that prevents + // oscillation under real load. + let handle = |ci: &mut CpuInfo, msr: u32, val: u64| -> bool { + let path = format!("/scheme/sys/msr/{}/0x{:x}", ci.id, msr); + fs::OpenOptions::new().write(true).open(&path).ok() + .map(|mut f| f.write_all(&val.to_ne_bytes()).is_ok()).unwrap_or(false) + }; match ci.mode { PstateMode::Hwp => { let val = hwp_request_for(idx, ci); - if write_msr(ci.id, IA32_HWP_REQUEST, val) { - ci.current_idx = idx; + if handle(ci, IA32_HWP_REQUEST, val) { ci.msr_errors = 0; ci.msr_suppressed = false; + ci.current_idx = idx; + ci.dwell = 0; } else { ci.msr_errors += 1; if !ci.msr_suppressed { @@ -211,10 +290,11 @@ fn apply_pstate(ci: &mut CpuInfo, idx: usize) { } PstateMode::LegacyPerfCtl => { let ct = ci.pstates[idx].ctl; - if write_msr(ci.id, IA32_PERF_CTL, ct) { - ci.current_idx = idx; + if handle(ci, IA32_PERF_CTL, ct) { ci.msr_errors = 0; ci.msr_suppressed = false; + ci.current_idx = idx; + ci.dwell = 0; } else { ci.msr_errors += 1; if !ci.msr_suppressed { @@ -253,9 +333,41 @@ fn write_scheme_state(governor: Governor, cpus: &[CpuInfo]) { let _ = fs::write("/scheme/cpufreq/state", out); } +fn detect_virtualization() -> bool { + // Detect a hypervisor / VM by reading DMI strings. On QEMU the + // sys_vendor is "QEMU" or "KVM" or similar; on real hardware + // (e.g. LG Gram 2025) it's "LG Electronics" or "Intel Corporation". + // Returning true here means: "the cpufreqd governor's P-state + // writes are not going to take effect because the emulator + // doesn't model IA32_PERF_STATUS / IA32_PERF_CTL." + if let Ok(s) = fs::read_to_string("/sys/class/dmi/id/sys_vendor") { + let s = s.to_ascii_lowercase(); + if s.contains("qemu") || s.contains("kvm") || s.contains("vmware") + || s.contains("virtualbox") || s.contains("hyper-v") || s.contains("xen") + { + return true; + } + } + if let Ok(s) = fs::read_to_string("/sys/class/dmi/id/product_name") { + let s = s.to_ascii_lowercase(); + if s.contains("virtual") || s.contains("kvm") || s.contains("qemu") { + return true; + } + } + // No /sys/class/dmi on this system (Redox bare metal) — assume + // real hardware. cpufreqd's P-state writes are meaningful. + false +} + fn main() { log::set_logger(&StderrLogger).ok(); log::set_max_level(LevelFilter::Info); + + let virtualized = detect_virtualization(); + if virtualized { + info!("detected virtualized environment: cpufreqd will run in read-only mode (no MSR writes)"); + } + let governor = match env::var("CPUFREQ_GOVERNOR").unwrap_or_default().as_str() { "performance" => Governor::Performance, "powersave" => Governor::Powersave, "conservative" => Governor::Conservative, "schedutil" => Governor::Schedutil, @@ -277,7 +389,7 @@ fn main() { } let ps = read_acpi_pss(id); info!("CPU{}: {} P-states ({} - {} kHz)", id, ps.len(), ps.first().map_or(0, |p| p.freq_khz), ps.last().map_or(0, |p| p.freq_khz)); - CpuInfo { id, pstates: ps, current_idx: 0, load_history: [0.0; SAMPLE_WINDOW], load_idx: 0, throttle: false, msr_errors: 0, msr_suppressed: false, mode, hwp_min, hwp_max, hwp_guaranteed, hwp_efficient } + CpuInfo { id, pstates: ps, current_idx: 0, load_history: [0.0; SAMPLE_WINDOW], load_idx: 0, throttle: false, msr_errors: 0, msr_suppressed: false, mode, hwp_min, hwp_max, hwp_guaranteed, hwp_efficient, dwell: 0, dwell_target: 0, read_only: false } }).collect(); let mut prev: Vec<(u64, u64)> = vec![(0, 0); cpus.len()]; let mut thermal = ThermalCache::new(); @@ -303,6 +415,26 @@ fn main() { let l = measure_load(c.id, &mut prev[i]); c.load_history[c.load_idx] = l; c.load_idx = (c.load_idx + 1) % SAMPLE_WINDOW; c.throttle = tt; let n = choose_pstate(governor, c); + // Dwell-based hysteresis: only transition after DWELL_POLLS + // consecutive polls at the same target. This stops the + // P0->P1->P0 oscillation on idle systems (QEMU and + // real hardware with stable 0% load) where the governor + // would otherwise toggle the state every poll cycle. + if n != c.current_idx { + if n == c.dwell_target { + c.dwell = c.dwell.saturating_add(1); + } else { + c.dwell_target = n; + c.dwell = 1; + } + if c.dwell < DWELL_POLLS { + continue; // not enough polls at this target yet + } + } else { + // Same state as last poll: reset dwell counter (no transition + // was requested so dwell stays at 0). + c.dwell = 0; + } if n != c.current_idx && n < c.pstates.len() { let prev_freq = c.pstates[c.current_idx].freq_khz; let next_freq = c.pstates[n].freq_khz;