cpufreqd: add MSR readback and hysteresis to prevent P-state thrashing

This commit is contained in:
2026-06-30 22:20:04 +03:00
parent 3c9ddc1407
commit 3e57b52a2d
@@ -6,6 +6,7 @@ use log::{info, warn, LevelFilter};
// MSR addresses — see Intel SDM Vol 3B §14
const IA32_PERF_CTL: u32 = 0x199; // legacy P-state
const IA32_PERF_STATUS: u32 = 0x198; // current P-state (read-only)
const IA32_HWP_REQUEST: u32 = 0x774; // HWP control
const IA32_HWP_CAPABILITIES: u32 = 0x771; // HWP range
const IA32_PM_ENABLE: u32 = 0x770; // HWP enable bit
@@ -16,6 +17,17 @@ const EPP_BALANCE_PERFORMANCE: u64 = 0x80;
const EPP_BALANCE_POWER: u64 = 0xC0;
const EPP_POWERSAVE: u64 = 0xFF;
// Hysteresis: minimum dwell time (in poll cycles) at a given
// P-state before we consider changing. Prevents thrashing at the
// Ondemand/Conservative boundaries when load oscillates around
// the threshold. With POLL_MS=100ms and DWELL_CYCLES=3, the
// minimum dwell is 300ms — well within the Linux kernel
// schedutil's typical 4-8ms response time but slow enough to
// avoid the P0->P1->P0 oscillation seen on QEMU when the MSR
// write doesn't actually change the frequency (QEMU's PIIX4
// doesn't model the ACPI P-state register).
const DWELL_CYCLES: u32 = 3;
const POLL_MS: u64 = 100;
const SAMPLE_WINDOW: usize = 10;
const STATE_WRITE_INTERVAL_S: u64 = 1;
@@ -47,6 +59,15 @@ struct PState {
ctl: u64,
}
/// Minimum dwell time (in polls) at the current P-state before we
/// allow a transition to a different one. Prevents the
/// P0->P1->P0 oscillation seen in the Ondemand governor when
/// the load sits at exactly the threshold (load=0% on idle systems).
/// With POLL_MS=100ms and DWELL_POLLS=3, the minimum dwell is
/// 300ms — fast enough for real workloads but slow enough to
/// stop the threshold-flapping noise.
const DWELL_POLLS: u32 = 3;
#[derive(Clone)]
struct CpuInfo {
id: u32,
@@ -62,6 +83,22 @@ struct CpuInfo {
hwp_max: u8, // from MSR 0x771[7:0]
hwp_guaranteed: u8, // from MSR 0x771[23:16]
hwp_efficient: u8, // from MSR 0x771[31:24]
/// Number of consecutive polls at the current_idx. Reset to
/// 0 on every state change. The next P-state transition
/// only fires when dwell reaches DWELL_POLLS. This is the
/// hystersis that stops the P0->P1->P0 oscillation on idle.
dwell: u32,
/// P-state index the dwell counter is counting toward. Set
/// each time choose_pstate returns a different target than
/// the previous tick; reset to 0 when it matches the actual
/// current_idx (= "no transition was requested").
dwell_target: usize,
/// When the host is a VM (QEMU, KVM, VMware, etc.) the MSR
/// writes are no-ops on the underlying hardware emulation.
/// In that case we don't even try to write; the load value is
/// still tracked and the governor still logs its choice, but
/// the P-state stays where the BIOS/bootloader left it.
read_only: bool,
}
fn detect_cpus() -> Vec<u32> {
@@ -144,6 +181,26 @@ fn write_msr(cpu: u32, msr: u32, val: u64) -> bool {
.map(|mut f| f.write_all(&val.to_ne_bytes()).is_ok()).unwrap_or(false)
}
/// Read the current operating P-state index from IA32_PERF_STATUS
/// (MSR 0x198). The state occupies bits [3:0] of the 64-bit read.
/// Returns None if the read fails or the value is reserved (>15).
///
/// This is the "readback" that prevents the P0->P1->P0 oscillation
/// seen on QEMU: the MSR write to IA32_PERF_CTL silently succeeds
/// (PIIX4 emulation in QEMU) but the CPU never actually changes
/// state, so reading IA32_PERF_STATUS back returns the unchanged
/// P0. We use that to detect the no-op and short-circuit the
/// governor's next transition until something actually changes.
fn read_current_pstate(cpu: u32) -> Option<u8> {
let path = format!("/scheme/sys/msr/{}/0x{:x}", cpu, IA32_PERF_STATUS);
let mut f = fs::File::open(&path).ok()?;
let mut buf = [0u8; 8];
f.read_exact(&mut buf).ok()?;
let val = u64::from_le_bytes(buf);
let pstate = (val & 0xF) as u8;
if pstate > 15 { None } else { Some(pstate) }
}
/// Map a P-state index to IA32_HWP_REQUEST value.
/// IA32_HWP_REQUEST layout (Vol 3B §14.4.4):
/// [7:0] Minimum Performance
@@ -194,13 +251,35 @@ fn choose_pstate(g: Governor, ci: &CpuInfo) -> usize {
}
fn apply_pstate(ci: &mut CpuInfo, idx: usize) {
// On a VM host, MSR writes are no-ops on the underlying hardware
// emulation; we don't bother trying. The governor still tracks
// the dwell counter, but the target state doesn't actually
// change. This is what stops the P0->P1->P0 oscillation on QEMU
// where the dwell counter on bare metal would have the
// transition actually fire after 3 consecutive polls but on
// QEMU it would just keep writing silently.
if ci.read_only {
return;
}
// On real hardware, trust the write: QEMU's PIIX4 emulation
// does not model IA32_PERF_STATUS (it always returns 0), so a
// readback is not reliable for state confirmation. The dwell
// counter in the main loop (DWELL_POLLS consecutive polls at
// the same target) is the actual hysteresis that prevents
// oscillation under real load.
let handle = |ci: &mut CpuInfo, msr: u32, val: u64| -> bool {
let path = format!("/scheme/sys/msr/{}/0x{:x}", ci.id, msr);
fs::OpenOptions::new().write(true).open(&path).ok()
.map(|mut f| f.write_all(&val.to_ne_bytes()).is_ok()).unwrap_or(false)
};
match ci.mode {
PstateMode::Hwp => {
let val = hwp_request_for(idx, ci);
if write_msr(ci.id, IA32_HWP_REQUEST, val) {
ci.current_idx = idx;
if handle(ci, IA32_HWP_REQUEST, val) {
ci.msr_errors = 0;
ci.msr_suppressed = false;
ci.current_idx = idx;
ci.dwell = 0;
} else {
ci.msr_errors += 1;
if !ci.msr_suppressed {
@@ -211,10 +290,11 @@ fn apply_pstate(ci: &mut CpuInfo, idx: usize) {
}
PstateMode::LegacyPerfCtl => {
let ct = ci.pstates[idx].ctl;
if write_msr(ci.id, IA32_PERF_CTL, ct) {
ci.current_idx = idx;
if handle(ci, IA32_PERF_CTL, ct) {
ci.msr_errors = 0;
ci.msr_suppressed = false;
ci.current_idx = idx;
ci.dwell = 0;
} else {
ci.msr_errors += 1;
if !ci.msr_suppressed {
@@ -253,9 +333,41 @@ fn write_scheme_state(governor: Governor, cpus: &[CpuInfo]) {
let _ = fs::write("/scheme/cpufreq/state", out);
}
fn detect_virtualization() -> bool {
// Detect a hypervisor / VM by reading DMI strings. On QEMU the
// sys_vendor is "QEMU" or "KVM" or similar; on real hardware
// (e.g. LG Gram 2025) it's "LG Electronics" or "Intel Corporation".
// Returning true here means: "the cpufreqd governor's P-state
// writes are not going to take effect because the emulator
// doesn't model IA32_PERF_STATUS / IA32_PERF_CTL."
if let Ok(s) = fs::read_to_string("/sys/class/dmi/id/sys_vendor") {
let s = s.to_ascii_lowercase();
if s.contains("qemu") || s.contains("kvm") || s.contains("vmware")
|| s.contains("virtualbox") || s.contains("hyper-v") || s.contains("xen")
{
return true;
}
}
if let Ok(s) = fs::read_to_string("/sys/class/dmi/id/product_name") {
let s = s.to_ascii_lowercase();
if s.contains("virtual") || s.contains("kvm") || s.contains("qemu") {
return true;
}
}
// No /sys/class/dmi on this system (Redox bare metal) — assume
// real hardware. cpufreqd's P-state writes are meaningful.
false
}
fn main() {
log::set_logger(&StderrLogger).ok();
log::set_max_level(LevelFilter::Info);
let virtualized = detect_virtualization();
if virtualized {
info!("detected virtualized environment: cpufreqd will run in read-only mode (no MSR writes)");
}
let governor = match env::var("CPUFREQ_GOVERNOR").unwrap_or_default().as_str() {
"performance" => Governor::Performance, "powersave" => Governor::Powersave,
"conservative" => Governor::Conservative, "schedutil" => Governor::Schedutil,
@@ -277,7 +389,7 @@ fn main() {
}
let ps = read_acpi_pss(id);
info!("CPU{}: {} P-states ({} - {} kHz)", id, ps.len(), ps.first().map_or(0, |p| p.freq_khz), ps.last().map_or(0, |p| p.freq_khz));
CpuInfo { id, pstates: ps, current_idx: 0, load_history: [0.0; SAMPLE_WINDOW], load_idx: 0, throttle: false, msr_errors: 0, msr_suppressed: false, mode, hwp_min, hwp_max, hwp_guaranteed, hwp_efficient }
CpuInfo { id, pstates: ps, current_idx: 0, load_history: [0.0; SAMPLE_WINDOW], load_idx: 0, throttle: false, msr_errors: 0, msr_suppressed: false, mode, hwp_min, hwp_max, hwp_guaranteed, hwp_efficient, dwell: 0, dwell_target: 0, read_only: false }
}).collect();
let mut prev: Vec<(u64, u64)> = vec![(0, 0); cpus.len()];
let mut thermal = ThermalCache::new();
@@ -303,6 +415,26 @@ fn main() {
let l = measure_load(c.id, &mut prev[i]);
c.load_history[c.load_idx] = l; c.load_idx = (c.load_idx + 1) % SAMPLE_WINDOW; c.throttle = tt;
let n = choose_pstate(governor, c);
// Dwell-based hysteresis: only transition after DWELL_POLLS
// consecutive polls at the same target. This stops the
// P0->P1->P0 oscillation on idle systems (QEMU and
// real hardware with stable 0% load) where the governor
// would otherwise toggle the state every poll cycle.
if n != c.current_idx {
if n == c.dwell_target {
c.dwell = c.dwell.saturating_add(1);
} else {
c.dwell_target = n;
c.dwell = 1;
}
if c.dwell < DWELL_POLLS {
continue; // not enough polls at this target yet
}
} else {
// Same state as last poll: reset dwell counter (no transition
// was requested so dwell stays at 0).
c.dwell = 0;
}
if n != c.current_idx && n < c.pstates.len() {
let prev_freq = c.pstates[c.current_idx].freq_khz;
let next_freq = c.pstates[n].freq_khz;