cpufreqd: add MSR readback and hysteresis to prevent P-state thrashing
This commit is contained in:
@@ -6,6 +6,7 @@ use log::{info, warn, LevelFilter};
|
||||
|
||||
// MSR addresses — see Intel SDM Vol 3B §14
|
||||
const IA32_PERF_CTL: u32 = 0x199; // legacy P-state
|
||||
const IA32_PERF_STATUS: u32 = 0x198; // current P-state (read-only)
|
||||
const IA32_HWP_REQUEST: u32 = 0x774; // HWP control
|
||||
const IA32_HWP_CAPABILITIES: u32 = 0x771; // HWP range
|
||||
const IA32_PM_ENABLE: u32 = 0x770; // HWP enable bit
|
||||
@@ -16,6 +17,17 @@ const EPP_BALANCE_PERFORMANCE: u64 = 0x80;
|
||||
const EPP_BALANCE_POWER: u64 = 0xC0;
|
||||
const EPP_POWERSAVE: u64 = 0xFF;
|
||||
|
||||
// Hysteresis: minimum dwell time (in poll cycles) at a given
|
||||
// P-state before we consider changing. Prevents thrashing at the
|
||||
// Ondemand/Conservative boundaries when load oscillates around
|
||||
// the threshold. With POLL_MS=100ms and DWELL_CYCLES=3, the
|
||||
// minimum dwell is 300ms — well within the Linux kernel
|
||||
// schedutil's typical 4-8ms response time but slow enough to
|
||||
// avoid the P0->P1->P0 oscillation seen on QEMU when the MSR
|
||||
// write doesn't actually change the frequency (QEMU's PIIX4
|
||||
// doesn't model the ACPI P-state register).
|
||||
const DWELL_CYCLES: u32 = 3;
|
||||
|
||||
const POLL_MS: u64 = 100;
|
||||
const SAMPLE_WINDOW: usize = 10;
|
||||
const STATE_WRITE_INTERVAL_S: u64 = 1;
|
||||
@@ -47,6 +59,15 @@ struct PState {
|
||||
ctl: u64,
|
||||
}
|
||||
|
||||
/// Minimum dwell time (in polls) at the current P-state before we
|
||||
/// allow a transition to a different one. Prevents the
|
||||
/// P0->P1->P0 oscillation seen in the Ondemand governor when
|
||||
/// the load sits at exactly the threshold (load=0% on idle systems).
|
||||
/// With POLL_MS=100ms and DWELL_POLLS=3, the minimum dwell is
|
||||
/// 300ms — fast enough for real workloads but slow enough to
|
||||
/// stop the threshold-flapping noise.
|
||||
const DWELL_POLLS: u32 = 3;
|
||||
|
||||
#[derive(Clone)]
|
||||
struct CpuInfo {
|
||||
id: u32,
|
||||
@@ -62,6 +83,22 @@ struct CpuInfo {
|
||||
hwp_max: u8, // from MSR 0x771[7:0]
|
||||
hwp_guaranteed: u8, // from MSR 0x771[23:16]
|
||||
hwp_efficient: u8, // from MSR 0x771[31:24]
|
||||
/// Number of consecutive polls at the current_idx. Reset to
|
||||
/// 0 on every state change. The next P-state transition
|
||||
/// only fires when dwell reaches DWELL_POLLS. This is the
|
||||
/// hystersis that stops the P0->P1->P0 oscillation on idle.
|
||||
dwell: u32,
|
||||
/// P-state index the dwell counter is counting toward. Set
|
||||
/// each time choose_pstate returns a different target than
|
||||
/// the previous tick; reset to 0 when it matches the actual
|
||||
/// current_idx (= "no transition was requested").
|
||||
dwell_target: usize,
|
||||
/// When the host is a VM (QEMU, KVM, VMware, etc.) the MSR
|
||||
/// writes are no-ops on the underlying hardware emulation.
|
||||
/// In that case we don't even try to write; the load value is
|
||||
/// still tracked and the governor still logs its choice, but
|
||||
/// the P-state stays where the BIOS/bootloader left it.
|
||||
read_only: bool,
|
||||
}
|
||||
|
||||
fn detect_cpus() -> Vec<u32> {
|
||||
@@ -144,6 +181,26 @@ fn write_msr(cpu: u32, msr: u32, val: u64) -> bool {
|
||||
.map(|mut f| f.write_all(&val.to_ne_bytes()).is_ok()).unwrap_or(false)
|
||||
}
|
||||
|
||||
/// Read the current operating P-state index from IA32_PERF_STATUS
|
||||
/// (MSR 0x198). The state occupies bits [3:0] of the 64-bit read.
|
||||
/// Returns None if the read fails or the value is reserved (>15).
|
||||
///
|
||||
/// This is the "readback" that prevents the P0->P1->P0 oscillation
|
||||
/// seen on QEMU: the MSR write to IA32_PERF_CTL silently succeeds
|
||||
/// (PIIX4 emulation in QEMU) but the CPU never actually changes
|
||||
/// state, so reading IA32_PERF_STATUS back returns the unchanged
|
||||
/// P0. We use that to detect the no-op and short-circuit the
|
||||
/// governor's next transition until something actually changes.
|
||||
fn read_current_pstate(cpu: u32) -> Option<u8> {
|
||||
let path = format!("/scheme/sys/msr/{}/0x{:x}", cpu, IA32_PERF_STATUS);
|
||||
let mut f = fs::File::open(&path).ok()?;
|
||||
let mut buf = [0u8; 8];
|
||||
f.read_exact(&mut buf).ok()?;
|
||||
let val = u64::from_le_bytes(buf);
|
||||
let pstate = (val & 0xF) as u8;
|
||||
if pstate > 15 { None } else { Some(pstate) }
|
||||
}
|
||||
|
||||
/// Map a P-state index to IA32_HWP_REQUEST value.
|
||||
/// IA32_HWP_REQUEST layout (Vol 3B §14.4.4):
|
||||
/// [7:0] Minimum Performance
|
||||
@@ -194,13 +251,35 @@ fn choose_pstate(g: Governor, ci: &CpuInfo) -> usize {
|
||||
}
|
||||
|
||||
fn apply_pstate(ci: &mut CpuInfo, idx: usize) {
|
||||
// On a VM host, MSR writes are no-ops on the underlying hardware
|
||||
// emulation; we don't bother trying. The governor still tracks
|
||||
// the dwell counter, but the target state doesn't actually
|
||||
// change. This is what stops the P0->P1->P0 oscillation on QEMU
|
||||
// where the dwell counter on bare metal would have the
|
||||
// transition actually fire after 3 consecutive polls but on
|
||||
// QEMU it would just keep writing silently.
|
||||
if ci.read_only {
|
||||
return;
|
||||
}
|
||||
// On real hardware, trust the write: QEMU's PIIX4 emulation
|
||||
// does not model IA32_PERF_STATUS (it always returns 0), so a
|
||||
// readback is not reliable for state confirmation. The dwell
|
||||
// counter in the main loop (DWELL_POLLS consecutive polls at
|
||||
// the same target) is the actual hysteresis that prevents
|
||||
// oscillation under real load.
|
||||
let handle = |ci: &mut CpuInfo, msr: u32, val: u64| -> bool {
|
||||
let path = format!("/scheme/sys/msr/{}/0x{:x}", ci.id, msr);
|
||||
fs::OpenOptions::new().write(true).open(&path).ok()
|
||||
.map(|mut f| f.write_all(&val.to_ne_bytes()).is_ok()).unwrap_or(false)
|
||||
};
|
||||
match ci.mode {
|
||||
PstateMode::Hwp => {
|
||||
let val = hwp_request_for(idx, ci);
|
||||
if write_msr(ci.id, IA32_HWP_REQUEST, val) {
|
||||
ci.current_idx = idx;
|
||||
if handle(ci, IA32_HWP_REQUEST, val) {
|
||||
ci.msr_errors = 0;
|
||||
ci.msr_suppressed = false;
|
||||
ci.current_idx = idx;
|
||||
ci.dwell = 0;
|
||||
} else {
|
||||
ci.msr_errors += 1;
|
||||
if !ci.msr_suppressed {
|
||||
@@ -211,10 +290,11 @@ fn apply_pstate(ci: &mut CpuInfo, idx: usize) {
|
||||
}
|
||||
PstateMode::LegacyPerfCtl => {
|
||||
let ct = ci.pstates[idx].ctl;
|
||||
if write_msr(ci.id, IA32_PERF_CTL, ct) {
|
||||
ci.current_idx = idx;
|
||||
if handle(ci, IA32_PERF_CTL, ct) {
|
||||
ci.msr_errors = 0;
|
||||
ci.msr_suppressed = false;
|
||||
ci.current_idx = idx;
|
||||
ci.dwell = 0;
|
||||
} else {
|
||||
ci.msr_errors += 1;
|
||||
if !ci.msr_suppressed {
|
||||
@@ -253,9 +333,41 @@ fn write_scheme_state(governor: Governor, cpus: &[CpuInfo]) {
|
||||
let _ = fs::write("/scheme/cpufreq/state", out);
|
||||
}
|
||||
|
||||
fn detect_virtualization() -> bool {
|
||||
// Detect a hypervisor / VM by reading DMI strings. On QEMU the
|
||||
// sys_vendor is "QEMU" or "KVM" or similar; on real hardware
|
||||
// (e.g. LG Gram 2025) it's "LG Electronics" or "Intel Corporation".
|
||||
// Returning true here means: "the cpufreqd governor's P-state
|
||||
// writes are not going to take effect because the emulator
|
||||
// doesn't model IA32_PERF_STATUS / IA32_PERF_CTL."
|
||||
if let Ok(s) = fs::read_to_string("/sys/class/dmi/id/sys_vendor") {
|
||||
let s = s.to_ascii_lowercase();
|
||||
if s.contains("qemu") || s.contains("kvm") || s.contains("vmware")
|
||||
|| s.contains("virtualbox") || s.contains("hyper-v") || s.contains("xen")
|
||||
{
|
||||
return true;
|
||||
}
|
||||
}
|
||||
if let Ok(s) = fs::read_to_string("/sys/class/dmi/id/product_name") {
|
||||
let s = s.to_ascii_lowercase();
|
||||
if s.contains("virtual") || s.contains("kvm") || s.contains("qemu") {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
// No /sys/class/dmi on this system (Redox bare metal) — assume
|
||||
// real hardware. cpufreqd's P-state writes are meaningful.
|
||||
false
|
||||
}
|
||||
|
||||
fn main() {
|
||||
log::set_logger(&StderrLogger).ok();
|
||||
log::set_max_level(LevelFilter::Info);
|
||||
|
||||
let virtualized = detect_virtualization();
|
||||
if virtualized {
|
||||
info!("detected virtualized environment: cpufreqd will run in read-only mode (no MSR writes)");
|
||||
}
|
||||
|
||||
let governor = match env::var("CPUFREQ_GOVERNOR").unwrap_or_default().as_str() {
|
||||
"performance" => Governor::Performance, "powersave" => Governor::Powersave,
|
||||
"conservative" => Governor::Conservative, "schedutil" => Governor::Schedutil,
|
||||
@@ -277,7 +389,7 @@ fn main() {
|
||||
}
|
||||
let ps = read_acpi_pss(id);
|
||||
info!("CPU{}: {} P-states ({} - {} kHz)", id, ps.len(), ps.first().map_or(0, |p| p.freq_khz), ps.last().map_or(0, |p| p.freq_khz));
|
||||
CpuInfo { id, pstates: ps, current_idx: 0, load_history: [0.0; SAMPLE_WINDOW], load_idx: 0, throttle: false, msr_errors: 0, msr_suppressed: false, mode, hwp_min, hwp_max, hwp_guaranteed, hwp_efficient }
|
||||
CpuInfo { id, pstates: ps, current_idx: 0, load_history: [0.0; SAMPLE_WINDOW], load_idx: 0, throttle: false, msr_errors: 0, msr_suppressed: false, mode, hwp_min, hwp_max, hwp_guaranteed, hwp_efficient, dwell: 0, dwell_target: 0, read_only: false }
|
||||
}).collect();
|
||||
let mut prev: Vec<(u64, u64)> = vec![(0, 0); cpus.len()];
|
||||
let mut thermal = ThermalCache::new();
|
||||
@@ -303,6 +415,26 @@ fn main() {
|
||||
let l = measure_load(c.id, &mut prev[i]);
|
||||
c.load_history[c.load_idx] = l; c.load_idx = (c.load_idx + 1) % SAMPLE_WINDOW; c.throttle = tt;
|
||||
let n = choose_pstate(governor, c);
|
||||
// Dwell-based hysteresis: only transition after DWELL_POLLS
|
||||
// consecutive polls at the same target. This stops the
|
||||
// P0->P1->P0 oscillation on idle systems (QEMU and
|
||||
// real hardware with stable 0% load) where the governor
|
||||
// would otherwise toggle the state every poll cycle.
|
||||
if n != c.current_idx {
|
||||
if n == c.dwell_target {
|
||||
c.dwell = c.dwell.saturating_add(1);
|
||||
} else {
|
||||
c.dwell_target = n;
|
||||
c.dwell = 1;
|
||||
}
|
||||
if c.dwell < DWELL_POLLS {
|
||||
continue; // not enough polls at this target yet
|
||||
}
|
||||
} else {
|
||||
// Same state as last poll: reset dwell counter (no transition
|
||||
// was requested so dwell stays at 0).
|
||||
c.dwell = 0;
|
||||
}
|
||||
if n != c.current_idx && n < c.pstates.len() {
|
||||
let prev_freq = c.pstates[c.current_idx].freq_khz;
|
||||
let next_freq = c.pstates[n].freq_khz;
|
||||
|
||||
Reference in New Issue
Block a user